/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1510 - (show annotations)
Wed Nov 5 15:08:03 2014 UTC (4 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 217980 byte(s)
Fix bug when there are unset groups prior to (*ACCEPT) within a capturing 
group.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
199 if (utf)
200 {
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
208 wrong). */
209
210 PCRE_PUCHAR endptr = p + length;
211 while (p < endptr)
212 {
213 pcre_uint32 c, d;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
216 GETCHARINC(c, eptr);
217 GETCHARINC(d, p);
218 ur = GET_UCD(d);
219 if (c != d && c != d + ur->other_case)
220 {
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222 for (;;)
223 {
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
226 }
227 }
228 }
229 }
230 else
231 #endif
232
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
235 {
236 while (length-- > 0)
237 {
238 pcre_uint32 cc, cp;
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
241 cp = UCHAR21TEST(p);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
243 p++;
244 eptr++;
245 }
246 }
247 }
248
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
251
252 else
253 {
254 while (length-- > 0)
255 {
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
258 }
259 }
260
261 return (int)(eptr - eptr_start);
262 }
263
264
265
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
269
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
275 fine.
276
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
281
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
286 always used to.
287
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
294
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
303
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
306
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
314
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
318
319 #ifndef NO_RECURSE
320 #define REGISTER register
321
322 #ifdef PCRE_DEBUG
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
324 { \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
328 }
329 #define RRETURN(ra) \
330 { \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
332 return ra; \
333 }
334 #else
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
338 #endif
339
340 #else
341
342
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
346
347 #define REGISTER
348
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
350 {\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
353 {\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
358 }\
359 frame->Xwhere = rw;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
367 frame = newframe;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
369 goto HEAP_RECURSE;\
370 L_##rw:\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
372 }
373
374 #define RRETURN(ra)\
375 {\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
378 if (frame != NULL)\
379 {\
380 rrc = ra;\
381 goto HEAP_RETURN;\
382 }\
383 return ra;\
384 }
385
386
387 /* Structure for remembering the local variables in a private frame */
388
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
392
393 /* Function arguments that may change */
394
395 PCRE_PUCHAR Xeptr;
396 const pcre_uchar *Xecode;
397 PCRE_PUCHAR Xmstart;
398 int Xoffset_top;
399 eptrblock *Xeptrb;
400 unsigned int Xrdepth;
401
402 /* Function local variables */
403
404 PCRE_PUCHAR Xcallpat;
405 #ifdef SUPPORT_UTF
406 PCRE_PUCHAR Xcharptr;
407 #endif
408 PCRE_PUCHAR Xdata;
409 PCRE_PUCHAR Xnext;
410 PCRE_PUCHAR Xpp;
411 PCRE_PUCHAR Xprev;
412 PCRE_PUCHAR Xsaved_eptr;
413
414 recursion_info Xnew_recursive;
415
416 BOOL Xcur_is_word;
417 BOOL Xcondition;
418 BOOL Xprev_is_word;
419
420 #ifdef SUPPORT_UCP
421 int Xprop_type;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
424 int Xoclength;
425 pcre_uchar Xocchars[6];
426 #endif
427
428 int Xcodelink;
429 int Xctype;
430 unsigned int Xfc;
431 int Xfi;
432 int Xlength;
433 int Xmax;
434 int Xmin;
435 unsigned int Xnumber;
436 int Xoffset;
437 unsigned int Xop;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
441
442 eptrblock Xnewptrb;
443
444 /* Where to jump back to */
445
446 int Xwhere;
447
448 } heapframe;
449
450 #endif
451
452
453 /***************************************************************************
454 ***************************************************************************/
455
456
457
458 /*************************************************
459 * Match from current position *
460 *************************************************/
461
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
464 same response. */
465
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
471 the subject. */
472
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
476 { \
477 md->hitend = TRUE; \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
479 }
480
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
483 { \
484 md->hitend = TRUE; \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
486 }
487
488
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
493
494 Arguments:
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
498 by encountering \K)
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
504
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
510 */
511
512 static int
513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515 unsigned int rdepth)
516 {
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
520
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
525
526 BOOL minimize, possessive; /* Quantifier options */
527 BOOL caseless;
528 int condcode;
529
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
536
537 #ifdef NO_RECURSE
538 heapframe *frame = (heapframe *)md->match_frames_base;
539
540 /* Copy in the original argument variables */
541
542 frame->Xeptr = eptr;
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
548
549 /* This is where control jumps back to to effect "recursion" */
550
551 HEAP_RECURSE:
552
553 /* Macros make the argument variables come from the current frame */
554
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
561
562 /* Ditto for the local variables */
563
564 #ifdef SUPPORT_UTF
565 #define charptr frame->Xcharptr
566 #endif
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
574
575 #define new_recursive frame->Xnew_recursive
576
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
580
581 #ifdef SUPPORT_UCP
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
587 #endif
588
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
603
604 #define newptrb frame->Xnewptrb
605
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
609
610 #else /* NO_RECURSE not defined */
611 #define fi i
612 #define fc c
613
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
620 to RMATCH(). */
621
622 #ifdef SUPPORT_UTF
623 const pcre_uchar *charptr;
624 #endif
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
628 PCRE_PUCHAR pp;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
631
632 recursion_info new_recursive;
633
634 BOOL cur_is_word;
635 BOOL condition;
636 BOOL prev_is_word;
637
638 #ifdef SUPPORT_UCP
639 int prop_type;
640 unsigned int prop_value;
641 int prop_fail_result;
642 int oclength;
643 pcre_uchar occhars[6];
644 #endif
645
646 int codelink;
647 int ctype;
648 int length;
649 int max;
650 int min;
651 unsigned int number;
652 int offset;
653 unsigned int op;
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
657
658 eptrblock newptrb;
659
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
664 size. */
665
666 if (ecode == NULL)
667 {
668 if (rdepth == 0)
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670 else
671 {
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
674 }
675 }
676 #endif /* NO_RECURSE */
677
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
682
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
688 #define foc number
689 #define save_mark data
690
691 /* These statements are here to stop the compiler complaining about unitialized
692 variables. */
693
694 #ifdef SUPPORT_UCP
695 prop_value = 0;
696 prop_fail_result = 0;
697 #endif
698
699
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
703 original patch. */
704
705 TAIL_RECURSE:
706
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
714
715 #ifdef SUPPORT_UTF
716 utf = md->utf; /* Local copy of the flag */
717 #else
718 utf = FALSE;
719 #endif
720
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
723
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
726
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
731
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
738
739 if (md->match_function_type == MATCH_CBEGROUP)
740 {
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
743 eptrb = &newptrb;
744 md->match_function_type = 0;
745 }
746
747 /* Now start processing the opcodes. */
748
749 for (;;)
750 {
751 minimize = possessive = FALSE;
752 op = *ecode;
753
754 switch(op)
755 {
756 case OP_MARK:
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
760 eptrb, RM55);
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
763
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
769 unaltered. */
770
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
773 {
774 md->start_match_ptr = eptr;
775 RRETURN(MATCH_SKIP);
776 }
777 RRETURN(rrc);
778
779 case OP_FAIL:
780 RRETURN(MATCH_NOMATCH);
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
787
788 case OP_PRUNE:
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 eptrb, RM51);
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
793
794 case OP_PRUNE_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
798 eptrb, RM56);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
803
804 case OP_SKIP:
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
806 eptrb, RM53);
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
809 RRETURN(MATCH_SKIP);
810
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
818
819 case OP_SKIP_ARG:
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
822 {
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
824 break;
825 }
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
827 eptrb, RM57);
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
829
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
834
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
837
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
841
842 case OP_THEN:
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
844 eptrb, RM54);
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
847 RRETURN(MATCH_THEN);
848
849 case OP_THEN_ARG:
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
853 md, eptrb, RM58);
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
858 RRETURN(MATCH_THEN);
859
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
867
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
872
873 case OP_ONCE_NC:
874 prev = ecode;
875 saved_eptr = eptr;
876 save_mark = md->mark;
877 do
878 {
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
881 {
882 mstart = md->start_match_ptr;
883 break;
884 }
885 if (rrc == MATCH_THEN)
886 {
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
890 rrc = MATCH_NOMATCH;
891 }
892
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
896 }
897 while (*ecode == OP_ALT);
898
899 /* If hit the end of the group (which could be repeated), fail */
900
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
902
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
905
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
907
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
910
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
914 5.005. */
915
916 if (*ecode == OP_KET || eptr == saved_eptr)
917 {
918 ecode += 1+LINK_SIZE;
919 break;
920 }
921
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
925
926 if (*ecode == OP_KETRMIN)
927 {
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930 ecode = prev;
931 goto TAIL_RECURSE;
932 }
933 else /* OP_KETRMAX */
934 {
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
938 goto TAIL_RECURSE;
939 }
940 /* Control never gets here */
941
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
951
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
955
956 case OP_CBRA:
957 case OP_SCBRA:
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
960
961 #ifdef PCRE_DEBUG
962 printf("start bracket %d\n", number);
963 printf("subject=");
964 pchars(eptr, 16, TRUE, md);
965 printf("\n");
966 #endif
967
968 if (offset < md->offset_max)
969 {
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
975
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
979
980 for (;;)
981 {
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
984 eptrb, RM1);
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
986
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
997
998 if (rrc == MATCH_THEN)
999 {
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1004 }
1005
1006 /* Anything other than NOMATCH is passed back. */
1007
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1013 }
1014
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1019
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1021
1022 RRETURN(rrc);
1023 }
1024
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1027
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1032
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1038
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1045
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1048
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1055
1056 case OP_ONCE:
1057 case OP_BRA:
1058 case OP_SBRA:
1059 DPRINTF(("start non-capturing bracket\n"));
1060
1061 for (;;)
1062 {
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1065
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1068 above. */
1069
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1071 {
1072 ecode += PRIV(OP_lengths)[*ecode];
1073 goto TAIL_RECURSE;
1074 }
1075
1076 /* In all other cases, we have to make another call to match(). */
1077
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1081 RM2);
1082
1083 /* See comment in the code for capturing groups above about handling
1084 THEN. */
1085
1086 if (rrc == MATCH_THEN)
1087 {
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1092 }
1093
1094 if (rrc != MATCH_NOMATCH)
1095 {
1096 if (rrc == MATCH_ONCE)
1097 {
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1100 {
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1103 }
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1105 }
1106 RRETURN(rrc);
1107 }
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1112 }
1113
1114 RRETURN(MATCH_NOMATCH);
1115
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1123
1124 case OP_CBRAPOS:
1125 case OP_SCBRAPOS:
1126 allow_zero = FALSE;
1127
1128 POSSESSIVE_CAPTURE:
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1131
1132 #ifdef PCRE_DEBUG
1133 printf("start possessive bracket %d\n", number);
1134 printf("subject=");
1135 pchars(eptr, 16, TRUE, md);
1136 printf("\n");
1137 #endif
1138
1139 if (offset < md->offset_max)
1140 {
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1143
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1148
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1150
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1159
1160 for (;;)
1161 {
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1166 eptrb, RM63);
1167 if (rrc == MATCH_KETRPOS)
1168 {
1169 offset_top = md->end_offset_top;
1170 ecode = md->start_code + code_offset;
1171 save_capture_last = md->capture_last;
1172 matched_once = TRUE;
1173 mstart = md->start_match_ptr; /* In case \K changed it */
1174 if (eptr == md->end_match_ptr) /* Matched an empty string */
1175 {
1176 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1177 break;
1178 }
1179 eptr = md->end_match_ptr;
1180 continue;
1181 }
1182
1183 /* See comment in the code for capturing groups above about handling
1184 THEN. */
1185
1186 if (rrc == MATCH_THEN)
1187 {
1188 next = ecode + GET(ecode,1);
1189 if (md->start_match_ptr < next &&
1190 (*ecode == OP_ALT || *next == OP_ALT))
1191 rrc = MATCH_NOMATCH;
1192 }
1193
1194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1195 md->capture_last = save_capture_last;
1196 ecode += GET(ecode, 1);
1197 if (*ecode != OP_ALT) break;
1198 }
1199
1200 if (!matched_once)
1201 {
1202 md->offset_vector[offset] = save_offset1;
1203 md->offset_vector[offset+1] = save_offset2;
1204 md->offset_vector[md->offset_end - number] = save_offset3;
1205 }
1206
1207 if (allow_zero || matched_once)
1208 {
1209 ecode += 1 + LINK_SIZE;
1210 break;
1211 }
1212
1213 RRETURN(MATCH_NOMATCH);
1214 }
1215
1216 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1217 as a non-capturing bracket. */
1218
1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221
1222 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1223
1224 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1225 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1226
1227 /* Non-capturing possessive bracket with unlimited repeat. We come here
1228 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1229 without the capturing complication. It is written out separately for speed
1230 and cleanliness. */
1231
1232 case OP_BRAPOS:
1233 case OP_SBRAPOS:
1234 allow_zero = FALSE;
1235
1236 POSSESSIVE_NON_CAPTURE:
1237 matched_once = FALSE;
1238 code_offset = (int)(ecode - md->start_code);
1239 save_capture_last = md->capture_last;
1240
1241 for (;;)
1242 {
1243 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1244 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1245 eptrb, RM48);
1246 if (rrc == MATCH_KETRPOS)
1247 {
1248 offset_top = md->end_offset_top;
1249 ecode = md->start_code + code_offset;
1250 matched_once = TRUE;
1251 mstart = md->start_match_ptr; /* In case \K reset it */
1252 if (eptr == md->end_match_ptr) /* Matched an empty string */
1253 {
1254 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1255 break;
1256 }
1257 eptr = md->end_match_ptr;
1258 continue;
1259 }
1260
1261 /* See comment in the code for capturing groups above about handling
1262 THEN. */
1263
1264 if (rrc == MATCH_THEN)
1265 {
1266 next = ecode + GET(ecode,1);
1267 if (md->start_match_ptr < next &&
1268 (*ecode == OP_ALT || *next == OP_ALT))
1269 rrc = MATCH_NOMATCH;
1270 }
1271
1272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1273 ecode += GET(ecode, 1);
1274 if (*ecode != OP_ALT) break;
1275 md->capture_last = save_capture_last;
1276 }
1277
1278 if (matched_once || allow_zero)
1279 {
1280 ecode += 1 + LINK_SIZE;
1281 break;
1282 }
1283 RRETURN(MATCH_NOMATCH);
1284
1285 /* Control never reaches here. */
1286
1287 /* Conditional group: compilation checked that there are no more than two
1288 branches. If the condition is false, skipping the first branch takes us
1289 past the end of the item if there is only one branch, but that's exactly
1290 what we want. */
1291
1292 case OP_COND:
1293 case OP_SCOND:
1294
1295 /* The variable codelink will be added to ecode when the condition is
1296 false, to get to the second branch. Setting it to the offset to the ALT
1297 or KET, then incrementing ecode achieves this effect. We now have ecode
1298 pointing to the condition or callout. */
1299
1300 codelink = GET(ecode, 1); /* Offset to the second branch */
1301 ecode += 1 + LINK_SIZE; /* From this opcode */
1302
1303 /* Because of the way auto-callout works during compile, a callout item is
1304 inserted between OP_COND and an assertion condition. */
1305
1306 if (*ecode == OP_CALLOUT)
1307 {
1308 if (PUBL(callout) != NULL)
1309 {
1310 PUBL(callout_block) cb;
1311 cb.version = 2; /* Version 1 of the callout block */
1312 cb.callout_number = ecode[1];
1313 cb.offset_vector = md->offset_vector;
1314 #if defined COMPILE_PCRE8
1315 cb.subject = (PCRE_SPTR)md->start_subject;
1316 #elif defined COMPILE_PCRE16
1317 cb.subject = (PCRE_SPTR16)md->start_subject;
1318 #elif defined COMPILE_PCRE32
1319 cb.subject = (PCRE_SPTR32)md->start_subject;
1320 #endif
1321 cb.subject_length = (int)(md->end_subject - md->start_subject);
1322 cb.start_match = (int)(mstart - md->start_subject);
1323 cb.current_position = (int)(eptr - md->start_subject);
1324 cb.pattern_position = GET(ecode, 2);
1325 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1326 cb.capture_top = offset_top/2;
1327 cb.capture_last = md->capture_last & CAPLMASK;
1328 /* Internal change requires this for API compatibility. */
1329 if (cb.capture_last == 0) cb.capture_last = -1;
1330 cb.callout_data = md->callout_data;
1331 cb.mark = md->nomatch_mark;
1332 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1333 if (rrc < 0) RRETURN(rrc);
1334 }
1335
1336 /* Advance ecode past the callout, so it now points to the condition. We
1337 must adjust codelink so that the value of ecode+codelink is unchanged. */
1338
1339 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1340 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1341 }
1342
1343 /* Test the various possible conditions */
1344
1345 condition = FALSE;
1346 switch(condcode = *ecode)
1347 {
1348 case OP_RREF: /* Numbered group recursion test */
1349 if (md->recursive != NULL) /* Not recursing => FALSE */
1350 {
1351 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1352 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1353 }
1354 break;
1355
1356 case OP_DNRREF: /* Duplicate named group recursion test */
1357 if (md->recursive != NULL)
1358 {
1359 int count = GET2(ecode, 1 + IMM2_SIZE);
1360 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1361 while (count-- > 0)
1362 {
1363 unsigned int recno = GET2(slot, 0);
1364 condition = recno == md->recursive->group_num;
1365 if (condition) break;
1366 slot += md->name_entry_size;
1367 }
1368 }
1369 break;
1370
1371 case OP_CREF: /* Numbered group used test */
1372 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 break;
1375
1376 case OP_DNCREF: /* Duplicate named group used test */
1377 {
1378 int count = GET2(ecode, 1 + IMM2_SIZE);
1379 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1380 while (count-- > 0)
1381 {
1382 offset = GET2(slot, 0) << 1;
1383 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1384 if (condition) break;
1385 slot += md->name_entry_size;
1386 }
1387 }
1388 break;
1389
1390 case OP_DEF: /* DEFINE - always false */
1391 break;
1392
1393 /* The condition is an assertion. Call match() to evaluate it - setting
1394 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1395 of an assertion. */
1396
1397 default:
1398 md->match_function_type = MATCH_CONDASSERT;
1399 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1400 if (rrc == MATCH_MATCH)
1401 {
1402 if (md->end_offset_top > offset_top)
1403 offset_top = md->end_offset_top; /* Captures may have happened */
1404 condition = TRUE;
1405
1406 /* Advance ecode past the assertion to the start of the first branch,
1407 but adjust it so that the general choosing code below works. */
1408
1409 ecode += GET(ecode, 1);
1410 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1411 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1412 }
1413
1414 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1415 assertion; it is therefore treated as NOMATCH. Any other return is an
1416 error. */
1417
1418 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1419 {
1420 RRETURN(rrc); /* Need braces because of following else */
1421 }
1422 break;
1423 }
1424
1425 /* Choose branch according to the condition */
1426
1427 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1428
1429 /* We are now at the branch that is to be obeyed. As there is only one, we
1430 can use tail recursion to avoid using another stack frame, except when
1431 there is unlimited repeat of a possibly empty group. In the latter case, a
1432 recursive call to match() is always required, unless the second alternative
1433 doesn't exist, in which case we can just plough on. Note that, for
1434 compatibility with Perl, the | in a conditional group is NOT treated as
1435 creating two alternatives. If a THEN is encountered in the branch, it
1436 propagates out to the enclosing alternative (unless nested in a deeper set
1437 of alternatives, of course). */
1438
1439 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1440 {
1441 if (op != OP_SCOND)
1442 {
1443 goto TAIL_RECURSE;
1444 }
1445
1446 md->match_function_type = MATCH_CBEGROUP;
1447 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1448 RRETURN(rrc);
1449 }
1450
1451 /* Condition false & no alternative; continue after the group. */
1452
1453 else
1454 {
1455 }
1456 break;
1457
1458
1459 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1460 to close any currently open capturing brackets. */
1461
1462 case OP_CLOSE:
1463 number = GET2(ecode, 1); /* Must be less than 65536 */
1464 offset = number << 1;
1465
1466 #ifdef PCRE_DEBUG
1467 printf("end bracket %d at *ACCEPT", number);
1468 printf("\n");
1469 #endif
1470
1471 md->capture_last = (md->capture_last & OVFLMASK) | number;
1472 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1473 {
1474 md->offset_vector[offset] =
1475 md->offset_vector[md->offset_end - number];
1476 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1477
1478 /* If this group is at or above the current highwater mark, ensure that
1479 any groups between the current high water mark and this group are marked
1480 unset and then update the high water mark. */
1481
1482 if (offset >= offset_top)
1483 {
1484 register int *iptr = md->offset_vector + offset_top;
1485 register int *iend = md->offset_vector + offset;
1486 while (iptr < iend) *iptr++ = -1;
1487 offset_top = offset + 2;
1488 }
1489 }
1490 ecode += 1 + IMM2_SIZE;
1491 break;
1492
1493
1494 /* End of the pattern, either real or forced. */
1495
1496 case OP_END:
1497 case OP_ACCEPT:
1498 case OP_ASSERT_ACCEPT:
1499
1500 /* If we have matched an empty string, fail if not in an assertion and not
1501 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1502 is set and we have matched at the start of the subject. In both cases,
1503 backtracking will then try other alternatives, if any. */
1504
1505 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1506 md->recursive == NULL &&
1507 (md->notempty ||
1508 (md->notempty_atstart &&
1509 mstart == md->start_subject + md->start_offset)))
1510 RRETURN(MATCH_NOMATCH);
1511
1512 /* Otherwise, we have a match. */
1513
1514 md->end_match_ptr = eptr; /* Record where we ended */
1515 md->end_offset_top = offset_top; /* and how many extracts were taken */
1516 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1517
1518 /* For some reason, the macros don't work properly if an expression is
1519 given as the argument to RRETURN when the heap is in use. */
1520
1521 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1522 RRETURN(rrc);
1523
1524 /* Assertion brackets. Check the alternative branches in turn - the
1525 matching won't pass the KET for an assertion. If any one branch matches,
1526 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1527 start of each branch to move the current point backwards, so the code at
1528 this level is identical to the lookahead case. When the assertion is part
1529 of a condition, we want to return immediately afterwards. The caller of
1530 this incarnation of the match() function will have set MATCH_CONDASSERT in
1531 md->match_function type, and one of these opcodes will be the first opcode
1532 that is processed. We use a local variable that is preserved over calls to
1533 match() to remember this case. */
1534
1535 case OP_ASSERT:
1536 case OP_ASSERTBACK:
1537 save_mark = md->mark;
1538 if (md->match_function_type == MATCH_CONDASSERT)
1539 {
1540 condassert = TRUE;
1541 md->match_function_type = 0;
1542 }
1543 else condassert = FALSE;
1544
1545 /* Loop for each branch */
1546
1547 do
1548 {
1549 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1550
1551 /* A match means that the assertion is true; break out of the loop
1552 that matches its alternatives. */
1553
1554 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1555 {
1556 mstart = md->start_match_ptr; /* In case \K reset it */
1557 break;
1558 }
1559
1560 /* If not matched, restore the previous mark setting. */
1561
1562 md->mark = save_mark;
1563
1564 /* See comment in the code for capturing groups above about handling
1565 THEN. */
1566
1567 if (rrc == MATCH_THEN)
1568 {
1569 next = ecode + GET(ecode,1);
1570 if (md->start_match_ptr < next &&
1571 (*ecode == OP_ALT || *next == OP_ALT))
1572 rrc = MATCH_NOMATCH;
1573 }
1574
1575 /* Anything other than NOMATCH causes the entire assertion to fail,
1576 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1577 uncaptured THEN, which means they take their normal effect. This
1578 consistent approach does not always have exactly the same effect as in
1579 Perl. */
1580
1581 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1582 ecode += GET(ecode, 1);
1583 }
1584 while (*ecode == OP_ALT); /* Continue for next alternative */
1585
1586 /* If we have tried all the alternative branches, the assertion has
1587 failed. If not, we broke out after a match. */
1588
1589 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1590
1591 /* If checking an assertion for a condition, return MATCH_MATCH. */
1592
1593 if (condassert) RRETURN(MATCH_MATCH);
1594
1595 /* Continue from after a successful assertion, updating the offsets high
1596 water mark, since extracts may have been taken during the assertion. */
1597
1598 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1599 ecode += 1 + LINK_SIZE;
1600 offset_top = md->end_offset_top;
1601 continue;
1602
1603 /* Negative assertion: all branches must fail to match for the assertion to
1604 succeed. */
1605
1606 case OP_ASSERT_NOT:
1607 case OP_ASSERTBACK_NOT:
1608 save_mark = md->mark;
1609 if (md->match_function_type == MATCH_CONDASSERT)
1610 {
1611 condassert = TRUE;
1612 md->match_function_type = 0;
1613 }
1614 else condassert = FALSE;
1615
1616 /* Loop for each alternative branch. */
1617
1618 do
1619 {
1620 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1621 md->mark = save_mark; /* Always restore the mark setting */
1622
1623 switch(rrc)
1624 {
1625 case MATCH_MATCH: /* A successful match means */
1626 case MATCH_ACCEPT: /* the assertion has failed. */
1627 RRETURN(MATCH_NOMATCH);
1628
1629 case MATCH_NOMATCH: /* Carry on with next branch */
1630 break;
1631
1632 /* See comment in the code for capturing groups above about handling
1633 THEN. */
1634
1635 case MATCH_THEN:
1636 next = ecode + GET(ecode,1);
1637 if (md->start_match_ptr < next &&
1638 (*ecode == OP_ALT || *next == OP_ALT))
1639 {
1640 rrc = MATCH_NOMATCH;
1641 break;
1642 }
1643 /* Otherwise fall through. */
1644
1645 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1646 assertion to fail to match, without considering any more alternatives.
1647 Failing to match means the assertion is true. This is a consistent
1648 approach, but does not always have the same effect as in Perl. */
1649
1650 case MATCH_COMMIT:
1651 case MATCH_SKIP:
1652 case MATCH_SKIP_ARG:
1653 case MATCH_PRUNE:
1654 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1655 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1656
1657 /* Anything else is an error */
1658
1659 default:
1660 RRETURN(rrc);
1661 }
1662
1663 /* Continue with next branch */
1664
1665 ecode += GET(ecode,1);
1666 }
1667 while (*ecode == OP_ALT);
1668
1669 /* All branches in the assertion failed to match. */
1670
1671 NEG_ASSERT_TRUE:
1672 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1673 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1674 continue;
1675
1676 /* Move the subject pointer back. This occurs only at the start of
1677 each branch of a lookbehind assertion. If we are too close to the start to
1678 move back, this match function fails. When working with UTF-8 we move
1679 back a number of characters, not bytes. */
1680
1681 case OP_REVERSE:
1682 #ifdef SUPPORT_UTF
1683 if (utf)
1684 {
1685 i = GET(ecode, 1);
1686 while (i-- > 0)
1687 {
1688 eptr--;
1689 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1690 BACKCHAR(eptr);
1691 }
1692 }
1693 else
1694 #endif
1695
1696 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1697
1698 {
1699 eptr -= GET(ecode, 1);
1700 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1701 }
1702
1703 /* Save the earliest consulted character, then skip to next op code */
1704
1705 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1706 ecode += 1 + LINK_SIZE;
1707 break;
1708
1709 /* The callout item calls an external function, if one is provided, passing
1710 details of the match so far. This is mainly for debugging, though the
1711 function is able to force a failure. */
1712
1713 case OP_CALLOUT:
1714 if (PUBL(callout) != NULL)
1715 {
1716 PUBL(callout_block) cb;
1717 cb.version = 2; /* Version 1 of the callout block */
1718 cb.callout_number = ecode[1];
1719 cb.offset_vector = md->offset_vector;
1720 #if defined COMPILE_PCRE8
1721 cb.subject = (PCRE_SPTR)md->start_subject;
1722 #elif defined COMPILE_PCRE16
1723 cb.subject = (PCRE_SPTR16)md->start_subject;
1724 #elif defined COMPILE_PCRE32
1725 cb.subject = (PCRE_SPTR32)md->start_subject;
1726 #endif
1727 cb.subject_length = (int)(md->end_subject - md->start_subject);
1728 cb.start_match = (int)(mstart - md->start_subject);
1729 cb.current_position = (int)(eptr - md->start_subject);
1730 cb.pattern_position = GET(ecode, 2);
1731 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1732 cb.capture_top = offset_top/2;
1733 cb.capture_last = md->capture_last & CAPLMASK;
1734 /* Internal change requires this for API compatibility. */
1735 if (cb.capture_last == 0) cb.capture_last = -1;
1736 cb.callout_data = md->callout_data;
1737 cb.mark = md->nomatch_mark;
1738 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1739 if (rrc < 0) RRETURN(rrc);
1740 }
1741 ecode += 2 + 2*LINK_SIZE;
1742 break;
1743
1744 /* Recursion either matches the current regex, or some subexpression. The
1745 offset data is the offset to the starting bracket from the start of the
1746 whole pattern. (This is so that it works from duplicated subpatterns.)
1747
1748 The state of the capturing groups is preserved over recursion, and
1749 re-instated afterwards. We don't know how many are started and not yet
1750 finished (offset_top records the completed total) so we just have to save
1751 all the potential data. There may be up to 65535 such values, which is too
1752 large to put on the stack, but using malloc for small numbers seems
1753 expensive. As a compromise, the stack is used when there are no more than
1754 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1755
1756 There are also other values that have to be saved. We use a chained
1757 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1758 for the original version of this logic. It has, however, been hacked around
1759 a lot, so he is not to blame for the current way it works. */
1760
1761 case OP_RECURSE:
1762 {
1763 recursion_info *ri;
1764 unsigned int recno;
1765
1766 callpat = md->start_code + GET(ecode, 1);
1767 recno = (callpat == md->start_code)? 0 :
1768 GET2(callpat, 1 + LINK_SIZE);
1769
1770 /* Check for repeating a recursion without advancing the subject pointer.
1771 This should catch convoluted mutual recursions. (Some simple cases are
1772 caught at compile time.) */
1773
1774 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1775 if (recno == ri->group_num && eptr == ri->subject_position)
1776 RRETURN(PCRE_ERROR_RECURSELOOP);
1777
1778 /* Add to "recursing stack" */
1779
1780 new_recursive.group_num = recno;
1781 new_recursive.saved_capture_last = md->capture_last;
1782 new_recursive.subject_position = eptr;
1783 new_recursive.prevrec = md->recursive;
1784 md->recursive = &new_recursive;
1785
1786 /* Where to continue from afterwards */
1787
1788 ecode += 1 + LINK_SIZE;
1789
1790 /* Now save the offset data */
1791
1792 new_recursive.saved_max = md->offset_end;
1793 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1794 new_recursive.offset_save = stacksave;
1795 else
1796 {
1797 new_recursive.offset_save =
1798 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1799 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1800 }
1801 memcpy(new_recursive.offset_save, md->offset_vector,
1802 new_recursive.saved_max * sizeof(int));
1803
1804 /* OK, now we can do the recursion. After processing each alternative,
1805 restore the offset data and the last captured value. If there were nested
1806 recursions, md->recursive might be changed, so reset it before looping.
1807 */
1808
1809 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1810 cbegroup = (*callpat >= OP_SBRA);
1811 do
1812 {
1813 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1814 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1815 md, eptrb, RM6);
1816 memcpy(md->offset_vector, new_recursive.offset_save,
1817 new_recursive.saved_max * sizeof(int));
1818 md->capture_last = new_recursive.saved_capture_last;
1819 md->recursive = new_recursive.prevrec;
1820 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1821 {
1822 DPRINTF(("Recursion matched\n"));
1823 if (new_recursive.offset_save != stacksave)
1824 (PUBL(free))(new_recursive.offset_save);
1825
1826 /* Set where we got to in the subject, and reset the start in case
1827 it was changed by \K. This *is* propagated back out of a recursion,
1828 for Perl compatibility. */
1829
1830 eptr = md->end_match_ptr;
1831 mstart = md->start_match_ptr;
1832 goto RECURSION_MATCHED; /* Exit loop; end processing */
1833 }
1834
1835 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1836 recursion; they cause a NOMATCH for the entire recursion. These codes
1837 are defined in a range that can be tested for. */
1838
1839 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1840 RRETURN(MATCH_NOMATCH);
1841
1842 /* Any return code other than NOMATCH is an error. */
1843
1844 if (rrc != MATCH_NOMATCH)
1845 {
1846 DPRINTF(("Recursion gave error %d\n", rrc));
1847 if (new_recursive.offset_save != stacksave)
1848 (PUBL(free))(new_recursive.offset_save);
1849 RRETURN(rrc);
1850 }
1851
1852 md->recursive = &new_recursive;
1853 callpat += GET(callpat, 1);
1854 }
1855 while (*callpat == OP_ALT);
1856
1857 DPRINTF(("Recursion didn't match\n"));
1858 md->recursive = new_recursive.prevrec;
1859 if (new_recursive.offset_save != stacksave)
1860 (PUBL(free))(new_recursive.offset_save);
1861 RRETURN(MATCH_NOMATCH);
1862 }
1863
1864 RECURSION_MATCHED:
1865 break;
1866
1867 /* An alternation is the end of a branch; scan along to find the end of the
1868 bracketed group and go to there. */
1869
1870 case OP_ALT:
1871 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1872 break;
1873
1874 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1875 indicating that it may occur zero times. It may repeat infinitely, or not
1876 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1877 with fixed upper repeat limits are compiled as a number of copies, with the
1878 optional ones preceded by BRAZERO or BRAMINZERO. */
1879
1880 case OP_BRAZERO:
1881 next = ecode + 1;
1882 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1884 do next += GET(next, 1); while (*next == OP_ALT);
1885 ecode = next + 1 + LINK_SIZE;
1886 break;
1887
1888 case OP_BRAMINZERO:
1889 next = ecode + 1;
1890 do next += GET(next, 1); while (*next == OP_ALT);
1891 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1893 ecode++;
1894 break;
1895
1896 case OP_SKIPZERO:
1897 next = ecode+1;
1898 do next += GET(next,1); while (*next == OP_ALT);
1899 ecode = next + 1 + LINK_SIZE;
1900 break;
1901
1902 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1903 here; just jump to the group, with allow_zero set TRUE. */
1904
1905 case OP_BRAPOSZERO:
1906 op = *(++ecode);
1907 allow_zero = TRUE;
1908 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1909 goto POSSESSIVE_NON_CAPTURE;
1910
1911 /* End of a group, repeated or non-repeating. */
1912
1913 case OP_KET:
1914 case OP_KETRMIN:
1915 case OP_KETRMAX:
1916 case OP_KETRPOS:
1917 prev = ecode - GET(ecode, 1);
1918
1919 /* If this was a group that remembered the subject start, in order to break
1920 infinite repeats of empty string matches, retrieve the subject start from
1921 the chain. Otherwise, set it NULL. */
1922
1923 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1924 {
1925 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1926 eptrb = eptrb->epb_prev; /* Backup to previous group */
1927 }
1928 else saved_eptr = NULL;
1929
1930 /* If we are at the end of an assertion group or a non-capturing atomic
1931 group, stop matching and return MATCH_MATCH, but record the current high
1932 water mark for use by positive assertions. We also need to record the match
1933 start in case it was changed by \K. */
1934
1935 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1936 *prev == OP_ONCE_NC)
1937 {
1938 md->end_match_ptr = eptr; /* For ONCE_NC */
1939 md->end_offset_top = offset_top;
1940 md->start_match_ptr = mstart;
1941 RRETURN(MATCH_MATCH); /* Sets md->mark */
1942 }
1943
1944 /* For capturing groups we have to check the group number back at the start
1945 and if necessary complete handling an extraction by setting the offsets and
1946 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1947 into group 0, so it won't be picked up here. Instead, we catch it when the
1948 OP_END is reached. Other recursion is handled here. We just have to record
1949 the current subject position and start match pointer and give a MATCH
1950 return. */
1951
1952 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1953 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1954 {
1955 number = GET2(prev, 1+LINK_SIZE);
1956 offset = number << 1;
1957
1958 #ifdef PCRE_DEBUG
1959 printf("end bracket %d", number);
1960 printf("\n");
1961 #endif
1962
1963 /* Handle a recursively called group. */
1964
1965 if (md->recursive != NULL && md->recursive->group_num == number)
1966 {
1967 md->end_match_ptr = eptr;
1968 md->start_match_ptr = mstart;
1969 RRETURN(MATCH_MATCH);
1970 }
1971
1972 /* Deal with capturing */
1973
1974 md->capture_last = (md->capture_last & OVFLMASK) | number;
1975 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1976 {
1977 /* If offset is greater than offset_top, it means that we are
1978 "skipping" a capturing group, and that group's offsets must be marked
1979 unset. In earlier versions of PCRE, all the offsets were unset at the
1980 start of matching, but this doesn't work because atomic groups and
1981 assertions can cause a value to be set that should later be unset.
1982 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1983 part of the atomic group, but this is not on the final matching path,
1984 so must be unset when 2 is set. (If there is no group 2, there is no
1985 problem, because offset_top will then be 2, indicating no capture.) */
1986
1987 if (offset > offset_top)
1988 {
1989 register int *iptr = md->offset_vector + offset_top;
1990 register int *iend = md->offset_vector + offset;
1991 while (iptr < iend) *iptr++ = -1;
1992 }
1993
1994 /* Now make the extraction */
1995
1996 md->offset_vector[offset] =
1997 md->offset_vector[md->offset_end - number];
1998 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1999 if (offset_top <= offset) offset_top = offset + 2;
2000 }
2001 }
2002
2003 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2004 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2005 at a time from the outer level, thus saving stack. This must precede the
2006 empty string test - in this case that test is done at the outer level. */
2007
2008 if (*ecode == OP_KETRPOS)
2009 {
2010 md->start_match_ptr = mstart; /* In case \K reset it */
2011 md->end_match_ptr = eptr;
2012 md->end_offset_top = offset_top;
2013 RRETURN(MATCH_KETRPOS);
2014 }
2015
2016 /* For an ordinary non-repeating ket, just continue at this level. This
2017 also happens for a repeating ket if no characters were matched in the
2018 group. This is the forcible breaking of infinite loops as implemented in
2019 Perl 5.005. For a non-repeating atomic group that includes captures,
2020 establish a backup point by processing the rest of the pattern at a lower
2021 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2022 original OP_ONCE level, thereby bypassing intermediate backup points, but
2023 resetting any captures that happened along the way. */
2024
2025 if (*ecode == OP_KET || eptr == saved_eptr)
2026 {
2027 if (*prev == OP_ONCE)
2028 {
2029 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2030 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2031 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2032 RRETURN(MATCH_ONCE);
2033 }
2034 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2035 break;
2036 }
2037
2038 /* The normal repeating kets try the rest of the pattern or restart from
2039 the preceding bracket, in the appropriate order. In the second case, we can
2040 use tail recursion to avoid using another stack frame, unless we have an
2041 an atomic group or an unlimited repeat of a group that can match an empty
2042 string. */
2043
2044 if (*ecode == OP_KETRMIN)
2045 {
2046 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 if (*prev == OP_ONCE)
2049 {
2050 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2052 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2053 RRETURN(MATCH_ONCE);
2054 }
2055 if (*prev >= OP_SBRA) /* Could match an empty string */
2056 {
2057 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2058 RRETURN(rrc);
2059 }
2060 ecode = prev;
2061 goto TAIL_RECURSE;
2062 }
2063 else /* OP_KETRMAX */
2064 {
2065 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2066 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2067 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2068 if (*prev == OP_ONCE)
2069 {
2070 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2072 md->once_target = prev;
2073 RRETURN(MATCH_ONCE);
2074 }
2075 ecode += 1 + LINK_SIZE;
2076 goto TAIL_RECURSE;
2077 }
2078 /* Control never gets here */
2079
2080 /* Not multiline mode: start of subject assertion, unless notbol. */
2081
2082 case OP_CIRC:
2083 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2084
2085 /* Start of subject assertion */
2086
2087 case OP_SOD:
2088 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2089 ecode++;
2090 break;
2091
2092 /* Multiline mode: start of subject unless notbol, or after any newline. */
2093
2094 case OP_CIRCM:
2095 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2096 if (eptr != md->start_subject &&
2097 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2098 RRETURN(MATCH_NOMATCH);
2099 ecode++;
2100 break;
2101
2102 /* Start of match assertion */
2103
2104 case OP_SOM:
2105 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2106 ecode++;
2107 break;
2108
2109 /* Reset the start of match point */
2110
2111 case OP_SET_SOM:
2112 mstart = eptr;
2113 ecode++;
2114 break;
2115
2116 /* Multiline mode: assert before any newline, or before end of subject
2117 unless noteol is set. */
2118
2119 case OP_DOLLM:
2120 if (eptr < md->end_subject)
2121 {
2122 if (!IS_NEWLINE(eptr))
2123 {
2124 if (md->partial != 0 &&
2125 eptr + 1 >= md->end_subject &&
2126 NLBLOCK->nltype == NLTYPE_FIXED &&
2127 NLBLOCK->nllen == 2 &&
2128 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2129 {
2130 md->hitend = TRUE;
2131 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2132 }
2133 RRETURN(MATCH_NOMATCH);
2134 }
2135 }
2136 else
2137 {
2138 if (md->noteol) RRETURN(MATCH_NOMATCH);
2139 SCHECK_PARTIAL();
2140 }
2141 ecode++;
2142 break;
2143
2144 /* Not multiline mode: assert before a terminating newline or before end of
2145 subject unless noteol is set. */
2146
2147 case OP_DOLL:
2148 if (md->noteol) RRETURN(MATCH_NOMATCH);
2149 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2150
2151 /* ... else fall through for endonly */
2152
2153 /* End of subject assertion (\z) */
2154
2155 case OP_EOD:
2156 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2157 SCHECK_PARTIAL();
2158 ecode++;
2159 break;
2160
2161 /* End of subject or ending \n assertion (\Z) */
2162
2163 case OP_EODN:
2164 ASSERT_NL_OR_EOS:
2165 if (eptr < md->end_subject &&
2166 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2167 {
2168 if (md->partial != 0 &&
2169 eptr + 1 >= md->end_subject &&
2170 NLBLOCK->nltype == NLTYPE_FIXED &&
2171 NLBLOCK->nllen == 2 &&
2172 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2173 {
2174 md->hitend = TRUE;
2175 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2176 }
2177 RRETURN(MATCH_NOMATCH);
2178 }
2179
2180 /* Either at end of string or \n before end. */
2181
2182 SCHECK_PARTIAL();
2183 ecode++;
2184 break;
2185
2186 /* Word boundary assertions */
2187
2188 case OP_NOT_WORD_BOUNDARY:
2189 case OP_WORD_BOUNDARY:
2190 {
2191
2192 /* Find out if the previous and current characters are "word" characters.
2193 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2194 be "non-word" characters. Remember the earliest consulted character for
2195 partial matching. */
2196
2197 #ifdef SUPPORT_UTF
2198 if (utf)
2199 {
2200 /* Get status of previous character */
2201
2202 if (eptr == md->start_subject) prev_is_word = FALSE; else
2203 {
2204 PCRE_PUCHAR lastptr = eptr - 1;
2205 BACKCHAR(lastptr);
2206 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2207 GETCHAR(c, lastptr);
2208 #ifdef SUPPORT_UCP
2209 if (md->use_ucp)
2210 {
2211 if (c == '_') prev_is_word = TRUE; else
2212 {
2213 int cat = UCD_CATEGORY(c);
2214 prev_is_word = (cat == ucp_L || cat == ucp_N);
2215 }
2216 }
2217 else
2218 #endif
2219 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2220 }
2221
2222 /* Get status of next character */
2223
2224 if (eptr >= md->end_subject)
2225 {
2226 SCHECK_PARTIAL();
2227 cur_is_word = FALSE;
2228 }
2229 else
2230 {
2231 GETCHAR(c, eptr);
2232 #ifdef SUPPORT_UCP
2233 if (md->use_ucp)
2234 {
2235 if (c == '_') cur_is_word = TRUE; else
2236 {
2237 int cat = UCD_CATEGORY(c);
2238 cur_is_word = (cat == ucp_L || cat == ucp_N);
2239 }
2240 }
2241 else
2242 #endif
2243 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2244 }
2245 }
2246 else
2247 #endif
2248
2249 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2250 consistency with the behaviour of \w we do use it in this case. */
2251
2252 {
2253 /* Get status of previous character */
2254
2255 if (eptr == md->start_subject) prev_is_word = FALSE; else
2256 {
2257 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2258 #ifdef SUPPORT_UCP
2259 if (md->use_ucp)
2260 {
2261 c = eptr[-1];
2262 if (c == '_') prev_is_word = TRUE; else
2263 {
2264 int cat = UCD_CATEGORY(c);
2265 prev_is_word = (cat == ucp_L || cat == ucp_N);
2266 }
2267 }
2268 else
2269 #endif
2270 prev_is_word = MAX_255(eptr[-1])
2271 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2272 }
2273
2274 /* Get status of next character */
2275
2276 if (eptr >= md->end_subject)
2277 {
2278 SCHECK_PARTIAL();
2279 cur_is_word = FALSE;
2280 }
2281 else
2282 #ifdef SUPPORT_UCP
2283 if (md->use_ucp)
2284 {
2285 c = *eptr;
2286 if (c == '_') cur_is_word = TRUE; else
2287 {
2288 int cat = UCD_CATEGORY(c);
2289 cur_is_word = (cat == ucp_L || cat == ucp_N);
2290 }
2291 }
2292 else
2293 #endif
2294 cur_is_word = MAX_255(*eptr)
2295 && ((md->ctypes[*eptr] & ctype_word) != 0);
2296 }
2297
2298 /* Now see if the situation is what we want */
2299
2300 if ((*ecode++ == OP_WORD_BOUNDARY)?
2301 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2302 RRETURN(MATCH_NOMATCH);
2303 }
2304 break;
2305
2306 /* Match any single character type except newline; have to take care with
2307 CRLF newlines and partial matching. */
2308
2309 case OP_ANY:
2310 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2311 if (md->partial != 0 &&
2312 eptr + 1 >= md->end_subject &&
2313 NLBLOCK->nltype == NLTYPE_FIXED &&
2314 NLBLOCK->nllen == 2 &&
2315 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2316 {
2317 md->hitend = TRUE;
2318 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2319 }
2320
2321 /* Fall through */
2322
2323 /* Match any single character whatsoever. */
2324
2325 case OP_ALLANY:
2326 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2327 { /* not be updated before SCHECK_PARTIAL. */
2328 SCHECK_PARTIAL();
2329 RRETURN(MATCH_NOMATCH);
2330 }
2331 eptr++;
2332 #ifdef SUPPORT_UTF
2333 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2334 #endif
2335 ecode++;
2336 break;
2337
2338 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2339 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2340
2341 case OP_ANYBYTE:
2342 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2343 { /* not be updated before SCHECK_PARTIAL. */
2344 SCHECK_PARTIAL();
2345 RRETURN(MATCH_NOMATCH);
2346 }
2347 eptr++;
2348 ecode++;
2349 break;
2350
2351 case OP_NOT_DIGIT:
2352 if (eptr >= md->end_subject)
2353 {
2354 SCHECK_PARTIAL();
2355 RRETURN(MATCH_NOMATCH);
2356 }
2357 GETCHARINCTEST(c, eptr);
2358 if (
2359 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2360 c < 256 &&
2361 #endif
2362 (md->ctypes[c] & ctype_digit) != 0
2363 )
2364 RRETURN(MATCH_NOMATCH);
2365 ecode++;
2366 break;
2367
2368 case OP_DIGIT:
2369 if (eptr >= md->end_subject)
2370 {
2371 SCHECK_PARTIAL();
2372 RRETURN(MATCH_NOMATCH);
2373 }
2374 GETCHARINCTEST(c, eptr);
2375 if (
2376 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2377 c > 255 ||
2378 #endif
2379 (md->ctypes[c] & ctype_digit) == 0
2380 )
2381 RRETURN(MATCH_NOMATCH);
2382 ecode++;
2383 break;
2384
2385 case OP_NOT_WHITESPACE:
2386 if (eptr >= md->end_subject)
2387 {
2388 SCHECK_PARTIAL();
2389 RRETURN(MATCH_NOMATCH);
2390 }
2391 GETCHARINCTEST(c, eptr);
2392 if (
2393 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2394 c < 256 &&
2395 #endif
2396 (md->ctypes[c] & ctype_space) != 0
2397 )
2398 RRETURN(MATCH_NOMATCH);
2399 ecode++;
2400 break;
2401
2402 case OP_WHITESPACE:
2403 if (eptr >= md->end_subject)
2404 {
2405 SCHECK_PARTIAL();
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 GETCHARINCTEST(c, eptr);
2409 if (
2410 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2411 c > 255 ||
2412 #endif
2413 (md->ctypes[c] & ctype_space) == 0
2414 )
2415 RRETURN(MATCH_NOMATCH);
2416 ecode++;
2417 break;
2418
2419 case OP_NOT_WORDCHAR:
2420 if (eptr >= md->end_subject)
2421 {
2422 SCHECK_PARTIAL();
2423 RRETURN(MATCH_NOMATCH);
2424 }
2425 GETCHARINCTEST(c, eptr);
2426 if (
2427 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2428 c < 256 &&
2429 #endif
2430 (md->ctypes[c] & ctype_word) != 0
2431 )
2432 RRETURN(MATCH_NOMATCH);
2433 ecode++;
2434 break;
2435
2436 case OP_WORDCHAR:
2437 if (eptr >= md->end_subject)
2438 {
2439 SCHECK_PARTIAL();
2440 RRETURN(MATCH_NOMATCH);
2441 }
2442 GETCHARINCTEST(c, eptr);
2443 if (
2444 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2445 c > 255 ||
2446 #endif
2447 (md->ctypes[c] & ctype_word) == 0
2448 )
2449 RRETURN(MATCH_NOMATCH);
2450 ecode++;
2451 break;
2452
2453 case OP_ANYNL:
2454 if (eptr >= md->end_subject)
2455 {
2456 SCHECK_PARTIAL();
2457 RRETURN(MATCH_NOMATCH);
2458 }
2459 GETCHARINCTEST(c, eptr);
2460 switch(c)
2461 {
2462 default: RRETURN(MATCH_NOMATCH);
2463
2464 case CHAR_CR:
2465 if (eptr >= md->end_subject)
2466 {
2467 SCHECK_PARTIAL();
2468 }
2469 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2470 break;
2471
2472 case CHAR_LF:
2473 break;
2474
2475 case CHAR_VT:
2476 case CHAR_FF:
2477 case CHAR_NEL:
2478 #ifndef EBCDIC
2479 case 0x2028:
2480 case 0x2029:
2481 #endif /* Not EBCDIC */
2482 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2483 break;
2484 }
2485 ecode++;
2486 break;
2487
2488 case OP_NOT_HSPACE:
2489 if (eptr >= md->end_subject)
2490 {
2491 SCHECK_PARTIAL();
2492 RRETURN(MATCH_NOMATCH);
2493 }
2494 GETCHARINCTEST(c, eptr);
2495 switch(c)
2496 {
2497 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2498 default: break;
2499 }
2500 ecode++;
2501 break;
2502
2503 case OP_HSPACE:
2504 if (eptr >= md->end_subject)
2505 {
2506 SCHECK_PARTIAL();
2507 RRETURN(MATCH_NOMATCH);
2508 }
2509 GETCHARINCTEST(c, eptr);
2510 switch(c)
2511 {
2512 HSPACE_CASES: break; /* Byte and multibyte cases */
2513 default: RRETURN(MATCH_NOMATCH);
2514 }
2515 ecode++;
2516 break;
2517
2518 case OP_NOT_VSPACE:
2519 if (eptr >= md->end_subject)
2520 {
2521 SCHECK_PARTIAL();
2522 RRETURN(MATCH_NOMATCH);
2523 }
2524 GETCHARINCTEST(c, eptr);
2525 switch(c)
2526 {
2527 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2528 default: break;
2529 }
2530 ecode++;
2531 break;
2532
2533 case OP_VSPACE:
2534 if (eptr >= md->end_subject)
2535 {
2536 SCHECK_PARTIAL();
2537 RRETURN(MATCH_NOMATCH);
2538 }
2539 GETCHARINCTEST(c, eptr);
2540 switch(c)
2541 {
2542 VSPACE_CASES: break;
2543 default: RRETURN(MATCH_NOMATCH);
2544 }
2545 ecode++;
2546 break;
2547
2548 #ifdef SUPPORT_UCP
2549 /* Check the next character by Unicode property. We will get here only
2550 if the support is in the binary; otherwise a compile-time error occurs. */
2551
2552 case OP_PROP:
2553 case OP_NOTPROP:
2554 if (eptr >= md->end_subject)
2555 {
2556 SCHECK_PARTIAL();
2557 RRETURN(MATCH_NOMATCH);
2558 }
2559 GETCHARINCTEST(c, eptr);
2560 {
2561 const pcre_uint32 *cp;
2562 const ucd_record *prop = GET_UCD(c);
2563
2564 switch(ecode[1])
2565 {
2566 case PT_ANY:
2567 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2568 break;
2569
2570 case PT_LAMP:
2571 if ((prop->chartype == ucp_Lu ||
2572 prop->chartype == ucp_Ll ||
2573 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2574 RRETURN(MATCH_NOMATCH);
2575 break;
2576
2577 case PT_GC:
2578 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2579 RRETURN(MATCH_NOMATCH);
2580 break;
2581
2582 case PT_PC:
2583 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2586
2587 case PT_SC:
2588 if ((ecode[2] != prop->script) == (op == OP_PROP))
2589 RRETURN(MATCH_NOMATCH);
2590 break;
2591
2592 /* These are specials */
2593
2594 case PT_ALNUM:
2595 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2596 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2597 RRETURN(MATCH_NOMATCH);
2598 break;
2599
2600 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2601 which means that Perl space and POSIX space are now identical. PCRE
2602 was changed at release 8.34. */
2603
2604 case PT_SPACE: /* Perl space */
2605 case PT_PXSPACE: /* POSIX space */
2606 switch(c)
2607 {
2608 HSPACE_CASES:
2609 VSPACE_CASES:
2610 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2611 break;
2612
2613 default:
2614 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2615 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2616 break;
2617 }
2618 break;
2619
2620 case PT_WORD:
2621 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2622 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2623 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2624 RRETURN(MATCH_NOMATCH);
2625 break;
2626
2627 case PT_CLIST:
2628 cp = PRIV(ucd_caseless_sets) + ecode[2];
2629 for (;;)
2630 {
2631 if (c < *cp)
2632 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2633 if (c == *cp++)
2634 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2635 }
2636 break;
2637
2638 case PT_UCNC:
2639 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2640 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2641 c >= 0xe000) == (op == OP_NOTPROP))
2642 RRETURN(MATCH_NOMATCH);
2643 break;
2644
2645 /* This should never occur */
2646
2647 default:
2648 RRETURN(PCRE_ERROR_INTERNAL);
2649 }
2650
2651 ecode += 3;
2652 }
2653 break;
2654
2655 /* Match an extended Unicode sequence. We will get here only if the support
2656 is in the binary; otherwise a compile-time error occurs. */
2657
2658 case OP_EXTUNI:
2659 if (eptr >= md->end_subject)
2660 {
2661 SCHECK_PARTIAL();
2662 RRETURN(MATCH_NOMATCH);
2663 }
2664 else
2665 {
2666 int lgb, rgb;
2667 GETCHARINCTEST(c, eptr);
2668 lgb = UCD_GRAPHBREAK(c);
2669 while (eptr < md->end_subject)
2670 {
2671 int len = 1;
2672 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2673 rgb = UCD_GRAPHBREAK(c);
2674 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2675 lgb = rgb;
2676 eptr += len;
2677 }
2678 }
2679 CHECK_PARTIAL();
2680 ecode++;
2681 break;
2682 #endif /* SUPPORT_UCP */
2683
2684
2685 /* Match a back reference, possibly repeatedly. Look past the end of the
2686 item to see if there is repeat information following. The code is similar
2687 to that for character classes, but repeated for efficiency. Then obey
2688 similar code to character type repeats - written out again for speed.
2689 However, if the referenced string is the empty string, always treat
2690 it as matched, any number of times (otherwise there could be infinite
2691 loops). If the reference is unset, there are two possibilities:
2692
2693 (a) In the default, Perl-compatible state, set the length negative;
2694 this ensures that every attempt at a match fails. We can't just fail
2695 here, because of the possibility of quantifiers with zero minima.
2696
2697 (b) If the JavaScript compatibility flag is set, set the length to zero
2698 so that the back reference matches an empty string.
2699
2700 Otherwise, set the length to the length of what was matched by the
2701 referenced subpattern.
2702
2703 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2704 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2705 and OP_DNREFI are used. In this case we must scan the list of groups to
2706 which the name refers, and use the first one that is set. */
2707
2708 case OP_DNREF:
2709 case OP_DNREFI:
2710 caseless = op == OP_DNREFI;
2711 {
2712 int count = GET2(ecode, 1+IMM2_SIZE);
2713 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2714 ecode += 1 + 2*IMM2_SIZE;
2715
2716 /* Setting the default length first and initializing 'offset' avoids
2717 compiler warnings in the REF_REPEAT code. */
2718
2719 length = (md->jscript_compat)? 0 : -1;
2720 offset = 0;
2721
2722 while (count-- > 0)
2723 {
2724 offset = GET2(slot, 0) << 1;
2725 if (offset < offset_top && md->offset_vector[offset] >= 0)
2726 {
2727 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2728 break;
2729 }
2730 slot += md->name_entry_size;
2731 }
2732 }
2733 goto REF_REPEAT;
2734
2735 case OP_REF:
2736 case OP_REFI:
2737 caseless = op == OP_REFI;
2738 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2739 ecode += 1 + IMM2_SIZE;
2740 if (offset >= offset_top || md->offset_vector[offset] < 0)
2741 length = (md->jscript_compat)? 0 : -1;
2742 else
2743 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2744
2745 /* Set up for repetition, or handle the non-repeated case */
2746
2747 REF_REPEAT:
2748 switch (*ecode)
2749 {
2750 case OP_CRSTAR:
2751 case OP_CRMINSTAR:
2752 case OP_CRPLUS:
2753 case OP_CRMINPLUS:
2754 case OP_CRQUERY:
2755 case OP_CRMINQUERY:
2756 c = *ecode++ - OP_CRSTAR;
2757 minimize = (c & 1) != 0;
2758 min = rep_min[c]; /* Pick up values from tables; */
2759 max = rep_max[c]; /* zero for max => infinity */
2760 if (max == 0) max = INT_MAX;
2761 break;
2762
2763 case OP_CRRANGE:
2764 case OP_CRMINRANGE:
2765 minimize = (*ecode == OP_CRMINRANGE);
2766 min = GET2(ecode, 1);
2767 max = GET2(ecode, 1 + IMM2_SIZE);
2768 if (max == 0) max = INT_MAX;
2769 ecode += 1 + 2 * IMM2_SIZE;
2770 break;
2771
2772 default: /* No repeat follows */
2773 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2774 {
2775 if (length == -2) eptr = md->end_subject; /* Partial match */
2776 CHECK_PARTIAL();
2777 RRETURN(MATCH_NOMATCH);
2778 }
2779 eptr += length;
2780 continue; /* With the main loop */
2781 }
2782
2783 /* Handle repeated back references. If the length of the reference is
2784 zero, just continue with the main loop. If the length is negative, it
2785 means the reference is unset in non-Java-compatible mode. If the minimum is
2786 zero, we can continue at the same level without recursion. For any other
2787 minimum, carrying on will result in NOMATCH. */
2788
2789 if (length == 0) continue;
2790 if (length < 0 && min == 0) continue;
2791
2792 /* First, ensure the minimum number of matches are present. We get back
2793 the length of the reference string explicitly rather than passing the
2794 address of eptr, so that eptr can be a register variable. */
2795
2796 for (i = 1; i <= min; i++)
2797 {
2798 int slength;
2799 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2800 {
2801 if (slength == -2) eptr = md->end_subject; /* Partial match */
2802 CHECK_PARTIAL();
2803 RRETURN(MATCH_NOMATCH);
2804 }
2805 eptr += slength;
2806 }
2807
2808 /* If min = max, continue at the same level without recursion.
2809 They are not both allowed to be zero. */
2810
2811 if (min == max) continue;
2812
2813 /* If minimizing, keep trying and advancing the pointer */
2814
2815 if (minimize)
2816 {
2817 for (fi = min;; fi++)
2818 {
2819 int slength;
2820 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2821 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2822 if (fi >= max) RRETURN(MATCH_NOMATCH);
2823 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2824 {
2825 if (slength == -2) eptr = md->end_subject; /* Partial match */
2826 CHECK_PARTIAL();
2827 RRETURN(MATCH_NOMATCH);
2828 }
2829 eptr += slength;
2830 }
2831 /* Control never gets here */
2832 }
2833
2834 /* If maximizing, find the longest string and work backwards */
2835
2836 else
2837 {
2838 pp = eptr;
2839 for (i = min; i < max; i++)
2840 {
2841 int slength;
2842 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2843 {
2844 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2845 the soft partial matching case. */
2846
2847 if (slength == -2 && md->partial != 0 &&
2848 md->end_subject > md->start_used_ptr)
2849 {
2850 md->hitend = TRUE;
2851 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2852 }
2853 break;
2854 }
2855 eptr += slength;
2856 }
2857
2858 while (eptr >= pp)
2859 {
2860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2862 eptr -= length;
2863 }
2864 RRETURN(MATCH_NOMATCH);
2865 }
2866 /* Control never gets here */
2867
2868 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2869 used when all the characters in the class have values in the range 0-255,
2870 and either the matching is caseful, or the characters are in the range
2871 0-127 when UTF-8 processing is enabled. The only difference between
2872 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2873 encountered.
2874
2875 First, look past the end of the item to see if there is repeat information
2876 following. Then obey similar code to character type repeats - written out
2877 again for speed. */
2878
2879 case OP_NCLASS:
2880 case OP_CLASS:
2881 {
2882 /* The data variable is saved across frames, so the byte map needs to
2883 be stored there. */
2884 #define BYTE_MAP ((pcre_uint8 *)data)
2885 data = ecode + 1; /* Save for matching */
2886 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2887
2888 switch (*ecode)
2889 {
2890 case OP_CRSTAR:
2891 case OP_CRMINSTAR:
2892 case OP_CRPLUS:
2893 case OP_CRMINPLUS:
2894 case OP_CRQUERY:
2895 case OP_CRMINQUERY:
2896 case OP_CRPOSSTAR:
2897 case OP_CRPOSPLUS:
2898 case OP_CRPOSQUERY:
2899 c = *ecode++ - OP_CRSTAR;
2900 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2901 else possessive = TRUE;
2902 min = rep_min[c]; /* Pick up values from tables; */
2903 max = rep_max[c]; /* zero for max => infinity */
2904 if (max == 0) max = INT_MAX;
2905 break;
2906
2907 case OP_CRRANGE:
2908 case OP_CRMINRANGE:
2909 case OP_CRPOSRANGE:
2910 minimize = (*ecode == OP_CRMINRANGE);
2911 possessive = (*ecode == OP_CRPOSRANGE);
2912 min = GET2(ecode, 1);
2913 max = GET2(ecode, 1 + IMM2_SIZE);
2914 if (max == 0) max = INT_MAX;
2915 ecode += 1 + 2 * IMM2_SIZE;
2916 break;
2917
2918 default: /* No repeat follows */
2919 min = max = 1;
2920 break;
2921 }
2922
2923 /* First, ensure the minimum number of matches are present. */
2924
2925 #ifdef SUPPORT_UTF
2926 if (utf)
2927 {
2928 for (i = 1; i <= min; i++)
2929 {
2930 if (eptr >= md->end_subject)
2931 {
2932 SCHECK_PARTIAL();
2933 RRETURN(MATCH_NOMATCH);
2934 }
2935 GETCHARINC(c, eptr);
2936 if (c > 255)
2937 {
2938 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2939 }
2940 else
2941 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2942 }
2943 }
2944 else
2945 #endif
2946 /* Not UTF mode */
2947 {
2948 for (i = 1; i <= min; i++)
2949 {
2950 if (eptr >= md->end_subject)
2951 {
2952 SCHECK_PARTIAL();
2953 RRETURN(MATCH_NOMATCH);
2954 }
2955 c = *eptr++;
2956 #ifndef COMPILE_PCRE8
2957 if (c > 255)
2958 {
2959 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2960 }
2961 else
2962 #endif
2963 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2964 }
2965 }
2966
2967 /* If max == min we can continue with the main loop without the
2968 need to recurse. */
2969
2970 if (min == max) continue;
2971
2972 /* If minimizing, keep testing the rest of the expression and advancing
2973 the pointer while it matches the class. */
2974
2975 if (minimize)
2976 {
2977 #ifdef SUPPORT_UTF
2978 if (utf)
2979 {
2980 for (fi = min;; fi++)
2981 {
2982 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2983 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2984 if (fi >= max) RRETURN(MATCH_NOMATCH);
2985 if (eptr >= md->end_subject)
2986 {
2987 SCHECK_PARTIAL();
2988 RRETURN(MATCH_NOMATCH);
2989 }
2990 GETCHARINC(c, eptr);
2991 if (c > 255)
2992 {
2993 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2994 }
2995 else
2996 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2997 }
2998 }
2999 else
3000 #endif
3001 /* Not UTF mode */
3002 {
3003 for (fi = min;; fi++)
3004 {
3005 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3007 if (fi >= max) RRETURN(MATCH_NOMATCH);
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 RRETURN(MATCH_NOMATCH);
3012 }
3013 c = *eptr++;
3014 #ifndef COMPILE_PCRE8
3015 if (c > 255)
3016 {
3017 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3018 }
3019 else
3020 #endif
3021 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3022 }
3023 }
3024 /* Control never gets here */
3025 }
3026
3027 /* If maximizing, find the longest possible run, then work backwards. */
3028
3029 else
3030 {
3031 pp = eptr;
3032
3033 #ifdef SUPPORT_UTF
3034 if (utf)
3035 {
3036 for (i = min; i < max; i++)
3037 {
3038 int len = 1;
3039 if (eptr >= md->end_subject)
3040 {
3041 SCHECK_PARTIAL();
3042 break;
3043 }
3044 GETCHARLEN(c, eptr, len);
3045 if (c > 255)
3046 {
3047 if (op == OP_CLASS) break;
3048 }
3049 else
3050 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3051 eptr += len;
3052 }
3053
3054 if (possessive) continue; /* No backtracking */
3055
3056 for (;;)
3057 {
3058 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3060 if (eptr-- == pp) break; /* Stop if tried at original pos */
3061 BACKCHAR(eptr);
3062 }
3063 }
3064 else
3065 #endif
3066 /* Not UTF mode */
3067 {
3068 for (i = min; i < max; i++)
3069 {
3070 if (eptr >= md->end_subject)
3071 {
3072 SCHECK_PARTIAL();
3073 break;
3074 }
3075 c = *eptr;
3076 #ifndef COMPILE_PCRE8
3077 if (c > 255)
3078 {
3079 if (op == OP_CLASS) break;
3080 }
3081 else
3082 #endif
3083 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3084 eptr++;
3085 }
3086
3087 if (possessive) continue; /* No backtracking */
3088
3089 while (eptr >= pp)
3090 {
3091 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3092 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3093 eptr--;
3094 }
3095 }
3096
3097 RRETURN(MATCH_NOMATCH);
3098 }
3099 #undef BYTE_MAP
3100 }
3101 /* Control never gets here */
3102
3103
3104 /* Match an extended character class. In the 8-bit library, this opcode is
3105 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3106 32-bit libraries, codepoints greater than 255 may be encountered even when
3107 UTF is not supported. */
3108
3109 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3110 case OP_XCLASS:
3111 {
3112 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3113 ecode += GET(ecode, 1); /* Advance past the item */
3114
3115 switch (*ecode)
3116 {
3117 case OP_CRSTAR:
3118 case OP_CRMINSTAR:
3119 case OP_CRPLUS:
3120 case OP_CRMINPLUS:
3121 case OP_CRQUERY:
3122 case OP_CRMINQUERY:
3123 case OP_CRPOSSTAR:
3124 case OP_CRPOSPLUS:
3125 case OP_CRPOSQUERY:
3126 c = *ecode++ - OP_CRSTAR;
3127 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3128 else possessive = TRUE;
3129 min = rep_min[c]; /* Pick up values from tables; */
3130 max = rep_max[c]; /* zero for max => infinity */
3131 if (max == 0) max = INT_MAX;
3132 break;
3133
3134 case OP_CRRANGE:
3135 case OP_CRMINRANGE:
3136 case OP_CRPOSRANGE:
3137 minimize = (*ecode == OP_CRMINRANGE);
3138 possessive = (*ecode == OP_CRPOSRANGE);
3139 min = GET2(ecode, 1);
3140 max = GET2(ecode, 1 + IMM2_SIZE);
3141 if (max == 0) max = INT_MAX;
3142 ecode += 1 + 2 * IMM2_SIZE;
3143 break;
3144
3145 default: /* No repeat follows */
3146 min = max = 1;
3147 break;
3148 }
3149
3150 /* First, ensure the minimum number of matches are present. */
3151
3152 for (i = 1; i <= min; i++)
3153 {
3154 if (eptr >= md->end_subject)
3155 {
3156 SCHECK_PARTIAL();
3157 RRETURN(MATCH_NOMATCH);
3158 }
3159 GETCHARINCTEST(c, eptr);
3160 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3161 }
3162
3163 /* If max == min we can continue with the main loop without the
3164 need to recurse. */
3165
3166 if (min == max) continue;
3167
3168 /* If minimizing, keep testing the rest of the expression and advancing
3169 the pointer while it matches the class. */
3170
3171 if (minimize)
3172 {
3173 for (fi = min;; fi++)
3174 {
3175 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3176 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3177 if (fi >= max) RRETURN(MATCH_NOMATCH);
3178 if (eptr >= md->end_subject)
3179 {
3180 SCHECK_PARTIAL();
3181 RRETURN(MATCH_NOMATCH);
3182 }
3183 GETCHARINCTEST(c, eptr);
3184 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3185 }
3186 /* Control never gets here */
3187 }
3188
3189 /* If maximizing, find the longest possible run, then work backwards. */
3190
3191 else
3192 {
3193 pp = eptr;
3194 for (i = min; i < max; i++)
3195 {
3196 int len = 1;
3197 if (eptr >= md->end_subject)
3198 {
3199 SCHECK_PARTIAL();
3200 break;
3201 }
3202 #ifdef SUPPORT_UTF
3203 GETCHARLENTEST(c, eptr, len);
3204 #else
3205 c = *eptr;
3206 #endif
3207 if (!PRIV(xclass)(c, data, utf)) break;
3208 eptr += len;
3209 }
3210
3211 if (possessive) continue; /* No backtracking */
3212
3213 for(;;)
3214 {
3215 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3216 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3217 if (eptr-- == pp) break; /* Stop if tried at original pos */
3218 #ifdef SUPPORT_UTF
3219 if (utf) BACKCHAR(eptr);
3220 #endif
3221 }
3222 RRETURN(MATCH_NOMATCH);
3223 }
3224
3225 /* Control never gets here */
3226 }
3227 #endif /* End of XCLASS */
3228
3229 /* Match a single character, casefully */
3230
3231 case OP_CHAR:
3232 #ifdef SUPPORT_UTF
3233 if (utf)
3234 {
3235 length = 1;
3236 ecode++;
3237 GETCHARLEN(fc, ecode, length);
3238 if (length > md->end_subject - eptr)
3239 {
3240 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3241 RRETURN(MATCH_NOMATCH);
3242 }
3243 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3244 }
3245 else
3246 #endif
3247 /* Not UTF mode */
3248 {
3249 if (md->end_subject - eptr < 1)
3250 {
3251 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3252 RRETURN(MATCH_NOMATCH);
3253 }
3254 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3255 ecode += 2;
3256 }
3257 break;
3258
3259 /* Match a single character, caselessly. If we are at the end of the
3260 subject, give up immediately. */
3261
3262 case OP_CHARI:
3263 if (eptr >= md->end_subject)
3264 {
3265 SCHECK_PARTIAL();
3266 RRETURN(MATCH_NOMATCH);
3267 }
3268
3269 #ifdef SUPPORT_UTF
3270 if (utf)
3271 {
3272 length = 1;
3273 ecode++;
3274 GETCHARLEN(fc, ecode, length);
3275
3276 /* If the pattern character's value is < 128, we have only one byte, and
3277 we know that its other case must also be one byte long, so we can use the
3278 fast lookup table. We know that there is at least one byte left in the
3279 subject. */
3280
3281 if (fc < 128)
3282 {
3283 pcre_uint32 cc = UCHAR21(eptr);
3284 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3285 ecode++;
3286 eptr++;
3287 }
3288
3289 /* Otherwise we must pick up the subject character. Note that we cannot
3290 use the value of "length" to check for sufficient bytes left, because the
3291 other case of the character may have more or fewer bytes. */
3292
3293 else
3294 {
3295 pcre_uint32 dc;
3296 GETCHARINC(dc, eptr);
3297 ecode += length;
3298
3299 /* If we have Unicode property support, we can use it to test the other
3300 case of the character, if there is one. */
3301
3302 if (fc != dc)
3303 {
3304 #ifdef SUPPORT_UCP
3305 if (dc != UCD_OTHERCASE(fc))
3306 #endif
3307 RRETURN(MATCH_NOMATCH);
3308 }
3309 }
3310 }
3311 else
3312 #endif /* SUPPORT_UTF */
3313
3314 /* Not UTF mode */
3315 {
3316 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3317 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3318 eptr++;
3319 ecode += 2;
3320 }
3321 break;
3322
3323 /* Match a single character repeatedly. */
3324
3325 case OP_EXACT:
3326 case OP_EXACTI:
3327 min = max = GET2(ecode, 1);
3328 ecode += 1 + IMM2_SIZE;
3329 goto REPEATCHAR;
3330
3331 case OP_POSUPTO:
3332 case OP_POSUPTOI:
3333 possessive = TRUE;
3334 /* Fall through */
3335
3336 case OP_UPTO:
3337 case OP_UPTOI:
3338 case OP_MINUPTO:
3339 case OP_MINUPTOI:
3340 min = 0;
3341 max = GET2(ecode, 1);
3342 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3343 ecode += 1 + IMM2_SIZE;
3344 goto REPEATCHAR;
3345
3346 case OP_POSSTAR:
3347 case OP_POSSTARI:
3348 possessive = TRUE;
3349 min = 0;
3350 max = INT_MAX;
3351 ecode++;
3352 goto REPEATCHAR;
3353
3354 case OP_POSPLUS:
3355 case OP_POSPLUSI:
3356 possessive = TRUE;
3357 min = 1;
3358 max = INT_MAX;
3359 ecode++;
3360 goto REPEATCHAR;
3361
3362 case OP_POSQUERY:
3363 case OP_POSQUERYI:
3364 possessive = TRUE;
3365 min = 0;
3366 max = 1;
3367 ecode++;
3368 goto REPEATCHAR;
3369
3370 case OP_STAR:
3371 case OP_STARI:
3372 case OP_MINSTAR:
3373 case OP_MINSTARI:
3374 case OP_PLUS:
3375 case OP_PLUSI:
3376 case OP_MINPLUS:
3377 case OP_MINPLUSI:
3378 case OP_QUERY:
3379 case OP_QUERYI:
3380 case OP_MINQUERY:
3381 case OP_MINQUERYI:
3382 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3383 minimize = (c & 1) != 0;
3384 min = rep_min[c]; /* Pick up values from tables; */
3385 max = rep_max[c]; /* zero for max => infinity */
3386 if (max == 0) max = INT_MAX;
3387
3388 /* Common code for all repeated single-character matches. We first check
3389 for the minimum number of characters. If the minimum equals the maximum, we
3390 are done. Otherwise, if minimizing, check the rest of the pattern for a
3391 match; if there isn't one, advance up to the maximum, one character at a
3392 time.
3393
3394 If maximizing, advance up to the maximum number of matching characters,
3395 until eptr is past the end of the maximum run. If possessive, we are
3396 then done (no backing up). Otherwise, match at this position; anything
3397 other than no match is immediately returned. For nomatch, back up one
3398 character, unless we are matching \R and the last thing matched was
3399 \r\n, in which case, back up two bytes. When we reach the first optional
3400 character position, we can save stack by doing a tail recurse.
3401
3402 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3403 for speed. */
3404
3405 REPEATCHAR:
3406 #ifdef SUPPORT_UTF
3407 if (utf)
3408 {
3409 length = 1;
3410 charptr = ecode;
3411 GETCHARLEN(fc, ecode, length);
3412 ecode += length;
3413
3414 /* Handle multibyte character matching specially here. There is
3415 support for caseless matching if UCP support is present. */
3416
3417 if (length > 1)
3418 {
3419 #ifdef SUPPORT_UCP
3420 pcre_uint32 othercase;
3421 if (op >= OP_STARI && /* Caseless */
3422 (othercase = UCD_OTHERCASE(fc)) != fc)
3423 oclength = PRIV(ord2utf)(othercase, occhars);
3424 else oclength = 0;
3425 #endif /* SUPPORT_UCP */
3426
3427 for (i = 1; i <= min; i++)
3428 {
3429 if (eptr <= md->end_subject - length &&
3430 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3431 #ifdef SUPPORT_UCP
3432 else if (oclength > 0 &&
3433 eptr <= md->end_subject - oclength &&
3434 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3435 #endif /* SUPPORT_UCP */
3436 else
3437 {
3438 CHECK_PARTIAL();
3439 RRETURN(MATCH_NOMATCH);
3440 }
3441 }
3442
3443 if (min == max) continue;
3444
3445 if (minimize)
3446 {
3447 for (fi = min;; fi++)
3448 {
3449 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3451 if (fi >= max) RRETURN(MATCH_NOMATCH);
3452 if (eptr <= md->end_subject - length &&
3453 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3454 #ifdef SUPPORT_UCP
3455 else if (oclength > 0 &&
3456 eptr <= md->end_subject - oclength &&
3457 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3458 #endif /* SUPPORT_UCP */
3459 else
3460 {
3461 CHECK_PARTIAL();
3462 RRETURN(MATCH_NOMATCH);
3463 }
3464 }
3465 /* Control never gets here */
3466 }
3467
3468 else /* Maximize */
3469 {
3470 pp = eptr;
3471 for (i = min; i < max; i++)
3472 {
3473 if (eptr <= md->end_subject - length &&
3474 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3475 #ifdef SUPPORT_UCP
3476 else if (oclength > 0 &&
3477 eptr <= md->end_subject - oclength &&
3478 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3479 #endif /* SUPPORT_UCP */
3480 else
3481 {
3482 CHECK_PARTIAL();
3483 break;
3484 }
3485 }
3486
3487 if (possessive) continue; /* No backtracking */
3488 for(;;)
3489 {
3490 if (eptr == pp) goto TAIL_RECURSE;
3491 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3492 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3493 #ifdef SUPPORT_UCP
3494 eptr--;
3495 BACKCHAR(eptr);
3496 #else /* without SUPPORT_UCP */
3497 eptr -= length;
3498 #endif /* SUPPORT_UCP */
3499 }
3500 }
3501 /* Control never gets here */
3502 }
3503
3504 /* If the length of a UTF-8 character is 1, we fall through here, and
3505 obey the code as for non-UTF-8 characters below, though in this case the
3506 value of fc will always be < 128. */
3507 }
3508 else
3509 #endif /* SUPPORT_UTF */
3510 /* When not in UTF-8 mode, load a single-byte character. */
3511 fc = *ecode++;
3512
3513 /* The value of fc at this point is always one character, though we may
3514 or may not be in UTF mode. The code is duplicated for the caseless and
3515 caseful cases, for speed, since matching characters is likely to be quite
3516 common. First, ensure the minimum number of matches are present. If min =
3517 max, continue at the same level without recursing. Otherwise, if
3518 minimizing, keep trying the rest of the expression and advancing one
3519 matching character if failing, up to the maximum. Alternatively, if
3520 maximizing, find the maximum number of characters and work backwards. */
3521
3522 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3523 max, (char *)eptr));
3524
3525 if (op >= OP_STARI) /* Caseless */
3526 {
3527 #ifdef COMPILE_PCRE8
3528 /* fc must be < 128 if UTF is enabled. */
3529 foc = md->fcc[fc];
3530 #else
3531 #ifdef SUPPORT_UTF
3532 #ifdef SUPPORT_UCP
3533 if (utf && fc > 127)
3534 foc = UCD_OTHERCASE(fc);
3535 #else
3536 if (utf && fc > 127)
3537 foc = fc;
3538 #endif /* SUPPORT_UCP */
3539 else
3540 #endif /* SUPPORT_UTF */
3541 foc = TABLE_GET(fc, md->fcc, fc);
3542 #endif /* COMPILE_PCRE8 */
3543
3544 for (i = 1; i <= min; i++)
3545 {
3546 pcre_uint32 cc; /* Faster than pcre_uchar */
3547 if (eptr >= md->end_subject)
3548 {
3549 SCHECK_PARTIAL();
3550 RRETURN(MATCH_NOMATCH);
3551 }
3552 cc = UCHAR21TEST(eptr);
3553 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3554 eptr++;
3555 }
3556 if (min == max) continue;
3557 if (minimize)
3558 {
3559 for (fi = min;; fi++)
3560 {
3561 pcre_uint32 cc; /* Faster than pcre_uchar */
3562 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3563 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3564 if (fi >= max) RRETURN(MATCH_NOMATCH);
3565 if (eptr >= md->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 cc = UCHAR21TEST(eptr);
3571 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3572 eptr++;
3573 }
3574 /* Control never gets here */
3575 }
3576 else /* Maximize */
3577 {
3578 pp = eptr;
3579 for (i = min; i < max; i++)
3580 {
3581 pcre_uint32 cc; /* Faster than pcre_uchar */
3582 if (eptr >= md->end_subject)
3583 {
3584 SCHECK_PARTIAL();
3585 break;
3586 }
3587 cc = UCHAR21TEST(eptr);
3588 if (fc != cc && foc != cc) break;
3589 eptr++;
3590 }
3591 if (possessive) continue; /* No backtracking */
3592 for (;;)
3593 {
3594 if (eptr == pp) goto TAIL_RECURSE;
3595 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3596 eptr--;
3597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3598 }
3599 /* Control never gets here */
3600 }
3601 }
3602
3603 /* Caseful comparisons (includes all multi-byte characters) */
3604
3605 else
3606 {
3607 for (i = 1; i <= min; i++)
3608 {
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 RRETURN(MATCH_NOMATCH);
3613 }
3614 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3615 }
3616
3617 if (min == max) continue;
3618
3619 if (minimize)
3620 {
3621 for (fi = min;; fi++)
3622 {
3623 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3624 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3625 if (fi >= max) RRETURN(MATCH_NOMATCH);
3626 if (eptr >= md->end_subject)
3627 {
3628 SCHECK_PARTIAL();
3629 RRETURN(MATCH_NOMATCH);
3630 }
3631 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3632 }
3633 /* Control never gets here */
3634 }
3635 else /* Maximize */
3636 {
3637 pp = eptr;
3638 for (i = min; i < max; i++)
3639 {
3640 if (eptr >= md->end_subject)
3641 {
3642 SCHECK_PARTIAL();
3643 break;
3644 }
3645 if (fc != UCHAR21TEST(eptr)) break;
3646 eptr++;
3647 }
3648 if (possessive) continue; /* No backtracking */
3649 for (;;)
3650 {
3651 if (eptr == pp) goto TAIL_RECURSE;
3652 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3653 eptr--;
3654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3655 }
3656 /* Control never gets here */
3657 }
3658 }
3659 /* Control never gets here */
3660
3661 /* Match a negated single one-byte character. The character we are
3662 checking can be multibyte. */
3663
3664 case OP_NOT:
3665 case OP_NOTI:
3666 if (eptr >= md->end_subject)
3667 {
3668 SCHECK_PARTIAL();
3669 RRETURN(MATCH_NOMATCH);
3670 }
3671 #ifdef SUPPORT_UTF
3672 if (utf)
3673 {
3674 register pcre_uint32 ch, och;
3675
3676 ecode++;
3677 GETCHARINC(ch, ecode);
3678 GETCHARINC(c, eptr);
3679
3680 if (op == OP_NOT)
3681 {
3682 if (ch == c) RRETURN(MATCH_NOMATCH);
3683 }
3684 else
3685 {
3686 #ifdef SUPPORT_UCP
3687 if (ch > 127)
3688 och = UCD_OTHERCASE(ch);
3689 #else
3690 if (ch > 127)
3691 och = ch;
3692 #endif /* SUPPORT_UCP */
3693 else
3694 och = TABLE_GET(ch, md->fcc, ch);
3695 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3696 }
3697 }
3698 else
3699 #endif
3700 {
3701 register pcre_uint32 ch = ecode[1];
3702 c = *eptr++;
3703 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3704 RRETURN(MATCH_NOMATCH);
3705 ecode += 2;
3706 }
3707 break;
3708
3709 /* Match a negated single one-byte character repeatedly. This is almost a
3710 repeat of the code for a repeated single character, but I haven't found a
3711 nice way of commoning these up that doesn't require a test of the
3712 positive/negative option for each character match. Maybe that wouldn't add
3713 very much to the time taken, but character matching *is* what this is all
3714 about... */
3715
3716 case OP_NOTEXACT:
3717 case OP_NOTEXACTI:
3718 min = max = GET2(ecode, 1);
3719 ecode += 1 + IMM2_SIZE;
3720 goto REPEATNOTCHAR;
3721
3722 case OP_NOTUPTO:
3723 case OP_NOTUPTOI:
3724 case OP_NOTMINUPTO:
3725 case OP_NOTMINUPTOI:
3726 min = 0;
3727 max = GET2(ecode, 1);
3728 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3729 ecode += 1 + IMM2_SIZE;
3730 goto REPEATNOTCHAR;
3731
3732 case OP_NOTPOSSTAR:
3733 case OP_NOTPOSSTARI:
3734 possessive = TRUE;
3735 min = 0;
3736 max = INT_MAX;
3737 ecode++;
3738 goto REPEATNOTCHAR;
3739
3740 case OP_NOTPOSPLUS:
3741 case OP_NOTPOSPLUSI:
3742 possessive = TRUE;
3743 min = 1;
3744 max = INT_MAX;
3745 ecode++;
3746 goto REPEATNOTCHAR;
3747
3748 case OP_NOTPOSQUERY:
3749 case OP_NOTPOSQUERYI:
3750 possessive = TRUE;
3751 min = 0;
3752 max = 1;
3753 ecode++;
3754 goto REPEATNOTCHAR;
3755
3756 case OP_NOTPOSUPTO:
3757 case OP_NOTPOSUPTOI:
3758 possessive = TRUE;
3759 min = 0;
3760 max = GET2(ecode, 1);
3761 ecode += 1 + IMM2_SIZE;
3762 goto REPEATNOTCHAR;
3763
3764 case OP_NOTSTAR:
3765 case OP_NOTSTARI:
3766 case OP_NOTMINSTAR:
3767 case OP_NOTMINSTARI:
3768 case OP_NOTPLUS:
3769 case OP_NOTPLUSI:
3770 case OP_NOTMINPLUS:
3771 case OP_NOTMINPLUSI:
3772 case OP_NOTQUERY:
3773 case OP_NOTQUERYI:
3774 case OP_NOTMINQUERY:
3775 case OP_NOTMINQUERYI:
3776 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3777 minimize = (c & 1) != 0;
3778 min = rep_min[c]; /* Pick up values from tables; */
3779 max = rep_max[c]; /* zero for max => infinity */
3780 if (max == 0) max = INT_MAX;
3781
3782 /* Common code for all repeated single-byte matches. */
3783
3784 REPEATNOTCHAR:
3785 GETCHARINCTEST(fc, ecode);
3786
3787 /* The code is duplicated for the caseless and caseful cases, for speed,
3788 since matching characters is likely to be quite common. First, ensure the
3789 minimum number of matches are present. If min = max, continue at the same
3790 level without recursing. Otherwise, if minimizing, keep trying the rest of
3791 the expression and advancing one matching character if failing, up to the
3792 maximum. Alternatively, if maximizing, find the maximum number of
3793 characters and work backwards. */
3794
3795 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3796 max, (char *)eptr));
3797
3798 if (op >= OP_NOTSTARI) /* Caseless */
3799 {
3800 #ifdef SUPPORT_UTF
3801 #ifdef SUPPORT_UCP
3802 if (utf && fc > 127)
3803 foc = UCD_OTHERCASE(fc);
3804 #else
3805 if (utf && fc > 127)
3806 foc = fc;
3807 #endif /* SUPPORT_UCP */
3808 else
3809 #endif /* SUPPORT_UTF */
3810 foc = TABLE_GET(fc, md->fcc, fc);
3811
3812 #ifdef SUPPORT_UTF
3813 if (utf)
3814 {
3815 register pcre_uint32 d;
3816 for (i = 1; i <= min; i++)
3817 {
3818 if (eptr >= md->end_subject)
3819 {
3820 SCHECK_PARTIAL();
3821 RRETURN(MATCH_NOMATCH);
3822 }
3823 GETCHARINC(d, eptr);
3824 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3825 }
3826 }
3827 else
3828 #endif /* SUPPORT_UTF */
3829 /* Not UTF mode */
3830 {
3831 for (i = 1; i <= min; i++)
3832 {
3833 if (eptr >= md->end_subject)
3834 {
3835 SCHECK_PARTIAL();
3836 RRETURN(MATCH_NOMATCH);
3837 }
3838 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3839 eptr++;
3840 }
3841 }
3842
3843 if (min == max) continue;
3844
3845 if (minimize)
3846 {
3847 #ifdef SUPPORT_UTF
3848 if (utf)
3849 {
3850 register pcre_uint32 d;
3851 for (fi = min;; fi++)
3852 {
3853 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3854 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3855 if (fi >= max) RRETURN(MATCH_NOMATCH);
3856 if (eptr >= md->end_subject)
3857 {
3858 SCHECK_PARTIAL();
3859 RRETURN(MATCH_NOMATCH);
3860 }
3861 GETCHARINC(d, eptr);
3862 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3863 }
3864 }
3865 else
3866 #endif /*SUPPORT_UTF */
3867 /* Not UTF mode */
3868 {
3869 for (fi = min;; fi++)
3870 {
3871 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3873 if (fi >= max) RRETURN(MATCH_NOMATCH);
3874 if (eptr >= md->end_subject)
3875 {
3876 SCHECK_PARTIAL();
3877 RRETURN(MATCH_NOMATCH);
3878 }
3879 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3880 eptr++;
3881 }
3882 }
3883 /* Control never gets here */
3884 }
3885
3886 /* Maximize case */
3887
3888 else
3889 {
3890 pp = eptr;
3891
3892 #ifdef SUPPORT_UTF
3893 if (utf)
3894 {
3895 register pcre_uint32 d;
3896 for (i = min; i < max; i++)
3897 {
3898 int len = 1;
3899 if (eptr >= md->end_subject)
3900 {
3901 SCHECK_PARTIAL();
3902 break;
3903 }
3904 GETCHARLEN(d, eptr, len);
3905 if (fc == d || (unsigned int)foc == d) break;
3906 eptr += len;
3907 }
3908 if (possessive) continue; /* No backtracking */
3909 for(;;)
3910 {
3911 if (eptr == pp) goto TAIL_RECURSE;
3912 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3914 eptr--;
3915 BACKCHAR(eptr);
3916 }
3917 }
3918 else
3919 #endif /* SUPPORT_UTF */
3920 /* Not UTF mode */
3921 {
3922 for (i = min; i < max; i++)
3923 {
3924 if (eptr >= md->end_subject)
3925 {
3926 SCHECK_PARTIAL();
3927 break;
3928 }
3929 if (fc == *eptr || foc == *eptr) break;
3930 eptr++;
3931 }
3932 if (possessive) continue; /* No backtracking */
3933 for (;;)
3934 {
3935 if (eptr == pp) goto TAIL_RECURSE;
3936 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3938 eptr--;
3939 }
3940 }
3941 /* Control never gets here */
3942 }
3943 }
3944
3945 /* Caseful comparisons */
3946
3947 else
3948 {
3949 #ifdef SUPPORT_UTF
3950 if (utf)
3951 {
3952 register pcre_uint32 d;
3953 for (i = 1; i <= min; i++)
3954 {
3955 if (eptr >= md->end_subject)
3956 {
3957 SCHECK_PARTIAL();
3958 RRETURN(MATCH_NOMATCH);
3959 }
3960 GETCHARINC(d, eptr);
3961 if (fc == d) RRETURN(MATCH_NOMATCH);
3962 }
3963 }
3964 else
3965 #endif
3966 /* Not UTF mode */
3967 {
3968 for (i = 1; i <= min; i++)
3969 {
3970 if (eptr >= md->end_subject)
3971 {
3972 SCHECK_PARTIAL();
3973 RRETURN(MATCH_NOMATCH);
3974 }
3975 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3976 }
3977 }
3978
3979 if (min == max) continue;
3980
3981 if (minimize)
3982 {
3983 #ifdef SUPPORT_UTF
3984 if (utf)
3985 {
3986 register pcre_uint32 d;
3987 for (fi = min;; fi++)
3988 {
3989 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3991 if (fi >= max) RRETURN(MATCH_NOMATCH);
3992 if (eptr >= md->end_subject)
3993 {
3994 SCHECK_PARTIAL();
3995 RRETURN(MATCH_NOMATCH);
3996 }
3997 GETCHARINC(d, eptr);
3998 if (fc == d) RRETURN(MATCH_NOMATCH);
3999 }
4000 }
4001 else
4002 #endif
4003 /* Not UTF mode */
4004 {
4005 for (fi = min;; fi++)
4006 {
4007 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4009 if (fi >= max) RRETURN(MATCH_NOMATCH);
4010 if (eptr >= md->end_subject)
4011 {
4012 SCHECK_PARTIAL();
4013 RRETURN(MATCH_NOMATCH);
4014 }
4015 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4016 }
4017 }
4018 /* Control never gets here */
4019 }
4020
4021 /* Maximize case */
4022
4023 else
4024 {
4025 pp = eptr;
4026
4027 #ifdef SUPPORT_UTF
4028 if (utf)
4029 {
4030 register pcre_uint32 d;
4031 for (i = min; i < max; i++)
4032 {
4033 int len = 1;
4034 if (eptr >= md->end_subject)
4035 {
4036 SCHECK_PARTIAL();
4037 break;
4038 }
4039 GETCHARLEN(d, eptr, len);
4040 if (fc == d) break;
4041 eptr += len;
4042 }
4043 if (possessive) continue; /* No backtracking */
4044 for(;;)
4045 {
4046 if (eptr == pp) goto TAIL_RECURSE;
4047 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4049 eptr--;
4050 BACKCHAR(eptr);
4051 }
4052 }
4053 else
4054 #endif
4055 /* Not UTF mode */
4056 {
4057 for (i = min; i < max; i++)
4058 {
4059 if (eptr >= md->end_subject)
4060 {
4061 SCHECK_PARTIAL();
4062 break;
4063 }
4064 if (fc == *eptr) break;
4065 eptr++;
4066 }
4067 if (possessive) continue; /* No backtracking */
4068 for (;;)
4069 {
4070 if (eptr == pp) goto TAIL_RECURSE;
4071 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4072 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4073 eptr--;
4074 }
4075 }
4076 /* Control never gets here */
4077 }
4078 }
4079 /* Control never gets here */
4080
4081 /* Match a single character type repeatedly; several different opcodes
4082 share code. This is very similar to the code for single characters, but we
4083 repeat it in the interests of efficiency. */
4084
4085 case OP_TYPEEXACT:
4086 min = max = GET2(ecode, 1);
4087 minimize = TRUE;
4088 ecode += 1 + IMM2_SIZE;
4089 goto REPEATTYPE;
4090
4091 case OP_TYPEUPTO:
4092 case OP_TYPEMINUPTO:
4093 min = 0;
4094 max = GET2(ecode, 1);
4095 minimize = *ecode == OP_TYPEMINUPTO;
4096 ecode += 1 + IMM2_SIZE;
4097 goto REPEATTYPE;
4098
4099 case OP_TYPEPOSSTAR:
4100 possessive = TRUE;
4101 min = 0;
4102 max = INT_MAX;
4103 ecode++;
4104 goto REPEATTYPE;
4105
4106 case OP_TYPEPOSPLUS:
4107 possessive = TRUE;
4108 min = 1;
4109 max = INT_MAX;
4110 ecode++;
4111 goto REPEATTYPE;
4112
4113 case OP_TYPEPOSQUERY:
4114 possessive = TRUE;
4115 min = 0;
4116 max = 1;
4117 ecode++;
4118 goto REPEATTYPE;
4119
4120 case OP_TYPEPOSUPTO:
4121 possessive = TRUE;
4122 min = 0;
4123 max = GET2(ecode, 1);
4124 ecode += 1 + IMM2_SIZE;
4125 goto REPEATTYPE;
4126
4127 case OP_TYPESTAR:
4128 case OP_TYPEMINSTAR:
4129 case OP_TYPEPLUS:
4130 case OP_TYPEMINPLUS:
4131 case OP_TYPEQUERY:
4132 case OP_TYPEMINQUERY:
4133 c = *ecode++ - OP_TYPESTAR;
4134 minimize = (c & 1) != 0;
4135 min = rep_min[c]; /* Pick up values from tables; */
4136 max = rep_max[c]; /* zero for max => infinity */
4137 if (max == 0) max = INT_MAX;
4138
4139 /* Common code for all repeated single character type matches. Note that
4140 in UTF-8 mode, '.' matches a character of any length, but for the other
4141 character types, the valid characters are all one-byte long. */
4142
4143 REPEATTYPE:
4144 ctype = *ecode++; /* Code for the character type */
4145
4146 #ifdef SUPPORT_UCP
4147 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4148 {
4149 prop_fail_result = ctype == OP_NOTPROP;
4150 prop_type = *ecode++;
4151 prop_value = *ecode++;
4152 }
4153 else prop_type = -1;
4154 #endif
4155
4156 /* First, ensure the minimum number of matches are present. Use inline
4157 code for maximizing the speed, and do the type test once at the start
4158 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4159 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4160 and single-bytes. */
4161
4162 if (min > 0)
4163 {
4164 #ifdef SUPPORT_UCP
4165 if (prop_type >= 0)
4166 {
4167 switch(prop_type)
4168 {
4169 case PT_ANY:
4170 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4171 for (i = 1; i <= min; i++)
4172 {
4173 if (eptr >= md->end_subject)
4174 {
4175 SCHECK_PARTIAL();
4176 RRETURN(MATCH_NOMATCH);
4177 }
4178 GETCHARINCTEST(c, eptr);
4179 }
4180 break;
4181
4182 case PT_LAMP:
4183 for (i = 1; i <= min; i++)
4184 {
4185 int chartype;
4186 if (eptr >= md->end_subject)
4187 {
4188 SCHECK_PARTIAL();
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 GETCHARINCTEST(c, eptr);
4192 chartype = UCD_CHARTYPE(c);
4193 if ((chartype == ucp_Lu ||
4194 chartype == ucp_Ll ||
4195 chartype == ucp_Lt) == prop_fail_result)
4196 RRETURN(MATCH_NOMATCH);
4197 }
4198 break;
4199
4200 case PT_GC:
4201 for (i = 1; i <= min; i++)
4202 {
4203 if (eptr >= md->end_subject)
4204 {
4205 SCHECK_PARTIAL();
4206 RRETURN(MATCH_NOMATCH);
4207 }
4208 GETCHARINCTEST(c, eptr);
4209 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4210 RRETURN(MATCH_NOMATCH);
4211 }
4212 break;
4213
4214 case PT_PC:
4215 for (i = 1; i <= min; i++)
4216 {
4217 if (eptr >= md->end_subject)
4218 {
4219 SCHECK_PARTIAL();
4220 RRETURN(MATCH_NOMATCH);
4221 }
4222 GETCHARINCTEST(c, eptr);
4223 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4224 RRETURN(MATCH_NOMATCH);
4225 }
4226 break;
4227
4228 case PT_SC:
4229 for (i = 1; i <= min; i++)
4230 {
4231 if (eptr >= md->end_subject)
4232 {
4233 SCHECK_PARTIAL();
4234 RRETURN(MATCH_NOMATCH);
4235 }
4236 GETCHARINCTEST(c, eptr);
4237 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4238 RRETURN(MATCH_NOMATCH);
4239 }
4240 break;
4241
4242 case PT_ALNUM:
4243 for (i = 1; i <= min; i++)
4244 {
4245 int category;
4246 if (eptr >= md->end_subject)
4247 {
4248 SCHECK_PARTIAL();
4249 RRETURN(MATCH_NOMATCH);
4250 }
4251 GETCHARINCTEST(c, eptr);
4252 category = UCD_CATEGORY(c);
4253 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4254 RRETURN(MATCH_NOMATCH);
4255 }
4256 break;
4257
4258 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4259 which means that Perl space and POSIX space are now identical. PCRE
4260 was changed at release 8.34. */
4261
4262 case PT_SPACE: /* Perl space */
4263 case PT_PXSPACE: /* POSIX space */
4264 for (i = 1; i <= min; i++)
4265 {
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 RRETURN(MATCH_NOMATCH);
4270 }
4271 GETCHARINCTEST(c, eptr);
4272 switch(c)
4273 {
4274 HSPACE_CASES:
4275 VSPACE_CASES:
4276 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4277 break;
4278
4279 default:
4280 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4281 RRETURN(MATCH_NOMATCH);
4282 break;
4283 }
4284 }
4285 break;
4286
4287 case PT_WORD:
4288 for (i = 1; i <= min; i++)
4289 {
4290 int category;
4291 if (eptr >= md->end_subject)
4292 {
4293 SCHECK_PARTIAL();
4294 RRETURN(MATCH_NOMATCH);
4295 }
4296 GETCHARINCTEST(c, eptr);
4297 category = UCD_CATEGORY(c);
4298 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4299 == prop_fail_result)
4300 RRETURN(MATCH_NOMATCH);
4301 }
4302 break;
4303
4304 case PT_CLIST:
4305 for (i = 1; i <= min; i++)
4306 {
4307 const pcre_uint32 *cp;
4308 if (eptr >= md->end_subject)
4309 {
4310 SCHECK_PARTIAL();
4311 RRETURN(MATCH_NOMATCH);
4312 }
4313 GETCHARINCTEST(c, eptr);
4314 cp = PRIV(ucd_caseless_sets) + prop_value;
4315 for (;;)
4316 {
4317 if (c < *cp)
4318 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4319 if (c == *cp++)
4320 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4321 }
4322 }
4323 break;
4324
4325 case PT_UCNC:
4326 for (i = 1; i <= min; i++)
4327 {
4328 if (eptr >= md->end_subject)
4329 {
4330 SCHECK_PARTIAL();
4331 RRETURN(MATCH_NOMATCH);
4332 }
4333 GETCHARINCTEST(c, eptr);
4334 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4335 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4336 c >= 0xe000) == prop_fail_result)
4337 RRETURN(MATCH_NOMATCH);
4338 }
4339 break;
4340
4341 /* This should not occur */
4342
4343 default:
4344 RRETURN(PCRE_ERROR_INTERNAL);
4345 }
4346 }
4347
4348 /* Match extended Unicode sequences. We will get here only if the
4349 support is in the binary; otherwise a compile-time error occurs. */
4350
4351 else if (ctype == OP_EXTUNI)
4352 {
4353 for (i = 1; i <= min; i++)
4354 {
4355 if (eptr >= md->end_subject)
4356 {
4357 SCHECK_PARTIAL();
4358 RRETURN(MATCH_NOMATCH);
4359 }
4360 else
4361 {
4362 int lgb, rgb;
4363 GETCHARINCTEST(c, eptr);
4364 lgb = UCD_GRAPHBREAK(c);
4365 while (eptr < md->end_subject)
4366 {
4367 int len = 1;
4368 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4369 rgb = UCD_GRAPHBREAK(c);
4370 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4371 lgb = rgb;
4372 eptr += len;
4373 }
4374 }
4375 CHECK_PARTIAL();
4376 }
4377 }
4378
4379 else
4380 #endif /* SUPPORT_UCP */
4381
4382 /* Handle all other cases when the coding is UTF-8 */
4383
4384 #ifdef SUPPORT_UTF
4385 if (utf) switch(ctype)
4386 {
4387 case OP_ANY:
4388 for (i = 1; i <= min; i++)
4389 {
4390 if (eptr >= md->end_subject)
4391 {
4392 SCHECK_PARTIAL();
4393 RRETURN(MATCH_NOMATCH);
4394 }
4395 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4396 if (md->partial != 0 &&
4397 eptr + 1 >= md->end_subject &&
4398 NLBLOCK->nltype == NLTYPE_FIXED &&
4399 NLBLOCK->nllen == 2 &&
4400 UCHAR21(eptr) == NLBLOCK->nl[0])
4401 {
4402 md->hitend = TRUE;
4403 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4404 }
4405 eptr++;
4406 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4407 }
4408 break;
4409
4410 case OP_ALLANY:
4411 for (i = 1; i <= min; i++)
4412 {
4413 if (eptr >= md->end_subject)
4414 {
4415 SCHECK_PARTIAL();
4416 RRETURN(MATCH_NOMATCH);
4417 }
4418 eptr++;
4419 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4420 }
4421 break;
4422
4423 case OP_ANYBYTE:
4424 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4425 eptr += min;
4426 break;
4427
4428 case OP_ANYNL:
4429 for (i = 1; i <= min; i++)
4430 {
4431 if (eptr >= md->end_subject)
4432 {
4433 SCHECK_PARTIAL();
4434 RRETURN(MATCH_NOMATCH);
4435 }
4436 GETCHARINC(c, eptr);
4437 switch(c)
4438 {
4439 default: RRETURN(MATCH_NOMATCH);
4440
4441 case CHAR_CR:
4442 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4443 break;
4444
4445 case CHAR_LF:
4446 break;
4447
4448 case CHAR_VT:
4449 case CHAR_FF:
4450 case CHAR_NEL:
4451 #ifndef EBCDIC
4452 case 0x2028:
4453 case 0x2029:
4454 #endif /* Not EBCDIC */
4455 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4456 break;
4457 }
4458 }
4459 break;
4460
4461 case OP_NOT_HSPACE:
4462 for (i = 1; i <= min; i++)
4463 {
4464 if (eptr >= md->end_subject)
4465 {
4466 SCHECK_PARTIAL();
4467 RRETURN(MATCH_NOMATCH);
4468 }
4469 GETCHARINC(c, eptr);
4470 switch(c)
4471 {
4472 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4473 default: break;
4474 }
4475 }
4476 break;
4477
4478 case OP_HSPACE:
4479 for (i = 1; i <= min; i++)
4480 {
4481 if (eptr >= md->end_subject)
4482 {
4483 SCHECK_PARTIAL();
4484 RRETURN(MATCH_NOMATCH);
4485 }
4486 GETCHARINC(c, eptr);
4487 switch(c)
4488 {
4489 HSPACE_CASES: break; /* Byte and multibyte cases */
4490 default: RRETURN(MATCH_NOMATCH);
4491 }
4492 }
4493 break;
4494
4495 case OP_NOT_VSPACE:
4496 for (i = 1; i <= min; i++)
4497 {
4498 if (eptr >= md->end_subject)
4499 {
4500 SCHECK_PARTIAL();
4501 RRETURN(MATCH_NOMATCH);
4502 }
4503 GETCHARINC(c, eptr);
4504 switch(c)
4505 {
4506 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4507 default: break;
4508 }
4509 }
4510 break;
4511
4512 case OP_VSPACE:
4513 for (i = 1; i <= min; i++)
4514 {
4515 if (eptr >= md->end_subject)
4516 {
4517 SCHECK_PARTIAL();
4518 RRETURN(MATCH_NOMATCH);
4519 }
4520 GETCHARINC(c, eptr);
4521 switch(c)
4522 {
4523 VSPACE_CASES: break;
4524 default: RRETURN(MATCH_NOMATCH);
4525 }
4526 }
4527 break;
4528
4529 case OP_NOT_DIGIT:
4530 for (i = 1; i <= min; i++)
4531 {
4532 if (eptr >= md->end_subject)
4533 {
4534 SCHECK_PARTIAL();
4535 RRETURN(MATCH_NOMATCH);
4536 }
4537 GETCHARINC(c, eptr);
4538 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4539 RRETURN(MATCH_NOMATCH);
4540 }
4541 break;
4542
4543 case OP_DIGIT:
4544 for (i = 1; i <= min; i++)
4545 {
4546 pcre_uint32 cc;
4547 if (eptr >= md->end_subject)
4548 {
4549 SCHECK_PARTIAL();
4550 RRETURN(MATCH_NOMATCH);
4551 }
4552 cc = UCHAR21(eptr);
4553 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4554 RRETURN(MATCH_NOMATCH);
4555 eptr++;
4556 /* No need to skip more bytes - we know it's a 1-byte character */
4557 }
4558 break;
4559
4560 case OP_NOT_WHITESPACE:
4561 for (i = 1; i <= min; i++)
4562 {
4563 pcre_uint32 cc;
4564 if (eptr >= md->end_subject)
4565 {
4566 SCHECK_PARTIAL();
4567 RRETURN(MATCH_NOMATCH);
4568 }
4569 cc = UCHAR21(eptr);
4570 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4571 RRETURN(MATCH_NOMATCH);
4572 eptr++;
4573 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4574 }
4575 break;
4576
4577 case OP_WHITESPACE:
4578 for (i = 1; i <= min; i++)
4579 {
4580 pcre_uint32 cc;
4581 if (eptr >= md->end_subject)
4582 {
4583 SCHECK_PARTIAL();
4584 RRETURN(MATCH_NOMATCH);
4585 }
4586 cc = UCHAR21(eptr);
4587 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4588 RRETURN(MATCH_NOMATCH);
4589 eptr++;
4590 /* No need to skip more bytes - we know it's a 1-byte character */
4591 }
4592 break;
4593
4594 case OP_NOT_WORDCHAR:
4595 for (i = 1; i <= min; i++)
4596 {
4597 pcre_uint32 cc;
4598 if (eptr >= md->end_subject)
4599 {
4600 SCHECK_PARTIAL();
4601 RRETURN(MATCH_NOMATCH);
4602 }
4603 cc = UCHAR21(eptr);
4604 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4605 RRETURN(MATCH_NOMATCH);
4606 eptr++;
4607 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4608 }
4609 break;
4610
4611 case OP_WORDCHAR:
4612 for (i = 1; i <= min; i++)
4613 {
4614 pcre_uint32 cc;
4615 if (eptr >= md->end_subject)
4616 {
4617 SCHECK_PARTIAL();
4618 RRETURN(MATCH_NOMATCH);
4619 }
4620 cc = UCHAR21(eptr);
4621 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4622 RRETURN(MATCH_NOMATCH);
4623 eptr++;
4624 /* No need to skip more bytes - we know it's a 1-byte character */
4625 }
4626 break;
4627
4628 default:
4629 RRETURN(PCRE_ERROR_INTERNAL);
4630 } /* End switch(ctype) */
4631
4632 else
4633 #endif /* SUPPORT_UTF */
4634
4635 /* Code for the non-UTF-8 case for minimum matching of operators other
4636 than OP_PROP and OP_NOTPROP. */
4637
4638 switch(ctype)
4639 {
4640 case OP_ANY:
4641 for (i = 1; i <= min; i++)
4642 {
4643 if (eptr >= md->end_subject)
4644 {
4645 SCHECK_PARTIAL();
4646 RRETURN(MATCH_NOMATCH);
4647 }
4648 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4649 if (md->partial != 0 &&
4650 eptr + 1 >= md->end_subject &&
4651 NLBLOCK->nltype == NLTYPE_FIXED &&
4652 NLBLOCK->nllen == 2 &&
4653 *eptr == NLBLOCK->nl[0])
4654 {
4655 md->hitend = TRUE;
4656 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4657 }
4658 eptr++;
4659 }
4660 break;
4661
4662 case OP_ALLANY:
4663 if (eptr > md->end_subject - min)
4664 {
4665 SCHECK_PARTIAL();
4666 RRETURN(MATCH_NOMATCH);
4667 }
4668 eptr += min;
4669 break;
4670
4671 case OP_ANYBYTE:
4672 if (eptr > md->end_subject - min)
4673 {
4674 SCHECK_PARTIAL();
4675 RRETURN(MATCH_NOMATCH);
4676 }
4677 eptr += min;
4678 break;
4679
4680 case OP_ANYNL:
4681 for (i = 1; i <= min; i++)
4682 {
4683 if (eptr >= md->end_subject)
4684 {
4685 SCHECK_PARTIAL();
4686 RRETURN(MATCH_NOMATCH);
4687 }
4688 switch(*eptr++)
4689 {
4690 default: RRETURN(MATCH_NOMATCH);
4691
4692 case CHAR_CR:
4693 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4694 break;
4695
4696 case CHAR_LF:
4697 break;
4698
4699 case CHAR_VT:
4700 case CHAR_FF:
4701 case CHAR_NEL:
4702 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4703 case 0x2028:
4704 case 0x2029:
4705 #endif
4706 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4707 break;
4708 }
4709 }
4710 break;
4711
4712 case OP_NOT_HSPACE:
4713 for (i = 1; i <= min; i++)
4714 {
4715 if (eptr >= md->end_subject)
4716 {
4717 SCHECK_PARTIAL();
4718 RRETURN(MATCH_NOMATCH);
4719 }
4720 switch(*eptr++)
4721 {
4722 default: break;
4723 HSPACE_BYTE_CASES:
4724 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4725 HSPACE_MULTIBYTE_CASES:
4726 #endif
4727 RRETURN(MATCH_NOMATCH);
4728 }
4729 }
4730 break;
4731
4732 case OP_HSPACE:
4733 for (i = 1; i <= min; i++)
4734 {
4735 if (eptr >= md->end_subject)
4736 {
4737 SCHECK_PARTIAL();
4738 RRETURN(MATCH_NOMATCH);
4739 }
4740 switch(*eptr++)
4741 {
4742 default: RRETURN(MATCH_NOMATCH);
4743 HSPACE_BYTE_CASES:
4744 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4745 HSPACE_MULTIBYTE_CASES:
4746 #endif
4747 break;
4748 }
4749 }
4750 break;
4751
4752 case OP_NOT_VSPACE:
4753 for (i = 1; i <= min; i++)
4754 {
4755 if (eptr >= md->end_subject)
4756 {
4757 SCHECK_PARTIAL();
4758 RRETURN(MATCH_NOMATCH);
4759 }
4760 switch(*eptr++)
4761 {
4762 VSPACE_BYTE_CASES:
4763 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4764 VSPACE_MULTIBYTE_CASES:
4765 #endif
4766 RRETURN(MATCH_NOMATCH);
4767 default: break;
4768 }
4769 }
4770 break;
4771
4772 case OP_VSPACE:
4773 for (i = 1; i <= min; i++)
4774 {
4775 if (eptr >= md->end_subject)
4776 {
4777 SCHECK_PARTIAL();
4778 RRETURN(MATCH_NOMATCH);
4779 }
4780 switch(*eptr++)
4781 {
4782 default: RRETURN(MATCH_NOMATCH);
4783 VSPACE_BYTE_CASES:
4784 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4785 VSPACE_MULTIBYTE_CASES:
4786 #endif
4787 break;
4788 }
4789 }
4790 break;
4791
4792 case OP_NOT_DIGIT:
4793 for (i = 1; i <= min; i++)
4794 {
4795 if (eptr >= md->end_subject)
4796 {
4797 SCHECK_PARTIAL();
4798 RRETURN(MATCH_NOMATCH);
4799 }
4800 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4801 RRETURN(MATCH_NOMATCH);
4802 eptr++;
4803 }
4804 break;
4805
4806 case OP_DIGIT:
4807 for (i = 1; i <= min; i++)
4808 {
4809 if (eptr >= md->end_subject)
4810 {
4811 SCHECK_PARTIAL();
4812 RRETURN(MATCH_NOMATCH);
4813 }
4814 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4815 RRETURN(MATCH_NOMATCH);
4816 eptr++;
4817 }
4818 break;
4819
4820 case OP_NOT_WHITESPACE:
4821 for (i = 1; i <= min; i++)
4822 {
4823 if (eptr >= md->end_subject)
4824 {
4825 SCHECK_PARTIAL();
4826 RRETURN(MATCH_NOMATCH);
4827 }
4828 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4829 RRETURN(MATCH_NOMATCH);
4830 eptr++;
4831 }
4832 break;
4833
4834 case OP_WHITESPACE:
4835 for (i = 1; i <= min; i++)
4836 {
4837 if (eptr >= md->end_subject)
4838 {
4839 SCHECK_PARTIAL();
4840 RRETURN(MATCH_NOMATCH);
4841 }
4842 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4843 RRETURN(MATCH_NOMATCH);
4844 eptr++;
4845 }
4846 break;
4847
4848 case OP_NOT_WORDCHAR:
4849 for (i = 1; i <= min; i++)
4850 {
4851 if (eptr >= md->end_subject)
4852 {
4853 SCHECK_PARTIAL();
4854 RRETURN(MATCH_NOMATCH);
4855 }
4856 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4857 RRETURN(MATCH_NOMATCH);
4858 eptr++;
4859 }
4860 break;
4861
4862 case OP_WORDCHAR:
4863 for (i = 1; i <= min; i++)
4864 {
4865 if (eptr >= md->end_subject)
4866 {
4867 SCHECK_PARTIAL();
4868 RRETURN(MATCH_NOMATCH);
4869 }
4870 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4871 RRETURN(MATCH_NOMATCH);
4872 eptr++;
4873 }
4874 break;
4875
4876 default:
4877 RRETURN(PCRE_ERROR_INTERNAL);
4878 }
4879 }
4880
4881 /* If min = max, continue at the same level without recursing */
4882
4883 if (min == max) continue;
4884
4885 /* If minimizing, we have to test the rest of the pattern before each
4886 subsequent match. Again, separate the UTF-8 case for speed, and also
4887 separate the UCP cases. */
4888
4889 if (minimize)
4890 {
4891 #ifdef SUPPORT_UCP
4892 if (prop_type >= 0)
4893 {
4894 switch(prop_type)
4895 {
4896 case PT_ANY:
4897 for (fi = min;; fi++)
4898 {
4899 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4901 if (fi >= max) RRETURN(MATCH_NOMATCH);
4902 if (eptr >= md->end_subject)
4903 {
4904 SCHECK_PARTIAL();
4905 RRETURN(MATCH_NOMATCH);
4906 }
4907 GETCHARINCTEST(c, eptr);
4908 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4909 }
4910 /* Control never gets here */
4911
4912 case PT_LAMP:
4913 for (fi = min;; fi++)
4914 {
4915 int chartype;
4916 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4917 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4918 if (fi >= max) RRETURN(MATCH_NOMATCH);
4919 if (eptr >= md->end_subject)
4920 {
4921 SCHECK_PARTIAL();
4922 RRETURN(MATCH_NOMATCH);
4923 }
4924 GETCHARINCTEST(c, eptr);
4925 chartype = UCD_CHARTYPE(c);
4926 if ((chartype == ucp_Lu ||
4927 chartype == ucp_Ll ||
4928 chartype == ucp_Lt) == prop_fail_result)
4929 RRETURN(MATCH_NOMATCH);
4930 }
4931 /* Control never gets here */
4932
4933 case PT_GC:
4934 for (fi = min;; fi++)
4935 {
4936 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4938 if (fi >= max) RRETURN(MATCH_NOMATCH);
4939 if (eptr >= md->end_subject)
4940 {
4941 SCHECK_PARTIAL();
4942 RRETURN(MATCH_NOMATCH);
4943 }
4944 GETCHARINCTEST(c, eptr);
4945 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4946 RRETURN(MATCH_NOMATCH);
4947 }
4948 /* Control never gets here */
4949
4950 case PT_PC:
4951 for (fi = min;; fi++)
4952 {
4953 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4955 if (fi >= max) RRETURN(MATCH_NOMATCH);
4956 if (eptr >= md->end_subject)
4957 {
4958 SCHECK_PARTIAL();
4959 RRETURN(MATCH_NOMATCH);
4960 }
4961 GETCHARINCTEST(c, eptr);
4962 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4963 RRETURN(MATCH_NOMATCH);
4964 }
4965 /* Control never gets here */
4966
4967 case PT_SC:
4968 for (fi = min;; fi++)
4969 {
4970 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4972 if (fi >= max) RRETURN(MATCH_NOMATCH);
4973 if (eptr >= md->end_subject)
4974 {
4975 SCHECK_PARTIAL();
4976 RRETURN(MATCH_NOMATCH);
4977 }
4978 GETCHARINCTEST(c, eptr);
4979 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4980 RRETURN(MATCH_NOMATCH);
4981 }
4982 /* Control never gets here */
4983
4984 case PT_ALNUM:
4985 for (fi = min;; fi++)
4986 {
4987 int category;
4988 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4990 if (fi >= max) RRETURN(MATCH_NOMATCH);
4991 if (eptr >= md->end_subject)
4992 {
4993 SCHECK_PARTIAL();
4994 RRETURN(MATCH_NOMATCH);
4995 }
4996 GETCHARINCTEST(c, eptr);
4997 category = UCD_CATEGORY(c);
4998 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4999 RRETURN(MATCH_NOMATCH);
5000 }
5001 /* Control never gets here */
5002
5003 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5004 which means that Perl space and POSIX space are now identical. PCRE
5005 was changed at release 8.34. */
5006
5007 case PT_SPACE: /* Perl space */
5008 case PT_PXSPACE: /* POSIX space */
5009 for (fi = min;; fi++)
5010 {
5011 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5013 if (fi >= max) RRETURN(MATCH_NOMATCH);
5014 if (eptr >= md->end_subject)
5015 {
5016 SCHECK_PARTIAL();
5017 RRETURN(MATCH_NOMATCH);
5018 }
5019 GETCHARINCTEST(c, eptr);
5020 switch(c)
5021 {
5022 HSPACE_CASES:
5023 VSPACE_CASES:
5024 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5025 break;
5026
5027 default:
5028 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5029 RRETURN(MATCH_NOMATCH);
5030 break;
5031 }
5032 }
5033 /* Control never gets here */
5034
5035 case PT_WORD:
5036 for (fi = min;; fi++)
5037 {
5038 int category;
5039 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5041 if (fi >= max) RRETURN(MATCH_NOMATCH);
5042 if (eptr >= md->end_subject)
5043 {
5044 SCHECK_PARTIAL();
5045 RRETURN(MATCH_NOMATCH);
5046 }
5047 GETCHARINCTEST(c, eptr);
5048 category = UCD_CATEGORY(c);
5049 if ((category == ucp_L ||
5050 category == ucp_N ||
5051 c == CHAR_UNDERSCORE)
5052 == prop_fail_result)
5053 RRETURN(MATCH_NOMATCH);
5054 }
5055 /* Control never gets here */
5056
5057 case PT_CLIST:
5058 for (fi = min;; fi++)
5059 {
5060 const pcre_uint32 *cp;
5061 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5062 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5063 if (fi >= max) RRETURN(MATCH_NOMATCH);
5064 if (eptr >= md->end_subject)
5065 {
5066 SCHECK_PARTIAL();
5067 RRETURN(MATCH_NOMATCH);
5068 }
5069 GETCHARINCTEST(c, eptr);
5070 cp = PRIV(ucd_caseless_sets) + prop_value;
5071 for (;;)
5072 {
5073 if (c < *cp)
5074 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5075 if (c == *cp++)
5076 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5077 }
5078 }
5079 /* Control never gets here */
5080
5081 case PT_UCNC:
5082 for (fi = min;; fi++)
5083 {
5084 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5086 if (fi >= max) RRETURN(MATCH_NOMATCH);
5087 if (eptr >= md->end_subject)
5088 {
5089 SCHECK_PARTIAL();
5090 RRETURN(MATCH_NOMATCH);
5091 }
5092 GETCHARINCTEST(c, eptr);
5093 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5094 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5095 c >= 0xe000) == prop_fail_result)
5096 RRETURN(MATCH_NOMATCH);
5097 }
5098 /* Control never gets here */
5099
5100 /* This should never occur */
5101 default:
5102 RRETURN(PCRE_ERROR_INTERNAL);
5103 }
5104 }
5105
5106 /* Match extended Unicode sequences. We will get here only if the
5107 support is in the binary; otherwise a compile-time error occurs. */
5108
5109 else if (ctype == OP_EXTUNI)
5110 {
5111 for (fi = min;; fi++)
5112 {
5113 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5114 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5115 if (fi >= max) RRETURN(MATCH_NOMATCH);
5116 if (eptr >= md->end_subject)
5117 {
5118 SCHECK_PARTIAL();
5119 RRETURN(MATCH_NOMATCH);
5120 }
5121 else
5122 {
5123 int lgb, rgb;
5124 GETCHARINCTEST(c, eptr);
5125 lgb = UCD_GRAPHBREAK(c);
5126 while (eptr < md->end_subject)
5127 {
5128 int len = 1;
5129 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5130 rgb = UCD_GRAPHBREAK(c);
5131 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5132 lgb = rgb;
5133 eptr += len;
5134 }
5135 }
5136 CHECK_PARTIAL();
5137 }
5138 }
5139 else
5140 #endif /* SUPPORT_UCP */
5141
5142 #ifdef SUPPORT_UTF
5143 if (utf)
5144 {
5145 for (fi = min;; fi++)
5146 {
5147 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5148 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5149 if (fi >= max) RRETURN(MATCH_NOMATCH);
5150 if (eptr >= md->end_subject)
5151 {
5152 SCHECK_PARTIAL();
5153 RRETURN(MATCH_NOMATCH);
5154 }
5155 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5156 RRETURN(MATCH_NOMATCH);
5157 GETCHARINC(c, eptr);
5158 switch(ctype)
5159 {
5160 case OP_ANY: /* This is the non-NL case */
5161 if (md->partial != 0 && /* Take care with CRLF partial */
5162 eptr >= md->end_subject &&
5163 NLBLOCK->nltype == NLTYPE_FIXED &&
5164 NLBLOCK->nllen == 2 &&
5165 c == NLBLOCK->nl[0])
5166 {
5167 md->hitend = TRUE;
5168 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5169 }
5170 break;
5171
5172 case OP_ALLANY:
5173 case OP_ANYBYTE:
5174 break;
5175
5176 case OP_ANYNL:
5177 switch(c)
5178 {
5179 default: RRETURN(MATCH_NOMATCH);
5180 case CHAR_CR:
5181 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5182 break;
5183
5184 case CHAR_LF:
5185 break;
5186
5187 case CHAR_VT:
5188 case CHAR_FF:
5189 case CHAR_NEL:
5190 #ifndef EBCDIC
5191 case 0x2028:
5192 case 0x2029:
5193 #endif /* Not EBCDIC */
5194 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5195 break;
5196 }
5197 break;
5198
5199 case OP_NOT_HSPACE:
5200 switch(c)
5201 {
5202 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5203 default: break;
5204 }
5205 break;
5206
5207 case OP_HSPACE:
5208 switch(c)
5209 {
5210 HSPACE_CASES: break;
5211 default: RRETURN(MATCH_NOMATCH);
5212 }
5213 break;
5214
5215 case OP_NOT_VSPACE:
5216 switch(c)
5217 {
5218 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5219 default: break;
5220 }
5221 break;
5222
5223 case OP_VSPACE:
5224 switch(c)
5225 {
5226 VSPACE_CASES: break;
5227 default: RRETURN(MATCH_NOMATCH);
5228 }
5229 break;
5230
5231 case OP_NOT_DIGIT:
5232 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5233 RRETURN(MATCH_NOMATCH);
5234 break;
5235
5236 case OP_DIGIT:
5237 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5238 RRETURN(MATCH_NOMATCH);
5239 break;
5240
5241 case OP_NOT_WHITESPACE:
5242 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5243 RRETURN(MATCH_NOMATCH);
5244 break;
5245
5246 case OP_WHITESPACE:
5247 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5248 RRETURN(MATCH_NOMATCH);
5249 break;
5250
5251 case OP_NOT_WORDCHAR:
5252 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5253 RRETURN(MATCH_NOMATCH);
5254 break;
5255
5256 case OP_WORDCHAR:
5257 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5258 RRETURN(MATCH_NOMATCH);
5259 break;
5260
5261 default:
5262 RRETURN(PCRE_ERROR_INTERNAL);
5263 }
5264 }
5265 }
5266 else
5267 #endif
5268 /* Not UTF mode */
5269 {
5270 for (fi = min;; fi++)
5271 {
5272 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5273 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5274 if (fi >= max) RRETURN(MATCH_NOMATCH);
5275 if (eptr >= md->end_subject)
5276 {
5277 SCHECK_PARTIAL();
5278 RRETURN(MATCH_NOMATCH);
5279 }
5280 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5281 RRETURN(MATCH_NOMATCH);
5282 c = *eptr++;
5283 switch(ctype)
5284 {
5285 case OP_ANY: /* This is the non-NL case */
5286 if (md->partial != 0 && /* Take care with CRLF partial */
5287 eptr >= md->end_subject &&
5288 NLBLOCK->nltype == NLTYPE_FIXED &&
5289 NLBLOCK->nllen == 2 &&
5290 c == NLBLOCK->nl[0])
5291 {
5292 md->hitend = TRUE;
5293 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5294 }
5295 break;
5296
5297 case OP_ALLANY:
5298 case OP_ANYBYTE:
5299 break;
5300
5301 case OP_ANYNL:
5302 switch(c)
5303 {
5304 default: RRETURN(MATCH_NOMATCH);
5305 case CHAR_CR:
5306 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5307 break;
5308
5309 case CHAR_LF:
5310 break;
5311
5312 case CHAR_VT:
5313 case CHAR_FF:
5314 case CHAR_NEL:
5315 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5316 case 0x2028:
5317 case 0x2029:
5318 #endif
5319 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5320 break;
5321 }
5322 break;
5323
5324 case OP_NOT_HSPACE:
5325 switch(c)
5326 {
5327 default: break;
5328 HSPACE_BYTE_CASES:
5329 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5330 HSPACE_MULTIBYTE_CASES:
5331 #endif
5332 RRETURN(MATCH_NOMATCH);
5333 }
5334 break;
5335
5336 case OP_HSPACE:
5337 switch(c)
5338 {
5339 default: RRETURN(MATCH_NOMATCH);
5340 HSPACE_BYTE_CASES:
5341 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5342 HSPACE_MULTIBYTE_CASES:
5343 #endif
5344 break;
5345 }
5346 break;
5347
5348 case OP_NOT_VSPACE:
5349 switch(c)
5350 {
5351 default: break;
5352 VSPACE_BYTE_CASES:
5353 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5354 VSPACE_MULTIBYTE_CASES:
5355 #endif
5356 RRETURN(MATCH_NOMATCH);
5357 }
5358 break;
5359
5360 case OP_VSPACE:
5361 switch(c)
5362 {
5363 default: RRETURN(MATCH_NOMATCH);
5364 VSPACE_BYTE_CASES:
5365 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5366 VSPACE_MULTIBYTE_CASES:
5367 #endif
5368 break;
5369 }
5370 break;
5371
5372 case OP_NOT_DIGIT:
5373 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5374 break;
5375
5376 case OP_DIGIT:
5377 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5378 break;
5379
5380 case OP_NOT_WHITESPACE:
5381 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5382 break;
5383
5384 case OP_WHITESPACE:
5385 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5386 break;
5387
5388 case OP_NOT_WORDCHAR:
5389 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5390 break;
5391
5392 case OP_WORDCHAR:
5393 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5394 break;
5395
5396 default:
5397 RRETURN(PCRE_ERROR_INTERNAL);
5398 }
5399 }
5400 }
5401 /* Control never gets here */
5402 }
5403
5404 /* If maximizing, it is worth using inline code for speed, doing the type
5405 test once at the start (i.e. keep it out of the loop). Again, keep the
5406 UTF-8 and UCP stuff separate. */
5407
5408 else
5409 {
5410 pp = eptr; /* Remember where we started */
5411
5412 #ifdef SUPPORT_UCP
5413 if (prop_type >= 0)
5414 {
5415 switch(prop_type)
5416 {
5417 case PT_ANY:
5418 for (i = min; i < max; i++)
5419 {
5420 int len = 1;
5421 if (eptr >= md->end_subject)
5422 {
5423 SCHECK_PARTIAL();
5424 break;
5425 }
5426 GETCHARLENTEST(c, eptr, len);
5427 if (prop_fail_result) break;
5428 eptr+= len;
5429 }
5430 break;
5431
5432 case PT_LAMP:
5433 for (i = min; i < max; i++)
5434 {
5435 int chartype;
5436 int len = 1;
5437 if (eptr >= md->end_subject)
5438 {
5439 SCHECK_PARTIAL();
5440 break;
5441 }
5442 GETCHARLENTEST(c, eptr, len);
5443 chartype = UCD_CHARTYPE(c);
5444 if ((chartype == ucp_Lu ||
5445 chartype == ucp_Ll ||
5446 chartype == ucp_Lt) == prop_fail_result)
5447 break;
5448 eptr+= len;
5449 }
5450 break;
5451
5452 case PT_GC:
5453 for (i = min; i < max; i++)
5454 {
5455 int len = 1;
5456 if (eptr >= md->end_subject)
5457 {
5458 SCHECK_PARTIAL();
5459 break;
5460 }
5461 GETCHARLENTEST(c, eptr, len);
5462 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5463 eptr+= len;
5464 }
5465 break;
5466
5467 case PT_PC:
5468 for (i = min; i < max; i++)
5469 {
5470 int len = 1;
5471 if (eptr >= md->end_subject)
5472 {
5473 SCHECK_PARTIAL();
5474 break;
5475 }
5476 GETCHARLENTEST(c, eptr, len);
5477 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5478 eptr+= len;
5479 }
5480 break;
5481
5482 case PT_SC:
5483 for (i = min; i < max; i++)
5484 {
5485 int len = 1;
5486 if (eptr >= md->end_subject)
5487 {
5488 SCHECK_PARTIAL();
5489 break;
5490 }
5491 GETCHARLENTEST(c, eptr, len);
5492 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5493 eptr+= len;
5494 }
5495 break;
5496
5497 case PT_ALNUM:
5498 for (i = min; i < max; i++)
5499 {
5500 int category;
5501 int len = 1;
5502 if (eptr >= md->end_subject)
5503 {
5504 SCHECK_PARTIAL();
5505 break;
5506 }
5507 GETCHARLENTEST(c, eptr, len);
5508 category = UCD_CATEGORY(c);
5509 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5510 break;
5511 eptr+= len;
5512 }
5513 break;
5514
5515 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5516 which means that Perl space and POSIX space are now identical. PCRE
5517 was changed at release 8.34. */
5518
5519 case PT_SPACE: /* Perl space */
5520 case PT_PXSPACE: /* POSIX space */
5521 for (i = min; i < max; i++)
5522 {
5523 int len = 1;
5524 if (eptr >= md->end_subject)
5525 {
5526 SCHECK_PARTIAL();
5527 break;
5528 }
5529 GETCHARLENTEST(c, eptr, len);
5530 switch(c)
5531 {
5532 HSPACE_CASES:
5533 VSPACE_CASES:
5534 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5535 break;
5536
5537 default:
5538 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5539 goto ENDLOOP99; /* Break the loop */
5540 break;
5541 }
5542 eptr+= len;
5543 }
5544 ENDLOOP99:
5545 break;
5546
5547 case PT_WORD:
5548 for (i = min; i < max; i++)
5549 {
5550 int category;
5551 int len = 1;
5552 if (eptr >= md->end_subject)
5553 {
5554 SCHECK_PARTIAL();
5555 break;
5556 }
5557 GETCHARLENTEST(c, eptr, len);
5558 category = UCD_CATEGORY(c);
5559 if ((category == ucp_L || category == ucp_N ||
5560 c == CHAR_UNDERSCORE) == prop_fail_result)
5561 break;
5562 eptr+= len;
5563 }
5564 break;
5565
5566 case PT_CLIST:
5567 for (i = min; i < max; i++)
5568 {
5569 const pcre_uint32 *cp;
5570 int len = 1;
5571 if (eptr >= md->end_subject)
5572 {
5573 SCHECK_PARTIAL();
5574 break;
5575 }
5576 GETCHARLENTEST(c, eptr, len);
5577 cp = PRIV(ucd_caseless_sets) + prop_value;
5578 for (;;)
5579 {
5580 if (c < *cp)
5581 { if (prop_fail_result) break; else goto GOT_MAX; }
5582 if (c == *cp++)
5583 { if (prop_fail_result) goto GOT_MAX; else break; }
5584 }
5585 eptr += len;
5586 }
5587 GOT_MAX:
5588 break;
5589
5590 case PT_UCNC:
5591 for (i = min; i < max; i++)
5592 {
5593 int len = 1;
5594 if (eptr >= md->end_subject)
5595 {
5596 SCHECK_PARTIAL();
5597 break;
5598 }
5599 GETCHARLENTEST(c, eptr, len);
5600 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5601 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5602 c >= 0xe000) == prop_fail_result)
5603 break;
5604 eptr += len;
5605 }
5606 break;
5607
5608 default:
5609 RRETURN(PCRE_ERROR_INTERNAL);
5610 }
5611
5612 /* eptr is now past the end of the maximum run */
5613
5614 if (possessive) continue; /* No backtracking */
5615 for(;;)
5616 {
5617 if (eptr == pp) goto TAIL_RECURSE;
5618 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5620 eptr--;
5621 if (utf) BACKCHAR(eptr);
5622 }
5623 }
5624
5625 /* Match extended Unicode grapheme clusters. We will get here only if the
5626 support is in the binary; otherwise a compile-time error occurs. */
5627
5628 else if (ctype == OP_EXTUNI)
5629 {
5630 for (i = min; i < max; i++)
5631 {
5632 if (eptr >= md->end_subject)
5633 {
5634 SCHECK_PARTIAL();
5635 break;
5636 }
5637 else
5638 {
5639 int lgb, rgb;
5640 GETCHARINCTEST(c, eptr);
5641 lgb = UCD_GRAPHBREAK(c);
5642 while (eptr < md->end_subject)
5643 {
5644 int len = 1;
5645 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5646 rgb = UCD_GRAPHBREAK(c);
5647 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5648 lgb = rgb;
5649 eptr += len;
5650 }
5651 }
5652 CHECK_PARTIAL();
5653 }
5654
5655 /* eptr is now past the end of the maximum run */
5656
5657 if (possessive) continue; /* No backtracking */
5658
5659 for(;;)
5660 {
5661 int lgb, rgb;
5662 PCRE_PUCHAR fptr;
5663
5664 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5665 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5666 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5667
5668 /* Backtracking over an extended grapheme cluster involves inspecting
5669 the previous two characters (if present) to see if a break is
5670 permitted between them. */
5671
5672 eptr--;
5673 if (!utf) c = *eptr; else
5674 {
5675 BACKCHAR(eptr);
5676 GETCHAR(c, eptr);
5677 }
5678 rgb = UCD_GRAPHBREAK(c);
5679
5680 for (;;)
5681 {
5682 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5683 fptr = eptr - 1;
5684 if (!utf) c = *fptr; else
5685 {
5686 BACKCHAR(fptr);
5687 GETCHAR(c, fptr);
5688 }
5689 lgb = UCD_GRAPHBREAK(c);
5690 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5691 eptr = fptr;
5692 rgb = lgb;
5693 }
5694 }
5695 }
5696
5697 else
5698 #endif /* SUPPORT_UCP */
5699
5700 #ifdef SUPPORT_UTF
5701 if (utf)
5702 {
5703 switch(ctype)
5704 {
5705 case OP_ANY:
5706 for (i = min; i < max; i++)
5707 {
5708 if (eptr >= md->end_subject)
5709 {
5710 SCHECK_PARTIAL();
5711 break;
5712 }
5713 if (IS_NEWLINE(eptr)) break;
5714 if (md->partial != 0 && /* Take care with CRLF partial */
5715 eptr + 1 >= md->end_subject &&
5716 NLBLOCK->nltype == NLTYPE_FIXED &&
5717 NLBLOCK->nllen == 2 &&
5718 UCHAR21(eptr) == NLBLOCK->nl[0])
5719 {
5720 md->hitend = TRUE;
5721 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5722 }
5723 eptr++;
5724 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5725 }
5726 break;
5727
5728 case OP_ALLANY:
5729 if (max < INT_MAX)
5730 {
5731 for (i = min; i < max; i++)
5732 {
5733 if (eptr >= md->end_subject)
5734 {
5735 SCHECK_PARTIAL();
5736 break;
5737 }
5738 eptr++;
5739 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5740 }
5741 }
5742 else
5743 {
5744 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5745 SCHECK_PARTIAL();
5746 }
5747 break;
5748
5749 /* The byte case is the same as non-UTF8 */
5750
5751 case OP_ANYBYTE:
5752 c = max - min;
5753 if (c > (unsigned int)(md->end_subject - eptr))
5754 {
5755 eptr = md->end_subject;
5756 SCHECK_PARTIAL();
5757 }
5758 else eptr += c;
5759 break;
5760
5761 case OP_ANYNL:
5762 for (i = min; i < max; i++)
5763 {
5764 int len = 1;
5765 if (eptr >= md->end_subject)
5766 {
5767 SCHECK_PARTIAL();
5768 break;
5769 }
5770 GETCHARLEN(c, eptr, len);
5771 if (c == CHAR_CR)
5772 {
5773 if (++eptr >= md->end_subject) break;
5774 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5775 }
5776 else
5777 {
5778 if (c != CHAR_LF &&
5779 (md->bsr_anycrlf ||
5780 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5781 #ifndef EBCDIC
5782 && c != 0x2028 && c != 0x2029
5783 #endif /* Not EBCDIC */
5784 )))
5785 break;
5786 eptr += len;
5787 }
5788 }
5789 break;
5790
5791 case OP_NOT_HSPACE:
5792 case OP_HSPACE:
5793 for (i = min; i < max; i++)
5794 {
5795 BOOL gotspace;
5796 int len = 1;
5797 if (eptr >= md->end_subject)
5798 {
5799 SCHECK_PARTIAL();
5800 break;
5801 }
5802 GETCHARLEN(c, eptr, len);
5803 switch(c)
5804 {
5805 HSPACE_CASES: gotspace = TRUE; break;
5806 default: gotspace = FALSE; break;
5807 }
5808 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5809 eptr += len;
5810 }
5811 break;
5812
5813 case OP_NOT_VSPACE:
5814 case OP_VSPACE:
5815 for (i = min; i < max; i++)
5816 {
5817 BOOL gotspace;
5818 int len = 1;
5819 if (eptr >= md->end_subject)
5820 {
5821 SCHECK_PARTIAL();
5822 break;
5823 }
5824 GETCHARLEN(c, eptr, len);
5825 switch(c)
5826 {
5827 VSPACE_CASES: gotspace = TRUE; break;
5828 default: gotspace = FALSE; break;
5829 }
5830 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5831 eptr += len;
5832 }
5833 break;
5834
5835 case OP_NOT_DIGIT:
5836 for (i = min; i < max; i++)
5837 {
5838 int len = 1;
5839 if (eptr >= md->end_subject)
5840 {
5841 SCHECK_PARTIAL();
5842 break;
5843 }
5844 GETCHARLEN(c, eptr, len);
5845 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5846 eptr+= len;
5847 }
5848 break;
5849
5850 case OP_DIGIT:
5851 for (i = min; i < max; i++)
5852 {
5853 int len = 1;
5854 if (eptr >= md->end_subject)
5855 {
5856 SCHECK_PARTIAL();
5857 break;
5858 }
5859 GETCHARLEN(c, eptr, len);
5860 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5861 eptr+= len;
5862 }
5863 break;
5864
5865 case OP_NOT_WHITESPACE:
5866 for (i = min; i < max; i++)
5867 {
5868 int len = 1;
5869 if (eptr >= md->end_subject)
5870 {
5871 SCHECK_PARTIAL();
5872 break;
5873 }
5874 GETCHARLEN(c, eptr, len);
5875 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5876 eptr+= len;
5877 }
5878 break;
5879
5880 case OP_WHITESPACE:
5881 for (i = min; i < max; i++)
5882 {
5883 int len = 1;
5884 if (eptr >= md->end_subject)
5885 {
5886 SCHECK_PARTIAL();
5887 break;
5888 }
5889 GETCHARLEN(c, eptr, len);
5890 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5891 eptr+= len;
5892 }
5893 break;
5894
5895 case OP_NOT_WORDCHAR:
5896 for (i = min; i < max; i++)
5897 {
5898 int len = 1;
5899 if (eptr >= md->end_subject)
5900 {
5901 SCHECK_PARTIAL();
5902 break;
5903 }
5904 GETCHARLEN(c, eptr, len);
5905 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5906 eptr+= len;
5907 }
5908 break;
5909
5910 case OP_WORDCHAR:
5911 for (i = min; i < max; i++)
5912 {
5913 int len = 1;
5914 if (eptr >= md->end_subject)
5915 {
5916 SCHECK_PARTIAL();
5917 break;
5918 }
5919 GETCHARLEN(c, eptr, len);
5920 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5921 eptr+= len;
5922 }
5923 break;
5924
5925 default:
5926 RRETURN(PCRE_ERROR_INTERNAL);
5927 }
5928
5929 if (possessive) continue; /* No backtracking */
5930 for(;;)
5931 {
5932 if (eptr == pp) goto TAIL_RECURSE;
5933 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5935 eptr--;
5936 BACKCHAR(eptr);
5937 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5938 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5939 }
5940 }
5941 else
5942 #endif /* SUPPORT_UTF */
5943 /* Not UTF mode */
5944 {
5945 switch(ctype)
5946 {
5947 case OP_ANY:
5948 for (i = min; i < max; i++)
5949 {
5950 if (eptr >= md->end_subject)
5951 {
5952 SCHECK_PARTIAL();
5953 break;
5954 }
5955 if (IS_NEWLINE(eptr)) break;
5956 if (md->partial != 0 && /* Take care with CRLF partial */
5957 eptr + 1 >= md->end_subject &&
5958 NLBLOCK->nltype == NLTYPE_FIXED &&
5959 NLBLOCK->nllen == 2 &&
5960 *eptr == NLBLOCK->nl[0])
5961 {
5962 md->hitend = TRUE;
5963 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5964 }
5965 eptr++;
5966 }
5967 break;
5968
5969 case OP_ALLANY:
5970 case OP_ANYBYTE:
5971 c = max - min;
5972 if (c > (unsigned int)(md->end_subject - eptr))
5973 {
5974 eptr = md->end_subject;
5975 SCHECK_PARTIAL();
5976 }
5977 else eptr += c;
5978 break;
5979
5980 case OP_ANYNL:
5981 for (i = min; i < max; i++)
5982 {
5983 if (eptr >= md->end_subject)
5984 {
5985 SCHECK_PARTIAL();
5986 break;
5987 }
5988 c = *eptr;
5989 if (c == CHAR_CR)
5990 {
5991 if (++eptr >= md->end_subject) break;
5992 if (*eptr == CHAR_LF) eptr++;
5993 }
5994 else
5995 {
5996 if (c != CHAR_LF && (md->bsr_anycrlf ||
5997 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5998 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5999 && c != 0x2028 && c != 0x2029
6000 #endif
6001 ))) break;
6002 eptr++;
6003 }
6004 }
6005 break;
6006
6007 case OP_NOT_HSPACE:
6008 for (i = min; i < max; i++)
6009 {
6010 if (eptr >= md->end_subject)
6011 {
6012 SCHECK_PARTIAL();
6013 break;
6014 }
6015 switch(*eptr)
6016 {
6017 default: eptr++; break;
6018 HSPACE_BYTE_CASES:
6019 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6020 HSPACE_MULTIBYTE_CASES:
6021 #endif
6022 goto ENDLOOP00;
6023 }
6024 }
6025 ENDLOOP00:
6026 break;
6027
6028 case OP_HSPACE:
6029 for (i = min; i < max; i++)
6030 {
6031 if (eptr >= md->end_subject)
6032 {
6033 SCHECK_PARTIAL();
6034 break;
6035 }
6036 switch(*eptr)
6037 {
6038 default: goto ENDLOOP01;
6039 HSPACE_BYTE_CASES:
6040 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6041 HSPACE_MULTIBYTE_CASES:
6042 #endif
6043 eptr++; break;
6044 }
6045 }
6046 ENDLOOP01:
6047 break;
6048
6049 case OP_NOT_VSPACE:
6050 for (i = min; i < max; i++)
6051 {
6052 if (eptr >= md->end_subject)
6053 {
6054 SCHECK_PARTIAL();
6055 break;
6056 }
6057 switch(*eptr)
6058 {
6059 default: eptr++; break;
6060 VSPACE_BYTE_CASES:
6061 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6062 VSPACE_MULTIBYTE_CASES:
6063 #endif
6064 goto ENDLOOP02;
6065 }
6066 }
6067 ENDLOOP02:
6068 break;
6069
6070 case OP_VSPACE:
6071 for (i = min; i < max; i++)
6072 {
6073 if (eptr >= md->end_subject)
6074 {
6075 SCHECK_PARTIAL();
6076 break;
6077 }
6078 switch(*eptr)
6079 {
6080 default: goto ENDLOOP03;
6081 VSPACE_BYTE_CASES:
6082 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6083 VSPACE_MULTIBYTE_CASES:
6084 #endif
6085 eptr++; break;
6086 }
6087 }
6088 ENDLOOP03:
6089 break;
6090
6091 case OP_NOT_DIGIT:
6092 for (i = min; i < max; i++)
6093 {
6094 if (eptr >= md->end_subject)
6095 {
6096 SCHECK_PARTIAL();
6097 break;
6098 }
6099 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6100 eptr++;
6101 }
6102 break;
6103
6104 case OP_DIGIT:
6105 for (i = min; i < max; i++)
6106 {
6107 if (eptr >= md->end_subject)
6108 {
6109 SCHECK_PARTIAL();
6110 break;
6111 }
6112 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6113 eptr++;
6114 }
6115 break;
6116
6117 case OP_NOT_WHITESPACE:
6118 for (i = min; i < max; i++)
6119 {
6120 if (eptr >= md->end_subject)
6121 {
6122 SCHECK_PARTIAL();
6123 break;
6124 }
6125 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6126 eptr++;
6127 }
6128 break;
6129
6130 case OP_WHITESPACE:
6131 for (i = min; i < max; i++)
6132 {
6133 if (eptr >= md->end_subject)
6134 {
6135 SCHECK_PARTIAL();
6136 break;
6137 }
6138 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6139 eptr++;
6140 }
6141 break;
6142
6143 case OP_NOT_WORDCHAR:
6144 for (i = min; i < max; i++)
6145 {
6146 if (eptr >= md->end_subject)
6147 {
6148 SCHECK_PARTIAL();
6149 break;
6150 }
6151 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6152 eptr++;
6153 }
6154 break;
6155
6156 case OP_WORDCHAR:
6157 for (i = min; i < max; i++)
6158 {
6159 if (eptr >= md->end_subject)
6160 {
6161 SCHECK_PARTIAL();
6162 break;
6163 }
6164 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6165 eptr++;
6166 }
6167 break;
6168
6169 default:
6170 RRETURN(PCRE_ERROR_INTERNAL);
6171 }
6172
6173 if (possessive) continue; /* No backtracking */
6174 for (;;)
6175 {
6176 if (eptr == pp) goto TAIL_RECURSE;
6177 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6178 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6179 eptr--;
6180 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6181 eptr[-1] == CHAR_CR) eptr--;
6182 }
6183 }
6184
6185 /* Control never gets here */
6186 }
6187
6188 /* There's been some horrible disaster. Arrival here can only mean there is
6189 something seriously wrong in the code above or the OP_xxx definitions. */
6190
6191 default:
6192 DPRINTF(("Unknown opcode %d\n", *ecode));
6193 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6194 }
6195
6196 /* Do not stick any code in here without much thought; it is assumed
6197 that "continue" in the code above comes out to here to repeat the main
6198 loop. */
6199
6200 } /* End of main loop */
6201 /* Control never reaches here */
6202
6203
6204 /* When compiling to use the heap rather than the stack for recursive calls to
6205 match(), the RRETURN() macro jumps here. The number that is saved in
6206 frame->Xwhere indicates which label we actually want to return to. */
6207
6208 #ifdef NO_RECURSE
6209 #define LBL(val) case val: goto L_RM##val;
6210 HEAP_RETURN:
6211 switch (frame->Xwhere)
6212 {
6213 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6214 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6215 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6216 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6217 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6218 LBL(65) LBL(66)
6219 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6220 LBL(20) LBL(21)
6221 #endif
6222 #ifdef SUPPORT_UTF
6223 LBL(16) LBL(18)
6224 LBL(22) LBL(23) LBL(28) LBL(30)
6225 LBL(32) LBL(34) LBL(42) LBL(46)
6226 #ifdef SUPPORT_UCP
6227 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6228 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6229 #endif /* SUPPORT_UCP */
6230 #endif /* SUPPORT_UTF */
6231 default:
6232 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6233 return PCRE_ERROR_INTERNAL;
6234 }
6235 #undef LBL
6236 #endif /* NO_RECURSE */
6237 }
6238
6239
6240 /***************************************************************************
6241 ****************************************************************************
6242 RECURSION IN THE match() FUNCTION
6243
6244 Undefine all the macros that were defined above to handle this. */
6245
6246 #ifdef NO_RECURSE
6247 #undef eptr
6248 #undef ecode
6249 #undef mstart
6250 #undef offset_top
6251 #undef eptrb
6252 #undef flags
6253
6254 #undef callpat
6255 #undef charptr
6256 #undef data
6257 #undef next
6258 #undef pp
6259 #undef prev
6260 #undef saved_eptr
6261
6262 #undef new_recursive
6263
6264 #undef cur_is_word
6265 #undef condition
6266 #undef prev_is_word
6267
6268 #undef ctype
6269 #undef length
6270 #undef max
6271 #undef min
6272 #undef number
6273 #undef offset
6274 #undef op
6275 #undef save_capture_last
6276 #undef save_offset1
6277 #undef save_offset2
6278 #undef save_offset3
6279 #undef stacksave
6280
6281 #undef newptrb
6282
6283 #endif
6284
6285 /* These two are defined as macros in both cases */
6286
6287 #undef fc
6288 #undef fi
6289
6290 /***************************************************************************
6291 ***************************************************************************/
6292
6293
6294 #ifdef NO_RECURSE
6295 /*************************************************
6296 * Release allocated heap frames *
6297 *************************************************/
6298
6299 /* This function releases all the allocated frames. The base frame is on the
6300 machine stack, and so must not be freed.
6301
6302 Argument: the address of the base frame
6303 Returns: nothing
6304 */
6305
6306 static void
6307 release_match_heapframes (heapframe *frame_base)
6308 {
6309 heapframe *nextframe = frame_base->Xnextframe;
6310 while (nextframe != NULL)
6311 {
6312 heapframe *oldframe = nextframe;
6313 nextframe = nextframe->Xnextframe;
6314 (PUBL(stack_free))(oldframe);
6315 }
6316 }
6317 #endif
6318
6319
6320 /*************************************************
6321 * Execute a Regular Expression *
6322 *************************************************/
6323
6324 /* This function applies a compiled re to a subject string and picks out
6325 portions of the string if it matches. Two elements in the vector are set for
6326 each substring: the offsets to the start and end of the substring.
6327
6328 Arguments:
6329 argument_re points to the compiled expression
6330 extra_data points to extra data or is NULL
6331 subject points to the subject string
6332 length length of subject string (may contain binary zeros)
6333 start_offset where to start in the subject string
6334 options option bits
6335 offsets points to a vector of ints to be filled in with offsets
6336 offsetcount the number of elements in the vector
6337
6338 Returns: > 0 => success; value is the number of elements filled in
6339 = 0 => success, but offsets is not big enough
6340 -1 => failed to match
6341 < -1 => some kind of unexpected problem
6342 */
6343
6344 #if defined COMPILE_PCRE8
6345 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6346 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6347 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6348 int offsetcount)
6349 #elif defined COMPILE_PCRE16
6350 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6351 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6352 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6353 int offsetcount)
6354 #elif defined COMPILE_PCRE32
6355 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6356 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6357 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6358 int offsetcount)
6359 #endif
6360 {
6361 int rc, ocount, arg_offset_max;
6362 int newline;
6363 BOOL using_temporary_offsets = FALSE;
6364 BOOL anchored;
6365 BOOL startline;
6366 BOOL firstline;
6367 BOOL utf;
6368 BOOL has_first_char = FALSE;
6369 BOOL has_req_char = FALSE;
6370 pcre_uchar first_char = 0;
6371 pcre_uchar first_char2 = 0;
6372 pcre_uchar req_char = 0;
6373 pcre_uchar req_char2 = 0;
6374 match_data match_block;
6375 match_data *md = &match_block;
6376 const pcre_uint8 *tables;
6377 const pcre_uint8 *start_bits = NULL;
6378 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6379 PCRE_PUCHAR end_subject;
6380 PCRE_PUCHAR start_partial = NULL;
6381 PCRE_PUCHAR match_partial = NULL;
6382 PCRE_PUCHAR req_char_ptr = start_match - 1;
6383
6384 const pcre_study_data *study;
6385 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6386
6387 #ifdef NO_RECURSE
6388 heapframe frame_zero;
6389 frame_zero.Xprevframe = NULL; /* Marks the top level */
6390 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6391 md->match_frames_base = &frame_zero;
6392 #endif
6393
6394 /* Check for the special magic call that measures the size of the stack used
6395 per recursive call of match(). Without the funny casting for sizeof, a Windows
6396 compiler gave this error: "unary minus operator applied to unsigned type,
6397 result still unsigned". Hopefully the cast fixes that. */
6398
6399 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6400 start_offset == -999)
6401 #ifdef NO_RECURSE
6402 return -((int)sizeof(heapframe));
6403 #else
6404 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6405 #endif
6406
6407 /* Plausibility checks */
6408
6409 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6410 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6411 return PCRE_ERROR_NULL;
6412 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6413 if (length < 0) return PCRE_ERROR_BADLENGTH;
6414 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6415
6416 /* Check that the first field in the block is the magic number. If it is not,
6417 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6418 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6419 means that the pattern is likely compiled with different endianness. */
6420
6421 if (re->magic_number != MAGIC_NUMBER)
6422 return re->magic_number == REVERSED_MAGIC_NUMBER?
6423 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6424 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6425
6426 /* These two settings are used in the code for checking a UTF-8 string that
6427 follows immediately afterwards. Other values in the md block are used only
6428 during "normal" pcre_exec() processing, not when the JIT support is in use,
6429 so they are set up later. */
6430
6431 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6432 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6433 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6434 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6435
6436 /* Check a UTF-8 string if required. Pass back the character offset and error
6437 code for an invalid string if a results vector is available. */
6438
6439 #ifdef SUPPORT_UTF
6440 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6441 {
6442 int erroroffset;
6443 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6444 if (errorcode != 0)
6445 {
6446 if (offsetcount >= 2)
6447 {
6448 offsets[0] = erroroffset;
6449 offsets[1] = errorcode;
6450 }
6451 #if defined COMPILE_PCRE8
6452 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6453 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6454 #elif defined COMPILE_PCRE16
6455 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6456 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6457 #elif defined COMPILE_PCRE32
6458 return PCRE_ERROR_BADUTF32;
6459 #endif
6460 }
6461 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6462 /* Check that a start_offset points to the start of a UTF character. */
6463 if (start_offset > 0 && start_offset < length &&
6464 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6465 return PCRE_ERROR_BADUTF8_OFFSET;
6466 #endif
6467 }
6468 #endif
6469
6470 /* If the pattern was successfully studied with JIT support, run the JIT
6471 executable instead of the rest of this function. Most options must be set at
6472 compile time for the JIT code to be usable. Fallback to the normal code path if
6473 an unsupported flag is set. */
6474
6475 #ifdef SUPPORT_JIT
6476 if (extra_data != NULL
6477 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6478 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6479 && extra_data->executable_jit != NULL
6480 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6481 {
6482 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6483 start_offset, options, offsets, offsetcount);
6484
6485 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6486 mode is not compiled. In this case we simply fallback to interpreter. */
6487
6488 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6489 }
6490 #endif
6491
6492 /* Carry on with non-JIT matching. This information is for finding all the
6493 numbers associated with a given name, for condition testing. */
6494
6495 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6496 md->name_count = re->name_count;
6497 md->name_entry_size = re->name_entry_size;
6498
6499 /* Fish out the optional data from the extra_data structure, first setting
6500 the default values. */
6501
6502 study = NULL;
6503 md->match_limit = MATCH_LIMIT;
6504 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6505 md->callout_data = NULL;
6506
6507 /* The table pointer is always in native byte order. */
6508
6509 tables = re->tables;
6510
6511 /* The two limit values override the defaults, whatever their value. */
6512
6513 if (extra_data != NULL)
6514 {
6515 unsigned long int flags = extra_data->flags;
6516 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6517 study = (const pcre_study_data *)extra_data->study_data;
6518 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6519 md->match_limit = extra_data->match_limit;
6520 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6521 md->match_limit_recursion = extra_data->match_limit_recursion;
6522 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6523 md->callout_data = extra_data->callout_data;
6524 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6525 }
6526
6527 /* Limits in the regex override only if they are smaller. */
6528
6529 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6530 md->match_limit = re->limit_match;
6531
6532 if ((re->flags & PCRE_RLSET) != 0 &&
6533 re->limit_recursion < md->match_limit_recursion)
6534 md->match_limit_recursion = re->limit_recursion;
6535
6536 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6537 is a feature that makes it possible to save compiled regex and re-use them
6538 in other programs later. */
6539
6540 if (tables == NULL) tables = PRIV(default_tables);
6541
6542 /* Set up other data */
6543
6544 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6545 startline = (re->flags & PCRE_STARTLINE) != 0;
6546 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6547
6548 /* The code starts after the real_pcre block and the capture name table. */
6549
6550 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6551 re->name_count * re->name_entry_size;
6552
6553 md->start_subject = (PCRE_PUCHAR)subject;
6554 md->start_offset = start_offset;
6555 md->end_subject = md->start_subject + length;
6556 end_subject = md->end_subject;
6557
6558 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6559 md->use_ucp = (re->options & PCRE_UCP) != 0;
6560 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6561 md->ignore_skip_arg = 0;
6562
6563 /* Some options are unpacked into BOOL variables in the hope that testing
6564 them will be faster than individual option bits. */
6565
6566 md->notbol = (options & PCRE_NOTBOL) != 0;
6567 md->noteol = (options & PCRE_NOTEOL) != 0;
6568 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6569 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6570
6571 md->hitend = FALSE;
6572 md->mark = md->nomatch_mark = NULL; /* In case never set */
6573
6574 md->recursive = NULL; /* No recursion at top level */
6575 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6576
6577 md->lcc = tables + lcc_offset;
6578 md->fcc = tables + fcc_offset;
6579 md->ctypes = tables + ctypes_offset;
6580
6581 /* Handle different \R options. */
6582
6583 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6584 {
6585 case 0:
6586 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6587 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6588 else
6589 #ifdef BSR_ANYCRLF
6590 md->bsr_anycrlf = TRUE;
6591 #else
6592 md->bsr_anycrlf = FALSE;
6593 #endif
6594 break;
6595
6596 case PCRE_BSR_ANYCRLF:
6597 md->bsr_anycrlf = TRUE;
6598 break;
6599
6600 case PCRE_BSR_UNICODE:
6601 md->bsr_anycrlf = FALSE;
6602 break;
6603
6604 default: return PCRE_ERROR_BADNEWLINE;
6605 }
6606
6607 /* Handle different types of newline. The three bits give eight cases. If
6608 nothing is set at run time, whatever was used at compile time applies. */
6609
6610 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6611 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6612 {
6613 case 0: newline = NEWLINE; break; /* Compile-time default */
6614 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6615 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6616 case PCRE_NEWLINE_CR+
6617 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6618 case PCRE_NEWLINE_ANY: newline = -1; break;
6619 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6620 default: return PCRE_ERROR_BADNEWLINE;
6621 }
6622
6623 if (newline == -2)
6624 {
6625 md->nltype = NLTYPE_ANYCRLF;
6626 }
6627 else if (newline < 0)
6628 {
6629 md->nltype = NLTYPE_ANY;
6630 }
6631 else
6632 {
6633 md->nltype = NLTYPE_FIXED;
6634 if (newline > 255)
6635 {
6636 md->nllen = 2;
6637 md->nl[0] = (newline >> 8) & 255;
6638 md->nl[1] = newline & 255;
6639 }
6640 else
6641 {
6642 md->nllen = 1;
6643 md->nl[0] = newline;
6644 }
6645 }
6646
6647 /* Partial matching was originally supported only for a restricted set of
6648 regexes; from release 8.00 there are no restrictions, but the bits are still
6649 defined (though never set). So there's no harm in leaving this code. */
6650
6651 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6652 return PCRE_ERROR_BADPARTIAL;
6653
6654 /* If the expression has got more back references than the offsets supplied can
6655 hold, we get a temporary chunk of working store to use during the matching.
6656 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6657 of 3. */
6658
6659 ocount = offsetcount - (offsetcount % 3);
6660 arg_offset_max = (2*ocount)/3;
6661
6662 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6663 {
6664 ocount = re->top_backref * 3 + 3;
6665 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6666 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6667 using_temporary_offsets = TRUE;
6668 DPRINTF(("Got memory to hold back references\n"));
6669 }
6670 else md->offset_vector = offsets;
6671 md->offset_end = ocount;
6672 md->offset_max = (2*ocount)/3;
6673 md->capture_last = 0;
6674
6675 /* Reset the working variable associated with each extraction. These should
6676 never be used unless previously set, but they get saved and restored, and so we
6677 initialize them to avoid reading uninitialized locations. Also, unset the
6678 offsets for the matched string. This is really just for tidiness with callouts,
6679 in case they inspect these fields. */
6680
6681 if (md->offset_vector != NULL)
6682 {
6683 register int *iptr = md->offset_vector + ocount;
6684 register int *iend = iptr - re->top_bracket;
6685 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6686 while (--iptr >= iend) *iptr = -1;
6687 md->offset_vector[0] = md->offset_vector[1] = -1;
6688 }
6689
6690 /* Set up the first character to match, if available. The first_char value is
6691 never set for an anchored regular expression, but the anchoring may be forced
6692 at run time, so we have to test for anchoring. The first char may be unset for
6693 an unanchored pattern, of course. If there's no first char and the pattern was
6694 studied, there may be a bitmap of possible first characters. */
6695
6696 if (!anchored)
6697 {
6698 if ((re->flags & PCRE_FIRSTSET) != 0)
6699 {
6700 has_first_char = TRUE;
6701 first_char = first_char2 = (pcre_uchar)(re->first_char);
6702 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6703 {
6704 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6705 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6706 if (utf && first_char > 127)
6707 first_char2 = UCD_OTHERCASE(first_char);
6708 #endif
6709 }
6710 }
6711 else
6712 if (!startline && study != NULL &&
6713 (study->flags & PCRE_STUDY_MAPPED) != 0)
6714 start_bits = study->start_bits;
6715 }
6716
6717 /* For anchored or unanchored matches, there may be a "last known required
6718 character" set. */
6719
6720 if ((re->flags & PCRE_REQCHSET) != 0)
6721 {
6722 has_req_char = TRUE;
6723 req_char = req_char2 = (pcre_uchar)(re->req_char);
6724 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6725 {
6726 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6727 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6728 if (utf && req_char > 127)
6729 req_char2 = UCD_OTHERCASE(req_char);
6730 #endif
6731 }
6732 }
6733
6734
6735 /* ==========================================================================*/
6736
6737 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6738 the loop runs just once. */
6739
6740 for(;;)
6741 {
6742 PCRE_PUCHAR save_end_subject = end_subject;
6743 PCRE_PUCHAR new_start_match;
6744
6745 /* If firstline is TRUE, the start of the match is constrained to the first
6746 line of a multiline string. That is, the match must be before or at the first
6747 newline. Implement this by temporarily adjusting end_subject so that we stop
6748 scanning at a newline. If the match fails at the newline, later code breaks
6749 this loop. */
6750
6751 if (firstline)
6752 {
6753 PCRE_PUCHAR t = start_match;
6754