/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1365 - (show annotations)
Sun Oct 6 18:33:56 2013 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 216995 byte(s)
Refactor named group handling for conditional tests.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 case OP_COMMIT:
785 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
786 eptrb, RM52);
787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
788 RRETURN(MATCH_COMMIT);
789
790 case OP_PRUNE:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
792 eptrb, RM51);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 RRETURN(MATCH_PRUNE);
795
796 case OP_PRUNE_ARG:
797 md->nomatch_mark = ecode + 2;
798 md->mark = NULL; /* In case previously set by assertion */
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM56);
801 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
802 md->mark == NULL) md->mark = ecode + 2;
803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
804 RRETURN(MATCH_PRUNE);
805
806 case OP_SKIP:
807 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
808 eptrb, RM53);
809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
810 md->start_match_ptr = eptr; /* Pass back current position */
811 RRETURN(MATCH_SKIP);
812
813 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
814 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
815 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
816 that failed and any that precede it (either they also failed, or were not
817 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
818 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
819 set to the count of the one that failed. */
820
821 case OP_SKIP_ARG:
822 md->skip_arg_count++;
823 if (md->skip_arg_count <= md->ignore_skip_arg)
824 {
825 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 break;
827 }
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
829 eptrb, RM57);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831
832 /* Pass back the current skip name by overloading md->start_match_ptr and
833 returning the special MATCH_SKIP_ARG return code. This will either be
834 caught by a matching MARK, or get to the top, where it causes a rematch
835 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
836
837 md->start_match_ptr = ecode + 2;
838 RRETURN(MATCH_SKIP_ARG);
839
840 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
841 the branch in which it occurs can be determined. Overload the start of
842 match pointer to do this. */
843
844 case OP_THEN:
845 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
846 eptrb, RM54);
847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848 md->start_match_ptr = ecode;
849 RRETURN(MATCH_THEN);
850
851 case OP_THEN_ARG:
852 md->nomatch_mark = ecode + 2;
853 md->mark = NULL; /* In case previously set by assertion */
854 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
855 md, eptrb, RM58);
856 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
857 md->mark == NULL) md->mark = ecode + 2;
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 md->start_match_ptr = ecode;
860 RRETURN(MATCH_THEN);
861
862 /* Handle an atomic group that does not contain any capturing parentheses.
863 This can be handled like an assertion. Prior to 8.13, all atomic groups
864 were handled this way. In 8.13, the code was changed as below for ONCE, so
865 that backups pass through the group and thereby reset captured values.
866 However, this uses a lot more stack, so in 8.20, atomic groups that do not
867 contain any captures generate OP_ONCE_NC, which can be handled in the old,
868 less stack intensive way.
869
870 Check the alternative branches in turn - the matching won't pass the KET
871 for this kind of subpattern. If any one branch matches, we carry on as at
872 the end of a normal bracket, leaving the subject pointer, but resetting
873 the start-of-match value in case it was changed by \K. */
874
875 case OP_ONCE_NC:
876 prev = ecode;
877 saved_eptr = eptr;
878 save_mark = md->mark;
879 do
880 {
881 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
882 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
883 {
884 mstart = md->start_match_ptr;
885 break;
886 }
887 if (rrc == MATCH_THEN)
888 {
889 next = ecode + GET(ecode,1);
890 if (md->start_match_ptr < next &&
891 (*ecode == OP_ALT || *next == OP_ALT))
892 rrc = MATCH_NOMATCH;
893 }
894
895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896 ecode += GET(ecode,1);
897 md->mark = save_mark;
898 }
899 while (*ecode == OP_ALT);
900
901 /* If hit the end of the group (which could be repeated), fail */
902
903 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
904
905 /* Continue as from after the group, updating the offsets high water
906 mark, since extracts may have been taken. */
907
908 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
909
910 offset_top = md->end_offset_top;
911 eptr = md->end_match_ptr;
912
913 /* For a non-repeating ket, just continue at this level. This also
914 happens for a repeating ket if no characters were matched in the group.
915 This is the forcible breaking of infinite loops as implemented in Perl
916 5.005. */
917
918 if (*ecode == OP_KET || eptr == saved_eptr)
919 {
920 ecode += 1+LINK_SIZE;
921 break;
922 }
923
924 /* The repeating kets try the rest of the pattern or restart from the
925 preceding bracket, in the appropriate order. The second "call" of match()
926 uses tail recursion, to avoid using another stack frame. */
927
928 if (*ecode == OP_KETRMIN)
929 {
930 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
932 ecode = prev;
933 goto TAIL_RECURSE;
934 }
935 else /* OP_KETRMAX */
936 {
937 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939 ecode += 1 + LINK_SIZE;
940 goto TAIL_RECURSE;
941 }
942 /* Control never gets here */
943
944 /* Handle a capturing bracket, other than those that are possessive with an
945 unlimited repeat. If there is space in the offset vector, save the current
946 subject position in the working slot at the top of the vector. We mustn't
947 change the current values of the data slot, because they may be set from a
948 previous iteration of this group, and be referred to by a reference inside
949 the group. A failure to match might occur after the group has succeeded,
950 if something later on doesn't match. For this reason, we need to restore
951 the working value and also the values of the final offsets, in case they
952 were set by a previous iteration of the same bracket.
953
954 If there isn't enough space in the offset vector, treat this as if it were
955 a non-capturing bracket. Don't worry about setting the flag for the error
956 case here; that is handled in the code for KET. */
957
958 case OP_CBRA:
959 case OP_SCBRA:
960 number = GET2(ecode, 1+LINK_SIZE);
961 offset = number << 1;
962
963 #ifdef PCRE_DEBUG
964 printf("start bracket %d\n", number);
965 printf("subject=");
966 pchars(eptr, 16, TRUE, md);
967 printf("\n");
968 #endif
969
970 if (offset < md->offset_max)
971 {
972 save_offset1 = md->offset_vector[offset];
973 save_offset2 = md->offset_vector[offset+1];
974 save_offset3 = md->offset_vector[md->offset_end - number];
975 save_capture_last = md->capture_last;
976 save_mark = md->mark;
977
978 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
979 md->offset_vector[md->offset_end - number] =
980 (int)(eptr - md->start_subject);
981
982 for (;;)
983 {
984 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
986 eptrb, RM1);
987 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
988
989 /* If we backed up to a THEN, check whether it is within the current
990 branch by comparing the address of the THEN that is passed back with
991 the end of the branch. If it is within the current branch, and the
992 branch is one of two or more alternatives (it either starts or ends
993 with OP_ALT), we have reached the limit of THEN's action, so convert
994 the return code to NOMATCH, which will cause normal backtracking to
995 happen from now on. Otherwise, THEN is passed back to an outer
996 alternative. This implements Perl's treatment of parenthesized groups,
997 where a group not containing | does not affect the current alternative,
998 that is, (X) is NOT the same as (X|(*F)). */
999
1000 if (rrc == MATCH_THEN)
1001 {
1002 next = ecode + GET(ecode,1);
1003 if (md->start_match_ptr < next &&
1004 (*ecode == OP_ALT || *next == OP_ALT))
1005 rrc = MATCH_NOMATCH;
1006 }
1007
1008 /* Anything other than NOMATCH is passed back. */
1009
1010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011 md->capture_last = save_capture_last;
1012 ecode += GET(ecode, 1);
1013 md->mark = save_mark;
1014 if (*ecode != OP_ALT) break;
1015 }
1016
1017 DPRINTF(("bracket %d failed\n", number));
1018 md->offset_vector[offset] = save_offset1;
1019 md->offset_vector[offset+1] = save_offset2;
1020 md->offset_vector[md->offset_end - number] = save_offset3;
1021
1022 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1023
1024 RRETURN(rrc);
1025 }
1026
1027 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1028 as a non-capturing bracket. */
1029
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032
1033 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1034
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037
1038 /* Non-capturing or atomic group, except for possessive with unlimited
1039 repeat and ONCE group with no captures. Loop for all the alternatives.
1040
1041 When we get to the final alternative within the brackets, we used to return
1042 the result of a recursive call to match() whatever happened so it was
1043 possible to reduce stack usage by turning this into a tail recursion,
1044 except in the case of a possibly empty group. However, now that there is
1045 the possiblity of (*THEN) occurring in the final alternative, this
1046 optimization is no longer always possible.
1047
1048 We can optimize if we know there are no (*THEN)s in the pattern; at present
1049 this is the best that can be done.
1050
1051 MATCH_ONCE is returned when the end of an atomic group is successfully
1052 reached, but subsequent matching fails. It passes back up the tree (causing
1053 captured values to be reset) until the original atomic group level is
1054 reached. This is tested by comparing md->once_target with the start of the
1055 group. At this point, the return is converted into MATCH_NOMATCH so that
1056 previous backup points can be taken. */
1057
1058 case OP_ONCE:
1059 case OP_BRA:
1060 case OP_SBRA:
1061 DPRINTF(("start non-capturing bracket\n"));
1062
1063 for (;;)
1064 {
1065 if (op >= OP_SBRA || op == OP_ONCE)
1066 md->match_function_type = MATCH_CBEGROUP;
1067
1068 /* If this is not a possibly empty group, and there are no (*THEN)s in
1069 the pattern, and this is the final alternative, optimize as described
1070 above. */
1071
1072 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1073 {
1074 ecode += PRIV(OP_lengths)[*ecode];
1075 goto TAIL_RECURSE;
1076 }
1077
1078 /* In all other cases, we have to make another call to match(). */
1079
1080 save_mark = md->mark;
1081 save_capture_last = md->capture_last;
1082 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 RM2);
1084
1085 /* See comment in the code for capturing groups above about handling
1086 THEN. */
1087
1088 if (rrc == MATCH_THEN)
1089 {
1090 next = ecode + GET(ecode,1);
1091 if (md->start_match_ptr < next &&
1092 (*ecode == OP_ALT || *next == OP_ALT))
1093 rrc = MATCH_NOMATCH;
1094 }
1095
1096 if (rrc != MATCH_NOMATCH)
1097 {
1098 if (rrc == MATCH_ONCE)
1099 {
1100 const pcre_uchar *scode = ecode;
1101 if (*scode != OP_ONCE) /* If not at start, find it */
1102 {
1103 while (*scode == OP_ALT) scode += GET(scode, 1);
1104 scode -= GET(scode, 1);
1105 }
1106 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1107 }
1108 RRETURN(rrc);
1109 }
1110 ecode += GET(ecode, 1);
1111 md->mark = save_mark;
1112 if (*ecode != OP_ALT) break;
1113 md->capture_last = save_capture_last;
1114 }
1115
1116 RRETURN(MATCH_NOMATCH);
1117
1118 /* Handle possessive capturing brackets with an unlimited repeat. We come
1119 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1120 handled similarly to the normal case above. However, the matching is
1121 different. The end of these brackets will always be OP_KETRPOS, which
1122 returns MATCH_KETRPOS without going further in the pattern. By this means
1123 we can handle the group by iteration rather than recursion, thereby
1124 reducing the amount of stack needed. */
1125
1126 case OP_CBRAPOS:
1127 case OP_SCBRAPOS:
1128 allow_zero = FALSE;
1129
1130 POSSESSIVE_CAPTURE:
1131 number = GET2(ecode, 1+LINK_SIZE);
1132 offset = number << 1;
1133
1134 #ifdef PCRE_DEBUG
1135 printf("start possessive bracket %d\n", number);
1136 printf("subject=");
1137 pchars(eptr, 16, TRUE, md);
1138 printf("\n");
1139 #endif
1140
1141 if (offset < md->offset_max)
1142 {
1143 matched_once = FALSE;
1144 code_offset = (int)(ecode - md->start_code);
1145
1146 save_offset1 = md->offset_vector[offset];
1147 save_offset2 = md->offset_vector[offset+1];
1148 save_offset3 = md->offset_vector[md->offset_end - number];
1149 save_capture_last = md->capture_last;
1150
1151 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1152
1153 /* Each time round the loop, save the current subject position for use
1154 when the group matches. For MATCH_MATCH, the group has matched, so we
1155 restart it with a new subject starting position, remembering that we had
1156 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1157 usual. If we haven't matched any alternatives in any iteration, check to
1158 see if a previous iteration matched. If so, the group has matched;
1159 continue from afterwards. Otherwise it has failed; restore the previous
1160 capture values before returning NOMATCH. */
1161
1162 for (;;)
1163 {
1164 md->offset_vector[md->offset_end - number] =
1165 (int)(eptr - md->start_subject);
1166 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1167 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1168 eptrb, RM63);
1169 if (rrc == MATCH_KETRPOS)
1170 {
1171 offset_top = md->end_offset_top;
1172 eptr = md->end_match_ptr;
1173 ecode = md->start_code + code_offset;
1174 save_capture_last = md->capture_last;
1175 matched_once = TRUE;
1176 continue;
1177 }
1178
1179 /* See comment in the code for capturing groups above about handling
1180 THEN. */
1181
1182 if (rrc == MATCH_THEN)
1183 {
1184 next = ecode + GET(ecode,1);
1185 if (md->start_match_ptr < next &&
1186 (*ecode == OP_ALT || *next == OP_ALT))
1187 rrc = MATCH_NOMATCH;
1188 }
1189
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 md->capture_last = save_capture_last;
1192 ecode += GET(ecode, 1);
1193 if (*ecode != OP_ALT) break;
1194 }
1195
1196 if (!matched_once)
1197 {
1198 md->offset_vector[offset] = save_offset1;
1199 md->offset_vector[offset+1] = save_offset2;
1200 md->offset_vector[md->offset_end - number] = save_offset3;
1201 }
1202
1203 if (allow_zero || matched_once)
1204 {
1205 ecode += 1 + LINK_SIZE;
1206 break;
1207 }
1208
1209 RRETURN(MATCH_NOMATCH);
1210 }
1211
1212 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1213 as a non-capturing bracket. */
1214
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217
1218 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222
1223 /* Non-capturing possessive bracket with unlimited repeat. We come here
1224 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1225 without the capturing complication. It is written out separately for speed
1226 and cleanliness. */
1227
1228 case OP_BRAPOS:
1229 case OP_SBRAPOS:
1230 allow_zero = FALSE;
1231
1232 POSSESSIVE_NON_CAPTURE:
1233 matched_once = FALSE;
1234 code_offset = (int)(ecode - md->start_code);
1235 save_capture_last = md->capture_last;
1236
1237 for (;;)
1238 {
1239 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1240 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1241 eptrb, RM48);
1242 if (rrc == MATCH_KETRPOS)
1243 {
1244 offset_top = md->end_offset_top;
1245 eptr = md->end_match_ptr;
1246 ecode = md->start_code + code_offset;
1247 matched_once = TRUE;
1248 continue;
1249 }
1250
1251 /* See comment in the code for capturing groups above about handling
1252 THEN. */
1253
1254 if (rrc == MATCH_THEN)
1255 {
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1260 }
1261
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1266 }
1267
1268 if (matched_once || allow_zero)
1269 {
1270 ecode += 1 + LINK_SIZE;
1271 break;
1272 }
1273 RRETURN(MATCH_NOMATCH);
1274
1275 /* Control never reaches here. */
1276
1277 /* Conditional group: compilation checked that there are no more than two
1278 branches. If the condition is false, skipping the first branch takes us
1279 past the end of the item if there is only one branch, but that's exactly
1280 what we want. */
1281
1282 case OP_COND:
1283 case OP_SCOND:
1284
1285 /* The variable codelink will be added to ecode when the condition is
1286 false, to get to the second branch. Setting it to the offset to the ALT
1287 or KET, then incrementing ecode achieves this effect. We now have ecode
1288 pointing to the condition or callout. */
1289
1290 codelink = GET(ecode, 1); /* Offset to the second branch */
1291 ecode += 1 + LINK_SIZE; /* From this opcode */
1292
1293 /* Because of the way auto-callout works during compile, a callout item is
1294 inserted between OP_COND and an assertion condition. */
1295
1296 if (*ecode == OP_CALLOUT)
1297 {
1298 if (PUBL(callout) != NULL)
1299 {
1300 PUBL(callout_block) cb;
1301 cb.version = 2; /* Version 1 of the callout block */
1302 cb.callout_number = ecode[1];
1303 cb.offset_vector = md->offset_vector;
1304 #if defined COMPILE_PCRE8
1305 cb.subject = (PCRE_SPTR)md->start_subject;
1306 #elif defined COMPILE_PCRE16
1307 cb.subject = (PCRE_SPTR16)md->start_subject;
1308 #elif defined COMPILE_PCRE32
1309 cb.subject = (PCRE_SPTR32)md->start_subject;
1310 #endif
1311 cb.subject_length = (int)(md->end_subject - md->start_subject);
1312 cb.start_match = (int)(mstart - md->start_subject);
1313 cb.current_position = (int)(eptr - md->start_subject);
1314 cb.pattern_position = GET(ecode, 2);
1315 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1316 cb.capture_top = offset_top/2;
1317 cb.capture_last = md->capture_last & CAPLMASK;
1318 /* Internal change requires this for API compatibility. */
1319 if (cb.capture_last == 0) cb.capture_last = -1;
1320 cb.callout_data = md->callout_data;
1321 cb.mark = md->nomatch_mark;
1322 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1323 if (rrc < 0) RRETURN(rrc);
1324 }
1325
1326 /* Advance ecode past the callout, so it now points to the condition. We
1327 must adjust codelink so that the value of ecode+codelink is unchanged. */
1328
1329 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1330 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1331 }
1332
1333 /* Test the various possible conditions */
1334
1335 condition = FALSE;
1336 switch(condcode = *ecode)
1337 {
1338 case OP_RREF: /* Numbered group recursion test */
1339 if (md->recursive != NULL) /* Not recursing => FALSE */
1340 {
1341 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1342 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1343 }
1344 break;
1345
1346 case OP_DNRREF: /* Duplicate named group recursion test */
1347 if (md->recursive != NULL)
1348 {
1349 int count = GET2(ecode, 1 + IMM2_SIZE);
1350 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1351 while (count-- > 0)
1352 {
1353 unsigned int recno = GET2(slot, 0);
1354 condition = recno == md->recursive->group_num;
1355 if (condition) break;
1356 slot += md->name_entry_size;
1357 }
1358 }
1359 break;
1360
1361 case OP_CREF: /* Numbered group used test */
1362 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1363 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1364 break;
1365
1366 case OP_DNCREF: /* Duplicate named group used test */
1367 {
1368 int count = GET2(ecode, 1 + IMM2_SIZE);
1369 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1370 while (count-- > 0)
1371 {
1372 offset = GET2(slot, 0) << 1;
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 if (condition) break;
1375 slot += md->name_entry_size;
1376 }
1377 }
1378 break;
1379
1380 case OP_DEF: /* DEFINE - always false */
1381 break;
1382
1383 /* The condition is an assertion. Call match() to evaluate it - setting
1384 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1385 of an assertion. */
1386
1387 default:
1388 md->match_function_type = MATCH_CONDASSERT;
1389 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1390 if (rrc == MATCH_MATCH)
1391 {
1392 if (md->end_offset_top > offset_top)
1393 offset_top = md->end_offset_top; /* Captures may have happened */
1394 condition = TRUE;
1395
1396 /* Advance ecode past the assertion to the start of the first branch,
1397 but adjust it so that the general choosing code below works. */
1398
1399 ecode += GET(ecode, 1);
1400 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1402 }
1403
1404 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405 assertion; it is therefore treated as NOMATCH. Any other return is an
1406 error. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 break;
1413 }
1414
1415 /* Choose branch according to the condition */
1416
1417 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, we
1420 can use tail recursion to avoid using another stack frame, except when
1421 there is unlimited repeat of a possibly empty group. In the latter case, a
1422 recursive call to match() is always required, unless the second alternative
1423 doesn't exist, in which case we can just plough on. Note that, for
1424 compatibility with Perl, the | in a conditional group is NOT treated as
1425 creating two alternatives. If a THEN is encountered in the branch, it
1426 propagates out to the enclosing alternative (unless nested in a deeper set
1427 of alternatives, of course). */
1428
1429 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 goto TAIL_RECURSE;
1434 }
1435
1436 md->match_function_type = MATCH_CBEGROUP;
1437 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1438 RRETURN(rrc);
1439 }
1440
1441 /* Condition false & no alternative; continue after the group. */
1442
1443 else
1444 {
1445 }
1446 break;
1447
1448
1449 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450 to close any currently open capturing brackets. */
1451
1452 case OP_CLOSE:
1453 number = GET2(ecode, 1); /* Must be less than 65536 */
1454 offset = number << 1;
1455
1456 #ifdef PCRE_DEBUG
1457 printf("end bracket %d at *ACCEPT", number);
1458 printf("\n");
1459 #endif
1460
1461 md->capture_last = (md->capture_last & OVFLMASK) | number;
1462 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1463 {
1464 md->offset_vector[offset] =
1465 md->offset_vector[md->offset_end - number];
1466 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467 if (offset_top <= offset) offset_top = offset + 2;
1468 }
1469 ecode += 1 + IMM2_SIZE;
1470 break;
1471
1472
1473 /* End of the pattern, either real or forced. */
1474
1475 case OP_END:
1476 case OP_ACCEPT:
1477 case OP_ASSERT_ACCEPT:
1478
1479 /* If we have matched an empty string, fail if not in an assertion and not
1480 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481 is set and we have matched at the start of the subject. In both cases,
1482 backtracking will then try other alternatives, if any. */
1483
1484 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485 md->recursive == NULL &&
1486 (md->notempty ||
1487 (md->notempty_atstart &&
1488 mstart == md->start_subject + md->start_offset)))
1489 RRETURN(MATCH_NOMATCH);
1490
1491 /* Otherwise, we have a match. */
1492
1493 md->end_match_ptr = eptr; /* Record where we ended */
1494 md->end_offset_top = offset_top; /* and how many extracts were taken */
1495 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1496
1497 /* For some reason, the macros don't work properly if an expression is
1498 given as the argument to RRETURN when the heap is in use. */
1499
1500 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501 RRETURN(rrc);
1502
1503 /* Assertion brackets. Check the alternative branches in turn - the
1504 matching won't pass the KET for an assertion. If any one branch matches,
1505 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506 start of each branch to move the current point backwards, so the code at
1507 this level is identical to the lookahead case. When the assertion is part
1508 of a condition, we want to return immediately afterwards. The caller of
1509 this incarnation of the match() function will have set MATCH_CONDASSERT in
1510 md->match_function type, and one of these opcodes will be the first opcode
1511 that is processed. We use a local variable that is preserved over calls to
1512 match() to remember this case. */
1513
1514 case OP_ASSERT:
1515 case OP_ASSERTBACK:
1516 save_mark = md->mark;
1517 if (md->match_function_type == MATCH_CONDASSERT)
1518 {
1519 condassert = TRUE;
1520 md->match_function_type = 0;
1521 }
1522 else condassert = FALSE;
1523
1524 /* Loop for each branch */
1525
1526 do
1527 {
1528 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1529
1530 /* A match means that the assertion is true; break out of the loop
1531 that matches its alternatives. */
1532
1533 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1534 {
1535 mstart = md->start_match_ptr; /* In case \K reset it */
1536 break;
1537 }
1538
1539 /* If not matched, restore the previous mark setting. */
1540
1541 md->mark = save_mark;
1542
1543 /* See comment in the code for capturing groups above about handling
1544 THEN. */
1545
1546 if (rrc == MATCH_THEN)
1547 {
1548 next = ecode + GET(ecode,1);
1549 if (md->start_match_ptr < next &&
1550 (*ecode == OP_ALT || *next == OP_ALT))
1551 rrc = MATCH_NOMATCH;
1552 }
1553
1554 /* Anything other than NOMATCH causes the entire assertion to fail,
1555 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1556 uncaptured THEN, which means they take their normal effect. This
1557 consistent approach does not always have exactly the same effect as in
1558 Perl. */
1559
1560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1561 ecode += GET(ecode, 1);
1562 }
1563 while (*ecode == OP_ALT); /* Continue for next alternative */
1564
1565 /* If we have tried all the alternative branches, the assertion has
1566 failed. If not, we broke out after a match. */
1567
1568 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1569
1570 /* If checking an assertion for a condition, return MATCH_MATCH. */
1571
1572 if (condassert) RRETURN(MATCH_MATCH);
1573
1574 /* Continue from after a successful assertion, updating the offsets high
1575 water mark, since extracts may have been taken during the assertion. */
1576
1577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1578 ecode += 1 + LINK_SIZE;
1579 offset_top = md->end_offset_top;
1580 continue;
1581
1582 /* Negative assertion: all branches must fail to match for the assertion to
1583 succeed. */
1584
1585 case OP_ASSERT_NOT:
1586 case OP_ASSERTBACK_NOT:
1587 save_mark = md->mark;
1588 if (md->match_function_type == MATCH_CONDASSERT)
1589 {
1590 condassert = TRUE;
1591 md->match_function_type = 0;
1592 }
1593 else condassert = FALSE;
1594
1595 /* Loop for each alternative branch. */
1596
1597 do
1598 {
1599 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1600 md->mark = save_mark; /* Always restore the mark setting */
1601
1602 switch(rrc)
1603 {
1604 case MATCH_MATCH: /* A successful match means */
1605 case MATCH_ACCEPT: /* the assertion has failed. */
1606 RRETURN(MATCH_NOMATCH);
1607
1608 case MATCH_NOMATCH: /* Carry on with next branch */
1609 break;
1610
1611 /* See comment in the code for capturing groups above about handling
1612 THEN. */
1613
1614 case MATCH_THEN:
1615 next = ecode + GET(ecode,1);
1616 if (md->start_match_ptr < next &&
1617 (*ecode == OP_ALT || *next == OP_ALT))
1618 {
1619 rrc = MATCH_NOMATCH;
1620 break;
1621 }
1622 /* Otherwise fall through. */
1623
1624 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1625 assertion to fail to match, without considering any more alternatives.
1626 Failing to match means the assertion is true. This is a consistent
1627 approach, but does not always have the same effect as in Perl. */
1628
1629 case MATCH_COMMIT:
1630 case MATCH_SKIP:
1631 case MATCH_SKIP_ARG:
1632 case MATCH_PRUNE:
1633 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1634 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1635
1636 /* Anything else is an error */
1637
1638 default:
1639 RRETURN(rrc);
1640 }
1641
1642 /* Continue with next branch */
1643
1644 ecode += GET(ecode,1);
1645 }
1646 while (*ecode == OP_ALT);
1647
1648 /* All branches in the assertion failed to match. */
1649
1650 NEG_ASSERT_TRUE:
1651 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1652 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1653 continue;
1654
1655 /* Move the subject pointer back. This occurs only at the start of
1656 each branch of a lookbehind assertion. If we are too close to the start to
1657 move back, this match function fails. When working with UTF-8 we move
1658 back a number of characters, not bytes. */
1659
1660 case OP_REVERSE:
1661 #ifdef SUPPORT_UTF
1662 if (utf)
1663 {
1664 i = GET(ecode, 1);
1665 while (i-- > 0)
1666 {
1667 eptr--;
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 BACKCHAR(eptr);
1670 }
1671 }
1672 else
1673 #endif
1674
1675 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1676
1677 {
1678 eptr -= GET(ecode, 1);
1679 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1680 }
1681
1682 /* Save the earliest consulted character, then skip to next op code */
1683
1684 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1685 ecode += 1 + LINK_SIZE;
1686 break;
1687
1688 /* The callout item calls an external function, if one is provided, passing
1689 details of the match so far. This is mainly for debugging, though the
1690 function is able to force a failure. */
1691
1692 case OP_CALLOUT:
1693 if (PUBL(callout) != NULL)
1694 {
1695 PUBL(callout_block) cb;
1696 cb.version = 2; /* Version 1 of the callout block */
1697 cb.callout_number = ecode[1];
1698 cb.offset_vector = md->offset_vector;
1699 #if defined COMPILE_PCRE8
1700 cb.subject = (PCRE_SPTR)md->start_subject;
1701 #elif defined COMPILE_PCRE16
1702 cb.subject = (PCRE_SPTR16)md->start_subject;
1703 #elif defined COMPILE_PCRE32
1704 cb.subject = (PCRE_SPTR32)md->start_subject;
1705 #endif
1706 cb.subject_length = (int)(md->end_subject - md->start_subject);
1707 cb.start_match = (int)(mstart - md->start_subject);
1708 cb.current_position = (int)(eptr - md->start_subject);
1709 cb.pattern_position = GET(ecode, 2);
1710 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1711 cb.capture_top = offset_top/2;
1712 cb.capture_last = md->capture_last & CAPLMASK;
1713 /* Internal change requires this for API compatibility. */
1714 if (cb.capture_last == 0) cb.capture_last = -1;
1715 cb.callout_data = md->callout_data;
1716 cb.mark = md->nomatch_mark;
1717 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1718 if (rrc < 0) RRETURN(rrc);
1719 }
1720 ecode += 2 + 2*LINK_SIZE;
1721 break;
1722
1723 /* Recursion either matches the current regex, or some subexpression. The
1724 offset data is the offset to the starting bracket from the start of the
1725 whole pattern. (This is so that it works from duplicated subpatterns.)
1726
1727 The state of the capturing groups is preserved over recursion, and
1728 re-instated afterwards. We don't know how many are started and not yet
1729 finished (offset_top records the completed total) so we just have to save
1730 all the potential data. There may be up to 65535 such values, which is too
1731 large to put on the stack, but using malloc for small numbers seems
1732 expensive. As a compromise, the stack is used when there are no more than
1733 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1734
1735 There are also other values that have to be saved. We use a chained
1736 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1737 for the original version of this logic. It has, however, been hacked around
1738 a lot, so he is not to blame for the current way it works. */
1739
1740 case OP_RECURSE:
1741 {
1742 recursion_info *ri;
1743 unsigned int recno;
1744
1745 callpat = md->start_code + GET(ecode, 1);
1746 recno = (callpat == md->start_code)? 0 :
1747 GET2(callpat, 1 + LINK_SIZE);
1748
1749 /* Check for repeating a recursion without advancing the subject pointer.
1750 This should catch convoluted mutual recursions. (Some simple cases are
1751 caught at compile time.) */
1752
1753 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1754 if (recno == ri->group_num && eptr == ri->subject_position)
1755 RRETURN(PCRE_ERROR_RECURSELOOP);
1756
1757 /* Add to "recursing stack" */
1758
1759 new_recursive.group_num = recno;
1760 new_recursive.saved_capture_last = md->capture_last;
1761 new_recursive.subject_position = eptr;
1762 new_recursive.prevrec = md->recursive;
1763 md->recursive = &new_recursive;
1764
1765 /* Where to continue from afterwards */
1766
1767 ecode += 1 + LINK_SIZE;
1768
1769 /* Now save the offset data */
1770
1771 new_recursive.saved_max = md->offset_end;
1772 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1773 new_recursive.offset_save = stacksave;
1774 else
1775 {
1776 new_recursive.offset_save =
1777 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1778 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1779 }
1780 memcpy(new_recursive.offset_save, md->offset_vector,
1781 new_recursive.saved_max * sizeof(int));
1782
1783 /* OK, now we can do the recursion. After processing each alternative,
1784 restore the offset data and the last captured value. If there were nested
1785 recursions, md->recursive might be changed, so reset it before looping.
1786 */
1787
1788 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1789 cbegroup = (*callpat >= OP_SBRA);
1790 do
1791 {
1792 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1793 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1794 md, eptrb, RM6);
1795 memcpy(md->offset_vector, new_recursive.offset_save,
1796 new_recursive.saved_max * sizeof(int));
1797 md->capture_last = new_recursive.saved_capture_last;
1798 md->recursive = new_recursive.prevrec;
1799 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1800 {
1801 DPRINTF(("Recursion matched\n"));
1802 if (new_recursive.offset_save != stacksave)
1803 (PUBL(free))(new_recursive.offset_save);
1804
1805 /* Set where we got to in the subject, and reset the start in case
1806 it was changed by \K. This *is* propagated back out of a recursion,
1807 for Perl compatibility. */
1808
1809 eptr = md->end_match_ptr;
1810 mstart = md->start_match_ptr;
1811 goto RECURSION_MATCHED; /* Exit loop; end processing */
1812 }
1813
1814 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1815 recursion; they cause a NOMATCH for the entire recursion. These codes
1816 are defined in a range that can be tested for. */
1817
1818 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1819 RRETURN(MATCH_NOMATCH);
1820
1821 /* Any return code other than NOMATCH is an error. */
1822
1823 if (rrc != MATCH_NOMATCH)
1824 {
1825 DPRINTF(("Recursion gave error %d\n", rrc));
1826 if (new_recursive.offset_save != stacksave)
1827 (PUBL(free))(new_recursive.offset_save);
1828 RRETURN(rrc);
1829 }
1830
1831 md->recursive = &new_recursive;
1832 callpat += GET(callpat, 1);
1833 }
1834 while (*callpat == OP_ALT);
1835
1836 DPRINTF(("Recursion didn't match\n"));
1837 md->recursive = new_recursive.prevrec;
1838 if (new_recursive.offset_save != stacksave)
1839 (PUBL(free))(new_recursive.offset_save);
1840 RRETURN(MATCH_NOMATCH);
1841 }
1842
1843 RECURSION_MATCHED:
1844 break;
1845
1846 /* An alternation is the end of a branch; scan along to find the end of the
1847 bracketed group and go to there. */
1848
1849 case OP_ALT:
1850 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1851 break;
1852
1853 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1854 indicating that it may occur zero times. It may repeat infinitely, or not
1855 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1856 with fixed upper repeat limits are compiled as a number of copies, with the
1857 optional ones preceded by BRAZERO or BRAMINZERO. */
1858
1859 case OP_BRAZERO:
1860 next = ecode + 1;
1861 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1863 do next += GET(next, 1); while (*next == OP_ALT);
1864 ecode = next + 1 + LINK_SIZE;
1865 break;
1866
1867 case OP_BRAMINZERO:
1868 next = ecode + 1;
1869 do next += GET(next, 1); while (*next == OP_ALT);
1870 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1872 ecode++;
1873 break;
1874
1875 case OP_SKIPZERO:
1876 next = ecode+1;
1877 do next += GET(next,1); while (*next == OP_ALT);
1878 ecode = next + 1 + LINK_SIZE;
1879 break;
1880
1881 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1882 here; just jump to the group, with allow_zero set TRUE. */
1883
1884 case OP_BRAPOSZERO:
1885 op = *(++ecode);
1886 allow_zero = TRUE;
1887 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1888 goto POSSESSIVE_NON_CAPTURE;
1889
1890 /* End of a group, repeated or non-repeating. */
1891
1892 case OP_KET:
1893 case OP_KETRMIN:
1894 case OP_KETRMAX:
1895 case OP_KETRPOS:
1896 prev = ecode - GET(ecode, 1);
1897
1898 /* If this was a group that remembered the subject start, in order to break
1899 infinite repeats of empty string matches, retrieve the subject start from
1900 the chain. Otherwise, set it NULL. */
1901
1902 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1903 {
1904 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1905 eptrb = eptrb->epb_prev; /* Backup to previous group */
1906 }
1907 else saved_eptr = NULL;
1908
1909 /* If we are at the end of an assertion group or a non-capturing atomic
1910 group, stop matching and return MATCH_MATCH, but record the current high
1911 water mark for use by positive assertions. We also need to record the match
1912 start in case it was changed by \K. */
1913
1914 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1915 *prev == OP_ONCE_NC)
1916 {
1917 md->end_match_ptr = eptr; /* For ONCE_NC */
1918 md->end_offset_top = offset_top;
1919 md->start_match_ptr = mstart;
1920 RRETURN(MATCH_MATCH); /* Sets md->mark */
1921 }
1922
1923 /* For capturing groups we have to check the group number back at the start
1924 and if necessary complete handling an extraction by setting the offsets and
1925 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1926 into group 0, so it won't be picked up here. Instead, we catch it when the
1927 OP_END is reached. Other recursion is handled here. We just have to record
1928 the current subject position and start match pointer and give a MATCH
1929 return. */
1930
1931 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1932 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1933 {
1934 number = GET2(prev, 1+LINK_SIZE);
1935 offset = number << 1;
1936
1937 #ifdef PCRE_DEBUG
1938 printf("end bracket %d", number);
1939 printf("\n");
1940 #endif
1941
1942 /* Handle a recursively called group. */
1943
1944 if (md->recursive != NULL && md->recursive->group_num == number)
1945 {
1946 md->end_match_ptr = eptr;
1947 md->start_match_ptr = mstart;
1948 RRETURN(MATCH_MATCH);
1949 }
1950
1951 /* Deal with capturing */
1952
1953 md->capture_last = (md->capture_last & OVFLMASK) | number;
1954 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1955 {
1956 /* If offset is greater than offset_top, it means that we are
1957 "skipping" a capturing group, and that group's offsets must be marked
1958 unset. In earlier versions of PCRE, all the offsets were unset at the
1959 start of matching, but this doesn't work because atomic groups and
1960 assertions can cause a value to be set that should later be unset.
1961 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1962 part of the atomic group, but this is not on the final matching path,
1963 so must be unset when 2 is set. (If there is no group 2, there is no
1964 problem, because offset_top will then be 2, indicating no capture.) */
1965
1966 if (offset > offset_top)
1967 {
1968 register int *iptr = md->offset_vector + offset_top;
1969 register int *iend = md->offset_vector + offset;
1970 while (iptr < iend) *iptr++ = -1;
1971 }
1972
1973 /* Now make the extraction */
1974
1975 md->offset_vector[offset] =
1976 md->offset_vector[md->offset_end - number];
1977 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1978 if (offset_top <= offset) offset_top = offset + 2;
1979 }
1980 }
1981
1982 /* For an ordinary non-repeating ket, just continue at this level. This
1983 also happens for a repeating ket if no characters were matched in the
1984 group. This is the forcible breaking of infinite loops as implemented in
1985 Perl 5.005. For a non-repeating atomic group that includes captures,
1986 establish a backup point by processing the rest of the pattern at a lower
1987 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1988 original OP_ONCE level, thereby bypassing intermediate backup points, but
1989 resetting any captures that happened along the way. */
1990
1991 if (*ecode == OP_KET || eptr == saved_eptr)
1992 {
1993 if (*prev == OP_ONCE)
1994 {
1995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1998 RRETURN(MATCH_ONCE);
1999 }
2000 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2001 break;
2002 }
2003
2004 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2005 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2006 at a time from the outer level, thus saving stack. */
2007
2008 if (*ecode == OP_KETRPOS)
2009 {
2010 md->end_match_ptr = eptr;
2011 md->end_offset_top = offset_top;
2012 RRETURN(MATCH_KETRPOS);
2013 }
2014
2015 /* The normal repeating kets try the rest of the pattern or restart from
2016 the preceding bracket, in the appropriate order. In the second case, we can
2017 use tail recursion to avoid using another stack frame, unless we have an
2018 an atomic group or an unlimited repeat of a group that can match an empty
2019 string. */
2020
2021 if (*ecode == OP_KETRMIN)
2022 {
2023 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2025 if (*prev == OP_ONCE)
2026 {
2027 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2029 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2030 RRETURN(MATCH_ONCE);
2031 }
2032 if (*prev >= OP_SBRA) /* Could match an empty string */
2033 {
2034 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2035 RRETURN(rrc);
2036 }
2037 ecode = prev;
2038 goto TAIL_RECURSE;
2039 }
2040 else /* OP_KETRMAX */
2041 {
2042 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2043 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2045 if (*prev == OP_ONCE)
2046 {
2047 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2049 md->once_target = prev;
2050 RRETURN(MATCH_ONCE);
2051 }
2052 ecode += 1 + LINK_SIZE;
2053 goto TAIL_RECURSE;
2054 }
2055 /* Control never gets here */
2056
2057 /* Not multiline mode: start of subject assertion, unless notbol. */
2058
2059 case OP_CIRC:
2060 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2061
2062 /* Start of subject assertion */
2063
2064 case OP_SOD:
2065 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2066 ecode++;
2067 break;
2068
2069 /* Multiline mode: start of subject unless notbol, or after any newline. */
2070
2071 case OP_CIRCM:
2072 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2073 if (eptr != md->start_subject &&
2074 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2075 RRETURN(MATCH_NOMATCH);
2076 ecode++;
2077 break;
2078
2079 /* Start of match assertion */
2080
2081 case OP_SOM:
2082 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2083 ecode++;
2084 break;
2085
2086 /* Reset the start of match point */
2087
2088 case OP_SET_SOM:
2089 mstart = eptr;
2090 ecode++;
2091 break;
2092
2093 /* Multiline mode: assert before any newline, or before end of subject
2094 unless noteol is set. */
2095
2096 case OP_DOLLM:
2097 if (eptr < md->end_subject)
2098 {
2099 if (!IS_NEWLINE(eptr))
2100 {
2101 if (md->partial != 0 &&
2102 eptr + 1 >= md->end_subject &&
2103 NLBLOCK->nltype == NLTYPE_FIXED &&
2104 NLBLOCK->nllen == 2 &&
2105 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2106 {
2107 md->hitend = TRUE;
2108 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2109 }
2110 RRETURN(MATCH_NOMATCH);
2111 }
2112 }
2113 else
2114 {
2115 if (md->noteol) RRETURN(MATCH_NOMATCH);
2116 SCHECK_PARTIAL();
2117 }
2118 ecode++;
2119 break;
2120
2121 /* Not multiline mode: assert before a terminating newline or before end of
2122 subject unless noteol is set. */
2123
2124 case OP_DOLL:
2125 if (md->noteol) RRETURN(MATCH_NOMATCH);
2126 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2127
2128 /* ... else fall through for endonly */
2129
2130 /* End of subject assertion (\z) */
2131
2132 case OP_EOD:
2133 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2134 SCHECK_PARTIAL();
2135 ecode++;
2136 break;
2137
2138 /* End of subject or ending \n assertion (\Z) */
2139
2140 case OP_EODN:
2141 ASSERT_NL_OR_EOS:
2142 if (eptr < md->end_subject &&
2143 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2144 {
2145 if (md->partial != 0 &&
2146 eptr + 1 >= md->end_subject &&
2147 NLBLOCK->nltype == NLTYPE_FIXED &&
2148 NLBLOCK->nllen == 2 &&
2149 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2150 {
2151 md->hitend = TRUE;
2152 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2153 }
2154 RRETURN(MATCH_NOMATCH);
2155 }
2156
2157 /* Either at end of string or \n before end. */
2158
2159 SCHECK_PARTIAL();
2160 ecode++;
2161 break;
2162
2163 /* Word boundary assertions */
2164
2165 case OP_NOT_WORD_BOUNDARY:
2166 case OP_WORD_BOUNDARY:
2167 {
2168
2169 /* Find out if the previous and current characters are "word" characters.
2170 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2171 be "non-word" characters. Remember the earliest consulted character for
2172 partial matching. */
2173
2174 #ifdef SUPPORT_UTF
2175 if (utf)
2176 {
2177 /* Get status of previous character */
2178
2179 if (eptr == md->start_subject) prev_is_word = FALSE; else
2180 {
2181 PCRE_PUCHAR lastptr = eptr - 1;
2182 BACKCHAR(lastptr);
2183 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2184 GETCHAR(c, lastptr);
2185 #ifdef SUPPORT_UCP
2186 if (md->use_ucp)
2187 {
2188 if (c == '_') prev_is_word = TRUE; else
2189 {
2190 int cat = UCD_CATEGORY(c);
2191 prev_is_word = (cat == ucp_L || cat == ucp_N);
2192 }
2193 }
2194 else
2195 #endif
2196 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2197 }
2198
2199 /* Get status of next character */
2200
2201 if (eptr >= md->end_subject)
2202 {
2203 SCHECK_PARTIAL();
2204 cur_is_word = FALSE;
2205 }
2206 else
2207 {
2208 GETCHAR(c, eptr);
2209 #ifdef SUPPORT_UCP
2210 if (md->use_ucp)
2211 {
2212 if (c == '_') cur_is_word = TRUE; else
2213 {
2214 int cat = UCD_CATEGORY(c);
2215 cur_is_word = (cat == ucp_L || cat == ucp_N);
2216 }
2217 }
2218 else
2219 #endif
2220 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2221 }
2222 }
2223 else
2224 #endif
2225
2226 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2227 consistency with the behaviour of \w we do use it in this case. */
2228
2229 {
2230 /* Get status of previous character */
2231
2232 if (eptr == md->start_subject) prev_is_word = FALSE; else
2233 {
2234 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2235 #ifdef SUPPORT_UCP
2236 if (md->use_ucp)
2237 {
2238 c = eptr[-1];
2239 if (c == '_') prev_is_word = TRUE; else
2240 {
2241 int cat = UCD_CATEGORY(c);
2242 prev_is_word = (cat == ucp_L || cat == ucp_N);
2243 }
2244 }
2245 else
2246 #endif
2247 prev_is_word = MAX_255(eptr[-1])
2248 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2249 }
2250
2251 /* Get status of next character */
2252
2253 if (eptr >= md->end_subject)
2254 {
2255 SCHECK_PARTIAL();
2256 cur_is_word = FALSE;
2257 }
2258 else
2259 #ifdef SUPPORT_UCP
2260 if (md->use_ucp)
2261 {
2262 c = *eptr;
2263 if (c == '_') cur_is_word = TRUE; else
2264 {
2265 int cat = UCD_CATEGORY(c);
2266 cur_is_word = (cat == ucp_L || cat == ucp_N);
2267 }
2268 }
2269 else
2270 #endif
2271 cur_is_word = MAX_255(*eptr)
2272 && ((md->ctypes[*eptr] & ctype_word) != 0);
2273 }
2274
2275 /* Now see if the situation is what we want */
2276
2277 if ((*ecode++ == OP_WORD_BOUNDARY)?
2278 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2279 RRETURN(MATCH_NOMATCH);
2280 }
2281 break;
2282
2283 /* Match any single character type except newline; have to take care with
2284 CRLF newlines and partial matching. */
2285
2286 case OP_ANY:
2287 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2288 if (md->partial != 0 &&
2289 eptr + 1 >= md->end_subject &&
2290 NLBLOCK->nltype == NLTYPE_FIXED &&
2291 NLBLOCK->nllen == 2 &&
2292 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2293 {
2294 md->hitend = TRUE;
2295 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2296 }
2297
2298 /* Fall through */
2299
2300 /* Match any single character whatsoever. */
2301
2302 case OP_ALLANY:
2303 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2304 { /* not be updated before SCHECK_PARTIAL. */
2305 SCHECK_PARTIAL();
2306 RRETURN(MATCH_NOMATCH);
2307 }
2308 eptr++;
2309 #ifdef SUPPORT_UTF
2310 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2311 #endif
2312 ecode++;
2313 break;
2314
2315 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2316 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2317
2318 case OP_ANYBYTE:
2319 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2320 { /* not be updated before SCHECK_PARTIAL. */
2321 SCHECK_PARTIAL();
2322 RRETURN(MATCH_NOMATCH);
2323 }
2324 eptr++;
2325 ecode++;
2326 break;
2327
2328 case OP_NOT_DIGIT:
2329 if (eptr >= md->end_subject)
2330 {
2331 SCHECK_PARTIAL();
2332 RRETURN(MATCH_NOMATCH);
2333 }
2334 GETCHARINCTEST(c, eptr);
2335 if (
2336 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2337 c < 256 &&
2338 #endif
2339 (md->ctypes[c] & ctype_digit) != 0
2340 )
2341 RRETURN(MATCH_NOMATCH);
2342 ecode++;
2343 break;
2344
2345 case OP_DIGIT:
2346 if (eptr >= md->end_subject)
2347 {
2348 SCHECK_PARTIAL();
2349 RRETURN(MATCH_NOMATCH);
2350 }
2351 GETCHARINCTEST(c, eptr);
2352 if (
2353 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2354 c > 255 ||
2355 #endif
2356 (md->ctypes[c] & ctype_digit) == 0
2357 )
2358 RRETURN(MATCH_NOMATCH);
2359 ecode++;
2360 break;
2361
2362 case OP_NOT_WHITESPACE:
2363 if (eptr >= md->end_subject)
2364 {
2365 SCHECK_PARTIAL();
2366 RRETURN(MATCH_NOMATCH);
2367 }
2368 GETCHARINCTEST(c, eptr);
2369 if (
2370 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2371 c < 256 &&
2372 #endif
2373 (md->ctypes[c] & ctype_space) != 0
2374 )
2375 RRETURN(MATCH_NOMATCH);
2376 ecode++;
2377 break;
2378
2379 case OP_WHITESPACE:
2380 if (eptr >= md->end_subject)
2381 {
2382 SCHECK_PARTIAL();
2383 RRETURN(MATCH_NOMATCH);
2384 }
2385 GETCHARINCTEST(c, eptr);
2386 if (
2387 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2388 c > 255 ||
2389 #endif
2390 (md->ctypes[c] & ctype_space) == 0
2391 )
2392 RRETURN(MATCH_NOMATCH);
2393 ecode++;
2394 break;
2395
2396 case OP_NOT_WORDCHAR:
2397 if (eptr >= md->end_subject)
2398 {
2399 SCHECK_PARTIAL();
2400 RRETURN(MATCH_NOMATCH);
2401 }
2402 GETCHARINCTEST(c, eptr);
2403 if (
2404 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2405 c < 256 &&
2406 #endif
2407 (md->ctypes[c] & ctype_word) != 0
2408 )
2409 RRETURN(MATCH_NOMATCH);
2410 ecode++;
2411 break;
2412
2413 case OP_WORDCHAR:
2414 if (eptr >= md->end_subject)
2415 {
2416 SCHECK_PARTIAL();
2417 RRETURN(MATCH_NOMATCH);
2418 }
2419 GETCHARINCTEST(c, eptr);
2420 if (
2421 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2422 c > 255 ||
2423 #endif
2424 (md->ctypes[c] & ctype_word) == 0
2425 )
2426 RRETURN(MATCH_NOMATCH);
2427 ecode++;
2428 break;
2429
2430 case OP_ANYNL:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 RRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 switch(c)
2438 {
2439 default: RRETURN(MATCH_NOMATCH);
2440
2441 case CHAR_CR:
2442 if (eptr >= md->end_subject)
2443 {
2444 SCHECK_PARTIAL();
2445 }
2446 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2447 break;
2448
2449 case CHAR_LF:
2450 break;
2451
2452 case CHAR_VT:
2453 case CHAR_FF:
2454 case CHAR_NEL:
2455 #ifndef EBCDIC
2456 case 0x2028:
2457 case 0x2029:
2458 #endif /* Not EBCDIC */
2459 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2460 break;
2461 }
2462 ecode++;
2463 break;
2464
2465 case OP_NOT_HSPACE:
2466 if (eptr >= md->end_subject)
2467 {
2468 SCHECK_PARTIAL();
2469 RRETURN(MATCH_NOMATCH);
2470 }
2471 GETCHARINCTEST(c, eptr);
2472 switch(c)
2473 {
2474 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2475 default: break;
2476 }
2477 ecode++;
2478 break;
2479
2480 case OP_HSPACE:
2481 if (eptr >= md->end_subject)
2482 {
2483 SCHECK_PARTIAL();
2484 RRETURN(MATCH_NOMATCH);
2485 }
2486 GETCHARINCTEST(c, eptr);
2487 switch(c)
2488 {
2489 HSPACE_CASES: break; /* Byte and multibyte cases */
2490 default: RRETURN(MATCH_NOMATCH);
2491 }
2492 ecode++;
2493 break;
2494
2495 case OP_NOT_VSPACE:
2496 if (eptr >= md->end_subject)
2497 {
2498 SCHECK_PARTIAL();
2499 RRETURN(MATCH_NOMATCH);
2500 }
2501 GETCHARINCTEST(c, eptr);
2502 switch(c)
2503 {
2504 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2505 default: break;
2506 }
2507 ecode++;
2508 break;
2509
2510 case OP_VSPACE:
2511 if (eptr >= md->end_subject)
2512 {
2513 SCHECK_PARTIAL();
2514 RRETURN(MATCH_NOMATCH);
2515 }
2516 GETCHARINCTEST(c, eptr);
2517 switch(c)
2518 {
2519 VSPACE_CASES: break;
2520 default: RRETURN(MATCH_NOMATCH);
2521 }
2522 ecode++;
2523 break;
2524
2525 #ifdef SUPPORT_UCP
2526 /* Check the next character by Unicode property. We will get here only
2527 if the support is in the binary; otherwise a compile-time error occurs. */
2528
2529 case OP_PROP:
2530 case OP_NOTPROP:
2531 if (eptr >= md->end_subject)
2532 {
2533 SCHECK_PARTIAL();
2534 RRETURN(MATCH_NOMATCH);
2535 }
2536 GETCHARINCTEST(c, eptr);
2537 {
2538 const pcre_uint32 *cp;
2539 const ucd_record *prop = GET_UCD(c);
2540
2541 switch(ecode[1])
2542 {
2543 case PT_ANY:
2544 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2545 break;
2546
2547 case PT_LAMP:
2548 if ((prop->chartype == ucp_Lu ||
2549 prop->chartype == ucp_Ll ||
2550 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2551 RRETURN(MATCH_NOMATCH);
2552 break;
2553
2554 case PT_GC:
2555 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2556 RRETURN(MATCH_NOMATCH);
2557 break;
2558
2559 case PT_PC:
2560 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2561 RRETURN(MATCH_NOMATCH);
2562 break;
2563
2564 case PT_SC:
2565 if ((ecode[2] != prop->script) == (op == OP_PROP))
2566 RRETURN(MATCH_NOMATCH);
2567 break;
2568
2569 /* These are specials */
2570
2571 case PT_ALNUM:
2572 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2573 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2574 RRETURN(MATCH_NOMATCH);
2575 break;
2576
2577 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2578 which means that Perl space and POSIX space are now identical. PCRE
2579 was changed at release 8.34. */
2580
2581 case PT_SPACE: /* Perl space */
2582 case PT_PXSPACE: /* POSIX space */
2583 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2584 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2585 c == CHAR_FF || c == CHAR_CR)
2586 == (op == OP_NOTPROP))
2587 RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 case PT_WORD:
2591 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2592 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2593 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2594 RRETURN(MATCH_NOMATCH);
2595 break;
2596
2597 case PT_CLIST:
2598 cp = PRIV(ucd_caseless_sets) + ecode[2];
2599 for (;;)
2600 {
2601 if (c < *cp)
2602 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2603 if (c == *cp++)
2604 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2605 }
2606 break;
2607
2608 case PT_UCNC:
2609 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2610 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2611 c >= 0xe000) == (op == OP_NOTPROP))
2612 RRETURN(MATCH_NOMATCH);
2613 break;
2614
2615 /* This should never occur */
2616
2617 default:
2618 RRETURN(PCRE_ERROR_INTERNAL);
2619 }
2620
2621 ecode += 3;
2622 }
2623 break;
2624
2625 /* Match an extended Unicode sequence. We will get here only if the support
2626 is in the binary; otherwise a compile-time error occurs. */
2627
2628 case OP_EXTUNI:
2629 if (eptr >= md->end_subject)
2630 {
2631 SCHECK_PARTIAL();
2632 RRETURN(MATCH_NOMATCH);
2633 }
2634 else
2635 {
2636 int lgb, rgb;
2637 GETCHARINCTEST(c, eptr);
2638 lgb = UCD_GRAPHBREAK(c);
2639 while (eptr < md->end_subject)
2640 {
2641 int len = 1;
2642 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2643 rgb = UCD_GRAPHBREAK(c);
2644 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2645 lgb = rgb;
2646 eptr += len;
2647 }
2648 }
2649 CHECK_PARTIAL();
2650 ecode++;
2651 break;
2652 #endif /* SUPPORT_UCP */
2653
2654
2655 /* Match a back reference, possibly repeatedly. Look past the end of the
2656 item to see if there is repeat information following. The code is similar
2657 to that for character classes, but repeated for efficiency. Then obey
2658 similar code to character type repeats - written out again for speed.
2659 However, if the referenced string is the empty string, always treat
2660 it as matched, any number of times (otherwise there could be infinite
2661 loops). If the reference is unset, there are two possibilities:
2662
2663 (a) In the default, Perl-compatible state, set the length negative;
2664 this ensures that every attempt at a match fails. We can't just fail
2665 here, because of the possibility of quantifiers with zero minima.
2666
2667 (b) If the JavaScript compatibility flag is set, set the length to zero
2668 so that the back reference matches an empty string.
2669
2670 Otherwise, set the length to the length of what was matched by the
2671 referenced subpattern.
2672
2673 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2674 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2675 and OP_DNREFI are used. In this case we must scan the list of groups to
2676 which the name refers, and use the first one that is set. */
2677
2678 case OP_DNREF:
2679 case OP_DNREFI:
2680 caseless = op == OP_DNREFI;
2681 {
2682 int count = GET2(ecode, 1+IMM2_SIZE);
2683 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2684 ecode += 1 + 2*IMM2_SIZE;
2685
2686 while (count-- > 0)
2687 {
2688 offset = GET2(slot, 0) << 1;
2689 if (offset < offset_top && md->offset_vector[offset] >= 0) break;
2690 slot += md->name_entry_size;
2691 }
2692 if (count < 0)
2693 length = (md->jscript_compat)? 0 : -1;
2694 else
2695 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2696 }
2697 goto REF_REPEAT;
2698
2699 case OP_REF:
2700 case OP_REFI:
2701 caseless = op == OP_REFI;
2702 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2703 ecode += 1 + IMM2_SIZE;
2704 if (offset >= offset_top || md->offset_vector[offset] < 0)
2705 length = (md->jscript_compat)? 0 : -1;
2706 else
2707 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2708
2709 /* Set up for repetition, or handle the non-repeated case */
2710
2711 REF_REPEAT:
2712 switch (*ecode)
2713 {
2714 case OP_CRSTAR:
2715 case OP_CRMINSTAR:
2716 case OP_CRPLUS:
2717 case OP_CRMINPLUS:
2718 case OP_CRQUERY:
2719 case OP_CRMINQUERY:
2720 c = *ecode++ - OP_CRSTAR;
2721 minimize = (c & 1) != 0;
2722 min = rep_min[c]; /* Pick up values from tables; */
2723 max = rep_max[c]; /* zero for max => infinity */
2724 if (max == 0) max = INT_MAX;
2725 break;
2726
2727 case OP_CRRANGE:
2728 case OP_CRMINRANGE:
2729 minimize = (*ecode == OP_CRMINRANGE);
2730 min = GET2(ecode, 1);
2731 max = GET2(ecode, 1 + IMM2_SIZE);
2732 if (max == 0) max = INT_MAX;
2733 ecode += 1 + 2 * IMM2_SIZE;
2734 break;
2735
2736 default: /* No repeat follows */
2737 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2738 {
2739 if (length == -2) eptr = md->end_subject; /* Partial match */
2740 CHECK_PARTIAL();
2741 RRETURN(MATCH_NOMATCH);
2742 }
2743 eptr += length;
2744 continue; /* With the main loop */
2745 }
2746
2747 /* Handle repeated back references. If the length of the reference is
2748 zero, just continue with the main loop. If the length is negative, it
2749 means the reference is unset in non-Java-compatible mode. If the minimum is
2750 zero, we can continue at the same level without recursion. For any other
2751 minimum, carrying on will result in NOMATCH. */
2752
2753 if (length == 0) continue;
2754 if (length < 0 && min == 0) continue;
2755
2756 /* First, ensure the minimum number of matches are present. We get back
2757 the length of the reference string explicitly rather than passing the
2758 address of eptr, so that eptr can be a register variable. */
2759
2760 for (i = 1; i <= min; i++)
2761 {
2762 int slength;
2763 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2764 {
2765 if (slength == -2) eptr = md->end_subject; /* Partial match */
2766 CHECK_PARTIAL();
2767 RRETURN(MATCH_NOMATCH);
2768 }
2769 eptr += slength;
2770 }
2771
2772 /* If min = max, continue at the same level without recursion.
2773 They are not both allowed to be zero. */
2774
2775 if (min == max) continue;
2776
2777 /* If minimizing, keep trying and advancing the pointer */
2778
2779 if (minimize)
2780 {
2781 for (fi = min;; fi++)
2782 {
2783 int slength;
2784 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2786 if (fi >= max) RRETURN(MATCH_NOMATCH);
2787 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2788 {
2789 if (slength == -2) eptr = md->end_subject; /* Partial match */
2790 CHECK_PARTIAL();
2791 RRETURN(MATCH_NOMATCH);
2792 }
2793 eptr += slength;
2794 }
2795 /* Control never gets here */
2796 }
2797
2798 /* If maximizing, find the longest string and work backwards */
2799
2800 else
2801 {
2802 pp = eptr;
2803 for (i = min; i < max; i++)
2804 {
2805 int slength;
2806 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2807 {
2808 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2809 the soft partial matching case. */
2810
2811 if (slength == -2 && md->partial != 0 &&
2812 md->end_subject > md->start_used_ptr)
2813 {
2814 md->hitend = TRUE;
2815 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2816 }
2817 break;
2818 }
2819 eptr += slength;
2820 }
2821
2822 while (eptr >= pp)
2823 {
2824 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2825 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2826 eptr -= length;
2827 }
2828 RRETURN(MATCH_NOMATCH);
2829 }
2830 /* Control never gets here */
2831
2832 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2833 used when all the characters in the class have values in the range 0-255,
2834 and either the matching is caseful, or the characters are in the range
2835 0-127 when UTF-8 processing is enabled. The only difference between
2836 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2837 encountered.
2838
2839 First, look past the end of the item to see if there is repeat information
2840 following. Then obey similar code to character type repeats - written out
2841 again for speed. */
2842
2843 case OP_NCLASS:
2844 case OP_CLASS:
2845 {
2846 /* The data variable is saved across frames, so the byte map needs to
2847 be stored there. */
2848 #define BYTE_MAP ((pcre_uint8 *)data)
2849 data = ecode + 1; /* Save for matching */
2850 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2851
2852 switch (*ecode)
2853 {
2854 case OP_CRSTAR:
2855 case OP_CRMINSTAR:
2856 case OP_CRPLUS:
2857 case OP_CRMINPLUS:
2858 case OP_CRQUERY:
2859 case OP_CRMINQUERY:
2860 c = *ecode++ - OP_CRSTAR;
2861 minimize = (c & 1) != 0;
2862 min = rep_min[c]; /* Pick up values from tables; */
2863 max = rep_max[c]; /* zero for max => infinity */
2864 if (max == 0) max = INT_MAX;
2865 break;
2866
2867 case OP_CRRANGE:
2868 case OP_CRMINRANGE:
2869 minimize = (*ecode == OP_CRMINRANGE);
2870 min = GET2(ecode, 1);
2871 max = GET2(ecode, 1 + IMM2_SIZE);
2872 if (max == 0) max = INT_MAX;
2873 ecode += 1 + 2 * IMM2_SIZE;
2874 break;
2875
2876 default: /* No repeat follows */
2877 min = max = 1;
2878 break;
2879 }
2880
2881 /* First, ensure the minimum number of matches are present. */
2882
2883 #ifdef SUPPORT_UTF
2884 if (utf)
2885 {
2886 for (i = 1; i <= min; i++)
2887 {
2888 if (eptr >= md->end_subject)
2889 {
2890 SCHECK_PARTIAL();
2891 RRETURN(MATCH_NOMATCH);
2892 }
2893 GETCHARINC(c, eptr);
2894 if (c > 255)
2895 {
2896 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2897 }
2898 else
2899 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2900 }
2901 }
2902 else
2903 #endif
2904 /* Not UTF mode */
2905 {
2906 for (i = 1; i <= min; i++)
2907 {
2908 if (eptr >= md->end_subject)
2909 {
2910 SCHECK_PARTIAL();
2911 RRETURN(MATCH_NOMATCH);
2912 }
2913 c = *eptr++;
2914 #ifndef COMPILE_PCRE8
2915 if (c > 255)
2916 {
2917 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2918 }
2919 else
2920 #endif
2921 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2922 }
2923 }
2924
2925 /* If max == min we can continue with the main loop without the
2926 need to recurse. */
2927
2928 if (min == max) continue;
2929
2930 /* If minimizing, keep testing the rest of the expression and advancing
2931 the pointer while it matches the class. */
2932
2933 if (minimize)
2934 {
2935 #ifdef SUPPORT_UTF
2936 if (utf)
2937 {
2938 for (fi = min;; fi++)
2939 {
2940 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2942 if (fi >= max) RRETURN(MATCH_NOMATCH);
2943 if (eptr >= md->end_subject)
2944 {
2945 SCHECK_PARTIAL();
2946 RRETURN(MATCH_NOMATCH);
2947 }
2948 GETCHARINC(c, eptr);
2949 if (c > 255)
2950 {
2951 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2952 }
2953 else
2954 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2955 }
2956 }
2957 else
2958 #endif
2959 /* Not UTF mode */
2960 {
2961 for (fi = min;; fi++)
2962 {
2963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2965 if (fi >= max) RRETURN(MATCH_NOMATCH);
2966 if (eptr >= md->end_subject)
2967 {
2968 SCHECK_PARTIAL();
2969 RRETURN(MATCH_NOMATCH);
2970 }
2971 c = *eptr++;
2972 #ifndef COMPILE_PCRE8
2973 if (c > 255)
2974 {
2975 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2976 }
2977 else
2978 #endif
2979 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2980 }
2981 }
2982 /* Control never gets here */
2983 }
2984
2985 /* If maximizing, find the longest possible run, then work backwards. */
2986
2987 else
2988 {
2989 pp = eptr;
2990
2991 #ifdef SUPPORT_UTF
2992 if (utf)
2993 {
2994 for (i = min; i < max; i++)
2995 {
2996 int len = 1;
2997 if (eptr >= md->end_subject)
2998 {
2999 SCHECK_PARTIAL();
3000 break;
3001 }
3002 GETCHARLEN(c, eptr, len);
3003 if (c > 255)
3004 {
3005 if (op == OP_CLASS) break;
3006 }
3007 else
3008 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3009 eptr += len;
3010 }
3011 for (;;)
3012 {
3013 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015 if (eptr-- == pp) break; /* Stop if tried at original pos */
3016 BACKCHAR(eptr);
3017 }
3018 }
3019 else
3020 #endif
3021 /* Not UTF mode */
3022 {
3023 for (i = min; i < max; i++)
3024 {
3025 if (eptr >= md->end_subject)
3026 {
3027 SCHECK_PARTIAL();
3028 break;
3029 }
3030 c = *eptr;
3031 #ifndef COMPILE_PCRE8
3032 if (c > 255)
3033 {
3034 if (op == OP_CLASS) break;
3035 }
3036 else
3037 #endif
3038 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3039 eptr++;
3040 }
3041 while (eptr >= pp)
3042 {
3043 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3045 eptr--;
3046 }
3047 }
3048
3049 RRETURN(MATCH_NOMATCH);
3050 }
3051 #undef BYTE_MAP
3052 }
3053 /* Control never gets here */
3054
3055
3056 /* Match an extended character class. This opcode is encountered only
3057 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3058 mode, because Unicode properties are supported in non-UTF-8 mode. */
3059
3060 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3061 case OP_XCLASS:
3062 {
3063 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3064 ecode += GET(ecode, 1); /* Advance past the item */
3065
3066 switch (*ecode)
3067 {
3068 case OP_CRSTAR:
3069 case OP_CRMINSTAR:
3070 case OP_CRPLUS:
3071 case OP_CRMINPLUS:
3072 case OP_CRQUERY:
3073 case OP_CRMINQUERY:
3074 c = *ecode++ - OP_CRSTAR;
3075 minimize = (c & 1) != 0;
3076 min = rep_min[c]; /* Pick up values from tables; */
3077 max = rep_max[c]; /* zero for max => infinity */
3078 if (max == 0) max = INT_MAX;
3079 break;
3080
3081 case OP_CRRANGE:
3082 case OP_CRMINRANGE:
3083 minimize = (*ecode == OP_CRMINRANGE);
3084 min = GET2(ecode, 1);
3085 max = GET2(ecode, 1 + IMM2_SIZE);
3086 if (max == 0) max = INT_MAX;
3087 ecode += 1 + 2 * IMM2_SIZE;
3088 break;
3089
3090 default: /* No repeat follows */
3091 min = max = 1;
3092 break;
3093 }
3094
3095 /* First, ensure the minimum number of matches are present. */
3096
3097 for (i = 1; i <= min; i++)
3098 {
3099 if (eptr >= md->end_subject)
3100 {
3101 SCHECK_PARTIAL();
3102 RRETURN(MATCH_NOMATCH);
3103 }
3104 GETCHARINCTEST(c, eptr);
3105 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3106 }
3107
3108 /* If max == min we can continue with the main loop without the
3109 need to recurse. */
3110
3111 if (min == max) continue;
3112
3113 /* If minimizing, keep testing the rest of the expression and advancing
3114 the pointer while it matches the class. */
3115
3116 if (minimize)
3117 {
3118 for (fi = min;; fi++)
3119 {
3120 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3122 if (fi >= max) RRETURN(MATCH_NOMATCH);
3123 if (eptr >= md->end_subject)
3124 {
3125 SCHECK_PARTIAL();
3126 RRETURN(MATCH_NOMATCH);
3127 }
3128 GETCHARINCTEST(c, eptr);
3129 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3130 }
3131 /* Control never gets here */
3132 }
3133
3134 /* If maximizing, find the longest possible run, then work backwards. */
3135
3136 else
3137 {
3138 pp = eptr;
3139 for (i = min; i < max; i++)
3140 {
3141 int len = 1;
3142 if (eptr >= md->end_subject)
3143 {
3144 SCHECK_PARTIAL();
3145 break;
3146 }
3147 #ifdef SUPPORT_UTF
3148 GETCHARLENTEST(c, eptr, len);
3149 #else
3150 c = *eptr;
3151 #endif
3152 if (!PRIV(xclass)(c, data, utf)) break;
3153 eptr += len;
3154 }
3155 for(;;)
3156 {
3157 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3158 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3159 if (eptr-- == pp) break; /* Stop if tried at original pos */
3160 #ifdef SUPPORT_UTF
3161 if (utf) BACKCHAR(eptr);
3162 #endif
3163 }
3164 RRETURN(MATCH_NOMATCH);
3165 }
3166
3167 /* Control never gets here */
3168 }
3169 #endif /* End of XCLASS */
3170
3171 /* Match a single character, casefully */
3172
3173 case OP_CHAR:
3174 #ifdef SUPPORT_UTF
3175 if (utf)
3176 {
3177 length = 1;
3178 ecode++;
3179 GETCHARLEN(fc, ecode, length);
3180 if (length > md->end_subject - eptr)
3181 {
3182 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3183 RRETURN(MATCH_NOMATCH);
3184 }
3185 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3186 }
3187 else
3188 #endif
3189 /* Not UTF mode */
3190 {
3191 if (md->end_subject - eptr < 1)
3192 {
3193 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3194 RRETURN(MATCH_NOMATCH);
3195 }
3196 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3197 ecode += 2;
3198 }
3199 break;
3200
3201 /* Match a single character, caselessly. If we are at the end of the
3202 subject, give up immediately. */
3203
3204 case OP_CHARI:
3205 if (eptr >= md->end_subject)
3206 {
3207 SCHECK_PARTIAL();
3208 RRETURN(MATCH_NOMATCH);
3209 }
3210
3211 #ifdef SUPPORT_UTF
3212 if (utf)
3213 {
3214 length = 1;
3215 ecode++;
3216 GETCHARLEN(fc, ecode, length);
3217
3218 /* If the pattern character's value is < 128, we have only one byte, and
3219 we know that its other case must also be one byte long, so we can use the
3220 fast lookup table. We know that there is at least one byte left in the
3221 subject. */
3222
3223 if (fc < 128)
3224 {
3225 pcre_uint32 cc = RAWUCHAR(eptr);
3226 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3227 ecode++;
3228 eptr++;
3229 }
3230
3231 /* Otherwise we must pick up the subject character. Note that we cannot
3232 use the value of "length" to check for sufficient bytes left, because the
3233 other case of the character may have more or fewer bytes. */
3234
3235 else
3236 {
3237 pcre_uint32 dc;
3238 GETCHARINC(dc, eptr);
3239 ecode += length;
3240
3241 /* If we have Unicode property support, we can use it to test the other
3242 case of the character, if there is one. */
3243
3244 if (fc != dc)
3245 {
3246 #ifdef SUPPORT_UCP
3247 if (dc != UCD_OTHERCASE(fc))
3248 #endif
3249 RRETURN(MATCH_NOMATCH);
3250 }
3251 }
3252 }
3253 else
3254 #endif /* SUPPORT_UTF */
3255
3256 /* Not UTF mode */
3257 {
3258 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3259 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3260 eptr++;
3261 ecode += 2;
3262 }
3263 break;
3264
3265 /* Match a single character repeatedly. */
3266
3267 case OP_EXACT:
3268 case OP_EXACTI:
3269 min = max = GET2(ecode, 1);
3270 ecode += 1 + IMM2_SIZE;
3271 goto REPEATCHAR;
3272
3273 case OP_POSUPTO:
3274 case OP_POSUPTOI:
3275 possessive = TRUE;
3276 /* Fall through */
3277
3278 case OP_UPTO:
3279 case OP_UPTOI:
3280 case OP_MINUPTO:
3281 case OP_MINUPTOI:
3282 min = 0;
3283 max = GET2(ecode, 1);
3284 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3285 ecode += 1 + IMM2_SIZE;
3286 goto REPEATCHAR;
3287
3288 case OP_POSSTAR:
3289 case OP_POSSTARI:
3290 possessive = TRUE;
3291 min = 0;
3292 max = INT_MAX;
3293 ecode++;
3294 goto REPEATCHAR;
3295
3296 case OP_POSPLUS:
3297 case OP_POSPLUSI:
3298 possessive = TRUE;
3299 min = 1;
3300 max = INT_MAX;
3301 ecode++;
3302 goto REPEATCHAR;
3303
3304 case OP_POSQUERY:
3305 case OP_POSQUERYI:
3306 possessive = TRUE;
3307 min = 0;
3308 max = 1;
3309 ecode++;
3310 goto REPEATCHAR;
3311
3312 case OP_STAR:
3313 case OP_STARI:
3314 case OP_MINSTAR:
3315 case OP_MINSTARI:
3316 case OP_PLUS:
3317 case OP_PLUSI:
3318 case OP_MINPLUS:
3319 case OP_MINPLUSI:
3320 case OP_QUERY:
3321 case OP_QUERYI:
3322 case OP_MINQUERY:
3323 case OP_MINQUERYI:
3324 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3325 minimize = (c & 1) != 0;
3326 min = rep_min[c]; /* Pick up values from tables; */
3327 max = rep_max[c]; /* zero for max => infinity */
3328 if (max == 0) max = INT_MAX;
3329
3330 /* Common code for all repeated single-character matches. We first check
3331 for the minimum number of characters. If the minimum equals the maximum, we
3332 are done. Otherwise, if minimizing, check the rest of the pattern for a
3333 match; if there isn't one, advance up to the maximum, one character at a
3334 time.
3335
3336 If maximizing, advance up to the maximum number of matching characters,
3337 until eptr is past the end of the maximum run. If possessive, we are
3338 then done (no backing up). Otherwise, match at this position; anything
3339 other than no match is immediately returned. For nomatch, back up one
3340 character, unless we are matching \R and the last thing matched was
3341 \r\n, in which case, back up two bytes. When we reach the first optional
3342 character position, we can save stack by doing a tail recurse.
3343
3344 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3345 for speed. */
3346
3347 REPEATCHAR:
3348 #ifdef SUPPORT_UTF
3349 if (utf)
3350 {
3351 length = 1;
3352 charptr = ecode;
3353 GETCHARLEN(fc, ecode, length);
3354 ecode += length;
3355
3356 /* Handle multibyte character matching specially here. There is
3357 support for caseless matching if UCP support is present. */
3358
3359 if (length > 1)
3360 {
3361 #ifdef SUPPORT_UCP
3362 pcre_uint32 othercase;
3363 if (op >= OP_STARI && /* Caseless */
3364 (othercase = UCD_OTHERCASE(fc)) != fc)
3365 oclength = PRIV(ord2utf)(othercase, occhars);
3366 else oclength = 0;
3367 #endif /* SUPPORT_UCP */
3368
3369 for (i = 1; i <= min; i++)
3370 {
3371 if (eptr <= md->end_subject - length &&
3372 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3373 #ifdef SUPPORT_UCP
3374 else if (oclength > 0 &&
3375 eptr <= md->end_subject - oclength &&
3376 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3377 #endif /* SUPPORT_UCP */
3378 else
3379 {
3380 CHECK_PARTIAL();
3381 RRETURN(MATCH_NOMATCH);
3382 }
3383 }
3384
3385 if (min == max) continue;
3386
3387 if (minimize)
3388 {
3389 for (fi = min;; fi++)
3390 {
3391 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3393 if (fi >= max) RRETURN(MATCH_NOMATCH);
3394 if (eptr <= md->end_subject - length &&
3395 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3396 #ifdef SUPPORT_UCP
3397 else if (oclength > 0 &&
3398 eptr <= md->end_subject - oclength &&
3399 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3400 #endif /* SUPPORT_UCP */
3401 else
3402 {
3403 CHECK_PARTIAL();
3404 RRETURN(MATCH_NOMATCH);
3405 }
3406 }
3407 /* Control never gets here */
3408 }
3409
3410 else /* Maximize */
3411 {
3412 pp = eptr;
3413 for (i = min; i < max; i++)
3414 {
3415 if (eptr <= md->end_subject - length &&
3416 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3417 #ifdef SUPPORT_UCP
3418 else if (oclength > 0 &&
3419 eptr <= md->end_subject - oclength &&
3420 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3421 #endif /* SUPPORT_UCP */
3422 else
3423 {
3424 CHECK_PARTIAL();
3425 break;
3426 }
3427 }
3428
3429 if (possessive) continue; /* No backtracking */
3430 for(;;)
3431 {
3432 if (eptr == pp) goto TAIL_RECURSE;
3433 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3434 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3435 #ifdef SUPPORT_UCP
3436 eptr--;
3437 BACKCHAR(eptr);
3438 #else /* without SUPPORT_UCP */
3439 eptr -= length;
3440 #endif /* SUPPORT_UCP */
3441 }
3442 }
3443 /* Control never gets here */
3444 }
3445
3446 /* If the length of a UTF-8 character is 1, we fall through here, and
3447 obey the code as for non-UTF-8 characters below, though in this case the
3448 value of fc will always be < 128. */
3449 }
3450 else
3451 #endif /* SUPPORT_UTF */
3452 /* When not in UTF-8 mode, load a single-byte character. */
3453 fc = *ecode++;
3454
3455 /* The value of fc at this point is always one character, though we may
3456 or may not be in UTF mode. The code is duplicated for the caseless and
3457 caseful cases, for speed, since matching characters is likely to be quite
3458 common. First, ensure the minimum number of matches are present. If min =
3459 max, continue at the same level without recursing. Otherwise, if
3460 minimizing, keep trying the rest of the expression and advancing one
3461 matching character if failing, up to the maximum. Alternatively, if
3462 maximizing, find the maximum number of characters and work backwards. */
3463
3464 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3465 max, (char *)eptr));
3466
3467 if (op >= OP_STARI) /* Caseless */
3468 {
3469 #ifdef COMPILE_PCRE8
3470 /* fc must be < 128 if UTF is enabled. */
3471 foc = md->fcc[fc];
3472 #else
3473 #ifdef SUPPORT_UTF
3474 #ifdef SUPPORT_UCP
3475 if (utf && fc > 127)
3476 foc = UCD_OTHERCASE(fc);
3477 #else
3478 if (utf && fc > 127)
3479 foc = fc;
3480 #endif /* SUPPORT_UCP */
3481 else
3482 #endif /* SUPPORT_UTF */
3483 foc = TABLE_GET(fc, md->fcc, fc);
3484 #endif /* COMPILE_PCRE8 */
3485
3486 for (i = 1; i <= min; i++)
3487 {
3488 pcre_uint32 cc; /* Faster than pcre_uchar */
3489 if (eptr >= md->end_subject)
3490 {
3491 SCHECK_PARTIAL();
3492 RRETURN(MATCH_NOMATCH);
3493 }
3494 cc = RAWUCHARTEST(eptr);
3495 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3496 eptr++;
3497 }
3498 if (min == max) continue;
3499 if (minimize)
3500 {
3501 for (fi = min;; fi++)
3502 {
3503 pcre_uint32 cc; /* Faster than pcre_uchar */
3504 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3505 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3506 if (fi >= max) RRETURN(MATCH_NOMATCH);
3507 if (eptr >= md->end_subject)
3508 {
3509 SCHECK_PARTIAL();
3510 RRETURN(MATCH_NOMATCH);
3511 }
3512 cc = RAWUCHARTEST(eptr);
3513 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3514 eptr++;
3515 }
3516 /* Control never gets here */
3517 }
3518 else /* Maximize */
3519 {
3520 pp = eptr;
3521 for (i = min; i < max; i++)
3522 {
3523 pcre_uint32 cc; /* Faster than pcre_uchar */
3524 if (eptr >= md->end_subject)
3525 {
3526 SCHECK_PARTIAL();
3527 break;
3528 }
3529 cc = RAWUCHARTEST(eptr);
3530 if (fc != cc && foc != cc) break;
3531 eptr++;
3532 }
3533 if (possessive) continue; /* No backtracking */
3534 for (;;)
3535 {
3536 if (eptr == pp) goto TAIL_RECURSE;
3537 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3538 eptr--;
3539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3540 }
3541 /* Control never gets here */
3542 }
3543 }
3544
3545 /* Caseful comparisons (includes all multi-byte characters) */
3546
3547 else
3548 {
3549 for (i = 1; i <= min; i++)
3550 {
3551 if (eptr >= md->end_subject)
3552 {
3553 SCHECK_PARTIAL();
3554 RRETURN(MATCH_NOMATCH);
3555 }
3556 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3557 }
3558
3559 if (min == max) continue;
3560
3561 if (minimize)
3562 {
3563 for (fi = min;; fi++)
3564 {
3565 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3566 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3567 if (fi >= max) RRETURN(MATCH_NOMATCH);
3568 if (eptr >= md->end_subject)
3569 {
3570 SCHECK_PARTIAL();
3571 RRETURN(MATCH_NOMATCH);
3572 }
3573 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3574 }
3575 /* Control never gets here */
3576 }
3577 else /* Maximize */
3578 {
3579 pp = eptr;
3580 for (i = min; i < max; i++)
3581 {
3582 if (eptr >= md->end_subject)
3583 {
3584 SCHECK_PARTIAL();
3585 break;
3586 }
3587 if (fc != RAWUCHARTEST(eptr)) break;
3588 eptr++;
3589 }
3590 if (possessive) continue; /* No backtracking */
3591 for (;;)
3592 {
3593 if (eptr == pp) goto TAIL_RECURSE;
3594 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3595 eptr--;
3596 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3597 }
3598 /* Control never gets here */
3599 }
3600 }
3601 /* Control never gets here */
3602
3603 /* Match a negated single one-byte character. The character we are
3604 checking can be multibyte. */
3605
3606 case OP_NOT:
3607 case OP_NOTI:
3608 if (eptr >= md->end_subject)
3609 {
3610 SCHECK_PARTIAL();
3611 RRETURN(MATCH_NOMATCH);
3612 }
3613 #ifdef SUPPORT_UTF
3614 if (utf)
3615 {
3616 register pcre_uint32 ch, och;
3617
3618 ecode++;
3619 GETCHARINC(ch, ecode);
3620 GETCHARINC(c, eptr);
3621
3622 if (op == OP_NOT)
3623 {
3624 if (ch == c) RRETURN(MATCH_NOMATCH);
3625 }
3626 else
3627 {
3628 #ifdef SUPPORT_UCP
3629 if (ch > 127)
3630 och = UCD_OTHERCASE(ch);
3631 #else
3632 if (ch > 127)
3633 och = ch;
3634 #endif /* SUPPORT_UCP */
3635 else
3636 och = TABLE_GET(ch, md->fcc, ch);
3637 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3638 }
3639 }
3640 else
3641 #endif
3642 {
3643 register pcre_uint32 ch = ecode[1];
3644 c = *eptr++;
3645 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3646 RRETURN(MATCH_NOMATCH);
3647 ecode += 2;
3648 }
3649 break;
3650
3651 /* Match a negated single one-byte character repeatedly. This is almost a
3652 repeat of the code for a repeated single character, but I haven't found a
3653 nice way of commoning these up that doesn't require a test of the
3654 positive/negative option for each character match. Maybe that wouldn't add
3655 very much to the time taken, but character matching *is* what this is all
3656 about... */
3657
3658 case OP_NOTEXACT:
3659 case OP_NOTEXACTI:
3660 min = max = GET2(ecode, 1);
3661 ecode += 1 + IMM2_SIZE;
3662 goto REPEATNOTCHAR;
3663
3664 case OP_NOTUPTO:
3665 case OP_NOTUPTOI:
3666 case OP_NOTMINUPTO:
3667 case OP_NOTMINUPTOI:
3668 min = 0;
3669 max = GET2(ecode, 1);
3670 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3671 ecode += 1 + IMM2_SIZE;
3672 goto REPEATNOTCHAR;
3673
3674 case OP_NOTPOSSTAR:
3675 case OP_NOTPOSSTARI:
3676 possessive = TRUE;
3677 min = 0;
3678 max = INT_MAX;
3679 ecode++;
3680 goto REPEATNOTCHAR;
3681
3682 case OP_NOTPOSPLUS:
3683 case OP_NOTPOSPLUSI:
3684 possessive = TRUE;
3685 min = 1;
3686 max = INT_MAX;
3687 ecode++;
3688 goto REPEATNOTCHAR;
3689
3690 case OP_NOTPOSQUERY:
3691 case OP_NOTPOSQUERYI:
3692 possessive = TRUE;
3693 min = 0;
3694 max = 1;
3695 ecode++;
3696 goto REPEATNOTCHAR;
3697
3698 case OP_NOTPOSUPTO:
3699 case OP_NOTPOSUPTOI:
3700 possessive = TRUE;
3701 min = 0;
3702 max = GET2(ecode, 1);
3703 ecode += 1 + IMM2_SIZE;
3704 goto REPEATNOTCHAR;
3705
3706 case OP_NOTSTAR:
3707 case OP_NOTSTARI:
3708 case OP_NOTMINSTAR:
3709 case OP_NOTMINSTARI:
3710 case OP_NOTPLUS:
3711 case OP_NOTPLUSI:
3712 case OP_NOTMINPLUS:
3713 case OP_NOTMINPLUSI:
3714 case OP_NOTQUERY:
3715 case OP_NOTQUERYI:
3716 case OP_NOTMINQUERY:
3717 case OP_NOTMINQUERYI:
3718 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3719 minimize = (c & 1) != 0;
3720 min = rep_min[c]; /* Pick up values from tables; */
3721 max = rep_max[c]; /* zero for max => infinity */
3722 if (max == 0) max = INT_MAX;
3723
3724 /* Common code for all repeated single-byte matches. */
3725
3726 REPEATNOTCHAR:
3727 GETCHARINCTEST(fc, ecode);
3728
3729 /* The code is duplicated for the caseless and caseful cases, for speed,
3730 since matching characters is likely to be quite common. First, ensure the
3731 minimum number of matches are present. If min = max, continue at the same
3732 level without recursing. Otherwise, if minimizing, keep trying the rest of
3733 the expression and advancing one matching character if failing, up to the
3734 maximum. Alternatively, if maximizing, find the maximum number of
3735 characters and work backwards. */
3736
3737 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3738 max, (char *)eptr));
3739
3740 if (op >= OP_NOTSTARI) /* Caseless */
3741 {
3742 #ifdef SUPPORT_UTF
3743 #ifdef SUPPORT_UCP
3744 if (utf && fc > 127)
3745 foc = UCD_OTHERCASE(fc);
3746 #else
3747 if (utf && fc > 127)
3748 foc = fc;
3749 #endif /* SUPPORT_UCP */
3750 else
3751 #endif /* SUPPORT_UTF */
3752 foc = TABLE_GET(fc, md->fcc, fc);
3753
3754 #ifdef SUPPORT_UTF
3755 if (utf)
3756 {
3757 register pcre_uint32 d;
3758 for (i = 1; i <= min; i++)
3759 {
3760 if (eptr >= md->end_subject)
3761 {
3762 SCHECK_PARTIAL();
3763 RRETURN(MATCH_NOMATCH);
3764 }
3765 GETCHARINC(d, eptr);
3766 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3767 }
3768 }
3769 else
3770 #endif /* SUPPORT_UTF */
3771 /* Not UTF mode */
3772 {
3773 for (i = 1; i <= min; i++)
3774 {
3775 if (eptr >= md->end_subject)
3776 {
3777 SCHECK_PARTIAL();
3778 RRETURN(MATCH_NOMATCH);
3779 }
3780 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3781 eptr++;
3782 }
3783 }
3784
3785 if (min == max) continue;
3786
3787 if (minimize)
3788 {
3789 #ifdef SUPPORT_UTF
3790 if (utf)
3791 {
3792 register pcre_uint32 d;
3793 for (fi = min;; fi++)
3794 {
3795 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3797 if (fi >= max) RRETURN(MATCH_NOMATCH);
3798 if (eptr >= md->end_subject)
3799 {
3800 SCHECK_PARTIAL();
3801 RRETURN(MATCH_NOMATCH);
3802 }
3803 GETCHARINC(d, eptr);
3804 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3805 }
3806 }
3807 else
3808 #endif /*SUPPORT_UTF */
3809 /* Not UTF mode */
3810 {
3811 for (fi = min;; fi++)
3812 {
3813 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3815 if (fi >= max) RRETURN(MATCH_NOMATCH);
3816 if (eptr >= md->end_subject)
3817 {
3818 SCHECK_PARTIAL();
3819 RRETURN(MATCH_NOMATCH);
3820 }
3821 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3822 eptr++;
3823 }
3824 }
3825 /* Control never gets here */
3826 }
3827
3828 /* Maximize case */
3829
3830 else
3831 {
3832 pp = eptr;
3833
3834 #ifdef SUPPORT_UTF
3835 if (utf)
3836 {
3837 register pcre_uint32 d;
3838 for (i = min; i < max; i++)
3839 {
3840 int len = 1;
3841 if (eptr >= md->end_subject)
3842 {
3843 SCHECK_PARTIAL();
3844 break;
3845 }
3846 GETCHARLEN(d, eptr, len);
3847 if (fc == d || (unsigned int)foc == d) break;
3848 eptr += len;
3849 }
3850 if (possessive) continue; /* No backtracking */
3851 for(;;)
3852 {
3853 if (eptr == pp) goto TAIL_RECURSE;
3854 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3855 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3856 eptr--;
3857 BACKCHAR(eptr);
3858 }
3859 }
3860 else
3861 #endif /* SUPPORT_UTF */
3862 /* Not UTF mode */
3863 {
3864 for (i = min; i < max; i++)
3865 {
3866 if (eptr >= md->end_subject)
3867 {
3868 SCHECK_PARTIAL();
3869 break;
3870 }
3871 if (fc == *eptr || foc == *eptr) break;
3872 eptr++;
3873 }
3874 if (possessive) continue; /* No backtracking */
3875 for (;;)
3876 {
3877 if (eptr == pp) goto TAIL_RECURSE;
3878 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3880 eptr--;
3881 }
3882 }
3883 /* Control never gets here */
3884 }
3885 }
3886
3887 /* Caseful comparisons */
3888
3889 else
3890 {
3891 #ifdef SUPPORT_UTF
3892 if (utf)
3893 {
3894 register pcre_uint32 d;
3895 for (i = 1; i <= min; i++)
3896 {
3897 if (eptr >= md->end_subject)
3898 {
3899 SCHECK_PARTIAL();
3900 RRETURN(MATCH_NOMATCH);
3901 }
3902 GETCHARINC(d, eptr);
3903 if (fc == d) RRETURN(MATCH_NOMATCH);
3904 }
3905 }
3906 else
3907 #endif
3908 /* Not UTF mode */
3909 {
3910 for (i = 1; i <= min; i++)
3911 {
3912 if (eptr >= md->end_subject)
3913 {
3914 SCHECK_PARTIAL();
3915 RRETURN(MATCH_NOMATCH);
3916 }
3917 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3918 }
3919 }
3920
3921 if (min == max) continue;
3922
3923 if (minimize)
3924 {
3925 #ifdef SUPPORT_UTF
3926 if (utf)
3927 {
3928 register pcre_uint32 d;
3929 for (fi = min;; fi++)
3930 {
3931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3933 if (fi >= max) RRETURN(MATCH_NOMATCH);
3934 if (eptr >= md->end_subject)
3935 {
3936 SCHECK_PARTIAL();
3937 RRETURN(MATCH_NOMATCH);
3938 }
3939 GETCHARINC(d, eptr);
3940 if (fc == d) RRETURN(MATCH_NOMATCH);
3941 }
3942 }
3943 else
3944 #endif
3945 /* Not UTF mode */
3946 {
3947 for (fi = min;; fi++)
3948 {
3949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3951 if (fi >= max) RRETURN(MATCH_NOMATCH);
3952 if (eptr >= md->end_subject)
3953 {
3954 SCHECK_PARTIAL();
3955 RRETURN(MATCH_NOMATCH);
3956 }
3957 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3958 }
3959 }
3960 /* Control never gets here */
3961 }
3962
3963 /* Maximize case */
3964
3965 else
3966 {
3967 pp = eptr;
3968
3969 #ifdef SUPPORT_UTF
3970 if (utf)
3971 {
3972 register pcre_uint32 d;
3973 for (i = min; i < max; i++)
3974 {
3975 int len = 1;
3976 if (eptr >= md->end_subject)
3977 {
3978 SCHECK_PARTIAL();
3979 break;
3980 }
3981 GETCHARLEN(d, eptr, len);
3982 if (fc == d) break;
3983 eptr += len;
3984 }
3985 if (possessive) continue; /* No backtracking */
3986 for(;;)
3987 {
3988 if (eptr == pp) goto TAIL_RECURSE;
3989 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3991 eptr--;
3992 BACKCHAR(eptr);
3993 }
3994 }
3995 else
3996 #endif
3997 /* Not UTF mode */
3998 {
3999 for (i = min; i < max; i++)
4000 {
4001 if (eptr >= md->end_subject)
4002 {
4003 SCHECK_PARTIAL();
4004 break;
4005 }
4006 if (fc == *eptr) break;
4007 eptr++;
4008 }
4009 if (possessive) continue; /* No backtracking */
4010 for (;;)
4011 {
4012 if (eptr == pp) goto TAIL_RECURSE;
4013 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4015 eptr--;
4016 }
4017 }
4018 /* Control never gets here */
4019 }
4020 }
4021 /* Control never gets here */
4022
4023 /* Match a single character type repeatedly; several different opcodes
4024 share code. This is very similar to the code for single characters, but we
4025 repeat it in the interests of efficiency. */
4026
4027 case OP_TYPEEXACT:
4028 min = max = GET2(ecode, 1);
4029 minimize = TRUE;
4030 ecode += 1 + IMM2_SIZE;
4031 goto REPEATTYPE;
4032
4033 case OP_TYPEUPTO:
4034 case OP_TYPEMINUPTO:
4035 min = 0;
4036 max = GET2(ecode, 1);
4037 minimize = *ecode == OP_TYPEMINUPTO;
4038 ecode += 1 + IMM2_SIZE;
4039 goto REPEATTYPE;
4040
4041 case OP_TYPEPOSSTAR:
4042 possessive = TRUE;
4043 min = 0;
4044 max = INT_MAX;
4045 ecode++;
4046 goto REPEATTYPE;
4047
4048 case OP_TYPEPOSPLUS:
4049 possessive = TRUE;
4050 min = 1;
4051 max = INT_MAX;
4052 ecode++;
4053 goto REPEATTYPE;
4054
4055 case OP_TYPEPOSQUERY:
4056 possessive = TRUE;
4057 min = 0;
4058 max = 1;
4059 ecode++;
4060 goto REPEATTYPE;
4061
4062 case OP_TYPEPOSUPTO:
4063 possessive = TRUE;
4064 min = 0;
4065 max = GET2(ecode, 1);
4066 ecode += 1 + IMM2_SIZE;
4067 goto REPEATTYPE;
4068
4069 case OP_TYPESTAR:
4070 case OP_TYPEMINSTAR:
4071 case OP_TYPEPLUS:
4072 case OP_TYPEMINPLUS:
4073 case OP_TYPEQUERY:
4074 case OP_TYPEMINQUERY:
4075 c = *ecode++ - OP_TYPESTAR;
4076 minimize = (c & 1) != 0;
4077 min = rep_min[c]; /* Pick up values from tables; */
4078 max = rep_max[c]; /* zero for max => infinity */
4079 if (max == 0) max = INT_MAX;
4080
4081 /* Common code for all repeated single character type matches. Note that
4082 in UTF-8 mode, '.' matches a character of any length, but for the other
4083 character types, the valid characters are all one-byte long. */
4084
4085 REPEATTYPE:
4086 ctype = *ecode++; /* Code for the character type */
4087
4088 #ifdef SUPPORT_UCP
4089 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4090 {
4091 prop_fail_result = ctype == OP_NOTPROP;
4092 prop_type = *ecode++;
4093 prop_value = *ecode++;
4094 }
4095 else prop_type = -1;
4096 #endif
4097
4098 /* First, ensure the minimum number of matches are present. Use inline
4099 code for maximizing the speed, and do the type test once at the start
4100 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4101 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4102 and single-bytes. */
4103
4104 if (min > 0)
4105 {
4106 #ifdef SUPPORT_UCP
4107 if (prop_type >= 0)
4108 {
4109 switch(prop_type)
4110 {
4111 case PT_ANY:
4112 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4113 for (i = 1; i <= min; i++)
4114 {
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 GETCHARINCTEST(c, eptr);
4121 }
4122 break;
4123
4124 case PT_LAMP:
4125 for (i = 1; i <= min; i++)
4126 {
4127 int chartype;
4128 if (eptr >= md->end_subject)
4129 {
4130 SCHECK_PARTIAL();
4131 RRETURN(MATCH_NOMATCH);
4132 }
4133 GETCHARINCTEST(c, eptr);
4134 chartype = UCD_CHARTYPE(c);
4135 if ((chartype == ucp_Lu ||
4136 chartype == ucp_Ll ||
4137 chartype == ucp_Lt) == prop_fail_result)
4138 RRETURN(MATCH_NOMATCH);
4139 }
4140 break;
4141
4142 case PT_GC:
4143 for (i = 1; i <= min; i++)
4144 {
4145 if (eptr >= md->end_subject)
4146 {
4147 SCHECK_PARTIAL();
4148 RRETURN(MATCH_NOMATCH);
4149 }
4150 GETCHARINCTEST(c, eptr);
4151 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4152 RRETURN(MATCH_NOMATCH);
4153 }
4154 break;
4155
4156 case PT_PC:
4157 for (i = 1; i <= min; i++)
4158 {
4159 if (eptr >= md->end_subject)
4160 {
4161 SCHECK_PARTIAL();
4162 RRETURN(MATCH_NOMATCH);
4163 }
4164 GETCHARINCTEST(c, eptr);
4165 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4166 RRETURN(MATCH_NOMATCH);
4167 }
4168 break;
4169
4170 case PT_SC:
4171 for (i = 1; i <= min; i++)
4172 {
4173 if (eptr >= md->end_subject)
4174 {
4175 SCHECK_PARTIAL();
4176 RRETURN(MATCH_NOMATCH);
4177 }
4178 GETCHARINCTEST(c, eptr);
4179 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4180 RRETURN(MATCH_NOMATCH);
4181 }
4182 break;
4183
4184 case PT_ALNUM:
4185 for (i = 1; i <= min; i++)
4186 {
4187 int category;
4188 if (eptr >= md->end_subject)
4189 {
4190 SCHECK_PARTIAL();
4191 RRETURN(MATCH_NOMATCH);
4192 }
4193 GETCHARINCTEST(c, eptr);
4194 category = UCD_CATEGORY(c);
4195 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4196 RRETURN(MATCH_NOMATCH);
4197 }
4198 break;
4199
4200 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4201 which means that Perl space and POSIX space are now identical. PCRE
4202 was changed at release 8.34. */
4203
4204 case PT_SPACE: /* Perl space */
4205 case PT_PXSPACE: /* POSIX space */
4206 for (i = 1; i <= min; i++)
4207 {
4208 if (eptr >= md->end_subject)
4209 {
4210 SCHECK_PARTIAL();
4211 RRETURN(MATCH_NOMATCH);
4212 }
4213 GETCHARINCTEST(c, eptr);
4214 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4215 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4216 == prop_fail_result)
4217 RRETURN(MATCH_NOMATCH);
4218 }
4219 break;
4220
4221 case PT_WORD:
4222 for (i = 1; i <= min; i++)
4223 {
4224 int category;
4225 if (eptr >= md->end_subject)
4226 {
4227 SCHECK_PARTIAL();
4228 RRETURN(MATCH_NOMATCH);
4229 }
4230 GETCHARINCTEST(c, eptr);
4231 category = UCD_CATEGORY(c);
4232 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4233 == prop_fail_result)
4234 RRETURN(MATCH_NOMATCH);
4235 }
4236 break;
4237
4238 case PT_CLIST:
4239 for (i = 1; i <= min; i++)
4240 {
4241 const pcre_uint32 *cp;
4242 if (eptr >= md->end_subject)
4243 {
4244 SCHECK_PARTIAL();
4245 RRETURN(MATCH_NOMATCH);
4246 }
4247 GETCHARINCTEST(c, eptr);
4248 cp = PRIV(ucd_caseless_sets) + prop_value;
4249 for (;;)
4250 {
4251 if (c < *cp)
4252 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4253 if (c == *cp++)
4254 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4255 }
4256 }
4257 break;
4258
4259 case PT_UCNC:
4260 for (i = 1; i <= min; i++)
4261 {
4262 if (eptr >= md->end_subject)
4263 {
4264 SCHECK_PARTIAL();
4265 RRETURN(MATCH_NOMATCH);
4266 }
4267 GETCHARINCTEST(c, eptr);
4268 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4269 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4270 c >= 0xe000) == prop_fail_result)
4271 RRETURN(MATCH_NOMATCH);
4272 }
4273 break;
4274
4275 /* This should not occur */
4276
4277 default:
4278 RRETURN(PCRE_ERROR_INTERNAL);
4279 }
4280 }
4281
4282 /* Match extended Unicode sequences. We will get here only if the
4283 support is in the binary; otherwise a compile-time error occurs. */
4284
4285 else if (ctype == OP_EXTUNI)
4286 {
4287 for (i = 1; i <= min; i++)
4288 {
4289 if (eptr >= md->end_subject)
4290 {
4291 SCHECK_PARTIAL();
4292 RRETURN(MATCH_NOMATCH);
4293 }
4294 else
4295 {
4296 int lgb, rgb;
4297 GETCHARINCTEST(c, eptr);
4298 lgb = UCD_GRAPHBREAK(c);
4299 while (eptr < md->end_subject)
4300 {
4301 int len = 1;
4302 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4303 rgb = UCD_GRAPHBREAK(c);
4304 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4305 lgb = rgb;
4306 eptr += len;
4307 }
4308 }
4309 CHECK_PARTIAL();
4310 }
4311 }
4312
4313 else
4314 #endif /* SUPPORT_UCP */
4315
4316 /* Handle all other cases when the coding is UTF-8 */
4317
4318 #ifdef SUPPORT_UTF
4319 if (utf) switch(ctype)
4320 {
4321 case OP_ANY:
4322 for (i = 1; i <= min; i++)
4323 {
4324 if (eptr >= md->end_subject)
4325 {
4326 SCHECK_PARTIAL();
4327 RRETURN(MATCH_NOMATCH);
4328 }
4329 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4330 if (md->partial != 0 &&
4331 eptr + 1 >= md->end_subject &&
4332 NLBLOCK->nltype == NLTYPE_FIXED &&
4333 NLBLOCK->nllen == 2 &&
4334 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4335 {
4336 md->hitend = TRUE;
4337 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4338 }
4339 eptr++;
4340 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4341 }
4342 break;
4343
4344 case OP_ALLANY:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 RRETURN(MATCH_NOMATCH);
4351 }
4352 eptr++;
4353 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4354 }
4355 break;
4356
4357 case OP_ANYBYTE:
4358 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4359 eptr += min;
4360 break;
4361
4362 case OP_ANYNL:
4363 for (i = 1; i <= min; i++)
4364 {
4365 if (eptr >= md->end_subject)
4366 {
4367 SCHECK_PARTIAL();
4368 RRETURN(MATCH_NOMATCH);
4369 }
4370 GETCHARINC(c, eptr);
4371 switch(c)
4372 {
4373 default: RRETURN(MATCH_NOMATCH);
4374
4375 case CHAR_CR:
4376 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4377 break;
4378
4379 case CHAR_LF:
4380 break;
4381
4382 case CHAR_VT:
4383 case CHAR_FF:
4384 case CHAR_NEL:
4385 #ifndef EBCDIC
4386 case 0x2028:
4387 case 0x2029:
4388 #endif /* Not EBCDIC */
4389 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4390 break;
4391 }
4392 }
4393 break;
4394
4395 case OP_NOT_HSPACE:
4396 for (i = 1; i <= min; i++)
4397 {
4398 if (eptr >= md->end_subject)
4399 {
4400 SCHECK_PARTIAL();
4401 RRETURN(MATCH_NOMATCH);
4402 }
4403 GETCHARINC(c, eptr);
4404 switch(c)
4405 {
4406 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4407 default: break;
4408 }
4409 }
4410 break;
4411
4412 case OP_HSPACE:
4413 for (i = 1; i <= min; i++)
4414 {
4415 if (eptr >= md->end_subject)
4416 {
4417 SCHECK_PARTIAL();
4418 RRETURN(MATCH_NOMATCH);
4419 }
4420 GETCHARINC(c, eptr);
4421 switch(c)
4422 {
4423 HSPACE_CASES: break; /* Byte and multibyte cases */
4424 default: RRETURN(MATCH_NOMATCH);
4425 }
4426 }
4427 break;
4428
4429 case OP_NOT_VSPACE:
4430 for (i = 1; i <= min; i++)
4431 {
4432 if (eptr >= md->end_subject)
4433 {
4434 SCHECK_PARTIAL();
4435 RRETURN(MATCH_NOMATCH);
4436 }
4437 GETCHARINC(c, eptr);
4438 switch(c)
4439 {
4440 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4441 default: break;
4442 }
4443 }
4444 break;
4445
4446 case OP_VSPACE:
4447 for (i = 1; i <= min; i++)
4448 {
4449 if (eptr >= md->end_subject)
4450 {
4451 SCHECK_PARTIAL();
4452 RRETURN(MATCH_NOMATCH);
4453 }
4454 GETCHARINC(c, eptr);
4455 switch(c)
4456 {
4457 VSPACE_CASES: break;
4458 default: RRETURN(MATCH_NOMATCH);
4459 }
4460 }
4461 break;
4462
4463 case OP_NOT_DIGIT:
4464 for (i = 1; i <= min; i++)
4465 {
4466 if (eptr >= md->end_subject)
4467 {
4468 SCHECK_PARTIAL();
4469 RRETURN(MATCH_NOMATCH);
4470 }
4471 GETCHARINC(c, eptr);
4472 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4473 RRETURN(MATCH_NOMATCH);
4474 }
4475 break;
4476
4477 case OP_DIGIT:
4478 for (i = 1; i <= min; i++)
4479 {
4480 pcre_uint32 cc;
4481 if (eptr >= md->end_subject)
4482 {
4483 SCHECK_PARTIAL();
4484 RRETURN(MATCH_NOMATCH);
4485 }
4486 cc = RAWUCHAR(eptr);
4487 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4488 RRETURN(MATCH_NOMATCH);
4489 eptr++;
4490 /* No need to skip more bytes - we know it's a 1-byte character */
4491 }
4492 break;
4493
4494 case OP_NOT_WHITESPACE:
4495 for (i = 1; i <= min; i++)
4496 {
4497 pcre_uint32 cc;
4498 if (eptr >= md->end_subject)
4499 {
4500 SCHECK_PARTIAL();
4501 RRETURN(MATCH_NOMATCH);
4502 }
4503 cc = RAWUCHAR(eptr);
4504 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4505 RRETURN(MATCH_NOMATCH);
4506 eptr++;
4507 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4508 }
4509 break;
4510
4511 case OP_WHITESPACE:
4512 for (i = 1; i <= min; i++)
4513 {
4514 pcre_uint32 cc;
4515 if (eptr >= md->end_subject)
4516 {
4517 SCHECK_PARTIAL();
4518 RRETURN(MATCH_NOMATCH);
4519 }
4520 cc = RAWUCHAR(eptr);
4521 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4522 RRETURN(MATCH_NOMATCH);
4523 eptr++;
4524 /* No need to skip more bytes - we know it's a 1-byte character */
4525 }
4526 break;
4527
4528 case OP_NOT_WORDCHAR:
4529 for (i = 1; i <= min; i++)
4530 {
4531 pcre_uint32 cc;
4532 if (eptr >= md->end_subject)
4533 {
4534 SCHECK_PARTIAL();
4535 RRETURN(MATCH_NOMATCH);
4536 }
4537 cc = RAWUCHAR(eptr);
4538 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4539 RRETURN(MATCH_NOMATCH);
4540 eptr++;
4541 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4542 }
4543 break;
4544
4545 case OP_WORDCHAR:
4546 for (i = 1; i <= min; i++)
4547 {
4548 pcre_uint32 cc;
4549 if (eptr >= md->end_subject)
4550 {
4551 SCHECK_PARTIAL();
4552 RRETURN(MATCH_NOMATCH);
4553 }
4554 cc = RAWUCHAR(eptr);
4555 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4556 RRETURN(MATCH_NOMATCH);
4557 eptr++;
4558 /* No need to skip more bytes - we know it's a 1-byte character */
4559 }
4560 break;
4561
4562 default:
4563 RRETURN(PCRE_ERROR_INTERNAL);
4564 } /* End switch(ctype) */
4565
4566 else
4567 #endif /* SUPPORT_UTF */
4568
4569 /* Code for the non-UTF-8 case for minimum matching of operators other
4570 than OP_PROP and OP_NOTPROP. */
4571
4572 switch(ctype)
4573 {
4574 case OP_ANY:
4575 for (i = 1; i <= min; i++)
4576 {
4577 if (eptr >= md->end_subject)
4578 {
4579 SCHECK_PARTIAL();
4580 RRETURN(MATCH_NOMATCH);
4581 }
4582 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4583 if (md->partial != 0 &&
4584 eptr + 1 >= md->end_subject &&
4585 NLBLOCK->nltype == NLTYPE_FIXED &&
4586 NLBLOCK->nllen == 2 &&
4587 *eptr == NLBLOCK->nl[0])
4588 {
4589 md->hitend = TRUE;
4590 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4591 }
4592 eptr++;
4593 }
4594 break;
4595
4596 case OP_ALLANY:
4597 if (eptr > md->end_subject - min)
4598 {
4599 SCHECK_PARTIAL();
4600 RRETURN(MATCH_NOMATCH);
4601 }
4602 eptr += min;
4603 break;
4604
4605 case OP_ANYBYTE:
4606 if (eptr > md->end_subject - min)
4607 {
4608 SCHECK_PARTIAL();
4609 RRETURN(MATCH_NOMATCH);
4610 }
4611 eptr += min;
4612 break;
4613
4614 case OP_ANYNL:
4615 for (i = 1; i <= min; i++)
4616 {
4617 if (eptr >= md->end_subject)
4618 {
4619 SCHECK_PARTIAL();
4620 RRETURN(MATCH_NOMATCH);
4621 }
4622 switch(*eptr++)
4623 {
4624 default: RRETURN(MATCH_NOMATCH);
4625
4626 case CHAR_CR:
4627 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4628 break;
4629
4630 case CHAR_LF:
4631 break;
4632
4633 case CHAR_VT:
4634 case CHAR_FF:
4635 case CHAR_NEL:
4636 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4637 case 0x2028:
4638 case 0x2029:
4639 #endif
4640 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4641 break;
4642 }
4643 }
4644 break;
4645
4646 case OP_NOT_HSPACE:
4647 for (i = 1; i <= min; i++)
4648 {
4649 if (eptr >= md->end_subject)
4650 {
4651 SCHECK_PARTIAL();
4652 RRETURN(MATCH_NOMATCH);
4653 }
4654 switch(*eptr++)
4655 {
4656 default: break;
4657 HSPACE_BYTE_CASES:
4658 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4659 HSPACE_MULTIBYTE_CASES:
4660 #endif
4661 RRETURN(MATCH_NOMATCH);
4662 }
4663 }
4664 break;
4665
4666 case OP_HSPACE:
4667 for (i = 1; i <= min; i++)
4668 {
4669 if (eptr >= md->end_subject)
4670 {
4671 SCHECK_PARTIAL();
4672 RRETURN(MATCH_NOMATCH);
4673 }
4674 switch(*eptr++)
4675 {
4676 default: RRETURN(MATCH_NOMATCH);
4677 HSPACE_BYTE_CASES:
4678 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4679 HSPACE_MULTIBYTE_CASES:
4680 #endif
4681 break;
4682 }
4683 }
4684 break;
4685
4686 case OP_NOT_VSPACE:
4687 for (i = 1; i <= min; i++)
4688 {
4689 if (eptr >= md->end_subject)
4690 {
4691 SCHECK_PARTIAL();
4692 RRETURN(MATCH_NOMATCH);
4693 }
4694 switch(*eptr++)
4695 {
4696 VSPACE_BYTE_CASES:
4697 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4698 VSPACE_MULTIBYTE_CASES:
4699 #endif
4700 RRETURN(MATCH_NOMATCH);
4701 default: break;
4702 }
4703 }
4704 break;
4705
4706 case OP_VSPACE:
4707 for (i = 1; i <= min; i++)
4708 {
4709 if (eptr >= md->end_subject)
4710 {
4711 SCHECK_PARTIAL();
4712 RRETURN(MATCH_NOMATCH);
4713 }
4714 switch(*eptr++)
4715 {
4716 default: RRETURN(MATCH_NOMATCH);
4717 VSPACE_BYTE_CASES:
4718 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4719 VSPACE_MULTIBYTE_CASES:
4720 #endif
4721 break;
4722 }
4723 }
4724 break;
4725
4726 case OP_NOT_DIGIT:
4727 for (i = 1; i <= min; i++)
4728 {
4729 if (eptr >= md->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4733 }
4734 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4735 RRETURN(MATCH_NOMATCH);
4736 eptr++;
4737 }
4738 break;
4739
4740 case OP_DIGIT:
4741 for (i = 1; i <= min; i++)
4742 {
4743 if (eptr >= md->end_subject)
4744 {
4745 SCHECK_PARTIAL();
4746 RRETURN(MATCH_NOMATCH);
4747 }
4748 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4749 RRETURN(MATCH_NOMATCH);
4750 eptr++;
4751 }
4752 break;
4753
4754 case OP_NOT_WHITESPACE:
4755 for (i = 1; i <= min; i++)
4756 {
4757 if (eptr >= md->end_subject)
4758 {
4759 SCHECK_PARTIAL();
4760 RRETURN(MATCH_NOMATCH);
4761 }
4762 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4763 RRETURN(MATCH_NOMATCH);
4764 eptr++;
4765 }
4766 break;
4767
4768 case OP_WHITESPACE:
4769 for (i = 1; i <= min; i++)
4770 {
4771 if (eptr >= md->end_subject)
4772 {
4773 SCHECK_PARTIAL();
4774 RRETURN(MATCH_NOMATCH);
4775 }
4776 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4777 RRETURN(MATCH_NOMATCH);
4778 eptr++;
4779 }
4780 break;
4781
4782 case OP_NOT_WORDCHAR:
4783 for (i = 1; i <= min; i++)
4784 {
4785 if (eptr >= md->end_subject)
4786 {
4787 SCHECK_PARTIAL();
4788 RRETURN(MATCH_NOMATCH);
4789 }
4790 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4791 RRETURN(MATCH_NOMATCH);
4792 eptr++;
4793 }
4794 break;
4795
4796 case OP_WORDCHAR:
4797 for (i = 1; i <= min; i++)
4798 {
4799 if (eptr >= md->end_subject)
4800 {
4801 SCHECK_PARTIAL();
4802 RRETURN(MATCH_NOMATCH);
4803 }
4804 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4805 RRETURN(MATCH_NOMATCH);
4806 eptr++;
4807 }
4808 break;
4809
4810 default:
4811 RRETURN(PCRE_ERROR_INTERNAL);
4812 }
4813 }
4814
4815 /* If min = max, continue at the same level without recursing */
4816
4817 if (min == max) continue;
4818
4819 /* If minimizing, we have to test the rest of the pattern before each
4820 subsequent match. Again, separate the UTF-8 case for speed, and also
4821 separate the UCP cases. */
4822
4823 if (minimize)
4824 {
4825 #ifdef SUPPORT_UCP
4826 if (prop_type >= 0)
4827 {
4828 switch(prop_type)
4829 {
4830 case PT_ANY:
4831 for (fi = min;; fi++)
4832 {
4833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4835 if (fi >= max) RRETURN(MATCH_NOMATCH);
4836 if (eptr >= md->end_subject)
4837 {
4838 SCHECK_PARTIAL();
4839 RRETURN(MATCH_NOMATCH);
4840 }
4841 GETCHARINCTEST(c, eptr);
4842 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4843 }
4844 /* Control never gets here */
4845
4846 case PT_LAMP:
4847 for (fi = min;; fi++)
4848 {
4849 int chartype;
4850 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4852 if (fi >= max) RRETURN(MATCH_NOMATCH);
4853 if (eptr >= md->end_subject)
4854 {
4855 SCHECK_PARTIAL();
4856 RRETURN(MATCH_NOMATCH);
4857 }
4858 GETCHARINCTEST(c, eptr);
4859 chartype = UCD_CHARTYPE(c);
4860 if ((chartype == ucp_Lu ||
4861 chartype == ucp_Ll ||
4862 chartype == ucp_Lt) == prop_fail_result)
4863 RRETURN(MATCH_NOMATCH);
4864 }
4865 /* Control never gets here */
4866
4867 case PT_GC:
4868 for (fi = min;; fi++)
4869 {
4870 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4872 if (fi >= max) RRETURN(MATCH_NOMATCH);
4873 if (eptr >= md->end_subject)
4874 {
4875 SCHECK_PARTIAL();
4876 RRETURN(MATCH_NOMATCH);
4877 }
4878 GETCHARINCTEST(c, eptr);
4879 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4880 RRETURN(MATCH_NOMATCH);
4881 }
4882 /* Control never gets here */
4883
4884 case PT_PC:
4885 for (fi = min;; fi++)
4886 {
4887 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4889 if (fi >= max) RRETURN(MATCH_NOMATCH);
4890 if (eptr >= md->end_subject)
4891 {
4892 SCHECK_PARTIAL();
4893 RRETURN(MATCH_NOMATCH);
4894 }
4895 GETCHARINCTEST(c, eptr);
4896 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4897 RRETURN(MATCH_NOMATCH);
4898 }
4899 /* Control never gets here */
4900
4901 case PT_SC:
4902 for (fi = min;; fi++)
4903 {
4904 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4906 if (fi >= max) RRETURN(MATCH_NOMATCH);
4907 if (eptr >= md->end_subject)
4908 {
4909 SCHECK_PARTIAL();
4910 RRETURN(MATCH_NOMATCH);
4911 }
4912 GETCHARINCTEST(c, eptr);
4913 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4914 RRETURN(MATCH_NOMATCH);
4915 }
4916 /* Control never gets here */
4917
4918 case PT_ALNUM:
4919 for (fi = min;; fi++)
4920 {
4921 int category;
4922 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4923 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4924 if (fi >= max) RRETURN(MATCH_NOMATCH);
4925 if (eptr >= md->end_subject)
4926 {
4927 SCHECK_PARTIAL();
4928 RRETURN(MATCH_NOMATCH);
4929 }
4930 GETCHARINCTEST(c, eptr);
4931 category = UCD_CATEGORY(c);
4932 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4933 RRETURN(MATCH_NOMATCH);
4934 }
4935 /* Control never gets here */
4936
4937 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4938 which means that Perl space and POSIX space are now identical. PCRE
4939 was changed at release 8.34. */
4940
4941 case PT_SPACE: /* Perl space */
4942 case PT_PXSPACE: /* POSIX space */
4943 for (fi = min;; fi++)
4944 {
4945 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4947 if (fi >= max) RRETURN(MATCH_NOMATCH);
4948 if (eptr >= md->end_subject)
4949 {
4950 SCHECK_PARTIAL();
4951 RRETURN(MATCH_NOMATCH);
4952 }
4953 GETCHARINCTEST(c, eptr);
4954 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4955 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4956 == prop_fail_result)
4957 RRETURN(MATCH_NOMATCH);
4958 }
4959 /* Control never gets here */
4960
4961 case PT_WORD:
4962 for (fi = min;; fi++)
4963 {
4964 int category;
4965 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4967 if (fi >= max) RRETURN(MATCH_NOMATCH);
4968 if (eptr >= md->end_subject)
4969 {
4970 SCHECK_PARTIAL();
4971 RRETURN(MATCH_NOMATCH);
4972 }
4973 GETCHARINCTEST(c, eptr);
4974 category = UCD_CATEGORY(c);
4975 if ((category == ucp_L ||
4976 category == ucp_N ||
4977 c == CHAR_UNDERSCORE)
4978 == prop_fail_result)
4979 RRETURN(MATCH_NOMATCH);
4980 }
4981 /* Control never gets here */
4982
4983 case PT_CLIST:
4984 for (fi = min;; fi++)
4985 {
4986 const pcre_uint32 *cp;
4987 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
4988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4989 if (fi >= max) RRETURN(MATCH_NOMATCH);
4990 if (eptr >= md->end_subject)
4991 {
4992 SCHECK_PARTIAL();
4993 RRETURN(MATCH_NOMATCH);
4994 }
4995 GETCHARINCTEST(c, eptr);
4996 cp = PRIV(ucd_caseless_sets) + prop_value;
4997 for (;;)
4998 {
4999 if (c < *cp)
5000 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5001 if (c == *cp++)
5002 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5003 }
5004 }
5005 /* Control never gets here */
5006
5007 case PT_UCNC:
5008 for (fi = min;; fi++)
5009 {
5010 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5011 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5012 if (fi >= max) RRETURN(MATCH_NOMATCH);
5013 if (eptr >= md->end_subject)
5014 {
5015 SCHECK_PARTIAL();
5016 RRETURN(MATCH_NOMATCH);
5017 }
5018 GETCHARINCTEST(c, eptr);
5019 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5020 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5021 c >= 0xe000) == prop_fail_result)
5022 RRETURN(MATCH_NOMATCH);
5023 }
5024 /* Control never gets here */
5025
5026 /* This should never occur */
5027 default:
5028 RRETURN(PCRE_ERROR_INTERNAL);
5029 }
5030 }
5031
5032 /* Match extended Unicode sequences. We will get here only if the
5033 support is in the binary; otherwise a compile-time error occurs. */
5034
5035 else if (ctype == OP_EXTUNI)
5036 {
5037 for (fi = min;; fi++)
5038 {
5039 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5041 if (fi >= max) RRETURN(MATCH_NOMATCH);
5042 if (eptr >= md->end_subject)
5043 {
5044 SCHECK_PARTIAL();
5045 RRETURN(MATCH_NOMATCH);
5046 }
5047 else
5048 {
5049 int lgb, rgb;
5050 GETCHARINCTEST(c, eptr);
5051 lgb = UCD_GRAPHBREAK(c);
5052 while (eptr < md->end_subject)
5053 {
5054 int len = 1;
5055 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5056 rgb = UCD_GRAPHBREAK(c);
5057 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5058 lgb = rgb;
5059 eptr += len;
5060 }
5061 }
5062 CHECK_PARTIAL();
5063 }
5064 }
5065 else
5066 #endif /* SUPPORT_UCP */
5067
5068 #ifdef SUPPORT_UTF
5069 if (utf)
5070 {
5071 for (fi = min;; fi++)
5072 {
5073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5075 if (fi >= max) RRETURN(MATCH_NOMATCH);
5076 if (eptr >= md->end_subject)
5077 {
5078 SCHECK_PARTIAL();
5079 RRETURN(MATCH_NOMATCH);
5080 }
5081 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5082 RRETURN(MATCH_NOMATCH);
5083 GETCHARINC(c, eptr);
5084 switch(ctype)
5085 {
5086 case OP_ANY: /* This is the non-NL case */
5087 if (md->partial != 0 && /* Take care with CRLF partial */
5088 eptr >= md->end_subject &&
5089 NLBLOCK->nltype == NLTYPE_FIXED &&
5090 NLBLOCK->nllen == 2 &&
5091 c == NLBLOCK->nl[0])
5092 {
5093 md->hitend = TRUE;
5094 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5095 }
5096 break;
5097
5098 case OP_ALLANY:
5099 case OP_ANYBYTE:
5100 break;
5101
5102 case OP_ANYNL:
5103 switch(c)
5104 {
5105 default: RRETURN(MATCH_NOMATCH);
5106 case CHAR_CR:
5107 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5108 break;
5109
5110 case CHAR_LF:
5111 break;
5112
5113 case CHAR_VT:
5114 case CHAR_FF:
5115 case CHAR_NEL:
5116 #ifndef EBCDIC
5117 case 0x2028:
5118 case 0x2029:
5119 #endif /* Not EBCDIC */
5120 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5121 break;
5122 }
5123 break;
5124
5125 case OP_NOT_HSPACE:
5126 switch(c)
5127 {
5128 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5129 default: break;
5130 }
5131 break;
5132
5133 case OP_HSPACE:
5134 switch(c)
5135 {
5136 HSPACE_CASES: break;
5137 default: RRETURN(MATCH_NOMATCH);
5138 }
5139 break;
5140
5141 case OP_NOT_VSPACE:
5142 switch(c)
5143 {
5144 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5145 default: break;
5146 }
5147 break;
5148
5149 case OP_VSPACE:
5150 switch(c)
5151 {
5152 VSPACE_CASES: break;
5153 default: RRETURN(MATCH_NOMATCH);
5154 }
5155 break;
5156
5157 case OP_NOT_DIGIT:
5158 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5159 RRETURN(MATCH_NOMATCH);
5160 break;
5161
5162 case OP_DIGIT:
5163 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5164 RRETURN(MATCH_NOMATCH);
5165 break;
5166
5167 case OP_NOT_WHITESPACE:
5168 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5169 RRETURN(MATCH_NOMATCH);
5170 break;
5171
5172 case OP_WHITESPACE:
5173 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5174 RRETURN(MATCH_NOMATCH);
5175 break;
5176
5177 case OP_NOT_WORDCHAR:
5178 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5179 RRETURN(MATCH_NOMATCH);
5180 break;
5181
5182 case OP_WORDCHAR:
5183 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5184 RRETURN(MATCH_NOMATCH);
5185 break;
5186
5187 default:
5188 RRETURN(PCRE_ERROR_INTERNAL);
5189 }
5190 }
5191 }
5192 else
5193 #endif
5194 /* Not UTF mode */
5195 {
5196 for (fi = min;; fi++)
5197 {
5198 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5199 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5200 if (fi >= max) RRETURN(MATCH_NOMATCH);
5201 if (eptr >= md->end_subject)
5202 {
5203 SCHECK_PARTIAL();
5204 RRETURN(MATCH_NOMATCH);
5205 }
5206 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5207 RRETURN(MATCH_NOMATCH);
5208 c = *eptr++;
5209 switch(ctype)
5210 {
5211 case OP_ANY: /* This is the non-NL case */
5212 if (md->partial != 0 && /* Take care with CRLF partial */
5213 eptr >= md->end_subject &&
5214 NLBLOCK->nltype == NLTYPE_FIXED &&
5215 NLBLOCK->nllen == 2 &&
5216 c == NLBLOCK->nl[0])
5217 {
5218 md->hitend = TRUE;
5219 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5220 }
5221 break;
5222
5223 case OP_ALLANY:
5224 case OP_ANYBYTE:
5225 break;
5226
5227 case OP_ANYNL:
5228 switch(c)
5229 {
5230 default: RRETURN(MATCH_NOMATCH);
5231 case CHAR_CR:
5232 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5233 break;
5234
5235 case CHAR_LF:
5236 break;
5237
5238 case CHAR_VT:
5239 case CHAR_FF:
5240 case CHAR_NEL:
5241 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5242 case 0x2028:
5243 case 0x2029:
5244 #endif
5245 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5246 break;
5247 }
5248 break;
5249
5250 case OP_NOT_HSPACE:
5251 switch(c)
5252 {
5253 default: break;
5254 HSPACE_BYTE_CASES:
5255 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5256 HSPACE_MULTIBYTE_CASES:
5257 #endif
5258 RRETURN(MATCH_NOMATCH);
5259 }
5260 break;
5261
5262 case OP_HSPACE:
5263 switch(c)
5264 {
5265 default: RRETURN(MATCH_NOMATCH);
5266 HSPACE_BYTE_CASES:
5267 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5268 HSPACE_MULTIBYTE_CASES:
5269 #endif
5270 break;
5271 }
5272 break;
5273
5274 case OP_NOT_VSPACE:
5275 switch(c)
5276 {
5277 default: break;
5278 VSPACE_BYTE_CASES:
5279 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5280 VSPACE_MULTIBYTE_CASES:
5281 #endif
5282 RRETURN(MATCH_NOMATCH);
5283 }
5284 break;
5285
5286 case OP_VSPACE:
5287 switch(c)
5288 {
5289 default: RRETURN(MATCH_NOMATCH);
5290 VSPACE_BYTE_CASES:
5291 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5292 VSPACE_MULTIBYTE_CASES:
5293 #endif
5294 break;
5295 }
5296 break;
5297
5298 case OP_NOT_DIGIT:
5299 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5300 break;
5301
5302 case OP_DIGIT:
5303 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5304 break;
5305
5306 case OP_NOT_WHITESPACE:
5307 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5308 break;
5309
5310 case OP_WHITESPACE:
5311 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5312 break;
5313
5314 case OP_NOT_WORDCHAR:
5315 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5316 break;
5317
5318 case OP_WORDCHAR:
5319 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5320 break;
5321
5322 default:
5323 RRETURN(PCRE_ERROR_INTERNAL);
5324 }
5325 }
5326 }
5327 /* Control never gets here */
5328 }
5329
5330 /* If maximizing, it is worth using inline code for speed, doing the type
5331 test once at the start (i.e. keep it out of the loop). Again, keep the
5332 UTF-8 and UCP stuff separate. */
5333
5334 else
5335 {
5336 pp = eptr; /* Remember where we started */
5337
5338 #ifdef SUPPORT_UCP
5339 if (prop_type >= 0)
5340 {
5341 switch(prop_type)
5342 {
5343 case PT_ANY:
5344 for (i = min; i < max; i++)
5345 {
5346 int len = 1;
5347 if (eptr >= md->end_subject)
5348 {
5349 SCHECK_PARTIAL();
5350 break;
5351 }
5352 GETCHARLENTEST(c, eptr, len);
5353 if (prop_fail_result) break;
5354 eptr+= len;
5355 }
5356 break;
5357
5358 case PT_LAMP:
5359 for (i = min; i < max; i++)
5360 {
5361 int chartype;
5362 int len = 1;
5363 if (eptr >= md->end_subject)
5364 {
5365 SCHECK_PARTIAL();
5366 break;
5367 }
5368 GETCHARLENTEST(c, eptr, len);
5369 chartype = UCD_CHARTYPE(c);
5370 if ((chartype == ucp_Lu ||
5371 chartype == ucp_Ll ||
5372 chartype == ucp_Lt) == prop_fail_result)
5373 break;
5374 eptr+= len;
5375 }
5376 break;
5377
5378 case PT_GC:
5379 for (i = min; i < max; i++)
5380 {
5381 int len = 1;
5382 if (eptr >= md->end_subject)
5383 {
5384 SCHECK_PARTIAL();
5385 break;
5386 }
5387 GETCHARLENTEST(c, eptr, len);
5388 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5389 eptr+= len;
5390 }
5391 break;
5392
5393 case PT_PC:
5394 for (i = min; i < max; i++)
5395 {
5396 int len = 1;
5397 if (eptr >= md->end_subject)
5398 {
5399 SCHECK_PARTIAL();
5400 break;
5401 }
5402 GETCHARLENTEST(c, eptr, len);
5403 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5404 eptr+= len;
5405 }
5406 break;
5407
5408 case PT_SC:
5409 for (i = min; i < max; i++)
5410 {
5411 int len = 1;
5412 if (eptr >= md->end_subject)
5413 {
5414 SCHECK_PARTIAL();
5415 break;
5416 }
5417 GETCHARLENTEST(c, eptr, len);
5418 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5419 eptr+= len;
5420 }
5421 break;
5422
5423 case PT_ALNUM:
5424 for (i = min; i < max; i++)
5425 {
5426 int category;
5427 int len = 1;
5428 if (eptr >= md->end_subject)
5429 {
5430 SCHECK_PARTIAL();
5431 break;
5432 }
5433 GETCHARLENTEST(c, eptr, len);
5434 category = UCD_CATEGORY(c);
5435 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5436 break;
5437 eptr+= len;
5438 }
5439 break;
5440
5441 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5442 which means that Perl space and POSIX space are now identical. PCRE
5443 was changed at release 8.34. */
5444
5445 case PT_SPACE: /* Perl space */
5446 case PT_PXSPACE: /* POSIX space */
5447 for (i = min; i < max; i++)
5448 {
5449 int len = 1;
5450 if (eptr >= md->end_subject)
5451 {
5452 SCHECK_PARTIAL();
5453 break;
5454 }
5455 GETCHARLENTEST(c, eptr, len);
5456 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5457 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5458 == prop_fail_result)
5459 break;
5460 eptr+= len;
5461 }
5462 break;
5463
5464 case PT_WORD:
5465 for (i = min; i < max; i++)
5466 {
5467 int category;
5468 int len = 1;
5469 if (eptr >= md->end_subject)
5470 {
5471 SCHECK_PARTIAL();
5472 break;
5473 }
5474 GETCHARLENTEST(c, eptr, len);
5475 category = UCD_CATEGORY(c);
5476 if ((category == ucp_L || category == ucp_N ||
5477 c == CHAR_UNDERSCORE) == prop_fail_result)
5478 break;
5479 eptr+= len;
5480 }
5481 break;
5482
5483 case PT_CLIST:
5484 for (i = min; i < max; i++)
5485 {
5486 const pcre_uint32 *cp;
5487 int len = 1;
5488 if (eptr >= md->end_subject)
5489 {
5490 SCHECK_PARTIAL();
5491 break;
5492 }
5493 GETCHARLENTEST(c, eptr, len);
5494 cp = PRIV(ucd_caseless_sets) + prop_value;
5495 for (;;)
5496 {
5497 if (c < *cp)
5498 { if (prop_fail_result) break; else goto GOT_MAX; }
5499 if (c == *cp++)
5500 { if (prop_fail_result) goto GOT_MAX; else break; }
5501 }
5502 eptr += len;
5503 }
5504 GOT_MAX:
5505 break;
5506
5507 case PT_UCNC:
5508 for (i = min; i < max; i++)
5509 {
5510 int len = 1;
5511 if (eptr >= md->end_subject)
5512 {
5513 SCHECK_PARTIAL();
5514 break;
5515 }
5516 GETCHARLENTEST(c, eptr, len);
5517 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5518 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5519 c >= 0xe000) == prop_fail_result)
5520 break;
5521 eptr += len;
5522 }
5523 break;
5524
5525 default:
5526 RRETURN(PCRE_ERROR_INTERNAL);
5527 }
5528
5529 /* eptr is now past the end of the maximum run */
5530
5531 if (possessive) continue; /* No backtracking */
5532 for(;;)
5533 {
5534 if (eptr == pp) goto TAIL_RECURSE;
5535 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5536 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5537 eptr--;
5538 if (utf) BACKCHAR(eptr);
5539 }
5540 }
5541
5542 /* Match extended Unicode grapheme clusters. We will get here only if the
5543 support is in the binary; otherwise a compile-time error occurs. */
5544
5545 else if (ctype == OP_EXTUNI)
5546 {
5547 for (i = min; i < max; i++)
5548 {
5549 if (eptr >= md->end_subject)
5550 {
5551 SCHECK_PARTIAL();
5552 break;
5553 }
5554 else
5555 {
5556 int lgb, rgb;
5557 GETCHARINCTEST(c, eptr);
5558 lgb = UCD_GRAPHBREAK(c);
5559 while (eptr < md->end_subject)
5560 {
5561 int len = 1;
5562 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5563 rgb = UCD_GRAPHBREAK(c);
5564 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5565 lgb = rgb;
5566 eptr += len;
5567 }
5568 }
5569 CHECK_PARTIAL();
5570 }
5571
5572 /* eptr is now past the end of the maximum run */
5573
5574 if (possessive) continue; /* No backtracking */
5575
5576 for(;;)
5577 {
5578 int lgb, rgb;
5579 PCRE_PUCHAR fptr;
5580
5581 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5582 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5583 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5584
5585 /* Backtracking over an extended grapheme cluster involves inspecting
5586 the previous two characters (if present) to see if a break is
5587 permitted between them. */
5588
5589 eptr--;
5590 if (!utf) c = *eptr; else
5591 {
5592 BACKCHAR(eptr);
5593 GETCHAR(c, eptr);
5594 }
5595 rgb = UCD_GRAPHBREAK(c);
5596
5597 for (;;)
5598 {
5599 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5600 fptr = eptr - 1;
5601 if (!utf) c = *fptr; else
5602 {
5603 BACKCHAR(fptr);
5604 GETCHAR(c, fptr);
5605 }
5606 lgb = UCD_GRAPHBREAK(c);
5607 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5608 eptr = fptr;
5609 rgb = lgb;
5610 }
5611 }
5612 }
5613
5614 else
5615 #endif /* SUPPORT_UCP */
5616
5617 #ifdef SUPPORT_UTF
5618 if (utf)
5619 {
5620 switch(ctype)
5621 {
5622 case OP_ANY:
5623 if (max < INT_MAX)
5624 {
5625 for (i = min; i < max; i++)
5626 {
5627 if (eptr >= md->end_subject)
5628 {
5629 SCHECK_PARTIAL();
5630 break;
5631 }
5632 if (IS_NEWLINE(eptr)) break;
5633 if (md->partial != 0 && /* Take care with CRLF partial */
5634 eptr + 1 >= md->end_subject &&
5635 NLBLOCK->nltype == NLTYPE_FIXED &&
5636 NLBLOCK->nllen == 2 &&
5637 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5638 {
5639 md->hitend = TRUE;
5640 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5641 }
5642 eptr++;
5643 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5644 }
5645 }
5646
5647 /* Handle unlimited UTF-8 repeat */
5648
5649 else
5650 {
5651 for (i = min; i < max; i++)
5652 {
5653 if (eptr >= md->end_subject)
5654 {
5655 SCHECK_PARTIAL();
5656 break;
5657 }
5658 if (IS_NEWLINE(eptr)) break;
5659 if (md->partial != 0 && /* Take care with CRLF partial */
5660 eptr + 1 >= md->end_subject &&
5661 NLBLOCK->nltype == NLTYPE_FIXED &&
5662 NLBLOCK->nllen == 2 &&
5663 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5664 {
5665 md->hitend = TRUE;
5666 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5667 }
5668 eptr++;
5669 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5670 }
5671 }
5672 break;
5673
5674 case OP_ALLANY:
5675 if (max < INT_MAX)
5676 {
5677 for (i = min; i < max; i++)
5678 {
5679 if (eptr >= md->end_subject)
5680 {
5681 SCHECK_PARTIAL();
5682 break;
5683 }
5684 eptr++;
5685 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5686 }
5687 }
5688 else
5689 {
5690 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5691 SCHECK_PARTIAL();
5692 }
5693 break;
5694
5695 /* The byte case is the same as non-UTF8 */
5696
5697 case OP_ANYBYTE:
5698 c = max - min;
5699 if (c > (unsigned int)(md->end_subject - eptr))
5700 {
5701 eptr = md->end_subject;
5702 SCHECK_PARTIAL();
5703 }
5704 else eptr += c;
5705 break;
5706
5707 case OP_ANYNL:
5708 for (i = min; i < max; i++)
5709 {
5710 int len = 1;
5711 if (eptr >= md->end_subject)
5712 {
5713 SCHECK_PARTIAL();
5714 break;
5715 }
5716 GETCHARLEN(c, eptr, len);
5717 if (c == CHAR_CR)
5718 {
5719 if (++eptr >= md->end_subject) break;
5720 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5721 }
5722 else
5723 {
5724 if (c != CHAR_LF &&
5725 (md->bsr_anycrlf ||
5726 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5727 #ifndef EBCDIC
5728 && c != 0x2028 && c != 0x2029
5729 #endif /* Not EBCDIC */
5730 )))
5731 break;
5732 eptr += len;
5733 }
5734 }
5735 break;
5736
5737 case OP_NOT_HSPACE:
5738 case OP_HSPACE:
5739 for (i = min; i < max; i++)
5740 {
5741 BOOL gotspace;
5742 int len = 1;
5743 if (eptr >= md->end_subject)
5744 {
5745 SCHECK_PARTIAL();
5746 break;
5747 }
5748 GETCHARLEN(c, eptr, len);
5749 switch(c)
5750 {
5751 HSPACE_CASES: gotspace = TRUE; break;
5752 default: gotspace = FALSE; break;
5753 }
5754 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5755 eptr += len;
5756 }
5757 break;
5758
5759 case OP_NOT_VSPACE:
5760 case OP_VSPACE:
5761 for (i = min; i < max; i++)
5762 {
5763 BOOL gotspace;
5764 int len = 1;
5765 if (eptr >= md->end_subject)
5766 {
5767 SCHECK_PARTIAL();
5768 break;
5769 }
5770 GETCHARLEN(c, eptr, len);
5771 switch(c)
5772 {
5773 VSPACE_CASES: gotspace = TRUE; break;
5774 default: gotspace = FALSE; break;
5775 }
5776 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5777 eptr += len;
5778 }
5779 break;
5780
5781 case OP_NOT_DIGIT:
5782 for (i = min; i < max; i++)
5783 {
5784 int len = 1;
5785 if (eptr >= md->end_subject)
5786 {
5787 SCHECK_PARTIAL();
5788 break;
5789 }
5790 GETCHARLEN(c, eptr, len);
5791 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5792 eptr+= len;
5793 }
5794 break;
5795
5796 case OP_DIGIT:
5797 for (i = min; i < max; i++)
5798 {
5799 int len = 1;
5800 if (eptr >= md->end_subject)
5801 {
5802 SCHECK_PARTIAL();
5803 break;
5804 }
5805 GETCHARLEN(c, eptr, len);
5806 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5807 eptr+= len;
5808 }
5809 break;
5810
5811 case OP_NOT_WHITESPACE:
5812 for (i = min; i < max; i++)
5813 {
5814 int len = 1;
5815 if (eptr >= md->end_subject)
5816 {
5817 SCHECK_PARTIAL();
5818 break;
5819 }
5820 GETCHARLEN(c, eptr, len);
5821 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5822 eptr+= len;
5823 }
5824 break;
5825
5826 case OP_WHITESPACE:
5827 for (i = min; i < max; i++)
5828 {
5829 int len = 1;
5830 if (eptr >= md->end_subject)
5831 {
5832 SCHECK_PARTIAL();
5833 break;
5834 }
5835 GETCHARLEN(c, eptr, len);
5836 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5837 eptr+= len;
5838 }
5839 break;
5840
5841 case OP_NOT_WORDCHAR:
5842 for (i = min; i < max; i++)
5843 {
5844 int len = 1;
5845 if (eptr >= md->end_subject)
5846 {
5847 SCHECK_PARTIAL();
5848 break;
5849 }
5850 GETCHARLEN(c, eptr, len);
5851 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5852 eptr+= len;
5853 }
5854 break;
5855
5856 case OP_WORDCHAR:
5857 for (i = min; i < max; i++)
5858 {
5859 int len = 1;
5860 if (eptr >= md->end_subject)
5861 {
5862 SCHECK_PARTIAL();
5863 break;
5864 }
5865 GETCHARLEN(c, eptr, len);
5866 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5867 eptr+= len;
5868 }
5869 break;
5870
5871 default:
5872 RRETURN(PCRE_ERROR_INTERNAL);
5873 }
5874
5875 if (possessive) continue; /* No backtracking */
5876 for(;;)
5877 {
5878 if (eptr == pp) goto TAIL_RECURSE;
5879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5881 eptr--;
5882 BACKCHAR(eptr);
5883 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5884 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5885 }
5886 }
5887 else
5888 #endif /* SUPPORT_UTF */
5889 /* Not UTF mode */
5890 {
5891 switch(ctype)
5892 {
5893 case OP_ANY:
5894 for (i = min; i < max; i++)
5895 {
5896 if (eptr >= md->end_subject)
5897 {
5898 SCHECK_PARTIAL();
5899 break;
5900 }
5901 if (IS_NEWLINE(eptr)) break;
5902 if (md->partial != 0 && /* Take care with CRLF partial */
5903 eptr + 1 >= md->end_subject &&
5904 NLBLOCK->nltype == NLTYPE_FIXED &&
5905 NLBLOCK->nllen == 2 &&
5906 *eptr == NLBLOCK->nl[0])
5907 {
5908 md->hitend = TRUE;
5909 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5910 }
5911 eptr++;
5912 }
5913 break;
5914
5915 case OP_ALLANY:
5916 case OP_ANYBYTE:
5917 c = max - min;
5918 if (c > (unsigned int)(md->end_subject - eptr))
5919 {
5920 eptr = md->end_subject;
5921 SCHECK_PARTIAL();
5922 }
5923 else eptr += c;
5924 break;
5925
5926 case OP_ANYNL:
5927 for (i = min; i < max; i++)
5928 {
5929 if (eptr >= md->end_subject)
5930 {
5931 SCHECK_PARTIAL();
5932 break;
5933 }
5934 c = *eptr;
5935 if (c == CHAR_CR)
5936 {
5937 if (++eptr >= md->end_subject) break;
5938 if (*eptr == CHAR_LF) eptr++;
5939 }
5940 else
5941 {
5942 if (c != CHAR_LF && (md->bsr_anycrlf ||
5943 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5944 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5945 && c != 0x2028 && c != 0x2029
5946 #endif
5947 ))) break;
5948 eptr++;
5949 }
5950 }
5951 break;
5952
5953 case OP_NOT_HSPACE:
5954 for (i = min; i < max; i++)
5955 {
5956 if (eptr >= md->end_subject)
5957 {
5958 SCHECK_PARTIAL();
5959 break;
5960 }
5961 switch(*eptr)
5962 {
5963 default: eptr++; break;
5964 HSPACE_BYTE_CASES:
5965 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5966 HSPACE_MULTIBYTE_CASES:
5967 #endif
5968 goto ENDLOOP00;
5969 }
5970 }
5971 ENDLOOP00:
5972 break;
5973
5974 case OP_HSPACE:
5975 for (i = min; i < max; i++)
5976 {
5977 if (eptr >= md->end_subject)
5978 {
5979 SCHECK_PARTIAL();
5980 break;
5981 }
5982 switch(*eptr)
5983 {
5984 default: goto ENDLOOP01;
5985 HSPACE_BYTE_CASES:
5986 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5987 HSPACE_MULTIBYTE_CASES:
5988 #endif
5989 eptr++; break;
5990 }
5991 }
5992 ENDLOOP01:
5993 break;
5994
5995 case OP_NOT_VSPACE:
5996 for (i = min; i < max; i++)
5997 {
5998 if (eptr >= md->end_subject)
5999 {
6000 SCHECK_PARTIAL();
6001 break;
6002 }
6003 switch(*eptr)
6004 {
6005 default: eptr++; break;
6006 VSPACE_BYTE_CASES:
6007 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6008 VSPACE_MULTIBYTE_CASES:
6009 #endif
6010 goto ENDLOOP02;
6011 }
6012 }
6013 ENDLOOP02:
6014 break;
6015
6016 case OP_VSPACE:
6017 for (i = min; i < max; i++)
6018 {
6019 if (eptr >= md->end_subject)
6020 {
6021 SCHECK_PARTIAL();
6022 break;
6023 }
6024 switch(*eptr)
6025 {
6026 default: goto ENDLOOP03;
6027 VSPACE_BYTE_CASES:
6028 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6029 VSPACE_MULTIBYTE_CASES:
6030 #endif
6031 eptr++; break;
6032 }
6033 }
6034 ENDLOOP03:
6035 break;
6036
6037 case OP_NOT_DIGIT:
6038 for (i = min; i < max; i++)
6039 {
6040 if (eptr >= md->end_subject)
6041 {
6042 SCHECK_PARTIAL();
6043 break;
6044 }
6045 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6046 eptr++;
6047 }
6048 break;
6049
6050 case OP_DIGIT:
6051 for (i = min; i < max; i++)
6052 {
6053 if (eptr >= md->end_subject)
6054 {
6055 SCHECK_PARTIAL();
6056 break;
6057 }
6058 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6059 eptr++;
6060 }
6061 break;
6062
6063 case OP_NOT_WHITESPACE:
6064 for (i = min; i < max; i++)
6065 {
6066 if (eptr >= md->end_subject)
6067 {
6068 SCHECK_PARTIAL();
6069 break;
6070 }
6071 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6072 eptr++;
6073 }
6074 break;
6075
6076 case OP_WHITESPACE:
6077 for (i = min; i < max; i++)
6078 {
6079 if (eptr >= md->end_subject)
6080 {
6081 SCHECK_PARTIAL();
6082 break;
6083 }
6084 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6085 eptr++;
6086 }
6087 break;
6088
6089 case OP_NOT_WORDCHAR:
6090 for (i = min; i < max; i++)
6091 {
6092 if (eptr >= md->end_subject)
6093 {
6094 SCHECK_PARTIAL();
6095 break;
6096 }
6097 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6098 eptr++;
6099 }
6100 break;
6101
6102 case OP_WORDCHAR:
6103 for (i = min; i < max; i++)
6104 {
6105 if (eptr >= md->end_subject)
6106 {
6107 SCHECK_PARTIAL();
6108 break;
6109 }
6110 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6111 eptr++;
6112 }
6113 break;
6114
6115 default:
6116 RRETURN(PCRE_ERROR_INTERNAL);
6117 }
6118
6119 if (possessive) continue; /* No backtracking */
6120 for (;;)
6121 {
6122 if (eptr == pp) goto TAIL_RECURSE;
6123 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6124 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6125 eptr--;
6126 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6127 eptr[-1] == CHAR_CR) eptr--;
6128 }
6129 }
6130
6131 /* Control never gets here */
6132 }
6133
6134 /* There's been some horrible disaster. Arrival here can only mean there is
6135 something seriously wrong in the code above or the OP_xxx definitions. */
6136
6137 default:
6138 DPRINTF(("Unknown opcode %d\n", *ecode));
6139 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6140 }
6141
6142 /* Do not stick any code in here without much thought; it is assumed
6143 that "continue" in the code above comes out to here to repeat the main
6144 loop. */
6145
6146 } /* End of main loop */
6147 /* Control never reaches here */
6148
6149
6150 /* When compiling to use the heap rather than the stack for recursive calls to
6151 match(), the RRETURN() macro jumps here. The number that is saved in
6152 frame->Xwhere indicates which label we actually want to return to. */
6153
6154 #ifdef NO_RECURSE
6155 #define LBL(val) case val: goto L_RM##val;
6156 HEAP_RETURN:
6157 switch (frame->Xwhere)
6158 {
6159 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6160 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6161 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6162 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6163 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6164 LBL(65) LBL(66)
6165 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6166 LBL(21)
6167 #endif
6168 #ifdef SUPPORT_UTF
6169 LBL(16) LBL(18) LBL(20)
6170 LBL(22) LBL(23) LBL(28) LBL(30)
6171 LBL(32) LBL(34) LBL(42) LBL(46)
6172 #ifdef SUPPORT_UCP
6173 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6174 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6175 #endif /* SUPPORT_UCP */
6176 #endif /* SUPPORT_UTF */
6177 default:
6178 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6179 return PCRE_ERROR_INTERNAL;
6180 }
6181 #undef LBL
6182 #endif /* NO_RECURSE */
6183 }
6184
6185
6186 /***************************************************************************
6187 ****************************************************************************
6188 RECURSION IN THE match() FUNCTION
6189
6190 Undefine all the macros that were defined above to handle this. */
6191
6192 #ifdef NO_RECURSE
6193 #undef eptr
6194 #undef ecode
6195 #undef mstart
6196 #undef offset_top
6197 #undef eptrb
6198 #undef flags
6199
6200 #undef callpat
6201 #undef charptr
6202 #undef data
6203 #undef next
6204 #undef pp
6205 #undef prev
6206 #undef saved_eptr
6207
6208 #undef new_recursive
6209
6210 #undef cur_is_word
6211 #undef condition
6212 #undef prev_is_word
6213
6214 #undef ctype
6215 #undef length
6216 #undef max
6217 #undef min
6218 #undef number
6219 #undef offset
6220 #undef op
6221 #undef save_capture_last
6222 #undef save_offset1
6223 #undef save_offset2
6224 #undef save_offset3
6225 #undef stacksave
6226
6227 #undef newptrb
6228
6229 #endif
6230
6231 /* These two are defined as macros in both cases */
6232
6233 #undef fc
6234 #undef fi
6235
6236 /***************************************************************************
6237 ***************************************************************************/
6238
6239
6240 #ifdef NO_RECURSE
6241 /*************************************************
6242 * Release allocated heap frames *
6243 *************************************************/
6244
6245 /* This function releases all the allocated frames. The base frame is on the
6246 machine stack, and so must not be freed.
6247
6248 Argument: the address of the base frame
6249 Returns: nothing
6250 */
6251
6252 static void
6253 release_match_heapframes (heapframe *frame_base)
6254 {
6255 heapframe *nextframe = frame_base->Xnextframe;
6256 while (nextframe != NULL)
6257 {
6258 heapframe *oldframe = nextframe;
6259 nextframe = nextframe->Xnextframe;
6260 (PUBL(stack_free))(oldframe);
6261 }
6262 }
6263 #endif
6264
6265
6266 /*************************************************
6267 * Execute a Regular Expression *
6268 *************************************************/
6269
6270 /* This function applies a compiled re to a subject string and picks out
6271 portions of the string if it matches. Two elements in the vector are set for
6272 each substring: the offsets to the start and end of the substring.
6273
6274 Arguments:
6275 argument_re points to the compiled expression
6276 extra_data points to extra data or is NULL
6277 subject points to the subject string
6278 length length of subject string (may contain binary zeros)
6279 start_offset where to start in the subject string
6280 options option bits
6281 offsets points to a vector of ints to be filled in with offsets
6282 offsetcount the number of elements in the vector
6283
6284 Returns: > 0 => success; value is the number of elements filled in
6285 = 0 => success, but offsets is not big enough
6286 -1 => failed to match
6287 < -1 => some kind of unexpected problem
6288 */
6289
6290 #if defined COMPILE_PCRE8
6291 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6292 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6293 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6294 int offsetcount)
6295 #elif defined COMPILE_PCRE16
6296 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6297 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6298 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6299 int offsetcount)
6300 #elif defined COMPILE_PCRE32
6301 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6302 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6303 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6304 int offsetcount)
6305 #endif
6306 {
6307 int rc, ocount, arg_offset_max;
6308 int newline;
6309 BOOL using_temporary_offsets = FALSE;
6310 BOOL anchored;
6311 BOOL startline;
6312 BOOL firstline;
6313 BOOL utf;
6314 BOOL has_first_char = FALSE;
6315 BOOL has_req_char = FALSE;
6316 pcre_uchar first_char = 0;
6317 pcre_uchar first_char2 = 0;
6318 pcre_uchar req_char = 0;
6319 pcre_uchar req_char2 = 0;
6320 match_data match_block;
6321 match_data *md = &match_block;
6322 const pcre_uint8 *tables;
6323 const pcre_uint8 *start_bits = NULL;
6324 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6325 PCRE_PUCHAR end_subject;
6326 PCRE_PUCHAR start_partial = NULL;
6327 PCRE_PUCHAR match_partial = NULL;
6328 PCRE_PUCHAR req_char_ptr = start_match - 1;
6329
6330 const pcre_study_data *study;
6331 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6332
6333 #ifdef NO_RECURSE
6334 heapframe frame_zero;
6335 frame_zero.Xprevframe = NULL; /* Marks the top level */
6336 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6337 md->match_frames_base = &frame_zero;
6338 #endif
6339
6340 /* Check for the special magic call that measures the size of the stack used
6341 per recursive call of match(). Without the funny casting for sizeof, a Windows
6342 compiler gave this error: "unary minus operator applied to unsigned type,
6343 result still unsigned". Hopefully the cast fixes that. */
6344
6345 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6346 start_offset == -999)
6347 #ifdef NO_RECURSE
6348 return -((int)sizeof(heapframe));
6349 #else
6350 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6351 #endif
6352
6353 /* Plausibility checks */
6354
6355 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6356 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6357 return PCRE_ERROR_NULL;
6358 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6359 if (length < 0) return PCRE_ERROR_BADLENGTH;
6360 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6361
6362 /* Check that the first field in the block is the magic number. If it is not,
6363 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6364 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6365 means that the pattern is likely compiled with different endianness. */
6366
6367 if (re->magic_number != MAGIC_NUMBER)
6368 return re->magic_number == REVERSED_MAGIC_NUMBER?
6369 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6370 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6371
6372 /* These two settings are used in the code for checking a UTF-8 string that
6373 follows immediately afterwards. Other values in the md block are used only
6374 during "normal" pcre_exec() processing, not when the JIT support is in use,
6375 so they are set up later. */
6376
6377 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6378 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6379 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6380 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6381
6382 /* Check a UTF-8 string if required. Pass back the character offset and error
6383 code for an invalid string if a results vector is available. */
6384
6385 #ifdef SUPPORT_UTF
6386 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6387 {
6388 int erroroffset;
6389 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6390 if (errorcode != 0)
6391 {
6392 if (offsetcount >= 2)
6393 {
6394 offsets[0] = erroroffset;
6395 offsets[1] = errorcode;
6396 }
6397 #if defined COMPILE_PCRE8
6398 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6399 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6400 #elif defined COMPILE_PCRE16
6401 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6402 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6403 #elif defined COMPILE_PCRE32
6404 return PCRE_ERROR_BADUTF32;
6405 #endif
6406 }
6407 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6408 /* Check that a start_offset points to the start of a UTF character. */
6409 if (start_offset > 0 && start_offset < length &&
6410 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6411 return PCRE_ERROR_BADUTF8_OFFSET;
6412 #endif
6413 }
6414 #endif
6415
6416 /* If the pattern was successfully studied with JIT support, run the JIT
6417 executable instead of the rest of this function. Most options must be set at
6418 compile time for the JIT code to be usable. Fallback to the normal code path if
6419 an unsupported flag is set. */
6420
6421 #ifdef SUPPORT_JIT
6422 if (extra_data != NULL
6423 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6424 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6425 && extra_data->executable_jit != NULL
6426 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6427 {
6428 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6429 start_offset, options, offsets, offsetcount);
6430
6431 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6432 mode is not compiled. In this case we simply fallback to interpreter. */
6433
6434 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6435 }
6436 #endif
6437
6438 /* Carry on with non-JIT matching. This information is for finding all the
6439 numbers associated with a given name, for condition testing. */
6440
6441 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6442 md->name_count = re->name_count;
6443 md->name_entry_size = re->name_entry_size;
6444
6445 /* Fish out the optional data from the extra_data structure, first setting
6446 the default values. */
6447
6448 study = NULL;
6449 md->match_limit = MATCH_LIMIT;
6450 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6451 md->callout_data = NULL;
6452
6453 /* The table pointer is always in native byte order. */
6454
6455 tables = re->tables;
6456
6457 /* The two limit values override the defaults, whatever their value. */
6458
6459 if (extra_data != NULL)
6460 {
6461 register unsigned int flags = extra_data->flags;
6462 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6463 study = (const pcre_study_data *)extra_data->study_data;
6464 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6465 md->match_limit = extra_data->match_limit;
6466 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6467 md->match_limit_recursion = extra_data->match_limit_recursion;
6468 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6469 md->callout_data = extra_data->callout_data;
6470 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6471 }
6472
6473 /* Limits in the regex override only if they are smaller. */
6474
6475 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6476 md->match_limit = re->limit_match;
6477
6478 if ((re->flags & PCRE_RLSET) != 0 &&
6479 re->limit_recursion < md->match_limit_recursion)
6480 md->match_limit_recursion = re->limit_recursion;
6481
6482 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6483 is a feature that makes it possible to save compiled regex and re-use them
6484 in other programs later. */
6485
6486 if (tables == NULL) tables = PRIV(default_tables);
6487
6488 /* Set up other data */
6489
6490 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6491 startline = (re->flags & PCRE_STARTLINE) != 0;
6492 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6493
6494 /* The code starts after the real_pcre block and the capture name table. */
6495
6496 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6497 re->name_count * re->name_entry_size;
6498
6499 md->start_subject = (PCRE_PUCHAR)subject;
6500 md->start_offset = start_offset;
6501 md->end_subject = md->start_subject + length;
6502 end_subject = md->end_subject;
6503
6504 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6505 md->use_ucp = (re->options & PCRE_UCP) != 0;
6506 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6507 md->ignore_skip_arg = 0;
6508
6509 /* Some options are unpacked into BOOL variables in the hope that testing
6510 them will be faster than individual option bits. */
6511
6512 md->notbol = (options & PCRE_NOTBOL) != 0;
6513 md->noteol = (options & PCRE_NOTEOL) != 0;
6514 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6515 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6516
6517 md->hitend = FALSE;
6518 md->mark = md->nomatch_mark = NULL; /* In case never set */
6519
6520 md->recursive = NULL; /* No recursion at top level */
6521 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6522
6523 md->lcc = tables + lcc_offset;
6524 md->fcc = tables + fcc_offset;
6525 md->ctypes = tables + ctypes_offset;
6526
6527 /* Handle different \R options. */
6528
6529 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6530 {
6531 case 0:
6532 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6533 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6534 else
6535 #ifdef BSR_ANYCRLF
6536 md->bsr_anycrlf = TRUE;
6537 #else
6538 md->bsr_anycrlf = FALSE;
6539 #endif
6540 break;
6541
6542 case PCRE_BSR_ANYCRLF:
6543 md->bsr_anycrlf = TRUE;
6544 break;
6545
6546 case PCRE_BSR_UNICODE:
6547 md->bsr_anycrlf = FALSE;
6548 break;
6549
6550 default: return PCRE_ERROR_BADNEWLINE;
6551 }
6552
6553 /* Handle different types of newline. The three bits give eight cases. If
6554 nothing is set at run time, whatever was used at compile time applies. */
6555
6556 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6557 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6558 {
6559 case 0: newline = NEWLINE; break; /* Compile-time default */
6560 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6561 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6562 case PCRE_NEWLINE_CR+
6563 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6564 case PCRE_NEWLINE_ANY: newline = -1; break;
6565 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6566 default: return PCRE_ERROR_BADNEWLINE;
6567 }
6568
6569 if (newline == -2)
6570 {
6571 md->nltype = NLTYPE_ANYCRLF;
6572 }
6573 else if (newline < 0)
6574 {
6575 md->nltype = NLTYPE_ANY;
6576 }
6577 else
6578 {
6579 md->nltype = NLTYPE_FIXED;
6580 if (newline > 255)
6581 {
6582 md->nllen = 2;
6583 md->nl[0] = (newline >> 8) & 255;
6584 md->nl[1] = newline & 255;
6585 }
6586 else
6587 {
6588 md->nllen = 1;
6589 md->nl[0] = newline;
6590 }
6591 }
6592
6593 /* Partial matching was originally supported only for a restricted set of
6594 regexes; from release 8.00 there are no restrictions, but the bits are still
6595 defined (though never set). So there's no harm in leaving this code. */
6596
6597 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6598 return PCRE_ERROR_BADPARTIAL;
6599
6600 /* If the expression has got more back references than the offsets supplied can
6601 hold, we get a temporary chunk of working store to use during the matching.
6602 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6603 of 3. */
6604
6605 ocount = offsetcount - (offsetcount % 3);
6606 arg_offset_max = (2*ocount)/3;
6607
6608 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6609 {
6610 ocount = re->top_backref * 3 + 3;
6611 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6612 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6613 using_temporary_offsets = TRUE;
6614 DPRINTF(("Got memory to hold back references\n"));
6615 }
6616 else md->offset_vector = offsets;
6617 md->offset_end = ocount;
6618 md->offset_max = (2*ocount)/3;
6619 md->capture_last = 0;
6620
6621 /* Reset the working variable associated with each extraction. These should
6622 never be used unless previously set, but they get saved and restored, and so we
6623 initialize them to avoid reading uninitialized locations. Also, unset the
6624 offsets for the matched string. This is really just for tidiness with callouts,
6625 in case they inspect these fields. */
6626
6627 if (md->offset_vector != NULL)
6628 {
6629 register int *iptr = md->offset_vector + ocount;
6630 register int *iend = iptr - re->top_bracket;
6631 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6632 while (--iptr >= iend) *iptr = -1;
6633 md->offset_vector[0] = md->offset_vector[1] = -1;
6634 }
6635
6636 /* Set up the first character to match, if available. The first_char value is
6637 never set for an anchored regular expression, but the anchoring may be forced
6638 at run time, so we have to test for anchoring. The first char may be unset for
6639 an unanchored pattern, of course. If there's no first char and the pattern was
6640 studied, there may be a bitmap of possible first characters. */
6641
6642 if (!anchored)
6643 {
6644 if ((re->flags & PCRE_FIRSTSET) != 0)
6645 {
6646 has_first_char = TRUE;
6647 first_char = first_char2 = (pcre_uchar)(re->first_char);
6648 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6649 {
6650 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6651 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6652 if (utf && first_char > 127)
6653 first_char2 = UCD_OTHERCASE(first_char);
6654 #endif
6655 }
6656 }
6657 else
6658 if (!startline && study != NULL &&
6659 (study->flags & PCRE_STUDY_MAPPED) != 0)
6660 start_bits = study->start_bits;
6661 }
6662
6663 /* For anchored or unanchored matches, there may be a "last known required
6664 character" set. */
6665
6666 if ((re->flags & PCRE_REQCHSET) != 0)
6667 {
6668 has_req_char = TRUE;
6669 req_char = req_char2 = (pcre_uchar)(re->req_char);
6670 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6671 {
6672 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6673 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6674 if (utf && req_char > 127)
6675 req_char2 = UCD_OTHERCASE(req_char);
6676 #endif
6677 }
6678 }
6679
6680
6681 /* ==========================================================================*/
6682
6683 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6684 the loop runs just once. */
6685
6686 for(;;)
6687 {
6688 PCRE_PUCHAR save_end_subject = end_subject;
6689 PCRE_PUCHAR new_start_match;
6690
6691 /* If firstline is TRUE, the start of the match is constrained to the first
6692 line of a multiline string. That is, the match must be before or at the first
6693 newline. Implement this by temporarily adjusting end_subject so that we stop
6694 scanning at a newline. If the match fails at the newline, later code breaks
6695 this loop. */
6696
6697 if (firstline)
6698 {
6699 PCRE_PUCHAR t = start_match;
6700 #ifdef SUPPORT_UTF
6701 if (utf)
6702 {
6703 while (t < md->end_subject && !IS_NEWLINE(t))
6704 {
6705 t++;
6706 ACROSSCHAR(t < end_subject, *t, t++);
6707 }
6708 }
6709 else
6710 #endif
6711 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6712 end_subject = t;
6713 }
6714
6715 /* There are some optimizations that avoid running the match if a known
6716 starting point is not found, or if a known later character is not present.
6717 However, there is an option that disables these, for testing and for ensuring
6718 that all callouts do actually occur. The option can be set in the regex by
6719 (*NO_START_OPT) or passed in match-time options. */
6720
6721 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6722 {
6723 /* Advance to a unique first char if there is one. */
6724
6725 if (has_first_char)
6726 {
6727 pcre_uchar smc;
6728
6729 if (first_char != first_char2)
6730 while (start_match < end_subject &&
6731 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6732 start_match++;
6733 else
6734 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6735 start_match++;
6736 }
6737
6738 /* Or to just after a linebreak for a multiline match */
6739
6740 else if (startline)
6741 {
6742 if (start_match > md->start_subject + start_offset)
6743 {
6744 #ifdef SUPPORT_UTF
6745 if (utf)
6746 {
6747 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6748 {
6749 start_match++;
6750 ACROSSCHAR(start_match < end_subject, *start_match,
6751 start_match++);
6752 }