/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1379 - (show annotations)
Mon Oct 14 13:54:07 2013 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 218028 byte(s)
More auto-possessification additions, using possessive class repeats. These are 
not yet used for explicit possessification.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 case OP_COMMIT:
785 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
786 eptrb, RM52);
787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
788 RRETURN(MATCH_COMMIT);
789
790 case OP_PRUNE:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
792 eptrb, RM51);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 RRETURN(MATCH_PRUNE);
795
796 case OP_PRUNE_ARG:
797 md->nomatch_mark = ecode + 2;
798 md->mark = NULL; /* In case previously set by assertion */
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM56);
801 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
802 md->mark == NULL) md->mark = ecode + 2;
803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
804 RRETURN(MATCH_PRUNE);
805
806 case OP_SKIP:
807 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
808 eptrb, RM53);
809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
810 md->start_match_ptr = eptr; /* Pass back current position */
811 RRETURN(MATCH_SKIP);
812
813 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
814 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
815 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
816 that failed and any that precede it (either they also failed, or were not
817 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
818 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
819 set to the count of the one that failed. */
820
821 case OP_SKIP_ARG:
822 md->skip_arg_count++;
823 if (md->skip_arg_count <= md->ignore_skip_arg)
824 {
825 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 break;
827 }
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
829 eptrb, RM57);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831
832 /* Pass back the current skip name by overloading md->start_match_ptr and
833 returning the special MATCH_SKIP_ARG return code. This will either be
834 caught by a matching MARK, or get to the top, where it causes a rematch
835 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
836
837 md->start_match_ptr = ecode + 2;
838 RRETURN(MATCH_SKIP_ARG);
839
840 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
841 the branch in which it occurs can be determined. Overload the start of
842 match pointer to do this. */
843
844 case OP_THEN:
845 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
846 eptrb, RM54);
847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848 md->start_match_ptr = ecode;
849 RRETURN(MATCH_THEN);
850
851 case OP_THEN_ARG:
852 md->nomatch_mark = ecode + 2;
853 md->mark = NULL; /* In case previously set by assertion */
854 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
855 md, eptrb, RM58);
856 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
857 md->mark == NULL) md->mark = ecode + 2;
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 md->start_match_ptr = ecode;
860 RRETURN(MATCH_THEN);
861
862 /* Handle an atomic group that does not contain any capturing parentheses.
863 This can be handled like an assertion. Prior to 8.13, all atomic groups
864 were handled this way. In 8.13, the code was changed as below for ONCE, so
865 that backups pass through the group and thereby reset captured values.
866 However, this uses a lot more stack, so in 8.20, atomic groups that do not
867 contain any captures generate OP_ONCE_NC, which can be handled in the old,
868 less stack intensive way.
869
870 Check the alternative branches in turn - the matching won't pass the KET
871 for this kind of subpattern. If any one branch matches, we carry on as at
872 the end of a normal bracket, leaving the subject pointer, but resetting
873 the start-of-match value in case it was changed by \K. */
874
875 case OP_ONCE_NC:
876 prev = ecode;
877 saved_eptr = eptr;
878 save_mark = md->mark;
879 do
880 {
881 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
882 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
883 {
884 mstart = md->start_match_ptr;
885 break;
886 }
887 if (rrc == MATCH_THEN)
888 {
889 next = ecode + GET(ecode,1);
890 if (md->start_match_ptr < next &&
891 (*ecode == OP_ALT || *next == OP_ALT))
892 rrc = MATCH_NOMATCH;
893 }
894
895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896 ecode += GET(ecode,1);
897 md->mark = save_mark;
898 }
899 while (*ecode == OP_ALT);
900
901 /* If hit the end of the group (which could be repeated), fail */
902
903 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
904
905 /* Continue as from after the group, updating the offsets high water
906 mark, since extracts may have been taken. */
907
908 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
909
910 offset_top = md->end_offset_top;
911 eptr = md->end_match_ptr;
912
913 /* For a non-repeating ket, just continue at this level. This also
914 happens for a repeating ket if no characters were matched in the group.
915 This is the forcible breaking of infinite loops as implemented in Perl
916 5.005. */
917
918 if (*ecode == OP_KET || eptr == saved_eptr)
919 {
920 ecode += 1+LINK_SIZE;
921 break;
922 }
923
924 /* The repeating kets try the rest of the pattern or restart from the
925 preceding bracket, in the appropriate order. The second "call" of match()
926 uses tail recursion, to avoid using another stack frame. */
927
928 if (*ecode == OP_KETRMIN)
929 {
930 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
932 ecode = prev;
933 goto TAIL_RECURSE;
934 }
935 else /* OP_KETRMAX */
936 {
937 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939 ecode += 1 + LINK_SIZE;
940 goto TAIL_RECURSE;
941 }
942 /* Control never gets here */
943
944 /* Handle a capturing bracket, other than those that are possessive with an
945 unlimited repeat. If there is space in the offset vector, save the current
946 subject position in the working slot at the top of the vector. We mustn't
947 change the current values of the data slot, because they may be set from a
948 previous iteration of this group, and be referred to by a reference inside
949 the group. A failure to match might occur after the group has succeeded,
950 if something later on doesn't match. For this reason, we need to restore
951 the working value and also the values of the final offsets, in case they
952 were set by a previous iteration of the same bracket.
953
954 If there isn't enough space in the offset vector, treat this as if it were
955 a non-capturing bracket. Don't worry about setting the flag for the error
956 case here; that is handled in the code for KET. */
957
958 case OP_CBRA:
959 case OP_SCBRA:
960 number = GET2(ecode, 1+LINK_SIZE);
961 offset = number << 1;
962
963 #ifdef PCRE_DEBUG
964 printf("start bracket %d\n", number);
965 printf("subject=");
966 pchars(eptr, 16, TRUE, md);
967 printf("\n");
968 #endif
969
970 if (offset < md->offset_max)
971 {
972 save_offset1 = md->offset_vector[offset];
973 save_offset2 = md->offset_vector[offset+1];
974 save_offset3 = md->offset_vector[md->offset_end - number];
975 save_capture_last = md->capture_last;
976 save_mark = md->mark;
977
978 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
979 md->offset_vector[md->offset_end - number] =
980 (int)(eptr - md->start_subject);
981
982 for (;;)
983 {
984 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
986 eptrb, RM1);
987 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
988
989 /* If we backed up to a THEN, check whether it is within the current
990 branch by comparing the address of the THEN that is passed back with
991 the end of the branch. If it is within the current branch, and the
992 branch is one of two or more alternatives (it either starts or ends
993 with OP_ALT), we have reached the limit of THEN's action, so convert
994 the return code to NOMATCH, which will cause normal backtracking to
995 happen from now on. Otherwise, THEN is passed back to an outer
996 alternative. This implements Perl's treatment of parenthesized groups,
997 where a group not containing | does not affect the current alternative,
998 that is, (X) is NOT the same as (X|(*F)). */
999
1000 if (rrc == MATCH_THEN)
1001 {
1002 next = ecode + GET(ecode,1);
1003 if (md->start_match_ptr < next &&
1004 (*ecode == OP_ALT || *next == OP_ALT))
1005 rrc = MATCH_NOMATCH;
1006 }
1007
1008 /* Anything other than NOMATCH is passed back. */
1009
1010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011 md->capture_last = save_capture_last;
1012 ecode += GET(ecode, 1);
1013 md->mark = save_mark;
1014 if (*ecode != OP_ALT) break;
1015 }
1016
1017 DPRINTF(("bracket %d failed\n", number));
1018 md->offset_vector[offset] = save_offset1;
1019 md->offset_vector[offset+1] = save_offset2;
1020 md->offset_vector[md->offset_end - number] = save_offset3;
1021
1022 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1023
1024 RRETURN(rrc);
1025 }
1026
1027 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1028 as a non-capturing bracket. */
1029
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032
1033 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1034
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037
1038 /* Non-capturing or atomic group, except for possessive with unlimited
1039 repeat and ONCE group with no captures. Loop for all the alternatives.
1040
1041 When we get to the final alternative within the brackets, we used to return
1042 the result of a recursive call to match() whatever happened so it was
1043 possible to reduce stack usage by turning this into a tail recursion,
1044 except in the case of a possibly empty group. However, now that there is
1045 the possiblity of (*THEN) occurring in the final alternative, this
1046 optimization is no longer always possible.
1047
1048 We can optimize if we know there are no (*THEN)s in the pattern; at present
1049 this is the best that can be done.
1050
1051 MATCH_ONCE is returned when the end of an atomic group is successfully
1052 reached, but subsequent matching fails. It passes back up the tree (causing
1053 captured values to be reset) until the original atomic group level is
1054 reached. This is tested by comparing md->once_target with the start of the
1055 group. At this point, the return is converted into MATCH_NOMATCH so that
1056 previous backup points can be taken. */
1057
1058 case OP_ONCE:
1059 case OP_BRA:
1060 case OP_SBRA:
1061 DPRINTF(("start non-capturing bracket\n"));
1062
1063 for (;;)
1064 {
1065 if (op >= OP_SBRA || op == OP_ONCE)
1066 md->match_function_type = MATCH_CBEGROUP;
1067
1068 /* If this is not a possibly empty group, and there are no (*THEN)s in
1069 the pattern, and this is the final alternative, optimize as described
1070 above. */
1071
1072 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1073 {
1074 ecode += PRIV(OP_lengths)[*ecode];
1075 goto TAIL_RECURSE;
1076 }
1077
1078 /* In all other cases, we have to make another call to match(). */
1079
1080 save_mark = md->mark;
1081 save_capture_last = md->capture_last;
1082 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 RM2);
1084
1085 /* See comment in the code for capturing groups above about handling
1086 THEN. */
1087
1088 if (rrc == MATCH_THEN)
1089 {
1090 next = ecode + GET(ecode,1);
1091 if (md->start_match_ptr < next &&
1092 (*ecode == OP_ALT || *next == OP_ALT))
1093 rrc = MATCH_NOMATCH;
1094 }
1095
1096 if (rrc != MATCH_NOMATCH)
1097 {
1098 if (rrc == MATCH_ONCE)
1099 {
1100 const pcre_uchar *scode = ecode;
1101 if (*scode != OP_ONCE) /* If not at start, find it */
1102 {
1103 while (*scode == OP_ALT) scode += GET(scode, 1);
1104 scode -= GET(scode, 1);
1105 }
1106 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1107 }
1108 RRETURN(rrc);
1109 }
1110 ecode += GET(ecode, 1);
1111 md->mark = save_mark;
1112 if (*ecode != OP_ALT) break;
1113 md->capture_last = save_capture_last;
1114 }
1115
1116 RRETURN(MATCH_NOMATCH);
1117
1118 /* Handle possessive capturing brackets with an unlimited repeat. We come
1119 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1120 handled similarly to the normal case above. However, the matching is
1121 different. The end of these brackets will always be OP_KETRPOS, which
1122 returns MATCH_KETRPOS without going further in the pattern. By this means
1123 we can handle the group by iteration rather than recursion, thereby
1124 reducing the amount of stack needed. */
1125
1126 case OP_CBRAPOS:
1127 case OP_SCBRAPOS:
1128 allow_zero = FALSE;
1129
1130 POSSESSIVE_CAPTURE:
1131 number = GET2(ecode, 1+LINK_SIZE);
1132 offset = number << 1;
1133
1134 #ifdef PCRE_DEBUG
1135 printf("start possessive bracket %d\n", number);
1136 printf("subject=");
1137 pchars(eptr, 16, TRUE, md);
1138 printf("\n");
1139 #endif
1140
1141 if (offset < md->offset_max)
1142 {
1143 matched_once = FALSE;
1144 code_offset = (int)(ecode - md->start_code);
1145
1146 save_offset1 = md->offset_vector[offset];
1147 save_offset2 = md->offset_vector[offset+1];
1148 save_offset3 = md->offset_vector[md->offset_end - number];
1149 save_capture_last = md->capture_last;
1150
1151 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1152
1153 /* Each time round the loop, save the current subject position for use
1154 when the group matches. For MATCH_MATCH, the group has matched, so we
1155 restart it with a new subject starting position, remembering that we had
1156 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1157 usual. If we haven't matched any alternatives in any iteration, check to
1158 see if a previous iteration matched. If so, the group has matched;
1159 continue from afterwards. Otherwise it has failed; restore the previous
1160 capture values before returning NOMATCH. */
1161
1162 for (;;)
1163 {
1164 md->offset_vector[md->offset_end - number] =
1165 (int)(eptr - md->start_subject);
1166 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1167 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1168 eptrb, RM63);
1169 if (rrc == MATCH_KETRPOS)
1170 {
1171 offset_top = md->end_offset_top;
1172 eptr = md->end_match_ptr;
1173 ecode = md->start_code + code_offset;
1174 save_capture_last = md->capture_last;
1175 matched_once = TRUE;
1176 continue;
1177 }
1178
1179 /* See comment in the code for capturing groups above about handling
1180 THEN. */
1181
1182 if (rrc == MATCH_THEN)
1183 {
1184 next = ecode + GET(ecode,1);
1185 if (md->start_match_ptr < next &&
1186 (*ecode == OP_ALT || *next == OP_ALT))
1187 rrc = MATCH_NOMATCH;
1188 }
1189
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 md->capture_last = save_capture_last;
1192 ecode += GET(ecode, 1);
1193 if (*ecode != OP_ALT) break;
1194 }
1195
1196 if (!matched_once)
1197 {
1198 md->offset_vector[offset] = save_offset1;
1199 md->offset_vector[offset+1] = save_offset2;
1200 md->offset_vector[md->offset_end - number] = save_offset3;
1201 }
1202
1203 if (allow_zero || matched_once)
1204 {
1205 ecode += 1 + LINK_SIZE;
1206 break;
1207 }
1208
1209 RRETURN(MATCH_NOMATCH);
1210 }
1211
1212 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1213 as a non-capturing bracket. */
1214
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217
1218 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222
1223 /* Non-capturing possessive bracket with unlimited repeat. We come here
1224 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1225 without the capturing complication. It is written out separately for speed
1226 and cleanliness. */
1227
1228 case OP_BRAPOS:
1229 case OP_SBRAPOS:
1230 allow_zero = FALSE;
1231
1232 POSSESSIVE_NON_CAPTURE:
1233 matched_once = FALSE;
1234 code_offset = (int)(ecode - md->start_code);
1235 save_capture_last = md->capture_last;
1236
1237 for (;;)
1238 {
1239 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1240 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1241 eptrb, RM48);
1242 if (rrc == MATCH_KETRPOS)
1243 {
1244 offset_top = md->end_offset_top;
1245 eptr = md->end_match_ptr;
1246 ecode = md->start_code + code_offset;
1247 matched_once = TRUE;
1248 continue;
1249 }
1250
1251 /* See comment in the code for capturing groups above about handling
1252 THEN. */
1253
1254 if (rrc == MATCH_THEN)
1255 {
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1260 }
1261
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1266 }
1267
1268 if (matched_once || allow_zero)
1269 {
1270 ecode += 1 + LINK_SIZE;
1271 break;
1272 }
1273 RRETURN(MATCH_NOMATCH);
1274
1275 /* Control never reaches here. */
1276
1277 /* Conditional group: compilation checked that there are no more than two
1278 branches. If the condition is false, skipping the first branch takes us
1279 past the end of the item if there is only one branch, but that's exactly
1280 what we want. */
1281
1282 case OP_COND:
1283 case OP_SCOND:
1284
1285 /* The variable codelink will be added to ecode when the condition is
1286 false, to get to the second branch. Setting it to the offset to the ALT
1287 or KET, then incrementing ecode achieves this effect. We now have ecode
1288 pointing to the condition or callout. */
1289
1290 codelink = GET(ecode, 1); /* Offset to the second branch */
1291 ecode += 1 + LINK_SIZE; /* From this opcode */
1292
1293 /* Because of the way auto-callout works during compile, a callout item is
1294 inserted between OP_COND and an assertion condition. */
1295
1296 if (*ecode == OP_CALLOUT)
1297 {
1298 if (PUBL(callout) != NULL)
1299 {
1300 PUBL(callout_block) cb;
1301 cb.version = 2; /* Version 1 of the callout block */
1302 cb.callout_number = ecode[1];
1303 cb.offset_vector = md->offset_vector;
1304 #if defined COMPILE_PCRE8
1305 cb.subject = (PCRE_SPTR)md->start_subject;
1306 #elif defined COMPILE_PCRE16
1307 cb.subject = (PCRE_SPTR16)md->start_subject;
1308 #elif defined COMPILE_PCRE32
1309 cb.subject = (PCRE_SPTR32)md->start_subject;
1310 #endif
1311 cb.subject_length = (int)(md->end_subject - md->start_subject);
1312 cb.start_match = (int)(mstart - md->start_subject);
1313 cb.current_position = (int)(eptr - md->start_subject);
1314 cb.pattern_position = GET(ecode, 2);
1315 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1316 cb.capture_top = offset_top/2;
1317 cb.capture_last = md->capture_last & CAPLMASK;
1318 /* Internal change requires this for API compatibility. */
1319 if (cb.capture_last == 0) cb.capture_last = -1;
1320 cb.callout_data = md->callout_data;
1321 cb.mark = md->nomatch_mark;
1322 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1323 if (rrc < 0) RRETURN(rrc);
1324 }
1325
1326 /* Advance ecode past the callout, so it now points to the condition. We
1327 must adjust codelink so that the value of ecode+codelink is unchanged. */
1328
1329 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1330 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1331 }
1332
1333 /* Test the various possible conditions */
1334
1335 condition = FALSE;
1336 switch(condcode = *ecode)
1337 {
1338 case OP_RREF: /* Numbered group recursion test */
1339 if (md->recursive != NULL) /* Not recursing => FALSE */
1340 {
1341 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1342 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1343 }
1344 break;
1345
1346 case OP_DNRREF: /* Duplicate named group recursion test */
1347 if (md->recursive != NULL)
1348 {
1349 int count = GET2(ecode, 1 + IMM2_SIZE);
1350 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1351 while (count-- > 0)
1352 {
1353 unsigned int recno = GET2(slot, 0);
1354 condition = recno == md->recursive->group_num;
1355 if (condition) break;
1356 slot += md->name_entry_size;
1357 }
1358 }
1359 break;
1360
1361 case OP_CREF: /* Numbered group used test */
1362 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1363 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1364 break;
1365
1366 case OP_DNCREF: /* Duplicate named group used test */
1367 {
1368 int count = GET2(ecode, 1 + IMM2_SIZE);
1369 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1370 while (count-- > 0)
1371 {
1372 offset = GET2(slot, 0) << 1;
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 if (condition) break;
1375 slot += md->name_entry_size;
1376 }
1377 }
1378 break;
1379
1380 case OP_DEF: /* DEFINE - always false */
1381 break;
1382
1383 /* The condition is an assertion. Call match() to evaluate it - setting
1384 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1385 of an assertion. */
1386
1387 default:
1388 md->match_function_type = MATCH_CONDASSERT;
1389 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1390 if (rrc == MATCH_MATCH)
1391 {
1392 if (md->end_offset_top > offset_top)
1393 offset_top = md->end_offset_top; /* Captures may have happened */
1394 condition = TRUE;
1395
1396 /* Advance ecode past the assertion to the start of the first branch,
1397 but adjust it so that the general choosing code below works. */
1398
1399 ecode += GET(ecode, 1);
1400 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1402 }
1403
1404 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405 assertion; it is therefore treated as NOMATCH. Any other return is an
1406 error. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 break;
1413 }
1414
1415 /* Choose branch according to the condition */
1416
1417 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, we
1420 can use tail recursion to avoid using another stack frame, except when
1421 there is unlimited repeat of a possibly empty group. In the latter case, a
1422 recursive call to match() is always required, unless the second alternative
1423 doesn't exist, in which case we can just plough on. Note that, for
1424 compatibility with Perl, the | in a conditional group is NOT treated as
1425 creating two alternatives. If a THEN is encountered in the branch, it
1426 propagates out to the enclosing alternative (unless nested in a deeper set
1427 of alternatives, of course). */
1428
1429 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 goto TAIL_RECURSE;
1434 }
1435
1436 md->match_function_type = MATCH_CBEGROUP;
1437 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1438 RRETURN(rrc);
1439 }
1440
1441 /* Condition false & no alternative; continue after the group. */
1442
1443 else
1444 {
1445 }
1446 break;
1447
1448
1449 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450 to close any currently open capturing brackets. */
1451
1452 case OP_CLOSE:
1453 number = GET2(ecode, 1); /* Must be less than 65536 */
1454 offset = number << 1;
1455
1456 #ifdef PCRE_DEBUG
1457 printf("end bracket %d at *ACCEPT", number);
1458 printf("\n");
1459 #endif
1460
1461 md->capture_last = (md->capture_last & OVFLMASK) | number;
1462 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1463 {
1464 md->offset_vector[offset] =
1465 md->offset_vector[md->offset_end - number];
1466 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467 if (offset_top <= offset) offset_top = offset + 2;
1468 }
1469 ecode += 1 + IMM2_SIZE;
1470 break;
1471
1472
1473 /* End of the pattern, either real or forced. */
1474
1475 case OP_END:
1476 case OP_ACCEPT:
1477 case OP_ASSERT_ACCEPT:
1478
1479 /* If we have matched an empty string, fail if not in an assertion and not
1480 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481 is set and we have matched at the start of the subject. In both cases,
1482 backtracking will then try other alternatives, if any. */
1483
1484 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485 md->recursive == NULL &&
1486 (md->notempty ||
1487 (md->notempty_atstart &&
1488 mstart == md->start_subject + md->start_offset)))
1489 RRETURN(MATCH_NOMATCH);
1490
1491 /* Otherwise, we have a match. */
1492
1493 md->end_match_ptr = eptr; /* Record where we ended */
1494 md->end_offset_top = offset_top; /* and how many extracts were taken */
1495 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1496
1497 /* For some reason, the macros don't work properly if an expression is
1498 given as the argument to RRETURN when the heap is in use. */
1499
1500 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501 RRETURN(rrc);
1502
1503 /* Assertion brackets. Check the alternative branches in turn - the
1504 matching won't pass the KET for an assertion. If any one branch matches,
1505 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506 start of each branch to move the current point backwards, so the code at
1507 this level is identical to the lookahead case. When the assertion is part
1508 of a condition, we want to return immediately afterwards. The caller of
1509 this incarnation of the match() function will have set MATCH_CONDASSERT in
1510 md->match_function type, and one of these opcodes will be the first opcode
1511 that is processed. We use a local variable that is preserved over calls to
1512 match() to remember this case. */
1513
1514 case OP_ASSERT:
1515 case OP_ASSERTBACK:
1516 save_mark = md->mark;
1517 if (md->match_function_type == MATCH_CONDASSERT)
1518 {
1519 condassert = TRUE;
1520 md->match_function_type = 0;
1521 }
1522 else condassert = FALSE;
1523
1524 /* Loop for each branch */
1525
1526 do
1527 {
1528 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1529
1530 /* A match means that the assertion is true; break out of the loop
1531 that matches its alternatives. */
1532
1533 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1534 {
1535 mstart = md->start_match_ptr; /* In case \K reset it */
1536 break;
1537 }
1538
1539 /* If not matched, restore the previous mark setting. */
1540
1541 md->mark = save_mark;
1542
1543 /* See comment in the code for capturing groups above about handling
1544 THEN. */
1545
1546 if (rrc == MATCH_THEN)
1547 {
1548 next = ecode + GET(ecode,1);
1549 if (md->start_match_ptr < next &&
1550 (*ecode == OP_ALT || *next == OP_ALT))
1551 rrc = MATCH_NOMATCH;
1552 }
1553
1554 /* Anything other than NOMATCH causes the entire assertion to fail,
1555 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1556 uncaptured THEN, which means they take their normal effect. This
1557 consistent approach does not always have exactly the same effect as in
1558 Perl. */
1559
1560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1561 ecode += GET(ecode, 1);
1562 }
1563 while (*ecode == OP_ALT); /* Continue for next alternative */
1564
1565 /* If we have tried all the alternative branches, the assertion has
1566 failed. If not, we broke out after a match. */
1567
1568 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1569
1570 /* If checking an assertion for a condition, return MATCH_MATCH. */
1571
1572 if (condassert) RRETURN(MATCH_MATCH);
1573
1574 /* Continue from after a successful assertion, updating the offsets high
1575 water mark, since extracts may have been taken during the assertion. */
1576
1577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1578 ecode += 1 + LINK_SIZE;
1579 offset_top = md->end_offset_top;
1580 continue;
1581
1582 /* Negative assertion: all branches must fail to match for the assertion to
1583 succeed. */
1584
1585 case OP_ASSERT_NOT:
1586 case OP_ASSERTBACK_NOT:
1587 save_mark = md->mark;
1588 if (md->match_function_type == MATCH_CONDASSERT)
1589 {
1590 condassert = TRUE;
1591 md->match_function_type = 0;
1592 }
1593 else condassert = FALSE;
1594
1595 /* Loop for each alternative branch. */
1596
1597 do
1598 {
1599 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1600 md->mark = save_mark; /* Always restore the mark setting */
1601
1602 switch(rrc)
1603 {
1604 case MATCH_MATCH: /* A successful match means */
1605 case MATCH_ACCEPT: /* the assertion has failed. */
1606 RRETURN(MATCH_NOMATCH);
1607
1608 case MATCH_NOMATCH: /* Carry on with next branch */
1609 break;
1610
1611 /* See comment in the code for capturing groups above about handling
1612 THEN. */
1613
1614 case MATCH_THEN:
1615 next = ecode + GET(ecode,1);
1616 if (md->start_match_ptr < next &&
1617 (*ecode == OP_ALT || *next == OP_ALT))
1618 {
1619 rrc = MATCH_NOMATCH;
1620 break;
1621 }
1622 /* Otherwise fall through. */
1623
1624 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1625 assertion to fail to match, without considering any more alternatives.
1626 Failing to match means the assertion is true. This is a consistent
1627 approach, but does not always have the same effect as in Perl. */
1628
1629 case MATCH_COMMIT:
1630 case MATCH_SKIP:
1631 case MATCH_SKIP_ARG:
1632 case MATCH_PRUNE:
1633 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1634 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1635
1636 /* Anything else is an error */
1637
1638 default:
1639 RRETURN(rrc);
1640 }
1641
1642 /* Continue with next branch */
1643
1644 ecode += GET(ecode,1);
1645 }
1646 while (*ecode == OP_ALT);
1647
1648 /* All branches in the assertion failed to match. */
1649
1650 NEG_ASSERT_TRUE:
1651 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1652 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1653 continue;
1654
1655 /* Move the subject pointer back. This occurs only at the start of
1656 each branch of a lookbehind assertion. If we are too close to the start to
1657 move back, this match function fails. When working with UTF-8 we move
1658 back a number of characters, not bytes. */
1659
1660 case OP_REVERSE:
1661 #ifdef SUPPORT_UTF
1662 if (utf)
1663 {
1664 i = GET(ecode, 1);
1665 while (i-- > 0)
1666 {
1667 eptr--;
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 BACKCHAR(eptr);
1670 }
1671 }
1672 else
1673 #endif
1674
1675 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1676
1677 {
1678 eptr -= GET(ecode, 1);
1679 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1680 }
1681
1682 /* Save the earliest consulted character, then skip to next op code */
1683
1684 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1685 ecode += 1 + LINK_SIZE;
1686 break;
1687
1688 /* The callout item calls an external function, if one is provided, passing
1689 details of the match so far. This is mainly for debugging, though the
1690 function is able to force a failure. */
1691
1692 case OP_CALLOUT:
1693 if (PUBL(callout) != NULL)
1694 {
1695 PUBL(callout_block) cb;
1696 cb.version = 2; /* Version 1 of the callout block */
1697 cb.callout_number = ecode[1];
1698 cb.offset_vector = md->offset_vector;
1699 #if defined COMPILE_PCRE8
1700 cb.subject = (PCRE_SPTR)md->start_subject;
1701 #elif defined COMPILE_PCRE16
1702 cb.subject = (PCRE_SPTR16)md->start_subject;
1703 #elif defined COMPILE_PCRE32
1704 cb.subject = (PCRE_SPTR32)md->start_subject;
1705 #endif
1706 cb.subject_length = (int)(md->end_subject - md->start_subject);
1707 cb.start_match = (int)(mstart - md->start_subject);
1708 cb.current_position = (int)(eptr - md->start_subject);
1709 cb.pattern_position = GET(ecode, 2);
1710 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1711 cb.capture_top = offset_top/2;
1712 cb.capture_last = md->capture_last & CAPLMASK;
1713 /* Internal change requires this for API compatibility. */
1714 if (cb.capture_last == 0) cb.capture_last = -1;
1715 cb.callout_data = md->callout_data;
1716 cb.mark = md->nomatch_mark;
1717 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1718 if (rrc < 0) RRETURN(rrc);
1719 }
1720 ecode += 2 + 2*LINK_SIZE;
1721 break;
1722
1723 /* Recursion either matches the current regex, or some subexpression. The
1724 offset data is the offset to the starting bracket from the start of the
1725 whole pattern. (This is so that it works from duplicated subpatterns.)
1726
1727 The state of the capturing groups is preserved over recursion, and
1728 re-instated afterwards. We don't know how many are started and not yet
1729 finished (offset_top records the completed total) so we just have to save
1730 all the potential data. There may be up to 65535 such values, which is too
1731 large to put on the stack, but using malloc for small numbers seems
1732 expensive. As a compromise, the stack is used when there are no more than
1733 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1734
1735 There are also other values that have to be saved. We use a chained
1736 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1737 for the original version of this logic. It has, however, been hacked around
1738 a lot, so he is not to blame for the current way it works. */
1739
1740 case OP_RECURSE:
1741 {
1742 recursion_info *ri;
1743 unsigned int recno;
1744
1745 callpat = md->start_code + GET(ecode, 1);
1746 recno = (callpat == md->start_code)? 0 :
1747 GET2(callpat, 1 + LINK_SIZE);
1748
1749 /* Check for repeating a recursion without advancing the subject pointer.
1750 This should catch convoluted mutual recursions. (Some simple cases are
1751 caught at compile time.) */
1752
1753 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1754 if (recno == ri->group_num && eptr == ri->subject_position)
1755 RRETURN(PCRE_ERROR_RECURSELOOP);
1756
1757 /* Add to "recursing stack" */
1758
1759 new_recursive.group_num = recno;
1760 new_recursive.saved_capture_last = md->capture_last;
1761 new_recursive.subject_position = eptr;
1762 new_recursive.prevrec = md->recursive;
1763 md->recursive = &new_recursive;
1764
1765 /* Where to continue from afterwards */
1766
1767 ecode += 1 + LINK_SIZE;
1768
1769 /* Now save the offset data */
1770
1771 new_recursive.saved_max = md->offset_end;
1772 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1773 new_recursive.offset_save = stacksave;
1774 else
1775 {
1776 new_recursive.offset_save =
1777 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1778 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1779 }
1780 memcpy(new_recursive.offset_save, md->offset_vector,
1781 new_recursive.saved_max * sizeof(int));
1782
1783 /* OK, now we can do the recursion. After processing each alternative,
1784 restore the offset data and the last captured value. If there were nested
1785 recursions, md->recursive might be changed, so reset it before looping.
1786 */
1787
1788 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1789 cbegroup = (*callpat >= OP_SBRA);
1790 do
1791 {
1792 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1793 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1794 md, eptrb, RM6);
1795 memcpy(md->offset_vector, new_recursive.offset_save,
1796 new_recursive.saved_max * sizeof(int));
1797 md->capture_last = new_recursive.saved_capture_last;
1798 md->recursive = new_recursive.prevrec;
1799 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1800 {
1801 DPRINTF(("Recursion matched\n"));
1802 if (new_recursive.offset_save != stacksave)
1803 (PUBL(free))(new_recursive.offset_save);
1804
1805 /* Set where we got to in the subject, and reset the start in case
1806 it was changed by \K. This *is* propagated back out of a recursion,
1807 for Perl compatibility. */
1808
1809 eptr = md->end_match_ptr;
1810 mstart = md->start_match_ptr;
1811 goto RECURSION_MATCHED; /* Exit loop; end processing */
1812 }
1813
1814 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1815 recursion; they cause a NOMATCH for the entire recursion. These codes
1816 are defined in a range that can be tested for. */
1817
1818 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1819 RRETURN(MATCH_NOMATCH);
1820
1821 /* Any return code other than NOMATCH is an error. */
1822
1823 if (rrc != MATCH_NOMATCH)
1824 {
1825 DPRINTF(("Recursion gave error %d\n", rrc));
1826 if (new_recursive.offset_save != stacksave)
1827 (PUBL(free))(new_recursive.offset_save);
1828 RRETURN(rrc);
1829 }
1830
1831 md->recursive = &new_recursive;
1832 callpat += GET(callpat, 1);
1833 }
1834 while (*callpat == OP_ALT);
1835
1836 DPRINTF(("Recursion didn't match\n"));
1837 md->recursive = new_recursive.prevrec;
1838 if (new_recursive.offset_save != stacksave)
1839 (PUBL(free))(new_recursive.offset_save);
1840 RRETURN(MATCH_NOMATCH);
1841 }
1842
1843 RECURSION_MATCHED:
1844 break;
1845
1846 /* An alternation is the end of a branch; scan along to find the end of the
1847 bracketed group and go to there. */
1848
1849 case OP_ALT:
1850 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1851 break;
1852
1853 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1854 indicating that it may occur zero times. It may repeat infinitely, or not
1855 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1856 with fixed upper repeat limits are compiled as a number of copies, with the
1857 optional ones preceded by BRAZERO or BRAMINZERO. */
1858
1859 case OP_BRAZERO:
1860 next = ecode + 1;
1861 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1863 do next += GET(next, 1); while (*next == OP_ALT);
1864 ecode = next + 1 + LINK_SIZE;
1865 break;
1866
1867 case OP_BRAMINZERO:
1868 next = ecode + 1;
1869 do next += GET(next, 1); while (*next == OP_ALT);
1870 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1872 ecode++;
1873 break;
1874
1875 case OP_SKIPZERO:
1876 next = ecode+1;
1877 do next += GET(next,1); while (*next == OP_ALT);
1878 ecode = next + 1 + LINK_SIZE;
1879 break;
1880
1881 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1882 here; just jump to the group, with allow_zero set TRUE. */
1883
1884 case OP_BRAPOSZERO:
1885 op = *(++ecode);
1886 allow_zero = TRUE;
1887 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1888 goto POSSESSIVE_NON_CAPTURE;
1889
1890 /* End of a group, repeated or non-repeating. */
1891
1892 case OP_KET:
1893 case OP_KETRMIN:
1894 case OP_KETRMAX:
1895 case OP_KETRPOS:
1896 prev = ecode - GET(ecode, 1);
1897
1898 /* If this was a group that remembered the subject start, in order to break
1899 infinite repeats of empty string matches, retrieve the subject start from
1900 the chain. Otherwise, set it NULL. */
1901
1902 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1903 {
1904 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1905 eptrb = eptrb->epb_prev; /* Backup to previous group */
1906 }
1907 else saved_eptr = NULL;
1908
1909 /* If we are at the end of an assertion group or a non-capturing atomic
1910 group, stop matching and return MATCH_MATCH, but record the current high
1911 water mark for use by positive assertions. We also need to record the match
1912 start in case it was changed by \K. */
1913
1914 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1915 *prev == OP_ONCE_NC)
1916 {
1917 md->end_match_ptr = eptr; /* For ONCE_NC */
1918 md->end_offset_top = offset_top;
1919 md->start_match_ptr = mstart;
1920 RRETURN(MATCH_MATCH); /* Sets md->mark */
1921 }
1922
1923 /* For capturing groups we have to check the group number back at the start
1924 and if necessary complete handling an extraction by setting the offsets and
1925 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1926 into group 0, so it won't be picked up here. Instead, we catch it when the
1927 OP_END is reached. Other recursion is handled here. We just have to record
1928 the current subject position and start match pointer and give a MATCH
1929 return. */
1930
1931 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1932 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1933 {
1934 number = GET2(prev, 1+LINK_SIZE);
1935 offset = number << 1;
1936
1937 #ifdef PCRE_DEBUG
1938 printf("end bracket %d", number);
1939 printf("\n");
1940 #endif
1941
1942 /* Handle a recursively called group. */
1943
1944 if (md->recursive != NULL && md->recursive->group_num == number)
1945 {
1946 md->end_match_ptr = eptr;
1947 md->start_match_ptr = mstart;
1948 RRETURN(MATCH_MATCH);
1949 }
1950
1951 /* Deal with capturing */
1952
1953 md->capture_last = (md->capture_last & OVFLMASK) | number;
1954 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1955 {
1956 /* If offset is greater than offset_top, it means that we are
1957 "skipping" a capturing group, and that group's offsets must be marked
1958 unset. In earlier versions of PCRE, all the offsets were unset at the
1959 start of matching, but this doesn't work because atomic groups and
1960 assertions can cause a value to be set that should later be unset.
1961 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1962 part of the atomic group, but this is not on the final matching path,
1963 so must be unset when 2 is set. (If there is no group 2, there is no
1964 problem, because offset_top will then be 2, indicating no capture.) */
1965
1966 if (offset > offset_top)
1967 {
1968 register int *iptr = md->offset_vector + offset_top;
1969 register int *iend = md->offset_vector + offset;
1970 while (iptr < iend) *iptr++ = -1;
1971 }
1972
1973 /* Now make the extraction */
1974
1975 md->offset_vector[offset] =
1976 md->offset_vector[md->offset_end - number];
1977 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1978 if (offset_top <= offset) offset_top = offset + 2;
1979 }
1980 }
1981
1982 /* For an ordinary non-repeating ket, just continue at this level. This
1983 also happens for a repeating ket if no characters were matched in the
1984 group. This is the forcible breaking of infinite loops as implemented in
1985 Perl 5.005. For a non-repeating atomic group that includes captures,
1986 establish a backup point by processing the rest of the pattern at a lower
1987 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1988 original OP_ONCE level, thereby bypassing intermediate backup points, but
1989 resetting any captures that happened along the way. */
1990
1991 if (*ecode == OP_KET || eptr == saved_eptr)
1992 {
1993 if (*prev == OP_ONCE)
1994 {
1995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1998 RRETURN(MATCH_ONCE);
1999 }
2000 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2001 break;
2002 }
2003
2004 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2005 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2006 at a time from the outer level, thus saving stack. */
2007
2008 if (*ecode == OP_KETRPOS)
2009 {
2010 md->end_match_ptr = eptr;
2011 md->end_offset_top = offset_top;
2012 RRETURN(MATCH_KETRPOS);
2013 }
2014
2015 /* The normal repeating kets try the rest of the pattern or restart from
2016 the preceding bracket, in the appropriate order. In the second case, we can
2017 use tail recursion to avoid using another stack frame, unless we have an
2018 an atomic group or an unlimited repeat of a group that can match an empty
2019 string. */
2020
2021 if (*ecode == OP_KETRMIN)
2022 {
2023 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2025 if (*prev == OP_ONCE)
2026 {
2027 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2029 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2030 RRETURN(MATCH_ONCE);
2031 }
2032 if (*prev >= OP_SBRA) /* Could match an empty string */
2033 {
2034 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2035 RRETURN(rrc);
2036 }
2037 ecode = prev;
2038 goto TAIL_RECURSE;
2039 }
2040 else /* OP_KETRMAX */
2041 {
2042 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2043 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2045 if (*prev == OP_ONCE)
2046 {
2047 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2049 md->once_target = prev;
2050 RRETURN(MATCH_ONCE);
2051 }
2052 ecode += 1 + LINK_SIZE;
2053 goto TAIL_RECURSE;
2054 }
2055 /* Control never gets here */
2056
2057 /* Not multiline mode: start of subject assertion, unless notbol. */
2058
2059 case OP_CIRC:
2060 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2061
2062 /* Start of subject assertion */
2063
2064 case OP_SOD:
2065 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2066 ecode++;
2067 break;
2068
2069 /* Multiline mode: start of subject unless notbol, or after any newline. */
2070
2071 case OP_CIRCM:
2072 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2073 if (eptr != md->start_subject &&
2074 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2075 RRETURN(MATCH_NOMATCH);
2076 ecode++;
2077 break;
2078
2079 /* Start of match assertion */
2080
2081 case OP_SOM:
2082 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2083 ecode++;
2084 break;
2085
2086 /* Reset the start of match point */
2087
2088 case OP_SET_SOM:
2089 mstart = eptr;
2090 ecode++;
2091 break;
2092
2093 /* Multiline mode: assert before any newline, or before end of subject
2094 unless noteol is set. */
2095
2096 case OP_DOLLM:
2097 if (eptr < md->end_subject)
2098 {
2099 if (!IS_NEWLINE(eptr))
2100 {
2101 if (md->partial != 0 &&
2102 eptr + 1 >= md->end_subject &&
2103 NLBLOCK->nltype == NLTYPE_FIXED &&
2104 NLBLOCK->nllen == 2 &&
2105 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2106 {
2107 md->hitend = TRUE;
2108 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2109 }
2110 RRETURN(MATCH_NOMATCH);
2111 }
2112 }
2113 else
2114 {
2115 if (md->noteol) RRETURN(MATCH_NOMATCH);
2116 SCHECK_PARTIAL();
2117 }
2118 ecode++;
2119 break;
2120
2121 /* Not multiline mode: assert before a terminating newline or before end of
2122 subject unless noteol is set. */
2123
2124 case OP_DOLL:
2125 if (md->noteol) RRETURN(MATCH_NOMATCH);
2126 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2127
2128 /* ... else fall through for endonly */
2129
2130 /* End of subject assertion (\z) */
2131
2132 case OP_EOD:
2133 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2134 SCHECK_PARTIAL();
2135 ecode++;
2136 break;
2137
2138 /* End of subject or ending \n assertion (\Z) */
2139
2140 case OP_EODN:
2141 ASSERT_NL_OR_EOS:
2142 if (eptr < md->end_subject &&
2143 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2144 {
2145 if (md->partial != 0 &&
2146 eptr + 1 >= md->end_subject &&
2147 NLBLOCK->nltype == NLTYPE_FIXED &&
2148 NLBLOCK->nllen == 2 &&
2149 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2150 {
2151 md->hitend = TRUE;
2152 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2153 }
2154 RRETURN(MATCH_NOMATCH);
2155 }
2156
2157 /* Either at end of string or \n before end. */
2158
2159 SCHECK_PARTIAL();
2160 ecode++;
2161 break;
2162
2163 /* Word boundary assertions */
2164
2165 case OP_NOT_WORD_BOUNDARY:
2166 case OP_WORD_BOUNDARY:
2167 {
2168
2169 /* Find out if the previous and current characters are "word" characters.
2170 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2171 be "non-word" characters. Remember the earliest consulted character for
2172 partial matching. */
2173
2174 #ifdef SUPPORT_UTF
2175 if (utf)
2176 {
2177 /* Get status of previous character */
2178
2179 if (eptr == md->start_subject) prev_is_word = FALSE; else
2180 {
2181 PCRE_PUCHAR lastptr = eptr - 1;
2182 BACKCHAR(lastptr);
2183 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2184 GETCHAR(c, lastptr);
2185 #ifdef SUPPORT_UCP
2186 if (md->use_ucp)
2187 {
2188 if (c == '_') prev_is_word = TRUE; else
2189 {
2190 int cat = UCD_CATEGORY(c);
2191 prev_is_word = (cat == ucp_L || cat == ucp_N);
2192 }
2193 }
2194 else
2195 #endif
2196 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2197 }
2198
2199 /* Get status of next character */
2200
2201 if (eptr >= md->end_subject)
2202 {
2203 SCHECK_PARTIAL();
2204 cur_is_word = FALSE;
2205 }
2206 else
2207 {
2208 GETCHAR(c, eptr);
2209 #ifdef SUPPORT_UCP
2210 if (md->use_ucp)
2211 {
2212 if (c == '_') cur_is_word = TRUE; else
2213 {
2214 int cat = UCD_CATEGORY(c);
2215 cur_is_word = (cat == ucp_L || cat == ucp_N);
2216 }
2217 }
2218 else
2219 #endif
2220 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2221 }
2222 }
2223 else
2224 #endif
2225
2226 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2227 consistency with the behaviour of \w we do use it in this case. */
2228
2229 {
2230 /* Get status of previous character */
2231
2232 if (eptr == md->start_subject) prev_is_word = FALSE; else
2233 {
2234 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2235 #ifdef SUPPORT_UCP
2236 if (md->use_ucp)
2237 {
2238 c = eptr[-1];
2239 if (c == '_') prev_is_word = TRUE; else
2240 {
2241 int cat = UCD_CATEGORY(c);
2242 prev_is_word = (cat == ucp_L || cat == ucp_N);
2243 }
2244 }
2245 else
2246 #endif
2247 prev_is_word = MAX_255(eptr[-1])
2248 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2249 }
2250
2251 /* Get status of next character */
2252
2253 if (eptr >= md->end_subject)
2254 {
2255 SCHECK_PARTIAL();
2256 cur_is_word = FALSE;
2257 }
2258 else
2259 #ifdef SUPPORT_UCP
2260 if (md->use_ucp)
2261 {
2262 c = *eptr;
2263 if (c == '_') cur_is_word = TRUE; else
2264 {
2265 int cat = UCD_CATEGORY(c);
2266 cur_is_word = (cat == ucp_L || cat == ucp_N);
2267 }
2268 }
2269 else
2270 #endif
2271 cur_is_word = MAX_255(*eptr)
2272 && ((md->ctypes[*eptr] & ctype_word) != 0);
2273 }
2274
2275 /* Now see if the situation is what we want */
2276
2277 if ((*ecode++ == OP_WORD_BOUNDARY)?
2278 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2279 RRETURN(MATCH_NOMATCH);
2280 }
2281 break;
2282
2283 /* Match any single character type except newline; have to take care with
2284 CRLF newlines and partial matching. */
2285
2286 case OP_ANY:
2287 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2288 if (md->partial != 0 &&
2289 eptr + 1 >= md->end_subject &&
2290 NLBLOCK->nltype == NLTYPE_FIXED &&
2291 NLBLOCK->nllen == 2 &&
2292 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2293 {
2294 md->hitend = TRUE;
2295 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2296 }
2297
2298 /* Fall through */
2299
2300 /* Match any single character whatsoever. */
2301
2302 case OP_ALLANY:
2303 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2304 { /* not be updated before SCHECK_PARTIAL. */
2305 SCHECK_PARTIAL();
2306 RRETURN(MATCH_NOMATCH);
2307 }
2308 eptr++;
2309 #ifdef SUPPORT_UTF
2310 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2311 #endif
2312 ecode++;
2313 break;
2314
2315 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2316 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2317
2318 case OP_ANYBYTE:
2319 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2320 { /* not be updated before SCHECK_PARTIAL. */
2321 SCHECK_PARTIAL();
2322 RRETURN(MATCH_NOMATCH);
2323 }
2324 eptr++;
2325 ecode++;
2326 break;
2327
2328 case OP_NOT_DIGIT:
2329 if (eptr >= md->end_subject)
2330 {
2331 SCHECK_PARTIAL();
2332 RRETURN(MATCH_NOMATCH);
2333 }
2334 GETCHARINCTEST(c, eptr);
2335 if (
2336 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2337 c < 256 &&
2338 #endif
2339 (md->ctypes[c] & ctype_digit) != 0
2340 )
2341 RRETURN(MATCH_NOMATCH);
2342 ecode++;
2343 break;
2344
2345 case OP_DIGIT:
2346 if (eptr >= md->end_subject)
2347 {
2348 SCHECK_PARTIAL();
2349 RRETURN(MATCH_NOMATCH);
2350 }
2351 GETCHARINCTEST(c, eptr);
2352 if (
2353 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2354 c > 255 ||
2355 #endif
2356 (md->ctypes[c] & ctype_digit) == 0
2357 )
2358 RRETURN(MATCH_NOMATCH);
2359 ecode++;
2360 break;
2361
2362 case OP_NOT_WHITESPACE:
2363 if (eptr >= md->end_subject)
2364 {
2365 SCHECK_PARTIAL();
2366 RRETURN(MATCH_NOMATCH);
2367 }
2368 GETCHARINCTEST(c, eptr);
2369 if (
2370 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2371 c < 256 &&
2372 #endif
2373 (md->ctypes[c] & ctype_space) != 0
2374 )
2375 RRETURN(MATCH_NOMATCH);
2376 ecode++;
2377 break;
2378
2379 case OP_WHITESPACE:
2380 if (eptr >= md->end_subject)
2381 {
2382 SCHECK_PARTIAL();
2383 RRETURN(MATCH_NOMATCH);
2384 }
2385 GETCHARINCTEST(c, eptr);
2386 if (
2387 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2388 c > 255 ||
2389 #endif
2390 (md->ctypes[c] & ctype_space) == 0
2391 )
2392 RRETURN(MATCH_NOMATCH);
2393 ecode++;
2394 break;
2395
2396 case OP_NOT_WORDCHAR:
2397 if (eptr >= md->end_subject)
2398 {
2399 SCHECK_PARTIAL();
2400 RRETURN(MATCH_NOMATCH);
2401 }
2402 GETCHARINCTEST(c, eptr);
2403 if (
2404 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2405 c < 256 &&
2406 #endif
2407 (md->ctypes[c] & ctype_word) != 0
2408 )
2409 RRETURN(MATCH_NOMATCH);
2410 ecode++;
2411 break;
2412
2413 case OP_WORDCHAR:
2414 if (eptr >= md->end_subject)
2415 {
2416 SCHECK_PARTIAL();
2417 RRETURN(MATCH_NOMATCH);
2418 }
2419 GETCHARINCTEST(c, eptr);
2420 if (
2421 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2422 c > 255 ||
2423 #endif
2424 (md->ctypes[c] & ctype_word) == 0
2425 )
2426 RRETURN(MATCH_NOMATCH);
2427 ecode++;
2428 break;
2429
2430 case OP_ANYNL:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 RRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 switch(c)
2438 {
2439 default: RRETURN(MATCH_NOMATCH);
2440
2441 case CHAR_CR:
2442 if (eptr >= md->end_subject)
2443 {
2444 SCHECK_PARTIAL();
2445 }
2446 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2447 break;
2448
2449 case CHAR_LF:
2450 break;
2451
2452 case CHAR_VT:
2453 case CHAR_FF:
2454 case CHAR_NEL:
2455 #ifndef EBCDIC
2456 case 0x2028:
2457 case 0x2029:
2458 #endif /* Not EBCDIC */
2459 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2460 break;
2461 }
2462 ecode++;
2463 break;
2464
2465 case OP_NOT_HSPACE:
2466 if (eptr >= md->end_subject)
2467 {
2468 SCHECK_PARTIAL();
2469 RRETURN(MATCH_NOMATCH);
2470 }
2471 GETCHARINCTEST(c, eptr);
2472 switch(c)
2473 {
2474 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2475 default: break;
2476 }
2477 ecode++;
2478 break;
2479
2480 case OP_HSPACE:
2481 if (eptr >= md->end_subject)
2482 {
2483 SCHECK_PARTIAL();
2484 RRETURN(MATCH_NOMATCH);
2485 }
2486 GETCHARINCTEST(c, eptr);
2487 switch(c)
2488 {
2489 HSPACE_CASES: break; /* Byte and multibyte cases */
2490 default: RRETURN(MATCH_NOMATCH);
2491 }
2492 ecode++;
2493 break;
2494
2495 case OP_NOT_VSPACE:
2496 if (eptr >= md->end_subject)
2497 {
2498 SCHECK_PARTIAL();
2499 RRETURN(MATCH_NOMATCH);
2500 }
2501 GETCHARINCTEST(c, eptr);
2502 switch(c)
2503 {
2504 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2505 default: break;
2506 }
2507 ecode++;
2508 break;
2509
2510 case OP_VSPACE:
2511 if (eptr >= md->end_subject)
2512 {
2513 SCHECK_PARTIAL();
2514 RRETURN(MATCH_NOMATCH);
2515 }
2516 GETCHARINCTEST(c, eptr);
2517 switch(c)
2518 {
2519 VSPACE_CASES: break;
2520 default: RRETURN(MATCH_NOMATCH);
2521 }
2522 ecode++;
2523 break;
2524
2525 #ifdef SUPPORT_UCP
2526 /* Check the next character by Unicode property. We will get here only
2527 if the support is in the binary; otherwise a compile-time error occurs. */
2528
2529 case OP_PROP:
2530 case OP_NOTPROP:
2531 if (eptr >= md->end_subject)
2532 {
2533 SCHECK_PARTIAL();
2534 RRETURN(MATCH_NOMATCH);
2535 }
2536 GETCHARINCTEST(c, eptr);
2537 {
2538 const pcre_uint32 *cp;
2539 const ucd_record *prop = GET_UCD(c);
2540
2541 switch(ecode[1])
2542 {
2543 case PT_ANY:
2544 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2545 break;
2546
2547 case PT_LAMP:
2548 if ((prop->chartype == ucp_Lu ||
2549 prop->chartype == ucp_Ll ||
2550 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2551 RRETURN(MATCH_NOMATCH);
2552 break;
2553
2554 case PT_GC:
2555 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2556 RRETURN(MATCH_NOMATCH);
2557 break;
2558
2559 case PT_PC:
2560 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2561 RRETURN(MATCH_NOMATCH);
2562 break;
2563
2564 case PT_SC:
2565 if ((ecode[2] != prop->script) == (op == OP_PROP))
2566 RRETURN(MATCH_NOMATCH);
2567 break;
2568
2569 /* These are specials */
2570
2571 case PT_ALNUM:
2572 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2573 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2574 RRETURN(MATCH_NOMATCH);
2575 break;
2576
2577 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2578 which means that Perl space and POSIX space are now identical. PCRE
2579 was changed at release 8.34. */
2580
2581 case PT_SPACE: /* Perl space */
2582 case PT_PXSPACE: /* POSIX space */
2583 switch(c)
2584 {
2585 HSPACE_CASES:
2586 VSPACE_CASES:
2587 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 default:
2591 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2592 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2593 break;
2594 }
2595 break;
2596
2597 case PT_WORD:
2598 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2599 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2600 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2601 RRETURN(MATCH_NOMATCH);
2602 break;
2603
2604 case PT_CLIST:
2605 cp = PRIV(ucd_caseless_sets) + ecode[2];
2606 for (;;)
2607 {
2608 if (c < *cp)
2609 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2610 if (c == *cp++)
2611 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2612 }
2613 break;
2614
2615 case PT_UCNC:
2616 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2617 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2618 c >= 0xe000) == (op == OP_NOTPROP))
2619 RRETURN(MATCH_NOMATCH);
2620 break;
2621
2622 /* This should never occur */
2623
2624 default:
2625 RRETURN(PCRE_ERROR_INTERNAL);
2626 }
2627
2628 ecode += 3;
2629 }
2630 break;
2631
2632 /* Match an extended Unicode sequence. We will get here only if the support
2633 is in the binary; otherwise a compile-time error occurs. */
2634
2635 case OP_EXTUNI:
2636 if (eptr >= md->end_subject)
2637 {
2638 SCHECK_PARTIAL();
2639 RRETURN(MATCH_NOMATCH);
2640 }
2641 else
2642 {
2643 int lgb, rgb;
2644 GETCHARINCTEST(c, eptr);
2645 lgb = UCD_GRAPHBREAK(c);
2646 while (eptr < md->end_subject)
2647 {
2648 int len = 1;
2649 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2650 rgb = UCD_GRAPHBREAK(c);
2651 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2652 lgb = rgb;
2653 eptr += len;
2654 }
2655 }
2656 CHECK_PARTIAL();
2657 ecode++;
2658 break;
2659 #endif /* SUPPORT_UCP */
2660
2661
2662 /* Match a back reference, possibly repeatedly. Look past the end of the
2663 item to see if there is repeat information following. The code is similar
2664 to that for character classes, but repeated for efficiency. Then obey
2665 similar code to character type repeats - written out again for speed.
2666 However, if the referenced string is the empty string, always treat
2667 it as matched, any number of times (otherwise there could be infinite
2668 loops). If the reference is unset, there are two possibilities:
2669
2670 (a) In the default, Perl-compatible state, set the length negative;
2671 this ensures that every attempt at a match fails. We can't just fail
2672 here, because of the possibility of quantifiers with zero minima.
2673
2674 (b) If the JavaScript compatibility flag is set, set the length to zero
2675 so that the back reference matches an empty string.
2676
2677 Otherwise, set the length to the length of what was matched by the
2678 referenced subpattern.
2679
2680 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2681 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2682 and OP_DNREFI are used. In this case we must scan the list of groups to
2683 which the name refers, and use the first one that is set. */
2684
2685 case OP_DNREF:
2686 case OP_DNREFI:
2687 caseless = op == OP_DNREFI;
2688 {
2689 int count = GET2(ecode, 1+IMM2_SIZE);
2690 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2691 ecode += 1 + 2*IMM2_SIZE;
2692
2693 while (count-- > 0)
2694 {
2695 offset = GET2(slot, 0) << 1;
2696 if (offset < offset_top && md->offset_vector[offset] >= 0) break;
2697 slot += md->name_entry_size;
2698 }
2699 if (count < 0)
2700 length = (md->jscript_compat)? 0 : -1;
2701 else
2702 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2703 }
2704 goto REF_REPEAT;
2705
2706 case OP_REF:
2707 case OP_REFI:
2708 caseless = op == OP_REFI;
2709 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2710 ecode += 1 + IMM2_SIZE;
2711 if (offset >= offset_top || md->offset_vector[offset] < 0)
2712 length = (md->jscript_compat)? 0 : -1;
2713 else
2714 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2715
2716 /* Set up for repetition, or handle the non-repeated case */
2717
2718 REF_REPEAT:
2719 switch (*ecode)
2720 {
2721 case OP_CRSTAR:
2722 case OP_CRMINSTAR:
2723 case OP_CRPLUS:
2724 case OP_CRMINPLUS:
2725 case OP_CRQUERY:
2726 case OP_CRMINQUERY:
2727 c = *ecode++ - OP_CRSTAR;
2728 minimize = (c & 1) != 0;
2729 min = rep_min[c]; /* Pick up values from tables; */
2730 max = rep_max[c]; /* zero for max => infinity */
2731 if (max == 0) max = INT_MAX;
2732 break;
2733
2734 case OP_CRRANGE:
2735 case OP_CRMINRANGE:
2736 minimize = (*ecode == OP_CRMINRANGE);
2737 min = GET2(ecode, 1);
2738 max = GET2(ecode, 1 + IMM2_SIZE);
2739 if (max == 0) max = INT_MAX;
2740 ecode += 1 + 2 * IMM2_SIZE;
2741 break;
2742
2743 default: /* No repeat follows */
2744 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2745 {
2746 if (length == -2) eptr = md->end_subject; /* Partial match */
2747 CHECK_PARTIAL();
2748 RRETURN(MATCH_NOMATCH);
2749 }
2750 eptr += length;
2751 continue; /* With the main loop */
2752 }
2753
2754 /* Handle repeated back references. If the length of the reference is
2755 zero, just continue with the main loop. If the length is negative, it
2756 means the reference is unset in non-Java-compatible mode. If the minimum is
2757 zero, we can continue at the same level without recursion. For any other
2758 minimum, carrying on will result in NOMATCH. */
2759
2760 if (length == 0) continue;
2761 if (length < 0 && min == 0) continue;
2762
2763 /* First, ensure the minimum number of matches are present. We get back
2764 the length of the reference string explicitly rather than passing the
2765 address of eptr, so that eptr can be a register variable. */
2766
2767 for (i = 1; i <= min; i++)
2768 {
2769 int slength;
2770 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2771 {
2772 if (slength == -2) eptr = md->end_subject; /* Partial match */
2773 CHECK_PARTIAL();
2774 RRETURN(MATCH_NOMATCH);
2775 }
2776 eptr += slength;
2777 }
2778
2779 /* If min = max, continue at the same level without recursion.
2780 They are not both allowed to be zero. */
2781
2782 if (min == max) continue;
2783
2784 /* If minimizing, keep trying and advancing the pointer */
2785
2786 if (minimize)
2787 {
2788 for (fi = min;; fi++)
2789 {
2790 int slength;
2791 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2793 if (fi >= max) RRETURN(MATCH_NOMATCH);
2794 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2795 {
2796 if (slength == -2) eptr = md->end_subject; /* Partial match */
2797 CHECK_PARTIAL();
2798 RRETURN(MATCH_NOMATCH);
2799 }
2800 eptr += slength;
2801 }
2802 /* Control never gets here */
2803 }
2804
2805 /* If maximizing, find the longest string and work backwards */
2806
2807 else
2808 {
2809 pp = eptr;
2810 for (i = min; i < max; i++)
2811 {
2812 int slength;
2813 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2814 {
2815 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2816 the soft partial matching case. */
2817
2818 if (slength == -2 && md->partial != 0 &&
2819 md->end_subject > md->start_used_ptr)
2820 {
2821 md->hitend = TRUE;
2822 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2823 }
2824 break;
2825 }
2826 eptr += slength;
2827 }
2828
2829 while (eptr >= pp)
2830 {
2831 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2833 eptr -= length;
2834 }
2835 RRETURN(MATCH_NOMATCH);
2836 }
2837 /* Control never gets here */
2838
2839 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2840 used when all the characters in the class have values in the range 0-255,
2841 and either the matching is caseful, or the characters are in the range
2842 0-127 when UTF-8 processing is enabled. The only difference between
2843 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2844 encountered.
2845
2846 First, look past the end of the item to see if there is repeat information
2847 following. Then obey similar code to character type repeats - written out
2848 again for speed. */
2849
2850 case OP_NCLASS:
2851 case OP_CLASS:
2852 {
2853 /* The data variable is saved across frames, so the byte map needs to
2854 be stored there. */
2855 #define BYTE_MAP ((pcre_uint8 *)data)
2856 data = ecode + 1; /* Save for matching */
2857 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2858
2859 switch (*ecode)
2860 {
2861 case OP_CRSTAR:
2862 case OP_CRMINSTAR:
2863 case OP_CRPLUS:
2864 case OP_CRMINPLUS:
2865 case OP_CRQUERY:
2866 case OP_CRMINQUERY:
2867 case OP_CRPOSSTAR:
2868 case OP_CRPOSPLUS:
2869 case OP_CRPOSQUERY:
2870 c = *ecode++ - OP_CRSTAR;
2871 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2872 else possessive = TRUE;
2873 min = rep_min[c]; /* Pick up values from tables; */
2874 max = rep_max[c]; /* zero for max => infinity */
2875 if (max == 0) max = INT_MAX;
2876 break;
2877
2878 case OP_CRRANGE:
2879 case OP_CRMINRANGE:
2880 case OP_CRPOSRANGE:
2881 minimize = (*ecode == OP_CRMINRANGE);
2882 possessive = (*ecode == OP_CRPOSRANGE);
2883 min = GET2(ecode, 1);
2884 max = GET2(ecode, 1 + IMM2_SIZE);
2885 if (max == 0) max = INT_MAX;
2886 ecode += 1 + 2 * IMM2_SIZE;
2887 break;
2888
2889 default: /* No repeat follows */
2890 min = max = 1;
2891 break;
2892 }
2893
2894 /* First, ensure the minimum number of matches are present. */
2895
2896 #ifdef SUPPORT_UTF
2897 if (utf)
2898 {
2899 for (i = 1; i <= min; i++)
2900 {
2901 if (eptr >= md->end_subject)
2902 {
2903 SCHECK_PARTIAL();
2904 RRETURN(MATCH_NOMATCH);
2905 }
2906 GETCHARINC(c, eptr);
2907 if (c > 255)
2908 {
2909 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2910 }
2911 else
2912 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2913 }
2914 }
2915 else
2916 #endif
2917 /* Not UTF mode */
2918 {
2919 for (i = 1; i <= min; i++)
2920 {
2921 if (eptr >= md->end_subject)
2922 {
2923 SCHECK_PARTIAL();
2924 RRETURN(MATCH_NOMATCH);
2925 }
2926 c = *eptr++;
2927 #ifndef COMPILE_PCRE8
2928 if (c > 255)
2929 {
2930 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2931 }
2932 else
2933 #endif
2934 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2935 }
2936 }
2937
2938 /* If max == min we can continue with the main loop without the
2939 need to recurse. */
2940
2941 if (min == max) continue;
2942
2943 /* If minimizing, keep testing the rest of the expression and advancing
2944 the pointer while it matches the class. */
2945
2946 if (minimize)
2947 {
2948 #ifdef SUPPORT_UTF
2949 if (utf)
2950 {
2951 for (fi = min;; fi++)
2952 {
2953 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2955 if (fi >= max) RRETURN(MATCH_NOMATCH);
2956 if (eptr >= md->end_subject)
2957 {
2958 SCHECK_PARTIAL();
2959 RRETURN(MATCH_NOMATCH);
2960 }
2961 GETCHARINC(c, eptr);
2962 if (c > 255)
2963 {
2964 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2965 }
2966 else
2967 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2968 }
2969 }
2970 else
2971 #endif
2972 /* Not UTF mode */
2973 {
2974 for (fi = min;; fi++)
2975 {
2976 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2978 if (fi >= max) RRETURN(MATCH_NOMATCH);
2979 if (eptr >= md->end_subject)
2980 {
2981 SCHECK_PARTIAL();
2982 RRETURN(MATCH_NOMATCH);
2983 }
2984 c = *eptr++;
2985 #ifndef COMPILE_PCRE8
2986 if (c > 255)
2987 {
2988 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2989 }
2990 else
2991 #endif
2992 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2993 }
2994 }
2995 /* Control never gets here */
2996 }
2997
2998 /* If maximizing, find the longest possible run, then work backwards. */
2999
3000 else
3001 {
3002 pp = eptr;
3003
3004 #ifdef SUPPORT_UTF
3005 if (utf)
3006 {
3007 for (i = min; i < max; i++)
3008 {
3009 int len = 1;
3010 if (eptr >= md->end_subject)
3011 {
3012 SCHECK_PARTIAL();
3013 break;
3014 }
3015 GETCHARLEN(c, eptr, len);
3016 if (c > 255)
3017 {
3018 if (op == OP_CLASS) break;
3019 }
3020 else
3021 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3022 eptr += len;
3023 }
3024
3025 if (possessive) continue; /* No backtracking */
3026
3027 for (;;)
3028 {
3029 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3030 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3031 if (eptr-- == pp) break; /* Stop if tried at original pos */
3032 BACKCHAR(eptr);
3033 }
3034 }
3035 else
3036 #endif
3037 /* Not UTF mode */
3038 {
3039 for (i = min; i < max; i++)
3040 {
3041 if (eptr >= md->end_subject)
3042 {
3043 SCHECK_PARTIAL();
3044 break;
3045 }
3046 c = *eptr;
3047 #ifndef COMPILE_PCRE8
3048 if (c > 255)
3049 {
3050 if (op == OP_CLASS) break;
3051 }
3052 else
3053 #endif
3054 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3055 eptr++;
3056 }
3057
3058 if (possessive) continue; /* No backtracking */
3059
3060 while (eptr >= pp)
3061 {
3062 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3063 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3064 eptr--;
3065 }
3066 }
3067
3068 RRETURN(MATCH_NOMATCH);
3069 }
3070 #undef BYTE_MAP
3071 }
3072 /* Control never gets here */
3073
3074
3075 /* Match an extended character class. This opcode is encountered only
3076 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3077 mode, because Unicode properties are supported in non-UTF-8 mode. */
3078
3079 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3080 case OP_XCLASS:
3081 {
3082 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3083 ecode += GET(ecode, 1); /* Advance past the item */
3084
3085 switch (*ecode)
3086 {
3087 case OP_CRSTAR:
3088 case OP_CRMINSTAR:
3089 case OP_CRPLUS:
3090 case OP_CRMINPLUS:
3091 case OP_CRQUERY:
3092 case OP_CRMINQUERY:
3093 case OP_CRPOSSTAR:
3094 case OP_CRPOSPLUS:
3095 case OP_CRPOSQUERY:
3096 c = *ecode++ - OP_CRSTAR;
3097 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3098 else possessive = TRUE;
3099 min = rep_min[c]; /* Pick up values from tables; */
3100 max = rep_max[c]; /* zero for max => infinity */
3101 if (max == 0) max = INT_MAX;
3102 break;
3103
3104 case OP_CRRANGE:
3105 case OP_CRMINRANGE:
3106 case OP_CRPOSRANGE:
3107 minimize = (*ecode == OP_CRMINRANGE);
3108 possessive = (*ecode == OP_CRPOSRANGE);
3109 min = GET2(ecode, 1);
3110 max = GET2(ecode, 1 + IMM2_SIZE);
3111 if (max == 0) max = INT_MAX;
3112 ecode += 1 + 2 * IMM2_SIZE;
3113 break;
3114
3115 default: /* No repeat follows */
3116 min = max = 1;
3117 break;
3118 }
3119
3120 /* First, ensure the minimum number of matches are present. */
3121
3122 for (i = 1; i <= min; i++)
3123 {
3124 if (eptr >= md->end_subject)
3125 {
3126 SCHECK_PARTIAL();
3127 RRETURN(MATCH_NOMATCH);
3128 }
3129 GETCHARINCTEST(c, eptr);
3130 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3131 }
3132
3133 /* If max == min we can continue with the main loop without the
3134 need to recurse. */
3135
3136 if (min == max) continue;
3137
3138 /* If minimizing, keep testing the rest of the expression and advancing
3139 the pointer while it matches the class. */
3140
3141 if (minimize)
3142 {
3143 for (fi = min;; fi++)
3144 {
3145 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3146 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3147 if (fi >= max) RRETURN(MATCH_NOMATCH);
3148 if (eptr >= md->end_subject)
3149 {
3150 SCHECK_PARTIAL();
3151 RRETURN(MATCH_NOMATCH);
3152 }
3153 GETCHARINCTEST(c, eptr);
3154 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3155 }
3156 /* Control never gets here */
3157 }
3158
3159 /* If maximizing, find the longest possible run, then work backwards. */
3160
3161 else
3162 {
3163 pp = eptr;
3164 for (i = min; i < max; i++)
3165 {
3166 int len = 1;
3167 if (eptr >= md->end_subject)
3168 {
3169 SCHECK_PARTIAL();
3170 break;
3171 }
3172 #ifdef SUPPORT_UTF
3173 GETCHARLENTEST(c, eptr, len);
3174 #else
3175 c = *eptr;
3176 #endif
3177 if (!PRIV(xclass)(c, data, utf)) break;
3178 eptr += len;
3179 }
3180
3181 if (possessive) continue; /* No backtracking */
3182
3183 for(;;)
3184 {
3185 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3186 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3187 if (eptr-- == pp) break; /* Stop if tried at original pos */
3188 #ifdef SUPPORT_UTF
3189 if (utf) BACKCHAR(eptr);
3190 #endif
3191 }
3192 RRETURN(MATCH_NOMATCH);
3193 }
3194
3195 /* Control never gets here */
3196 }
3197 #endif /* End of XCLASS */
3198
3199 /* Match a single character, casefully */
3200
3201 case OP_CHAR:
3202 #ifdef SUPPORT_UTF
3203 if (utf)
3204 {
3205 length = 1;
3206 ecode++;
3207 GETCHARLEN(fc, ecode, length);
3208 if (length > md->end_subject - eptr)
3209 {
3210 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3211 RRETURN(MATCH_NOMATCH);
3212 }
3213 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3214 }
3215 else
3216 #endif
3217 /* Not UTF mode */
3218 {
3219 if (md->end_subject - eptr < 1)
3220 {
3221 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3222 RRETURN(MATCH_NOMATCH);
3223 }
3224 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3225 ecode += 2;
3226 }
3227 break;
3228
3229 /* Match a single character, caselessly. If we are at the end of the
3230 subject, give up immediately. */
3231
3232 case OP_CHARI:
3233 if (eptr >= md->end_subject)
3234 {
3235 SCHECK_PARTIAL();
3236 RRETURN(MATCH_NOMATCH);
3237 }
3238
3239 #ifdef SUPPORT_UTF
3240 if (utf)
3241 {
3242 length = 1;
3243 ecode++;
3244 GETCHARLEN(fc, ecode, length);
3245
3246 /* If the pattern character's value is < 128, we have only one byte, and
3247 we know that its other case must also be one byte long, so we can use the
3248 fast lookup table. We know that there is at least one byte left in the
3249 subject. */
3250
3251 if (fc < 128)
3252 {
3253 pcre_uint32 cc = RAWUCHAR(eptr);
3254 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3255 ecode++;
3256 eptr++;
3257 }
3258
3259 /* Otherwise we must pick up the subject character. Note that we cannot
3260 use the value of "length" to check for sufficient bytes left, because the
3261 other case of the character may have more or fewer bytes. */
3262
3263 else
3264 {
3265 pcre_uint32 dc;
3266 GETCHARINC(dc, eptr);
3267 ecode += length;
3268
3269 /* If we have Unicode property support, we can use it to test the other
3270 case of the character, if there is one. */
3271
3272 if (fc != dc)
3273 {
3274 #ifdef SUPPORT_UCP
3275 if (dc != UCD_OTHERCASE(fc))
3276 #endif
3277 RRETURN(MATCH_NOMATCH);
3278 }
3279 }
3280 }
3281 else
3282 #endif /* SUPPORT_UTF */
3283
3284 /* Not UTF mode */
3285 {
3286 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3287 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3288 eptr++;
3289 ecode += 2;
3290 }
3291 break;
3292
3293 /* Match a single character repeatedly. */
3294
3295 case OP_EXACT:
3296 case OP_EXACTI:
3297 min = max = GET2(ecode, 1);
3298 ecode += 1 + IMM2_SIZE;
3299 goto REPEATCHAR;
3300
3301 case OP_POSUPTO:
3302 case OP_POSUPTOI:
3303 possessive = TRUE;
3304 /* Fall through */
3305
3306 case OP_UPTO:
3307 case OP_UPTOI:
3308 case OP_MINUPTO:
3309 case OP_MINUPTOI:
3310 min = 0;
3311 max = GET2(ecode, 1);
3312 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3313 ecode += 1 + IMM2_SIZE;
3314 goto REPEATCHAR;
3315
3316 case OP_POSSTAR:
3317 case OP_POSSTARI:
3318 possessive = TRUE;
3319 min = 0;
3320 max = INT_MAX;
3321 ecode++;
3322 goto REPEATCHAR;
3323
3324 case OP_POSPLUS:
3325 case OP_POSPLUSI:
3326 possessive = TRUE;
3327 min = 1;
3328 max = INT_MAX;
3329 ecode++;
3330 goto REPEATCHAR;
3331
3332 case OP_POSQUERY:
3333 case OP_POSQUERYI:
3334 possessive = TRUE;
3335 min = 0;
3336 max = 1;
3337 ecode++;
3338 goto REPEATCHAR;
3339
3340 case OP_STAR:
3341 case OP_STARI:
3342 case OP_MINSTAR:
3343 case OP_MINSTARI:
3344 case OP_PLUS:
3345 case OP_PLUSI:
3346 case OP_MINPLUS:
3347 case OP_MINPLUSI:
3348 case OP_QUERY:
3349 case OP_QUERYI:
3350 case OP_MINQUERY:
3351 case OP_MINQUERYI:
3352 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3353 minimize = (c & 1) != 0;
3354 min = rep_min[c]; /* Pick up values from tables; */
3355 max = rep_max[c]; /* zero for max => infinity */
3356 if (max == 0) max = INT_MAX;
3357
3358 /* Common code for all repeated single-character matches. We first check
3359 for the minimum number of characters. If the minimum equals the maximum, we
3360 are done. Otherwise, if minimizing, check the rest of the pattern for a
3361 match; if there isn't one, advance up to the maximum, one character at a
3362 time.
3363
3364 If maximizing, advance up to the maximum number of matching characters,
3365 until eptr is past the end of the maximum run. If possessive, we are
3366 then done (no backing up). Otherwise, match at this position; anything
3367 other than no match is immediately returned. For nomatch, back up one
3368 character, unless we are matching \R and the last thing matched was
3369 \r\n, in which case, back up two bytes. When we reach the first optional
3370 character position, we can save stack by doing a tail recurse.
3371
3372 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3373 for speed. */
3374
3375 REPEATCHAR:
3376 #ifdef SUPPORT_UTF
3377 if (utf)
3378 {
3379 length = 1;
3380 charptr = ecode;
3381 GETCHARLEN(fc, ecode, length);
3382 ecode += length;
3383
3384 /* Handle multibyte character matching specially here. There is
3385 support for caseless matching if UCP support is present. */
3386
3387 if (length > 1)
3388 {
3389 #ifdef SUPPORT_UCP
3390 pcre_uint32 othercase;
3391 if (op >= OP_STARI && /* Caseless */
3392 (othercase = UCD_OTHERCASE(fc)) != fc)
3393 oclength = PRIV(ord2utf)(othercase, occhars);
3394 else oclength = 0;
3395 #endif /* SUPPORT_UCP */
3396
3397 for (i = 1; i <= min; i++)
3398 {
3399 if (eptr <= md->end_subject - length &&
3400 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3401 #ifdef SUPPORT_UCP
3402 else if (oclength > 0 &&
3403 eptr <= md->end_subject - oclength &&
3404 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3405 #endif /* SUPPORT_UCP */
3406 else
3407 {
3408 CHECK_PARTIAL();
3409 RRETURN(MATCH_NOMATCH);
3410 }
3411 }
3412
3413 if (min == max) continue;
3414
3415 if (minimize)
3416 {
3417 for (fi = min;; fi++)
3418 {
3419 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3420 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3421 if (fi >= max) RRETURN(MATCH_NOMATCH);
3422 if (eptr <= md->end_subject - length &&
3423 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3424 #ifdef SUPPORT_UCP
3425 else if (oclength > 0 &&
3426 eptr <= md->end_subject - oclength &&
3427 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3428 #endif /* SUPPORT_UCP */
3429 else
3430 {
3431 CHECK_PARTIAL();
3432 RRETURN(MATCH_NOMATCH);
3433 }
3434 }
3435 /* Control never gets here */
3436 }
3437
3438 else /* Maximize */
3439 {
3440 pp = eptr;
3441 for (i = min; i < max; i++)
3442 {
3443 if (eptr <= md->end_subject - length &&
3444 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3445 #ifdef SUPPORT_UCP
3446 else if (oclength > 0 &&
3447 eptr <= md->end_subject - oclength &&
3448 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3449 #endif /* SUPPORT_UCP */
3450 else
3451 {
3452 CHECK_PARTIAL();
3453 break;
3454 }
3455 }
3456
3457 if (possessive) continue; /* No backtracking */
3458 for(;;)
3459 {
3460 if (eptr == pp) goto TAIL_RECURSE;
3461 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3462 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3463 #ifdef SUPPORT_UCP
3464 eptr--;
3465 BACKCHAR(eptr);
3466 #else /* without SUPPORT_UCP */
3467 eptr -= length;
3468 #endif /* SUPPORT_UCP */
3469 }
3470 }
3471 /* Control never gets here */
3472 }
3473
3474 /* If the length of a UTF-8 character is 1, we fall through here, and
3475 obey the code as for non-UTF-8 characters below, though in this case the
3476 value of fc will always be < 128. */
3477 }
3478 else
3479 #endif /* SUPPORT_UTF */
3480 /* When not in UTF-8 mode, load a single-byte character. */
3481 fc = *ecode++;
3482
3483 /* The value of fc at this point is always one character, though we may
3484 or may not be in UTF mode. The code is duplicated for the caseless and
3485 caseful cases, for speed, since matching characters is likely to be quite
3486 common. First, ensure the minimum number of matches are present. If min =
3487 max, continue at the same level without recursing. Otherwise, if
3488 minimizing, keep trying the rest of the expression and advancing one
3489 matching character if failing, up to the maximum. Alternatively, if
3490 maximizing, find the maximum number of characters and work backwards. */
3491
3492 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3493 max, (char *)eptr));
3494
3495 if (op >= OP_STARI) /* Caseless */
3496 {
3497 #ifdef COMPILE_PCRE8
3498 /* fc must be < 128 if UTF is enabled. */
3499 foc = md->fcc[fc];
3500 #else
3501 #ifdef SUPPORT_UTF
3502 #ifdef SUPPORT_UCP
3503 if (utf && fc > 127)
3504 foc = UCD_OTHERCASE(fc);
3505 #else
3506 if (utf && fc > 127)
3507 foc = fc;
3508 #endif /* SUPPORT_UCP */
3509 else
3510 #endif /* SUPPORT_UTF */
3511 foc = TABLE_GET(fc, md->fcc, fc);
3512 #endif /* COMPILE_PCRE8 */
3513
3514 for (i = 1; i <= min; i++)
3515 {
3516 pcre_uint32 cc; /* Faster than pcre_uchar */
3517 if (eptr >= md->end_subject)
3518 {
3519 SCHECK_PARTIAL();
3520 RRETURN(MATCH_NOMATCH);
3521 }
3522 cc = RAWUCHARTEST(eptr);
3523 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3524 eptr++;
3525 }
3526 if (min == max) continue;
3527 if (minimize)
3528 {
3529 for (fi = min;; fi++)
3530 {
3531 pcre_uint32 cc; /* Faster than pcre_uchar */
3532 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3533 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3534 if (fi >= max) RRETURN(MATCH_NOMATCH);
3535 if (eptr >= md->end_subject)
3536 {
3537 SCHECK_PARTIAL();
3538 RRETURN(MATCH_NOMATCH);
3539 }
3540 cc = RAWUCHARTEST(eptr);
3541 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3542 eptr++;
3543 }
3544 /* Control never gets here */
3545 }
3546 else /* Maximize */
3547 {
3548 pp = eptr;
3549 for (i = min; i < max; i++)
3550 {
3551 pcre_uint32 cc; /* Faster than pcre_uchar */
3552 if (eptr >= md->end_subject)
3553 {
3554 SCHECK_PARTIAL();
3555 break;
3556 }
3557 cc = RAWUCHARTEST(eptr);
3558 if (fc != cc && foc != cc) break;
3559 eptr++;
3560 }
3561 if (possessive) continue; /* No backtracking */
3562 for (;;)
3563 {
3564 if (eptr == pp) goto TAIL_RECURSE;
3565 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3566 eptr--;
3567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3568 }
3569 /* Control never gets here */
3570 }
3571 }
3572
3573 /* Caseful comparisons (includes all multi-byte characters) */
3574
3575 else
3576 {
3577 for (i = 1; i <= min; i++)
3578 {
3579 if (eptr >= md->end_subject)
3580 {
3581 SCHECK_PARTIAL();
3582 RRETURN(MATCH_NOMATCH);
3583 }
3584 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3585 }
3586
3587 if (min == max) continue;
3588
3589 if (minimize)
3590 {
3591 for (fi = min;; fi++)
3592 {
3593 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3594 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3595 if (fi >= max) RRETURN(MATCH_NOMATCH);
3596 if (eptr >= md->end_subject)
3597 {
3598 SCHECK_PARTIAL();
3599 RRETURN(MATCH_NOMATCH);
3600 }
3601 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3602 }
3603 /* Control never gets here */
3604 }
3605 else /* Maximize */
3606 {
3607 pp = eptr;
3608 for (i = min; i < max; i++)
3609 {
3610 if (eptr >= md->end_subject)
3611 {
3612 SCHECK_PARTIAL();
3613 break;
3614 }
3615 if (fc != RAWUCHARTEST(eptr)) break;
3616 eptr++;
3617 }
3618 if (possessive) continue; /* No backtracking */
3619 for (;;)
3620 {
3621 if (eptr == pp) goto TAIL_RECURSE;
3622 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3623 eptr--;
3624 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3625 }
3626 /* Control never gets here */
3627 }
3628 }
3629 /* Control never gets here */
3630
3631 /* Match a negated single one-byte character. The character we are
3632 checking can be multibyte. */
3633
3634 case OP_NOT:
3635 case OP_NOTI:
3636 if (eptr >= md->end_subject)
3637 {
3638 SCHECK_PARTIAL();
3639 RRETURN(MATCH_NOMATCH);
3640 }
3641 #ifdef SUPPORT_UTF
3642 if (utf)
3643 {
3644 register pcre_uint32 ch, och;
3645
3646 ecode++;
3647 GETCHARINC(ch, ecode);
3648 GETCHARINC(c, eptr);
3649
3650 if (op == OP_NOT)
3651 {
3652 if (ch == c) RRETURN(MATCH_NOMATCH);
3653 }
3654 else
3655 {
3656 #ifdef SUPPORT_UCP
3657 if (ch > 127)
3658 och = UCD_OTHERCASE(ch);
3659 #else
3660 if (ch > 127)
3661 och = ch;
3662 #endif /* SUPPORT_UCP */
3663 else
3664 och = TABLE_GET(ch, md->fcc, ch);
3665 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3666 }
3667 }
3668 else
3669 #endif
3670 {
3671 register pcre_uint32 ch = ecode[1];
3672 c = *eptr++;
3673 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3674 RRETURN(MATCH_NOMATCH);
3675 ecode += 2;
3676 }
3677 break;
3678
3679 /* Match a negated single one-byte character repeatedly. This is almost a
3680 repeat of the code for a repeated single character, but I haven't found a
3681 nice way of commoning these up that doesn't require a test of the
3682 positive/negative option for each character match. Maybe that wouldn't add
3683 very much to the time taken, but character matching *is* what this is all
3684 about... */
3685
3686 case OP_NOTEXACT:
3687 case OP_NOTEXACTI:
3688 min = max = GET2(ecode, 1);
3689 ecode += 1 + IMM2_SIZE;
3690 goto REPEATNOTCHAR;
3691
3692 case OP_NOTUPTO:
3693 case OP_NOTUPTOI:
3694 case OP_NOTMINUPTO:
3695 case OP_NOTMINUPTOI:
3696 min = 0;
3697 max = GET2(ecode, 1);
3698 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3699 ecode += 1 + IMM2_SIZE;
3700 goto REPEATNOTCHAR;
3701
3702 case OP_NOTPOSSTAR:
3703 case OP_NOTPOSSTARI:
3704 possessive = TRUE;
3705 min = 0;
3706 max = INT_MAX;
3707 ecode++;
3708 goto REPEATNOTCHAR;
3709
3710 case OP_NOTPOSPLUS:
3711 case OP_NOTPOSPLUSI:
3712 possessive = TRUE;
3713 min = 1;
3714 max = INT_MAX;
3715 ecode++;
3716 goto REPEATNOTCHAR;
3717
3718 case OP_NOTPOSQUERY:
3719 case OP_NOTPOSQUERYI:
3720 possessive = TRUE;
3721 min = 0;
3722 max = 1;
3723 ecode++;
3724 goto REPEATNOTCHAR;
3725
3726 case OP_NOTPOSUPTO:
3727 case OP_NOTPOSUPTOI:
3728 possessive = TRUE;
3729 min = 0;
3730 max = GET2(ecode, 1);
3731 ecode += 1 + IMM2_SIZE;
3732 goto REPEATNOTCHAR;
3733
3734 case OP_NOTSTAR:
3735 case OP_NOTSTARI:
3736 case OP_NOTMINSTAR:
3737 case OP_NOTMINSTARI:
3738 case OP_NOTPLUS:
3739 case OP_NOTPLUSI:
3740 case OP_NOTMINPLUS:
3741 case OP_NOTMINPLUSI:
3742 case OP_NOTQUERY:
3743 case OP_NOTQUERYI:
3744 case OP_NOTMINQUERY:
3745 case OP_NOTMINQUERYI:
3746 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3747 minimize = (c & 1) != 0;
3748 min = rep_min[c]; /* Pick up values from tables; */
3749 max = rep_max[c]; /* zero for max => infinity */
3750 if (max == 0) max = INT_MAX;
3751
3752 /* Common code for all repeated single-byte matches. */
3753
3754 REPEATNOTCHAR:
3755 GETCHARINCTEST(fc, ecode);
3756
3757 /* The code is duplicated for the caseless and caseful cases, for speed,
3758 since matching characters is likely to be quite common. First, ensure the
3759 minimum number of matches are present. If min = max, continue at the same
3760 level without recursing. Otherwise, if minimizing, keep trying the rest of
3761 the expression and advancing one matching character if failing, up to the
3762 maximum. Alternatively, if maximizing, find the maximum number of
3763 characters and work backwards. */
3764
3765 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3766 max, (char *)eptr));
3767
3768 if (op >= OP_NOTSTARI) /* Caseless */
3769 {
3770 #ifdef SUPPORT_UTF
3771 #ifdef SUPPORT_UCP
3772 if (utf && fc > 127)
3773 foc = UCD_OTHERCASE(fc);
3774 #else
3775 if (utf && fc > 127)
3776 foc = fc;
3777 #endif /* SUPPORT_UCP */
3778 else
3779 #endif /* SUPPORT_UTF */
3780 foc = TABLE_GET(fc, md->fcc, fc);
3781
3782 #ifdef SUPPORT_UTF
3783 if (utf)
3784 {
3785 register pcre_uint32 d;
3786 for (i = 1; i <= min; i++)
3787 {
3788 if (eptr >= md->end_subject)
3789 {
3790 SCHECK_PARTIAL();
3791 RRETURN(MATCH_NOMATCH);
3792 }
3793 GETCHARINC(d, eptr);
3794 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3795 }
3796 }
3797 else
3798 #endif /* SUPPORT_UTF */
3799 /* Not UTF mode */
3800 {
3801 for (i = 1; i <= min; i++)
3802 {
3803 if (eptr >= md->end_subject)
3804 {
3805 SCHECK_PARTIAL();
3806 RRETURN(MATCH_NOMATCH);
3807 }
3808 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3809 eptr++;
3810 }
3811 }
3812
3813 if (min == max) continue;
3814
3815 if (minimize)
3816 {
3817 #ifdef SUPPORT_UTF
3818 if (utf)
3819 {
3820 register pcre_uint32 d;
3821 for (fi = min;; fi++)
3822 {
3823 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3825 if (fi >= max) RRETURN(MATCH_NOMATCH);
3826 if (eptr >= md->end_subject)
3827 {
3828 SCHECK_PARTIAL();
3829 RRETURN(MATCH_NOMATCH);
3830 }
3831 GETCHARINC(d, eptr);
3832 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3833 }
3834 }
3835 else
3836 #endif /*SUPPORT_UTF */
3837 /* Not UTF mode */
3838 {
3839 for (fi = min;; fi++)
3840 {
3841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3843 if (fi >= max) RRETURN(MATCH_NOMATCH);
3844 if (eptr >= md->end_subject)
3845 {
3846 SCHECK_PARTIAL();
3847 RRETURN(MATCH_NOMATCH);
3848 }
3849 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3850 eptr++;
3851 }
3852 }
3853 /* Control never gets here */
3854 }
3855
3856 /* Maximize case */
3857
3858 else
3859 {
3860 pp = eptr;
3861
3862 #ifdef SUPPORT_UTF
3863 if (utf)
3864 {
3865 register pcre_uint32 d;
3866 for (i = min; i < max; i++)
3867 {
3868 int len = 1;
3869 if (eptr >= md->end_subject)
3870 {
3871 SCHECK_PARTIAL();
3872 break;
3873 }
3874 GETCHARLEN(d, eptr, len);
3875 if (fc == d || (unsigned int)foc == d) break;
3876 eptr += len;
3877 }
3878 if (possessive) continue; /* No backtracking */
3879 for(;;)
3880 {
3881 if (eptr == pp) goto TAIL_RECURSE;
3882 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3884 eptr--;
3885 BACKCHAR(eptr);
3886 }
3887 }
3888 else
3889 #endif /* SUPPORT_UTF */
3890 /* Not UTF mode */
3891 {
3892 for (i = min; i < max; i++)
3893 {
3894 if (eptr >= md->end_subject)
3895 {
3896 SCHECK_PARTIAL();
3897 break;
3898 }
3899 if (fc == *eptr || foc == *eptr) break;
3900 eptr++;
3901 }
3902 if (possessive) continue; /* No backtracking */
3903 for (;;)
3904 {
3905 if (eptr == pp) goto TAIL_RECURSE;
3906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3908 eptr--;
3909 }
3910 }
3911 /* Control never gets here */
3912 }
3913 }
3914
3915 /* Caseful comparisons */
3916
3917 else
3918 {
3919 #ifdef SUPPORT_UTF
3920 if (utf)
3921 {
3922 register pcre_uint32 d;
3923 for (i = 1; i <= min; i++)
3924 {
3925 if (eptr >= md->end_subject)
3926 {
3927 SCHECK_PARTIAL();
3928 RRETURN(MATCH_NOMATCH);
3929 }
3930 GETCHARINC(d, eptr);
3931 if (fc == d) RRETURN(MATCH_NOMATCH);
3932 }
3933 }
3934 else
3935 #endif
3936 /* Not UTF mode */
3937 {
3938 for (i = 1; i <= min; i++)
3939 {
3940 if (eptr >= md->end_subject)
3941 {
3942 SCHECK_PARTIAL();
3943 RRETURN(MATCH_NOMATCH);
3944 }
3945 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3946 }
3947 }
3948
3949 if (min == max) continue;
3950
3951 if (minimize)
3952 {
3953 #ifdef SUPPORT_UTF
3954 if (utf)
3955 {
3956 register pcre_uint32 d;
3957 for (fi = min;; fi++)
3958 {
3959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3961 if (fi >= max) RRETURN(MATCH_NOMATCH);
3962 if (eptr >= md->end_subject)
3963 {
3964 SCHECK_PARTIAL();
3965 RRETURN(MATCH_NOMATCH);
3966 }
3967 GETCHARINC(d, eptr);
3968 if (fc == d) RRETURN(MATCH_NOMATCH);
3969 }
3970 }
3971 else
3972 #endif
3973 /* Not UTF mode */
3974 {
3975 for (fi = min;; fi++)
3976 {
3977 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3979 if (fi >= max) RRETURN(MATCH_NOMATCH);
3980 if (eptr >= md->end_subject)
3981 {
3982 SCHECK_PARTIAL();
3983 RRETURN(MATCH_NOMATCH);
3984 }
3985 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3986 }
3987 }
3988 /* Control never gets here */
3989 }
3990
3991 /* Maximize case */
3992
3993 else
3994 {
3995 pp = eptr;
3996
3997 #ifdef SUPPORT_UTF
3998 if (utf)
3999 {
4000 register pcre_uint32 d;
4001 for (i = min; i < max; i++)
4002 {
4003 int len = 1;
4004 if (eptr >= md->end_subject)
4005 {
4006 SCHECK_PARTIAL();
4007 break;
4008 }
4009 GETCHARLEN(d, eptr, len);
4010 if (fc == d) break;
4011 eptr += len;
4012 }
4013 if (possessive) continue; /* No backtracking */
4014 for(;;)
4015 {
4016 if (eptr == pp) goto TAIL_RECURSE;
4017 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4019 eptr--;
4020 BACKCHAR(eptr);
4021 }
4022 }
4023 else
4024 #endif
4025 /* Not UTF mode */
4026 {
4027 for (i = min; i < max; i++)
4028 {
4029 if (eptr >= md->end_subject)
4030 {
4031 SCHECK_PARTIAL();
4032 break;
4033 }
4034 if (fc == *eptr) break;
4035 eptr++;
4036 }
4037 if (possessive) continue; /* No backtracking */
4038 for (;;)
4039 {
4040 if (eptr == pp) goto TAIL_RECURSE;
4041 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4043 eptr--;
4044 }
4045 }
4046 /* Control never gets here */
4047 }
4048 }
4049 /* Control never gets here */
4050
4051 /* Match a single character type repeatedly; several different opcodes
4052 share code. This is very similar to the code for single characters, but we
4053 repeat it in the interests of efficiency. */
4054
4055 case OP_TYPEEXACT:
4056 min = max = GET2(ecode, 1);
4057 minimize = TRUE;
4058 ecode += 1 + IMM2_SIZE;
4059 goto REPEATTYPE;
4060
4061 case OP_TYPEUPTO:
4062 case OP_TYPEMINUPTO:
4063 min = 0;
4064 max = GET2(ecode, 1);
4065 minimize = *ecode == OP_TYPEMINUPTO;
4066 ecode += 1 + IMM2_SIZE;
4067 goto REPEATTYPE;
4068
4069 case OP_TYPEPOSSTAR:
4070 possessive = TRUE;
4071 min = 0;
4072 max = INT_MAX;
4073 ecode++;
4074 goto REPEATTYPE;
4075
4076 case OP_TYPEPOSPLUS:
4077 possessive = TRUE;
4078 min = 1;
4079 max = INT_MAX;
4080 ecode++;
4081 goto REPEATTYPE;
4082
4083 case OP_TYPEPOSQUERY:
4084 possessive = TRUE;
4085 min = 0;
4086 max = 1;
4087 ecode++;
4088 goto REPEATTYPE;
4089
4090 case OP_TYPEPOSUPTO:
4091 possessive = TRUE;
4092 min = 0;
4093 max = GET2(ecode, 1);
4094 ecode += 1 + IMM2_SIZE;
4095 goto REPEATTYPE;
4096
4097 case OP_TYPESTAR:
4098 case OP_TYPEMINSTAR:
4099 case OP_TYPEPLUS:
4100 case OP_TYPEMINPLUS:
4101 case OP_TYPEQUERY:
4102 case OP_TYPEMINQUERY:
4103 c = *ecode++ - OP_TYPESTAR;
4104 minimize = (c & 1) != 0;
4105 min = rep_min[c]; /* Pick up values from tables; */
4106 max = rep_max[c]; /* zero for max => infinity */
4107 if (max == 0) max = INT_MAX;
4108
4109 /* Common code for all repeated single character type matches. Note that
4110 in UTF-8 mode, '.' matches a character of any length, but for the other
4111 character types, the valid characters are all one-byte long. */
4112
4113 REPEATTYPE:
4114 ctype = *ecode++; /* Code for the character type */
4115
4116 #ifdef SUPPORT_UCP
4117 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4118 {
4119 prop_fail_result = ctype == OP_NOTPROP;
4120 prop_type = *ecode++;
4121 prop_value = *ecode++;
4122 }
4123 else prop_type = -1;
4124 #endif
4125
4126 /* First, ensure the minimum number of matches are present. Use inline
4127 code for maximizing the speed, and do the type test once at the start
4128 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4129 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4130 and single-bytes. */
4131
4132 if (min > 0)
4133 {
4134 #ifdef SUPPORT_UCP
4135 if (prop_type >= 0)
4136 {
4137 switch(prop_type)
4138 {
4139 case PT_ANY:
4140 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4141 for (i = 1; i <= min; i++)
4142 {
4143 if (eptr >= md->end_subject)
4144 {
4145 SCHECK_PARTIAL();
4146 RRETURN(MATCH_NOMATCH);
4147 }
4148 GETCHARINCTEST(c, eptr);
4149 }
4150 break;
4151
4152 case PT_LAMP:
4153 for (i = 1; i <= min; i++)
4154 {
4155 int chartype;
4156 if (eptr >= md->end_subject)
4157 {
4158 SCHECK_PARTIAL();
4159 RRETURN(MATCH_NOMATCH);
4160 }
4161 GETCHARINCTEST(c, eptr);
4162 chartype = UCD_CHARTYPE(c);
4163 if ((chartype == ucp_Lu ||
4164 chartype == ucp_Ll ||
4165 chartype == ucp_Lt) == prop_fail_result)
4166 RRETURN(MATCH_NOMATCH);
4167 }
4168 break;
4169
4170 case PT_GC:
4171 for (i = 1; i <= min; i++)
4172 {
4173 if (eptr >= md->end_subject)
4174 {
4175 SCHECK_PARTIAL();
4176 RRETURN(MATCH_NOMATCH);
4177 }
4178 GETCHARINCTEST(c, eptr);
4179 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4180 RRETURN(MATCH_NOMATCH);
4181 }
4182 break;
4183
4184 case PT_PC:
4185 for (i = 1; i <= min; i++)
4186 {
4187 if (eptr >= md->end_subject)
4188 {
4189 SCHECK_PARTIAL();
4190 RRETURN(MATCH_NOMATCH);
4191 }
4192 GETCHARINCTEST(c, eptr);
4193 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4194 RRETURN(MATCH_NOMATCH);
4195 }
4196 break;
4197
4198 case PT_SC:
4199 for (i = 1; i <= min; i++)
4200 {
4201 if (eptr >= md->end_subject)
4202 {
4203 SCHECK_PARTIAL();
4204 RRETURN(MATCH_NOMATCH);
4205 }
4206 GETCHARINCTEST(c, eptr);
4207 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4208 RRETURN(MATCH_NOMATCH);
4209 }
4210 break;
4211
4212 case PT_ALNUM:
4213 for (i = 1; i <= min; i++)
4214 {
4215 int category;
4216 if (eptr >= md->end_subject)
4217 {
4218 SCHECK_PARTIAL();
4219 RRETURN(MATCH_NOMATCH);
4220 }
4221 GETCHARINCTEST(c, eptr);
4222 category = UCD_CATEGORY(c);
4223 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4224 RRETURN(MATCH_NOMATCH);
4225 }
4226 break;
4227
4228 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4229 which means that Perl space and POSIX space are now identical. PCRE
4230 was changed at release 8.34. */
4231
4232 case PT_SPACE: /* Perl space */
4233 case PT_PXSPACE: /* POSIX space */
4234 for (i = 1; i <= min; i++)
4235 {
4236 if (eptr >= md->end_subject)
4237 {
4238 SCHECK_PARTIAL();
4239 RRETURN(MATCH_NOMATCH);
4240 }
4241 GETCHARINCTEST(c, eptr);
4242 switch(c)
4243 {
4244 HSPACE_CASES:
4245 VSPACE_CASES:
4246 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4247 break;
4248
4249 default:
4250 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4251 RRETURN(MATCH_NOMATCH);
4252 break;
4253 }
4254 }
4255 break;
4256
4257 case PT_WORD:
4258 for (i = 1; i <= min; i++)
4259 {
4260 int category;
4261 if (eptr >= md->end_subject)
4262 {
4263 SCHECK_PARTIAL();
4264 RRETURN(MATCH_NOMATCH);
4265 }
4266 GETCHARINCTEST(c, eptr);
4267 category = UCD_CATEGORY(c);
4268 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4269 == prop_fail_result)
4270 RRETURN(MATCH_NOMATCH);
4271 }
4272 break;
4273
4274 case PT_CLIST:
4275 for (i = 1; i <= min; i++)
4276 {
4277 const pcre_uint32 *cp;
4278 if (eptr >= md->end_subject)
4279 {
4280 SCHECK_PARTIAL();
4281 RRETURN(MATCH_NOMATCH);
4282 }
4283 GETCHARINCTEST(c, eptr);
4284 cp = PRIV(ucd_caseless_sets) + prop_value;
4285 for (;;)
4286 {
4287 if (c < *cp)
4288 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4289 if (c == *cp++)
4290 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4291 }
4292 }
4293 break;
4294
4295 case PT_UCNC:
4296 for (i = 1; i <= min; i++)
4297 {
4298 if (eptr >= md->end_subject)
4299 {
4300 SCHECK_PARTIAL();
4301 RRETURN(MATCH_NOMATCH);
4302 }
4303 GETCHARINCTEST(c, eptr);
4304 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4305 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4306 c >= 0xe000) == prop_fail_result)
4307 RRETURN(MATCH_NOMATCH);
4308 }
4309 break;
4310
4311 /* This should not occur */
4312
4313 default:
4314 RRETURN(PCRE_ERROR_INTERNAL);
4315 }
4316 }
4317
4318 /* Match extended Unicode sequences. We will get here only if the
4319 support is in the binary; otherwise a compile-time error occurs. */
4320
4321 else if (ctype == OP_EXTUNI)
4322 {
4323 for (i = 1; i <= min; i++)
4324 {
4325 if (eptr >= md->end_subject)
4326 {
4327 SCHECK_PARTIAL();
4328 RRETURN(MATCH_NOMATCH);
4329 }
4330 else
4331 {
4332 int lgb, rgb;
4333 GETCHARINCTEST(c, eptr);
4334 lgb = UCD_GRAPHBREAK(c);
4335 while (eptr < md->end_subject)
4336 {
4337 int len = 1;
4338 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4339 rgb = UCD_GRAPHBREAK(c);
4340 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4341 lgb = rgb;
4342 eptr += len;
4343 }
4344 }
4345 CHECK_PARTIAL();
4346 }
4347 }
4348
4349 else
4350 #endif /* SUPPORT_UCP */
4351
4352 /* Handle all other cases when the coding is UTF-8 */
4353
4354 #ifdef SUPPORT_UTF
4355 if (utf) switch(ctype)
4356 {
4357 case OP_ANY:
4358 for (i = 1; i <= min; i++)
4359 {
4360 if (eptr >= md->end_subject)
4361 {
4362 SCHECK_PARTIAL();
4363 RRETURN(MATCH_NOMATCH);
4364 }
4365 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4366 if (md->partial != 0 &&
4367 eptr + 1 >= md->end_subject &&
4368 NLBLOCK->nltype == NLTYPE_FIXED &&
4369 NLBLOCK->nllen == 2 &&
4370 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4371 {
4372 md->hitend = TRUE;
4373 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4374 }
4375 eptr++;
4376 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4377 }
4378 break;
4379
4380 case OP_ALLANY:
4381 for (i = 1; i <= min; i++)
4382 {
4383 if (eptr >= md->end_subject)
4384 {
4385 SCHECK_PARTIAL();
4386 RRETURN(MATCH_NOMATCH);
4387 }
4388 eptr++;
4389 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4390 }
4391 break;
4392
4393 case OP_ANYBYTE:
4394 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4395 eptr += min;
4396 break;
4397
4398 case OP_ANYNL:
4399 for (i = 1; i <= min; i++)
4400 {
4401 if (eptr >= md->end_subject)
4402 {
4403 SCHECK_PARTIAL();
4404 RRETURN(MATCH_NOMATCH);
4405 }
4406 GETCHARINC(c, eptr);
4407 switch(c)
4408 {
4409 default: RRETURN(MATCH_NOMATCH);
4410
4411 case CHAR_CR:
4412 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4413 break;
4414
4415 case CHAR_LF:
4416 break;
4417
4418 case CHAR_VT:
4419 case CHAR_FF:
4420 case CHAR_NEL:
4421 #ifndef EBCDIC
4422 case 0x2028:
4423 case 0x2029:
4424 #endif /* Not EBCDIC */
4425 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4426 break;
4427 }
4428 }
4429 break;
4430
4431 case OP_NOT_HSPACE:
4432 for (i = 1; i <= min; i++)
4433 {
4434 if (eptr >= md->end_subject)
4435 {
4436 SCHECK_PARTIAL();
4437 RRETURN(MATCH_NOMATCH);
4438 }
4439 GETCHARINC(c, eptr);
4440 switch(c)
4441 {
4442 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4443 default: break;
4444 }
4445 }
4446 break;
4447
4448 case OP_HSPACE:
4449 for (i = 1; i <= min; i++)
4450 {
4451 if (eptr >= md->end_subject)
4452 {
4453 SCHECK_PARTIAL();
4454 RRETURN(MATCH_NOMATCH);
4455 }
4456 GETCHARINC(c, eptr);
4457 switch(c)
4458 {
4459 HSPACE_CASES: break; /* Byte and multibyte cases */
4460 default: RRETURN(MATCH_NOMATCH);
4461 }
4462 }
4463 break;
4464
4465 case OP_NOT_VSPACE:
4466 for (i = 1; i <= min; i++)
4467 {
4468 if (eptr >= md->end_subject)
4469 {
4470 SCHECK_PARTIAL();
4471 RRETURN(MATCH_NOMATCH);
4472 }
4473 GETCHARINC(c, eptr);
4474 switch(c)
4475 {
4476 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4477 default: break;
4478 }
4479 }
4480 break;
4481
4482 case OP_VSPACE:
4483 for (i = 1; i <= min; i++)
4484 {
4485 if (eptr >= md->end_subject)
4486 {
4487 SCHECK_PARTIAL();
4488 RRETURN(MATCH_NOMATCH);
4489 }
4490 GETCHARINC(c, eptr);
4491 switch(c)
4492 {
4493 VSPACE_CASES: break;
4494 default: RRETURN(MATCH_NOMATCH);
4495 }
4496 }
4497 break;
4498
4499 case OP_NOT_DIGIT:
4500 for (i = 1; i <= min; i++)
4501 {
4502 if (eptr >= md->end_subject)
4503 {
4504 SCHECK_PARTIAL();
4505 RRETURN(MATCH_NOMATCH);
4506 }
4507 GETCHARINC(c, eptr);
4508 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4509 RRETURN(MATCH_NOMATCH);
4510 }
4511 break;
4512
4513 case OP_DIGIT:
4514 for (i = 1; i <= min; i++)
4515 {
4516 pcre_uint32 cc;
4517 if (eptr >= md->end_subject)
4518 {
4519 SCHECK_PARTIAL();
4520 RRETURN(MATCH_NOMATCH);
4521 }
4522 cc = RAWUCHAR(eptr);
4523 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4524 RRETURN(MATCH_NOMATCH);
4525 eptr++;
4526 /* No need to skip more bytes - we know it's a 1-byte character */
4527 }
4528 break;
4529
4530 case OP_NOT_WHITESPACE:
4531 for (i = 1; i <= min; i++)
4532 {
4533 pcre_uint32 cc;
4534 if (eptr >= md->end_subject)
4535 {
4536 SCHECK_PARTIAL();
4537 RRETURN(MATCH_NOMATCH);
4538 }
4539 cc = RAWUCHAR(eptr);
4540 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4541 RRETURN(MATCH_NOMATCH);
4542 eptr++;
4543 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4544 }
4545 break;
4546
4547 case OP_WHITESPACE:
4548 for (i = 1; i <= min; i++)
4549 {
4550 pcre_uint32 cc;
4551 if (eptr >= md->end_subject)
4552 {
4553 SCHECK_PARTIAL();
4554 RRETURN(MATCH_NOMATCH);
4555 }
4556 cc = RAWUCHAR(eptr);
4557 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4558 RRETURN(MATCH_NOMATCH);
4559 eptr++;
4560 /* No need to skip more bytes - we know it's a 1-byte character */
4561 }
4562 break;
4563
4564 case OP_NOT_WORDCHAR:
4565 for (i = 1; i <= min; i++)
4566 {
4567 pcre_uint32 cc;
4568 if (eptr >= md->end_subject)
4569 {
4570 SCHECK_PARTIAL();
4571 RRETURN(MATCH_NOMATCH);
4572 }
4573 cc = RAWUCHAR(eptr);
4574 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4575 RRETURN(MATCH_NOMATCH);
4576 eptr++;
4577 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4578 }
4579 break;
4580
4581 case OP_WORDCHAR:
4582 for (i = 1; i <= min; i++)
4583 {
4584 pcre_uint32 cc;
4585 if (eptr >= md->end_subject)
4586 {
4587 SCHECK_PARTIAL();
4588 RRETURN(MATCH_NOMATCH);
4589 }
4590 cc = RAWUCHAR(eptr);
4591 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4592 RRETURN(MATCH_NOMATCH);
4593 eptr++;
4594 /* No need to skip more bytes - we know it's a 1-byte character */
4595 }
4596 break;
4597
4598 default:
4599 RRETURN(PCRE_ERROR_INTERNAL);
4600 } /* End switch(ctype) */
4601
4602 else
4603 #endif /* SUPPORT_UTF */
4604
4605 /* Code for the non-UTF-8 case for minimum matching of operators other
4606 than OP_PROP and OP_NOTPROP. */
4607
4608 switch(ctype)
4609 {
4610 case OP_ANY:
4611 for (i = 1; i <= min; i++)
4612 {
4613 if (eptr >= md->end_subject)
4614 {
4615 SCHECK_PARTIAL();
4616 RRETURN(MATCH_NOMATCH);
4617 }
4618 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4619 if (md->partial != 0 &&
4620 eptr + 1 >= md->end_subject &&
4621 NLBLOCK->nltype == NLTYPE_FIXED &&
4622 NLBLOCK->nllen == 2 &&
4623 *eptr == NLBLOCK->nl[0])
4624 {
4625 md->hitend = TRUE;
4626 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4627 }
4628 eptr++;
4629 }
4630 break;
4631
4632 case OP_ALLANY:
4633 if (eptr > md->end_subject - min)
4634 {
4635 SCHECK_PARTIAL();
4636 RRETURN(MATCH_NOMATCH);
4637 }
4638 eptr += min;
4639 break;
4640
4641 case OP_ANYBYTE:
4642 if (eptr > md->end_subject - min)
4643 {
4644 SCHECK_PARTIAL();
4645 RRETURN(MATCH_NOMATCH);
4646 }
4647 eptr += min;
4648 break;
4649
4650 case OP_ANYNL:
4651 for (i = 1; i <= min; i++)
4652 {
4653 if (eptr >= md->end_subject)
4654 {
4655 SCHECK_PARTIAL();
4656 RRETURN(MATCH_NOMATCH);
4657 }
4658 switch(*eptr++)
4659 {
4660 default: RRETURN(MATCH_NOMATCH);
4661
4662 case CHAR_CR:
4663 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4664 break;
4665
4666 case CHAR_LF:
4667 break;
4668
4669 case CHAR_VT:
4670 case CHAR_FF:
4671 case CHAR_NEL:
4672 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4673 case 0x2028:
4674 case 0x2029:
4675 #endif
4676 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4677 break;
4678 }
4679 }
4680 break;
4681
4682 case OP_NOT_HSPACE:
4683 for (i = 1; i <= min; i++)
4684 {
4685 if (eptr >= md->end_subject)
4686 {
4687 SCHECK_PARTIAL();
4688 RRETURN(MATCH_NOMATCH);
4689 }
4690 switch(*eptr++)
4691 {
4692 default: break;
4693 HSPACE_BYTE_CASES:
4694 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4695 HSPACE_MULTIBYTE_CASES:
4696 #endif
4697 RRETURN(MATCH_NOMATCH);
4698 }
4699 }
4700 break;
4701
4702 case OP_HSPACE:
4703 for (i = 1; i <= min; i++)
4704 {
4705 if (eptr >= md->end_subject)
4706 {
4707 SCHECK_PARTIAL();
4708 RRETURN(MATCH_NOMATCH);
4709 }
4710 switch(*eptr++)
4711 {
4712 default: RRETURN(MATCH_NOMATCH);
4713 HSPACE_BYTE_CASES:
4714 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4715 HSPACE_MULTIBYTE_CASES:
4716 #endif
4717 break;
4718 }
4719 }
4720 break;
4721
4722 case OP_NOT_VSPACE:
4723 for (i = 1; i <= min; i++)
4724 {
4725 if (eptr >= md->end_subject)
4726 {
4727 SCHECK_PARTIAL();
4728 RRETURN(MATCH_NOMATCH);
4729 }
4730 switch(*eptr++)
4731 {
4732 VSPACE_BYTE_CASES:
4733 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4734 VSPACE_MULTIBYTE_CASES:
4735 #endif
4736 RRETURN(MATCH_NOMATCH);
4737 default: break;
4738 }
4739 }
4740 break;
4741
4742 case OP_VSPACE:
4743 for (i = 1; i <= min; i++)
4744 {
4745 if (eptr >= md->end_subject)
4746 {
4747 SCHECK_PARTIAL();
4748 RRETURN(MATCH_NOMATCH);
4749 }
4750 switch(*eptr++)
4751 {
4752 default: RRETURN(MATCH_NOMATCH);
4753 VSPACE_BYTE_CASES:
4754 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4755 VSPACE_MULTIBYTE_CASES:
4756 #endif
4757 break;
4758 }
4759 }
4760 break;
4761
4762 case OP_NOT_DIGIT:
4763 for (i = 1; i <= min; i++)
4764 {
4765 if (eptr >= md->end_subject)
4766 {
4767 SCHECK_PARTIAL();
4768 RRETURN(MATCH_NOMATCH);
4769 }
4770 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4771 RRETURN(MATCH_NOMATCH);
4772 eptr++;
4773 }
4774 break;
4775
4776 case OP_DIGIT:
4777 for (i = 1; i <= min; i++)
4778 {
4779 if (eptr >= md->end_subject)
4780 {
4781 SCHECK_PARTIAL();
4782 RRETURN(MATCH_NOMATCH);
4783 }
4784 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4785 RRETURN(MATCH_NOMATCH);
4786 eptr++;
4787 }
4788 break;
4789
4790 case OP_NOT_WHITESPACE:
4791 for (i = 1; i <= min; i++)
4792 {
4793 if (eptr >= md->end_subject)
4794 {
4795 SCHECK_PARTIAL();
4796 RRETURN(MATCH_NOMATCH);
4797 }
4798 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4799 RRETURN(MATCH_NOMATCH);
4800 eptr++;
4801 }
4802 break;
4803
4804 case OP_WHITESPACE:
4805 for (i = 1; i <= min; i++)
4806 {
4807 if (eptr >= md->end_subject)
4808 {
4809 SCHECK_PARTIAL();
4810 RRETURN(MATCH_NOMATCH);
4811 }
4812 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4813 RRETURN(MATCH_NOMATCH);
4814 eptr++;
4815 }
4816 break;
4817
4818 case OP_NOT_WORDCHAR:
4819 for (i = 1; i <= min; i++)
4820 {
4821 if (eptr >= md->end_subject)
4822 {
4823 SCHECK_PARTIAL();
4824 RRETURN(MATCH_NOMATCH);
4825 }
4826 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4827 RRETURN(MATCH_NOMATCH);
4828 eptr++;
4829 }
4830 break;
4831
4832 case OP_WORDCHAR:
4833 for (i = 1; i <= min; i++)
4834 {
4835 if (eptr >= md->end_subject)
4836 {
4837 SCHECK_PARTIAL();
4838 RRETURN(MATCH_NOMATCH);
4839 }
4840 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4841 RRETURN(MATCH_NOMATCH);
4842 eptr++;
4843 }
4844 break;
4845
4846 default:
4847 RRETURN(PCRE_ERROR_INTERNAL);
4848 }
4849 }
4850
4851 /* If min = max, continue at the same level without recursing */
4852
4853 if (min == max) continue;
4854
4855 /* If minimizing, we have to test the rest of the pattern before each
4856 subsequent match. Again, separate the UTF-8 case for speed, and also
4857 separate the UCP cases. */
4858
4859 if (minimize)
4860 {
4861 #ifdef SUPPORT_UCP
4862 if (prop_type >= 0)
4863 {
4864 switch(prop_type)
4865 {
4866 case PT_ANY:
4867 for (fi = min;; fi++)
4868 {
4869 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4870 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4871 if (fi >= max) RRETURN(MATCH_NOMATCH);
4872 if (eptr >= md->end_subject)
4873 {
4874 SCHECK_PARTIAL();
4875 RRETURN(MATCH_NOMATCH);
4876 }
4877 GETCHARINCTEST(c, eptr);
4878 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4879 }
4880 /* Control never gets here */
4881
4882 case PT_LAMP:
4883 for (fi = min;; fi++)
4884 {
4885 int chartype;
4886 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4887 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4888 if (fi >= max) RRETURN(MATCH_NOMATCH);
4889 if (eptr >= md->end_subject)
4890 {
4891 SCHECK_PARTIAL();
4892 RRETURN(MATCH_NOMATCH);
4893 }
4894 GETCHARINCTEST(c, eptr);
4895 chartype = UCD_CHARTYPE(c);
4896 if ((chartype == ucp_Lu ||
4897 chartype == ucp_Ll ||
4898 chartype == ucp_Lt) == prop_fail_result)
4899 RRETURN(MATCH_NOMATCH);
4900 }
4901 /* Control never gets here */
4902
4903 case PT_GC:
4904 for (fi = min;; fi++)
4905 {
4906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4908 if (fi >= max) RRETURN(MATCH_NOMATCH);
4909 if (eptr >= md->end_subject)
4910 {
4911 SCHECK_PARTIAL();
4912 RRETURN(MATCH_NOMATCH);
4913 }
4914 GETCHARINCTEST(c, eptr);
4915 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4916 RRETURN(MATCH_NOMATCH);
4917 }
4918 /* Control never gets here */
4919
4920 case PT_PC:
4921 for (fi = min;; fi++)
4922 {
4923 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4925 if (fi >= max) RRETURN(MATCH_NOMATCH);
4926 if (eptr >= md->end_subject)
4927 {
4928 SCHECK_PARTIAL();
4929 RRETURN(MATCH_NOMATCH);
4930 }
4931 GETCHARINCTEST(c, eptr);
4932 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4933 RRETURN(MATCH_NOMATCH);
4934 }
4935 /* Control never gets here */
4936
4937 case PT_SC:
4938 for (fi = min;; fi++)
4939 {
4940 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4942 if (fi >= max) RRETURN(MATCH_NOMATCH);
4943 if (eptr >= md->end_subject)
4944 {
4945 SCHECK_PARTIAL();
4946 RRETURN(MATCH_NOMATCH);
4947 }
4948 GETCHARINCTEST(c, eptr);
4949 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4950 RRETURN(MATCH_NOMATCH);
4951 }
4952 /* Control never gets here */
4953
4954 case PT_ALNUM:
4955 for (fi = min;; fi++)
4956 {
4957 int category;
4958 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4959 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4960 if (fi >= max) RRETURN(MATCH_NOMATCH);
4961 if (eptr >= md->end_subject)
4962 {
4963 SCHECK_PARTIAL();
4964 RRETURN(MATCH_NOMATCH);
4965 }
4966 GETCHARINCTEST(c, eptr);
4967 category = UCD_CATEGORY(c);
4968 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4969 RRETURN(MATCH_NOMATCH);
4970 }
4971 /* Control never gets here */
4972
4973 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4974 which means that Perl space and POSIX space are now identical. PCRE
4975 was changed at release 8.34. */
4976
4977 case PT_SPACE: /* Perl space */
4978 case PT_PXSPACE: /* POSIX space */
4979 for (fi = min;; fi++)
4980 {
4981 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4983 if (fi >= max) RRETURN(MATCH_NOMATCH);
4984 if (eptr >= md->end_subject)
4985 {
4986 SCHECK_PARTIAL();
4987 RRETURN(MATCH_NOMATCH);
4988 }
4989 GETCHARINCTEST(c, eptr);
4990 switch(c)
4991 {
4992 HSPACE_CASES:
4993 VSPACE_CASES:
4994 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4995 break;
4996
4997 default:
4998 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4999 RRETURN(MATCH_NOMATCH);
5000 break;
5001 }
5002 }
5003 /* Control never gets here */
5004
5005 case PT_WORD:
5006 for (fi = min;; fi++)
5007 {
5008 int category;
5009 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5011 if (fi >= max) RRETURN(MATCH_NOMATCH);
5012 if (eptr >= md->end_subject)
5013 {
5014 SCHECK_PARTIAL();
5015 RRETURN(MATCH_NOMATCH);
5016 }
5017 GETCHARINCTEST(c, eptr);
5018 category = UCD_CATEGORY(c);
5019 if ((category == ucp_L ||
5020 category == ucp_N ||
5021 c == CHAR_UNDERSCORE)
5022 == prop_fail_result)
5023 RRETURN(MATCH_NOMATCH);
5024 }
5025 /* Control never gets here */
5026
5027 case PT_CLIST:
5028 for (fi = min;; fi++)
5029 {
5030 const pcre_uint32 *cp;
5031 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5033 if (fi >= max) RRETURN(MATCH_NOMATCH);
5034 if (eptr >= md->end_subject)
5035 {
5036 SCHECK_PARTIAL();
5037 RRETURN(MATCH_NOMATCH);
5038 }
5039 GETCHARINCTEST(c, eptr);
5040 cp = PRIV(ucd_caseless_sets) + prop_value;
5041 for (;;)
5042 {
5043 if (c < *cp)
5044 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5045 if (c == *cp++)
5046 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5047 }
5048 }
5049 /* Control never gets here */
5050
5051 case PT_UCNC:
5052 for (fi = min;; fi++)
5053 {
5054 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5055 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5056 if (fi >= max) RRETURN(MATCH_NOMATCH);
5057 if (eptr >= md->end_subject)
5058 {
5059 SCHECK_PARTIAL();
5060 RRETURN(MATCH_NOMATCH);
5061 }
5062 GETCHARINCTEST(c, eptr);
5063 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5064 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5065 c >= 0xe000) == prop_fail_result)
5066 RRETURN(MATCH_NOMATCH);
5067 }
5068 /* Control never gets here */
5069
5070 /* This should never occur */
5071 default:
5072 RRETURN(PCRE_ERROR_INTERNAL);
5073 }
5074 }
5075
5076 /* Match extended Unicode sequences. We will get here only if the
5077 support is in the binary; otherwise a compile-time error occurs. */
5078
5079 else if (ctype == OP_EXTUNI)
5080 {
5081 for (fi = min;; fi++)
5082 {
5083 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5084 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5085 if (fi >= max) RRETURN(MATCH_NOMATCH);
5086 if (eptr >= md->end_subject)
5087 {
5088 SCHECK_PARTIAL();
5089 RRETURN(MATCH_NOMATCH);
5090 }
5091 else
5092 {
5093 int lgb, rgb;
5094 GETCHARINCTEST(c, eptr);
5095 lgb = UCD_GRAPHBREAK(c);
5096 while (eptr < md->end_subject)
5097 {
5098 int len = 1;
5099 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5100 rgb = UCD_GRAPHBREAK(c);
5101 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5102 lgb = rgb;
5103 eptr += len;
5104 }
5105 }
5106 CHECK_PARTIAL();
5107 }
5108 }
5109 else
5110 #endif /* SUPPORT_UCP */
5111
5112 #ifdef SUPPORT_UTF
5113 if (utf)
5114 {
5115 for (fi = min;; fi++)
5116 {
5117 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5118 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5119 if (fi >= max) RRETURN(MATCH_NOMATCH);
5120 if (eptr >= md->end_subject)
5121 {
5122 SCHECK_PARTIAL();
5123 RRETURN(MATCH_NOMATCH);
5124 }
5125 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5126 RRETURN(MATCH_NOMATCH);
5127 GETCHARINC(c, eptr);
5128 switch(ctype)
5129 {
5130 case OP_ANY: /* This is the non-NL case */
5131 if (md->partial != 0 && /* Take care with CRLF partial */
5132 eptr >= md->end_subject &&
5133 NLBLOCK->nltype == NLTYPE_FIXED &&
5134 NLBLOCK->nllen == 2 &&
5135 c == NLBLOCK->nl[0])
5136 {
5137 md->hitend = TRUE;
5138 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5139 }
5140 break;
5141
5142 case OP_ALLANY:
5143 case OP_ANYBYTE:
5144 break;
5145
5146 case OP_ANYNL:
5147 switch(c)
5148 {
5149 default: RRETURN(MATCH_NOMATCH);
5150 case CHAR_CR:
5151 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5152 break;
5153
5154 case CHAR_LF:
5155 break;
5156
5157 case CHAR_VT:
5158 case CHAR_FF:
5159 case CHAR_NEL:
5160 #ifndef EBCDIC
5161 case 0x2028:
5162 case 0x2029:
5163 #endif /* Not EBCDIC */
5164 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5165 break;
5166 }
5167 break;
5168
5169 case OP_NOT_HSPACE:
5170 switch(c)
5171 {
5172 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5173 default: break;
5174 }
5175 break;
5176
5177 case OP_HSPACE:
5178 switch(c)
5179 {
5180 HSPACE_CASES: break;
5181 default: RRETURN(MATCH_NOMATCH);
5182 }
5183 break;
5184
5185 case OP_NOT_VSPACE:
5186 switch(c)
5187 {
5188 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5189 default: break;
5190 }
5191 break;
5192
5193 case OP_VSPACE:
5194 switch(c)
5195 {
5196 VSPACE_CASES: break;
5197 default: RRETURN(MATCH_NOMATCH);
5198 }
5199 break;
5200
5201 case OP_NOT_DIGIT:
5202 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5203 RRETURN(MATCH_NOMATCH);
5204 break;
5205
5206 case OP_DIGIT:
5207 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5208 RRETURN(MATCH_NOMATCH);
5209 break;
5210
5211 case OP_NOT_WHITESPACE:
5212 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5213 RRETURN(MATCH_NOMATCH);
5214 break;
5215
5216 case OP_WHITESPACE:
5217 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5218 RRETURN(MATCH_NOMATCH);
5219 break;
5220
5221 case OP_NOT_WORDCHAR:
5222 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5223 RRETURN(MATCH_NOMATCH);
5224 break;
5225
5226 case OP_WORDCHAR:
5227 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5228 RRETURN(MATCH_NOMATCH);
5229 break;
5230
5231 default:
5232 RRETURN(PCRE_ERROR_INTERNAL);
5233 }
5234 }
5235 }
5236 else
5237 #endif
5238 /* Not UTF mode */
5239 {
5240 for (fi = min;; fi++)
5241 {
5242 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5244 if (fi >= max) RRETURN(MATCH_NOMATCH);
5245 if (eptr >= md->end_subject)
5246 {
5247 SCHECK_PARTIAL();
5248 RRETURN(MATCH_NOMATCH);
5249 }
5250 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5251 RRETURN(MATCH_NOMATCH);
5252 c = *eptr++;
5253 switch(ctype)
5254 {
5255 case OP_ANY: /* This is the non-NL case */
5256 if (md->partial != 0 && /* Take care with CRLF partial */
5257 eptr >= md->end_subject &&
5258 NLBLOCK->nltype == NLTYPE_FIXED &&
5259 NLBLOCK->nllen == 2 &&
5260 c == NLBLOCK->nl[0])
5261 {
5262 md->hitend = TRUE;
5263 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5264 }
5265 break;
5266
5267 case OP_ALLANY:
5268 case OP_ANYBYTE:
5269 break;
5270
5271 case OP_ANYNL:
5272 switch(c)
5273 {
5274 default: RRETURN(MATCH_NOMATCH);
5275 case CHAR_CR:
5276 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5277 break;
5278
5279 case CHAR_LF:
5280 break;
5281
5282 case CHAR_VT:
5283 case CHAR_FF:
5284 case CHAR_NEL:
5285 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5286 case 0x2028:
5287 case 0x2029:
5288 #endif
5289 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5290 break;
5291 }
5292 break;
5293
5294 case OP_NOT_HSPACE:
5295 switch(c)
5296 {
5297 default: break;
5298 HSPACE_BYTE_CASES:
5299 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5300 HSPACE_MULTIBYTE_CASES:
5301 #endif
5302 RRETURN(MATCH_NOMATCH);
5303 }
5304 break;
5305
5306 case OP_HSPACE:
5307 switch(c)
5308 {
5309 default: RRETURN(MATCH_NOMATCH);
5310 HSPACE_BYTE_CASES:
5311 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5312 HSPACE_MULTIBYTE_CASES:
5313 #endif
5314 break;
5315 }
5316 break;
5317
5318 case OP_NOT_VSPACE:
5319 switch(c)
5320 {
5321 default: break;
5322 VSPACE_BYTE_CASES:
5323 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5324 VSPACE_MULTIBYTE_CASES:
5325 #endif
5326 RRETURN(MATCH_NOMATCH);
5327 }
5328 break;
5329
5330 case OP_VSPACE:
5331 switch(c)
5332 {
5333 default: RRETURN(MATCH_NOMATCH);
5334 VSPACE_BYTE_CASES:
5335 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5336 VSPACE_MULTIBYTE_CASES:
5337 #endif
5338 break;
5339 }
5340 break;
5341
5342 case OP_NOT_DIGIT:
5343 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5344 break;
5345
5346 case OP_DIGIT:
5347 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5348 break;
5349
5350 case OP_NOT_WHITESPACE:
5351 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5352 break;
5353
5354 case OP_WHITESPACE:
5355 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5356 break;
5357
5358 case OP_NOT_WORDCHAR:
5359 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5360 break;
5361
5362 case OP_WORDCHAR:
5363 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5364 break;
5365
5366 default:
5367 RRETURN(PCRE_ERROR_INTERNAL);
5368 }
5369 }
5370 }
5371 /* Control never gets here */
5372 }
5373
5374 /* If maximizing, it is worth using inline code for speed, doing the type
5375 test once at the start (i.e. keep it out of the loop). Again, keep the
5376 UTF-8 and UCP stuff separate. */
5377
5378 else
5379 {
5380 pp = eptr; /* Remember where we started */
5381
5382 #ifdef SUPPORT_UCP
5383 if (prop_type >= 0)
5384 {
5385 switch(prop_type)
5386 {
5387 case PT_ANY:
5388 for (i = min; i < max; i++)
5389 {
5390 int len = 1;
5391 if (eptr >= md->end_subject)
5392 {
5393 SCHECK_PARTIAL();
5394 break;
5395 }
5396 GETCHARLENTEST(c, eptr, len);
5397 if (prop_fail_result) break;
5398 eptr+= len;
5399 }
5400 break;
5401
5402 case PT_LAMP:
5403 for (i = min; i < max; i++)
5404 {
5405 int chartype;
5406 int len = 1;
5407 if (eptr >= md->end_subject)
5408 {
5409 SCHECK_PARTIAL();
5410 break;
5411 }
5412 GETCHARLENTEST(c, eptr, len);
5413 chartype = UCD_CHARTYPE(c);
5414 if ((chartype == ucp_Lu ||
5415 chartype == ucp_Ll ||
5416 chartype == ucp_Lt) == prop_fail_result)
5417 break;
5418 eptr+= len;
5419 }
5420 break;
5421
5422 case PT_GC:
5423 for (i = min; i < max; i++)
5424 {
5425 int len = 1;
5426 if (eptr >= md->end_subject)
5427 {
5428 SCHECK_PARTIAL();
5429 break;
5430 }
5431 GETCHARLENTEST(c, eptr, len);
5432 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5433 eptr+= len;
5434 }
5435 break;
5436
5437 case PT_PC:
5438 for (i = min; i < max; i++)
5439 {
5440 int len = 1;
5441 if (eptr >= md->end_subject)
5442 {
5443 SCHECK_PARTIAL();
5444 break;
5445 }
5446 GETCHARLENTEST(c, eptr, len);
5447 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5448 eptr+= len;
5449 }
5450 break;
5451
5452 case PT_SC:
5453 for (i = min; i < max; i++)
5454 {
5455 int len = 1;
5456 if (eptr >= md->end_subject)
5457 {
5458 SCHECK_PARTIAL();
5459 break;
5460 }
5461 GETCHARLENTEST(c, eptr, len);
5462 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5463 eptr+= len;
5464 }
5465 break;
5466
5467 case PT_ALNUM:
5468 for (i = min; i < max; i++)
5469 {
5470 int category;
5471 int len = 1;
5472 if (eptr >= md->end_subject)
5473 {
5474 SCHECK_PARTIAL();
5475 break;
5476 }
5477 GETCHARLENTEST(c, eptr, len);
5478 category = UCD_CATEGORY(c);
5479 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5480 break;
5481 eptr+= len;
5482 }
5483 break;
5484
5485 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5486 which means that Perl space and POSIX space are now identical. PCRE
5487 was changed at release 8.34. */
5488
5489 case PT_SPACE: /* Perl space */
5490 case PT_PXSPACE: /* POSIX space */
5491 for (i = min; i < max; i++)
5492 {
5493 int len = 1;
5494 if (eptr >= md->end_subject)
5495 {
5496 SCHECK_PARTIAL();
5497 break;
5498 }
5499 GETCHARLENTEST(c, eptr, len);
5500 switch(c)
5501 {
5502 HSPACE_CASES:
5503 VSPACE_CASES:
5504 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5505 break;
5506
5507 default:
5508 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5509 goto ENDLOOP99; /* Break the loop */
5510 break;
5511 }
5512 eptr+= len;
5513 }
5514 ENDLOOP99:
5515 break;
5516
5517 case PT_WORD:
5518 for (i = min; i < max; i++)
5519 {
5520 int category;
5521 int len = 1;
5522 if (eptr >= md->end_subject)
5523 {
5524 SCHECK_PARTIAL();
5525 break;
5526 }
5527 GETCHARLENTEST(c, eptr, len);
5528 category = UCD_CATEGORY(c);
5529 if ((category == ucp_L || category == ucp_N ||
5530 c == CHAR_UNDERSCORE) == prop_fail_result)
5531 break;
5532 eptr+= len;
5533 }
5534 break;
5535
5536 case PT_CLIST:
5537 for (i = min; i < max; i++)
5538 {
5539 const pcre_uint32 *cp;
5540 int len = 1;
5541 if (eptr >= md->end_subject)
5542 {
5543 SCHECK_PARTIAL();
5544 break;
5545 }
5546 GETCHARLENTEST(c, eptr, len);
5547 cp = PRIV(ucd_caseless_sets) + prop_value;
5548 for (;;)
5549 {
5550 if (c < *cp)
5551 { if (prop_fail_result) break; else goto GOT_MAX; }
5552 if (c == *cp++)
5553 { if (prop_fail_result) goto GOT_MAX; else break; }
5554 }
5555 eptr += len;
5556 }
5557 GOT_MAX:
5558 break;
5559
5560 case PT_UCNC:
5561 for (i = min; i < max; i++)
5562 {
5563 int len = 1;
5564 if (eptr >= md->end_subject)
5565 {
5566 SCHECK_PARTIAL();
5567 break;
5568 }
5569 GETCHARLENTEST(c, eptr, len);
5570 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5571 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5572 c >= 0xe000) == prop_fail_result)
5573 break;
5574 eptr += len;
5575 }
5576 break;
5577
5578 default:
5579 RRETURN(PCRE_ERROR_INTERNAL);
5580 }
5581
5582 /* eptr is now past the end of the maximum run */
5583
5584 if (possessive) continue; /* No backtracking */
5585 for(;;)
5586 {
5587 if (eptr == pp) goto TAIL_RECURSE;
5588 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5590 eptr--;
5591 if (utf) BACKCHAR(eptr);
5592 }
5593 }
5594
5595 /* Match extended Unicode grapheme clusters. We will get here only if the
5596 support is in the binary; otherwise a compile-time error occurs. */
5597
5598 else if (ctype == OP_EXTUNI)
5599 {
5600 for (i = min; i < max; i++)
5601 {
5602 if (eptr >= md->end_subject)
5603 {
5604 SCHECK_PARTIAL();
5605 break;
5606 }
5607 else
5608 {
5609 int lgb, rgb;
5610 GETCHARINCTEST(c, eptr);
5611 lgb = UCD_GRAPHBREAK(c);
5612 while (eptr < md->end_subject)
5613 {
5614 int len = 1;
5615 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5616 rgb = UCD_GRAPHBREAK(c);
5617 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5618 lgb = rgb;
5619 eptr += len;
5620 }
5621 }
5622 CHECK_PARTIAL();
5623 }
5624
5625 /* eptr is now past the end of the maximum run */
5626
5627 if (possessive) continue; /* No backtracking */
5628
5629 for(;;)
5630 {
5631 int lgb, rgb;
5632 PCRE_PUCHAR fptr;
5633
5634 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5635 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5637
5638 /* Backtracking over an extended grapheme cluster involves inspecting
5639 the previous two characters (if present) to see if a break is
5640 permitted between them. */
5641
5642 eptr--;
5643 if (!utf) c = *eptr; else
5644 {
5645 BACKCHAR(eptr);
5646 GETCHAR(c, eptr);
5647 }
5648 rgb = UCD_GRAPHBREAK(c);
5649
5650 for (;;)
5651 {
5652 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5653 fptr = eptr - 1;
5654 if (!utf) c = *fptr; else
5655 {
5656 BACKCHAR(fptr);
5657 GETCHAR(c, fptr);
5658 }
5659 lgb = UCD_GRAPHBREAK(c);
5660 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5661 eptr = fptr;
5662 rgb = lgb;
5663 }
5664 }
5665 }
5666
5667 else
5668 #endif /* SUPPORT_UCP */
5669
5670 #ifdef SUPPORT_UTF
5671 if (utf)
5672 {
5673 switch(ctype)
5674 {
5675 case OP_ANY:
5676 if (max < INT_MAX)
5677 {
5678 for (i = min; i < max; i++)
5679 {
5680 if (eptr >= md->end_subject)
5681 {
5682 SCHECK_PARTIAL();
5683 break;
5684 }
5685 if (IS_NEWLINE(eptr)) break;
5686 if (md->partial != 0 && /* Take care with CRLF partial */
5687 eptr + 1 >= md->end_subject &&
5688 NLBLOCK->nltype == NLTYPE_FIXED &&
5689 NLBLOCK->nllen == 2 &&
5690 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5691 {
5692 md->hitend = TRUE;
5693 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5694 }
5695 eptr++;
5696 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5697 }
5698 }
5699
5700 /* Handle unlimited UTF-8 repeat */
5701
5702 else
5703 {
5704 for (i = min; i < max; i++)
5705 {
5706 if (eptr >= md->end_subject)
5707 {
5708 SCHECK_PARTIAL();
5709 break;
5710 }
5711 if (IS_NEWLINE(eptr)) break;
5712 if (md->partial != 0 && /* Take care with CRLF partial */
5713 eptr + 1 >= md->end_subject &&
5714 NLBLOCK->nltype == NLTYPE_FIXED &&
5715 NLBLOCK->nllen == 2 &&
5716 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5717 {
5718 md->hitend = TRUE;
5719 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5720 }
5721 eptr++;
5722 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5723 }
5724 }
5725 break;
5726
5727 case OP_ALLANY:
5728 if (max < INT_MAX)
5729 {
5730 for (i = min; i < max; i++)
5731 {
5732 if (eptr >= md->end_subject)
5733 {
5734 SCHECK_PARTIAL();
5735 break;
5736 }
5737 eptr++;
5738 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5739 }
5740 }
5741 else
5742 {
5743 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5744 SCHECK_PARTIAL();
5745 }
5746 break;
5747
5748 /* The byte case is the same as non-UTF8 */
5749
5750 case OP_ANYBYTE:
5751 c = max - min;
5752 if (c > (unsigned int)(md->end_subject - eptr))
5753 {
5754 eptr = md->end_subject;
5755 SCHECK_PARTIAL();
5756 }
5757 else eptr += c;
5758 break;
5759
5760 case OP_ANYNL:
5761 for (i = min; i < max; i++)
5762 {
5763 int len = 1;
5764 if (eptr >= md->end_subject)
5765 {
5766 SCHECK_PARTIAL();
5767 break;
5768 }
5769 GETCHARLEN(c, eptr, len);
5770 if (c == CHAR_CR)
5771 {
5772 if (++eptr >= md->end_subject) break;
5773 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5774 }
5775 else
5776 {
5777 if (c != CHAR_LF &&
5778 (md->bsr_anycrlf ||
5779 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5780 #ifndef EBCDIC
5781 && c != 0x2028 && c != 0x2029
5782 #endif /* Not EBCDIC */
5783 )))
5784 break;
5785 eptr += len;
5786 }
5787 }
5788 break;
5789
5790 case OP_NOT_HSPACE:
5791 case OP_HSPACE:
5792 for (i = min; i < max; i++)
5793 {
5794 BOOL gotspace;
5795 int len = 1;
5796 if (eptr >= md->end_subject)
5797 {
5798 SCHECK_PARTIAL();
5799 break;
5800 }
5801 GETCHARLEN(c, eptr, len);
5802 switch(c)
5803 {
5804 HSPACE_CASES: gotspace = TRUE; break;
5805 default: gotspace = FALSE; break;
5806 }
5807 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5808 eptr += len;
5809 }
5810 break;
5811
5812 case OP_NOT_VSPACE:
5813 case OP_VSPACE:
5814 for (i = min; i < max; i++)
5815 {
5816 BOOL gotspace;
5817 int len = 1;
5818 if (eptr >= md->end_subject)
5819 {
5820 SCHECK_PARTIAL();
5821 break;
5822 }
5823 GETCHARLEN(c, eptr, len);
5824 switch(c)
5825 {
5826 VSPACE_CASES: gotspace = TRUE; break;
5827 default: gotspace = FALSE; break;
5828 }
5829 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5830 eptr += len;
5831 }
5832 break;
5833
5834 case OP_NOT_DIGIT:
5835 for (i = min; i < max; i++)
5836 {
5837 int len = 1;
5838 if (eptr >= md->end_subject)
5839 {
5840 SCHECK_PARTIAL();
5841 break;
5842 }
5843 GETCHARLEN(c, eptr, len);
5844 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5845 eptr+= len;
5846 }
5847 break;
5848
5849 case OP_DIGIT:
5850 for (i = min; i < max; i++)
5851 {
5852 int len = 1;
5853 if (eptr >= md->end_subject)
5854 {
5855 SCHECK_PARTIAL();
5856 break;
5857 }
5858 GETCHARLEN(c, eptr, len);
5859 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5860 eptr+= len;
5861 }
5862 break;
5863
5864 case OP_NOT_WHITESPACE:
5865 for (i = min; i < max; i++)
5866 {
5867 int len = 1;
5868 if (eptr >= md->end_subject)
5869 {
5870 SCHECK_PARTIAL();
5871 break;
5872 }
5873 GETCHARLEN(c, eptr, len);
5874 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5875 eptr+= len;
5876 }
5877 break;
5878
5879 case OP_WHITESPACE:
5880 for (i = min; i < max; i++)
5881 {
5882 int len = 1;
5883 if (eptr >= md->end_subject)
5884 {
5885 SCHECK_PARTIAL();
5886 break;
5887 }
5888 GETCHARLEN(c, eptr, len);
5889 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5890 eptr+= len;
5891 }
5892 break;
5893
5894 case OP_NOT_WORDCHAR:
5895 for (i = min; i < max; i++)
5896 {
5897 int len = 1;
5898 if (eptr >= md->end_subject)
5899 {
5900 SCHECK_PARTIAL();
5901 break;
5902 }
5903 GETCHARLEN(c, eptr, len);
5904 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5905 eptr+= len;
5906 }
5907 break;
5908
5909 case OP_WORDCHAR:
5910 for (i = min; i < max; i++)
5911 {
5912 int len = 1;
5913 if (eptr >= md->end_subject)
5914 {
5915 SCHECK_PARTIAL();
5916 break;
5917 }
5918 GETCHARLEN(c, eptr, len);
5919 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5920 eptr+= len;
5921 }
5922 break;
5923
5924 default:
5925 RRETURN(PCRE_ERROR_INTERNAL);
5926 }
5927
5928 if (possessive) continue; /* No backtracking */
5929 for(;;)
5930 {
5931 if (eptr == pp) goto TAIL_RECURSE;
5932 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5934 eptr--;
5935 BACKCHAR(eptr);
5936 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5937 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5938 }
5939 }
5940 else
5941 #endif /* SUPPORT_UTF */
5942 /* Not UTF mode */
5943 {
5944 switch(ctype)
5945 {
5946 case OP_ANY:
5947 for (i = min; i < max; i++)
5948 {
5949 if (eptr >= md->end_subject)
5950 {
5951 SCHECK_PARTIAL();
5952 break;
5953 }
5954 if (IS_NEWLINE(eptr)) break;
5955 if (md->partial != 0 && /* Take care with CRLF partial */
5956 eptr + 1 >= md->end_subject &&
5957 NLBLOCK->nltype == NLTYPE_FIXED &&
5958 NLBLOCK->nllen == 2 &&
5959 *eptr == NLBLOCK->nl[0])
5960 {
5961 md->hitend = TRUE;
5962 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5963 }
5964 eptr++;
5965 }
5966 break;
5967
5968 case OP_ALLANY:
5969 case OP_ANYBYTE:
5970 c = max - min;
5971 if (c > (unsigned int)(md->end_subject - eptr))
5972 {
5973 eptr = md->end_subject;
5974 SCHECK_PARTIAL();
5975 }
5976 else eptr += c;
5977 break;
5978
5979 case OP_ANYNL:
5980 for (i = min; i < max; i++)
5981 {
5982 if (eptr >= md->end_subject)
5983 {
5984 SCHECK_PARTIAL();
5985 break;
5986 }
5987 c = *eptr;
5988 if (c == CHAR_CR)
5989 {
5990 if (++eptr >= md->end_subject) break;
5991 if (*eptr == CHAR_LF) eptr++;
5992 }
5993 else
5994 {
5995 if (c != CHAR_LF && (md->bsr_anycrlf ||
5996 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5997 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5998 && c != 0x2028 && c != 0x2029
5999 #endif
6000 ))) break;
6001 eptr++;
6002 }
6003 }
6004 break;
6005
6006 case OP_NOT_HSPACE:
6007 for (i = min; i < max; i++)
6008 {
6009 if (eptr >= md->end_subject)
6010 {
6011 SCHECK_PARTIAL();
6012 break;
6013 }
6014 switch(*eptr)
6015 {
6016 default: eptr++; break;
6017 HSPACE_BYTE_CASES:
6018 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6019 HSPACE_MULTIBYTE_CASES:
6020 #endif
6021 goto ENDLOOP00;
6022 }
6023 }
6024 ENDLOOP00:
6025 break;
6026
6027 case OP_HSPACE:
6028 for (i = min; i < max; i++)
6029 {
6030 if (eptr >= md->end_subject)
6031 {
6032 SCHECK_PARTIAL();
6033 break;
6034 }
6035 switch(*eptr)
6036 {
6037 default: goto ENDLOOP01;
6038 HSPACE_BYTE_CASES:
6039 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6040 HSPACE_MULTIBYTE_CASES:
6041 #endif
6042 eptr++; break;
6043 }
6044 }
6045 ENDLOOP01:
6046 break;
6047
6048 case OP_NOT_VSPACE:
6049 for (i = min; i < max; i++)
6050 {
6051 if (eptr >= md->end_subject)
6052 {
6053 SCHECK_PARTIAL();
6054 break;
6055 }
6056 switch(*eptr)
6057 {
6058 default: eptr++; break;
6059 VSPACE_BYTE_CASES:
6060 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6061 VSPACE_MULTIBYTE_CASES:
6062 #endif
6063 goto ENDLOOP02;
6064 }
6065 }
6066 ENDLOOP02:
6067 break;
6068
6069 case OP_VSPACE:
6070 for (i = min; i < max; i++)
6071 {
6072 if (eptr >= md->end_subject)
6073 {
6074 SCHECK_PARTIAL();
6075 break;
6076 }
6077 switch(*eptr)
6078 {
6079 default: goto ENDLOOP03;
6080 VSPACE_BYTE_CASES:
6081 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6082 VSPACE_MULTIBYTE_CASES:
6083 #endif
6084 eptr++; break;
6085 }
6086 }
6087 ENDLOOP03:
6088 break;
6089
6090 case OP_NOT_DIGIT:
6091 for (i = min; i < max; i++)
6092 {
6093 if (eptr >= md->end_subject)
6094 {
6095 SCHECK_PARTIAL();
6096 break;
6097 }
6098 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6099 eptr++;
6100 }
6101 break;
6102
6103 case OP_DIGIT:
6104 for (i = min; i < max; i++)
6105 {
6106 if (eptr >= md->end_subject)
6107 {
6108 SCHECK_PARTIAL();
6109 break;
6110 }
6111 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6112 eptr++;
6113 }
6114 break;
6115
6116 case OP_NOT_WHITESPACE:
6117 for (i = min; i < max; i++)
6118 {
6119 if (eptr >= md->end_subject)
6120 {
6121 SCHECK_PARTIAL();
6122 break;
6123 }
6124 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6125 eptr++;
6126 }
6127 break;
6128
6129 case OP_WHITESPACE:
6130 for (i = min; i < max; i++)
6131 {
6132 if (eptr >= md->end_subject)
6133 {
6134 SCHECK_PARTIAL();
6135 break;
6136 }
6137 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6138 eptr++;
6139 }
6140 break;
6141
6142 case OP_NOT_WORDCHAR:
6143 for (i = min; i < max; i++)
6144 {
6145 if (eptr >= md->end_subject)
6146 {
6147 SCHECK_PARTIAL();
6148 break;
6149 }
6150 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6151 eptr++;
6152 }
6153 break;
6154
6155 case OP_WORDCHAR:
6156 for (i = min; i < max; i++)
6157 {
6158 if (eptr >= md->end_subject)
6159 {
6160 SCHECK_PARTIAL();
6161 break;
6162 }
6163 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6164 eptr++;
6165 }
6166 break;
6167
6168 default:
6169 RRETURN(PCRE_ERROR_INTERNAL);
6170 }
6171
6172 if (possessive) continue; /* No backtracking */
6173 for (;;)
6174 {
6175 if (eptr == pp) goto TAIL_RECURSE;
6176 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6177 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6178 eptr--;
6179 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6180 eptr[-1] == CHAR_CR) eptr--;
6181 }
6182 }
6183
6184 /* Control never gets here */
6185 }
6186
6187 /* There's been some horrible disaster. Arrival here can only mean there is
6188 something seriously wrong in the code above or the OP_xxx definitions. */
6189
6190 default:
6191 DPRINTF(("Unknown opcode %d\n", *ecode));
6192 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6193 }
6194
6195 /* Do not stick any code in here without much thought; it is assumed
6196 that "continue" in the code above comes out to here to repeat the main
6197 loop. */
6198
6199 } /* End of main loop */
6200 /* Control never reaches here */
6201
6202
6203 /* When compiling to use the heap rather than the stack for recursive calls to
6204 match(), the RRETURN() macro jumps here. The number that is saved in
6205 frame->Xwhere indicates which label we actually want to return to. */
6206
6207 #ifdef NO_RECURSE
6208 #define LBL(val) case val: goto L_RM##val;
6209 HEAP_RETURN:
6210 switch (frame->Xwhere)
6211 {
6212 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6213 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6214 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6215 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6216 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6217 LBL(65) LBL(66)
6218 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6219 LBL(21)
6220 #endif
6221 #ifdef SUPPORT_UTF
6222 LBL(16) LBL(18) LBL(20)
6223 LBL(22) LBL(23) LBL(28) LBL(30)
6224 LBL(32) LBL(34) LBL(42) LBL(46)
6225 #ifdef SUPPORT_UCP
6226 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6227 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6228 #endif /* SUPPORT_UCP */
6229 #endif /* SUPPORT_UTF */
6230 default:
6231 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6232 return PCRE_ERROR_INTERNAL;
6233 }
6234 #undef LBL
6235 #endif /* NO_RECURSE */
6236 }
6237
6238
6239 /***************************************************************************
6240 ****************************************************************************
6241 RECURSION IN THE match() FUNCTION
6242
6243 Undefine all the macros that were defined above to handle this. */
6244
6245 #ifdef NO_RECURSE
6246 #undef eptr
6247 #undef ecode
6248 #undef mstart
6249 #undef offset_top
6250 #undef eptrb
6251 #undef flags
6252
6253 #undef callpat
6254 #undef charptr
6255 #undef data
6256 #undef next
6257 #undef pp
6258 #undef prev
6259 #undef saved_eptr
6260
6261 #undef new_recursive
6262
6263 #undef cur_is_word
6264 #undef condition
6265 #undef prev_is_word
6266
6267 #undef ctype
6268 #undef length
6269 #undef max
6270 #undef min
6271 #undef number
6272 #undef offset
6273 #undef op
6274 #undef save_capture_last
6275 #undef save_offset1
6276 #undef save_offset2
6277 #undef save_offset3
6278 #undef stacksave
6279
6280 #undef newptrb
6281
6282 #endif
6283
6284 /* These two are defined as macros in both cases */
6285
6286 #undef fc
6287 #undef fi
6288
6289 /***************************************************************************
6290 ***************************************************************************/
6291
6292
6293 #ifdef NO_RECURSE
6294 /*************************************************
6295 * Release allocated heap frames *
6296 *************************************************/
6297
6298 /* This function releases all the allocated frames. The base frame is on the
6299 machine stack, and so must not be freed.
6300
6301 Argument: the address of the base frame
6302 Returns: nothing
6303 */
6304
6305 static void
6306 release_match_heapframes (heapframe *frame_base)
6307 {
6308 heapframe *nextframe = frame_base->Xnextframe;
6309 while (nextframe != NULL)
6310 {
6311 heapframe *oldframe = nextframe;
6312 nextframe = nextframe->Xnextframe;
6313 (PUBL(stack_free))(oldframe);
6314 }
6315 }
6316 #endif
6317
6318
6319 /*************************************************
6320 * Execute a Regular Expression *
6321 *************************************************/
6322
6323 /* This function applies a compiled re to a subject string and picks out
6324 portions of the string if it matches. Two elements in the vector are set for
6325 each substring: the offsets to the start and end of the substring.
6326
6327 Arguments:
6328 argument_re points to the compiled expression
6329 extra_data points to extra data or is NULL
6330 subject points to the subject string
6331 length length of subject string (may contain binary zeros)
6332 start_offset where to start in the subject string
6333 options option bits
6334 offsets points to a vector of ints to be filled in with offsets
6335 offsetcount the number of elements in the vector
6336
6337 Returns: > 0 => success; value is the number of elements filled in
6338 = 0 => success, but offsets is not big enough
6339 -1 => failed to match
6340 < -1 => some kind of unexpected problem
6341 */
6342
6343 #if defined COMPILE_PCRE8
6344 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6345 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6346 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6347 int offsetcount)
6348 #elif defined COMPILE_PCRE16
6349 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6350 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6351 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6352 int offsetcount)
6353 #elif defined COMPILE_PCRE32
6354 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6355 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6356 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6357 int offsetcount)
6358 #endif
6359 {
6360 int rc, ocount, arg_offset_max;
6361 int newline;
6362 BOOL using_temporary_offsets = FALSE;
6363 BOOL anchored;
6364 BOOL startline;
6365 BOOL firstline;
6366 BOOL utf;
6367 BOOL has_first_char = FALSE;
6368 BOOL has_req_char = FALSE;
6369 pcre_uchar first_char = 0;
6370 pcre_uchar first_char2 = 0;
6371 pcre_uchar req_char = 0;
6372 pcre_uchar req_char2 = 0;
6373 match_data match_block;
6374 match_data *md = &match_block;
6375 const pcre_uint8 *tables;
6376 const pcre_uint8 *start_bits = NULL;
6377 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6378 PCRE_PUCHAR end_subject;
6379 PCRE_PUCHAR start_partial = NULL;
6380 PCRE_PUCHAR match_partial = NULL;
6381 PCRE_PUCHAR req_char_ptr = start_match - 1;
6382
6383 const pcre_study_data *study;
6384 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6385
6386 #ifdef NO_RECURSE
6387 heapframe frame_zero;
6388 frame_zero.Xprevframe = NULL; /* Marks the top level */
6389 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6390 md->match_frames_base = &frame_zero;
6391 #endif
6392
6393 /* Check for the special magic call that measures the size of the stack used
6394 per recursive call of match(). Without the funny casting for sizeof, a Windows
6395 compiler gave this error: "unary minus operator applied to unsigned type,
6396 result still unsigned". Hopefully the cast fixes that. */
6397
6398 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6399 start_offset == -999)
6400 #ifdef NO_RECURSE
6401 return -((int)sizeof(heapframe));
6402 #else
6403 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6404 #endif
6405
6406 /* Plausibility checks */
6407
6408 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6409 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6410 return PCRE_ERROR_NULL;
6411 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6412 if (length < 0) return PCRE_ERROR_BADLENGTH;
6413 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6414
6415 /* Check that the first field in the block is the magic number. If it is not,
6416 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6417 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6418 means that the pattern is likely compiled with different endianness. */
6419
6420 if (re->magic_number != MAGIC_NUMBER)
6421 return re->magic_number == REVERSED_MAGIC_NUMBER?
6422 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6423 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6424
6425 /* These two settings are used in the code for checking a UTF-8 string that
6426 follows immediately afterwards. Other values in the md block are used only
6427 during "normal" pcre_exec() processing, not when the JIT support is in use,
6428 so they are set up later. */
6429
6430 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6431 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6432 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6433 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6434
6435 /* Check a UTF-8 string if required. Pass back the character offset and error
6436 code for an invalid string if a results vector is available. */
6437
6438 #ifdef SUPPORT_UTF
6439 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6440 {
6441 int erroroffset;
6442 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6443 if (errorcode != 0)
6444 {
6445 if (offsetcount >= 2)
6446 {
6447 offsets[0] = erroroffset;
6448 offsets[1] = errorcode;
6449 }
6450 #if defined COMPILE_PCRE8
6451 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6452 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6453 #elif defined COMPILE_PCRE16
6454 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6455 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6456 #elif defined COMPILE_PCRE32
6457 return PCRE_ERROR_BADUTF32;
6458 #endif
6459 }
6460 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6461 /* Check that a start_offset points to the start of a UTF character. */
6462 if (start_offset > 0 && start_offset < length &&
6463 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6464 return PCRE_ERROR_BADUTF8_OFFSET;
6465 #endif
6466 }
6467 #endif
6468
6469 /* If the pattern was successfully studied with JIT support, run the JIT
6470 executable instead of the rest of this function. Most options must be set at
6471 compile time for the JIT code to be usable. Fallback to the normal code path if
6472 an unsupported flag is set. */
6473
6474 #ifdef SUPPORT_JIT
6475 if (extra_data != NULL
6476 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6477 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6478 && extra_data->executable_jit != NULL
6479 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6480 {
6481 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6482 start_offset, options, offsets, offsetcount);
6483
6484 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6485 mode is not compiled. In this case we simply fallback to interpreter. */
6486
6487 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6488 }
6489 #endif
6490
6491 /* Carry on with non-JIT matching. This information is for finding all the
6492 numbers associated with a given name, for condition testing. */
6493
6494 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6495 md->name_count = re->name_count;
6496 md->name_entry_size = re->name_entry_size;
6497
6498 /* Fish out the optional data from the extra_data structure, first setting
6499 the default values. */
6500
6501 study = NULL;
6502 md->match_limit = MATCH_LIMIT;
6503 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6504 md->callout_data = NULL;
6505
6506 /* The table pointer is always in native byte order. */
6507
6508 tables = re->tables;
6509
6510 /* The two limit values override the defaults, whatever their value. */
6511
6512 if (extra_data != NULL)
6513 {
6514 register unsigned int flags = extra_data->flags;
6515 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6516 study = (const pcre_study_data *)extra_data->study_data;
6517 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6518 md->match_limit = extra_data->match_limit;
6519 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6520 md->match_limit_recursion = extra_data->match_limit_recursion;
6521 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6522 md->callout_data = extra_data->callout_data;
6523 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6524 }
6525
6526 /* Limits in the regex override only if they are smaller. */
6527
6528 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6529 md->match_limit = re->limit_match;
6530
6531 if ((re->flags & PCRE_RLSET) != 0 &&
6532 re->limit_recursion < md->match_limit_recursion)
6533 md->match_limit_recursion = re->limit_recursion;
6534
6535 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6536 is a feature that makes it possible to save compiled regex and re-use them
6537 in other programs later. */
6538
6539 if (tables == NULL) tables = PRIV(default_tables);
6540
6541 /* Set up other data */
6542
6543 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6544 startline = (re->flags & PCRE_STARTLINE) != 0;
6545 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6546
6547 /* The code starts after the real_pcre block and the capture name table. */
6548
6549 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6550 re->name_count * re->name_entry_size;
6551
6552 md->start_subject = (PCRE_PUCHAR)subject;
6553 md->start_offset = start_offset;
6554 md->end_subject = md->start_subject + length;
6555 end_subject = md->end_subject;
6556
6557 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6558 md->use_ucp = (re->options & PCRE_UCP) != 0;
6559 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6560 md->ignore_skip_arg = 0;
6561
6562 /* Some options are unpacked into BOOL variables in the hope that testing
6563 them will be faster than individual option bits. */
6564
6565 md->notbol = (options & PCRE_NOTBOL) != 0;
6566 md->noteol = (options & PCRE_NOTEOL) != 0;
6567 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6568 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6569
6570 md->hitend = FALSE;
6571 md->mark = md->nomatch_mark = NULL; /* In case never set */
6572
6573 md->recursive = NULL; /* No recursion at top level */
6574 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6575
6576 md->lcc = tables + lcc_offset;
6577 md->fcc = tables + fcc_offset;
6578 md->ctypes = tables + ctypes_offset;
6579
6580 /* Handle different \R options. */
6581
6582 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6583 {
6584 case 0:
6585 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6586 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6587 else
6588 #ifdef BSR_ANYCRLF
6589 md->bsr_anycrlf = TRUE;
6590 #else
6591 md->bsr_anycrlf = FALSE;
6592 #endif
6593 break;
6594
6595 case PCRE_BSR_ANYCRLF:
6596 md->bsr_anycrlf = TRUE;
6597 break;
6598
6599 case PCRE_BSR_UNICODE:
6600 md->bsr_anycrlf = FALSE;
6601 break;
6602
6603 default: return PCRE_ERROR_BADNEWLINE;
6604 }
6605
6606 /* Handle different types of newline. The three bits give eight cases. If
6607 nothing is set at run time, whatever was used at compile time applies. */
6608
6609 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6610 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6611 {
6612 case 0: newline = NEWLINE; break; /* Compile-time default */
6613 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6614 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6615 case PCRE_NEWLINE_CR+
6616 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6617 case PCRE_NEWLINE_ANY: newline = -1; break;
6618 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6619 default: return PCRE_ERROR_BADNEWLINE;
6620 }
6621
6622 if (newline == -2)
6623 {
6624 md->nltype = NLTYPE_ANYCRLF;
6625 }
6626 else if (newline < 0)
6627 {
6628 md->nltype = NLTYPE_ANY;
6629 }
6630 else
6631 {
6632 md->nltype = NLTYPE_FIXED;
6633 if (newline > 255)
6634 {
6635 md->nllen = 2;
6636 md->nl[0] = (newline >> 8) & 255;
6637 md->nl[1] = newline & 255;
6638 }
6639 else
6640 {
6641 md->nllen = 1;
6642 md->nl[0] = newline;
6643 }
6644 }
6645
6646 /* Partial matching was originally supported only for a restricted set of
6647 regexes; from release 8.00 there are no restrictions, but the bits are still
6648 defined (though never set). So there's no harm in leaving this code. */
6649
6650 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6651 return PCRE_ERROR_BADPARTIAL;
6652
6653 /* If the expression has got more back references than the offsets supplied can
6654 hold, we get a temporary chunk of working store to use during the matching.
6655 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6656 of 3. */
6657
6658 ocount = offsetcount - (offsetcount % 3);
6659 arg_offset_max = (2*ocount)/3;
6660
6661 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6662 {
6663 ocount = re->top_backref * 3 + 3;
6664 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6665 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6666 using_temporary_offsets = TRUE;
6667 DPRINTF(("Got memory to hold back references\n"));
6668 }
6669 else md->offset_vector = offsets;
6670 md->offset_end = ocount;
6671 md->offset_max = (2*ocount)/3;
6672 md->capture_last = 0;
6673
6674 /* Reset the working variable associated with each extraction. These should
6675 never be used unless previously set, but they get saved and restored, and so we
6676 initialize them to avoid reading uninitialized locations. Also, unset the
6677 offsets for the matched string. This is really just for tidiness with callouts,
6678 in case they inspect these fields. */
6679
6680 if (md->offset_vector != NULL)
6681 {
6682 register int *iptr = md->offset_vector + ocount;
6683 register int *iend = iptr - re->top_bracket;
6684 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6685 while (--iptr >= iend) *iptr = -1;
6686 md->offset_vector[0] = md->offset_vector[1] = -1;
6687 }
6688
6689 /* Set up the first character to match, if available. The first_char value is
6690 never set for an anchored regular expression, but the anchoring may be forced
6691 at run time, so we have to test for anchoring. The first char may be unset for
6692 an unanchored pattern, of course. If there's no first char and the pattern was
6693 studied, there may be a bitmap of possible first characters. */
6694
6695 if (!anchored)
6696 {
6697 if ((re->flags & PCRE_FIRSTSET) != 0)
6698 {
6699 has_first_char = TRUE;
6700 first_char = first_char2 = (pcre_uchar)(re->first_char);
6701 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6702 {
6703 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6704 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6705 if (utf && first_char > 127)
6706 first_char2 = UCD_OTHERCASE(first_char);
6707 #endif
6708 }
6709 }
6710 else
6711 if (!startline && study != NULL &&
6712 (study->flags & PCRE_STUDY_MAPPED) != 0)
6713 start_bits = study->start_bits;
6714 }
6715
6716 /* For anchored or unanchored matches, there may be a "last known required
6717 character" set. */
6718
6719 if ((re->flags & PCRE_REQCHSET) != 0)
6720 {
6721 has_req_char = TRUE;
6722 req_char = req_char2 = (pcre_uchar)(re->req_char);
6723 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6724 {
6725 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6726 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6727 if (utf && req_char > 127)
6728 req_char2 = UCD_OTHERCASE(req_char);
6729 #endif
6730 }
6731 }
6732
6733
6734 /* ==========================================================================*/
6735
6736 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6737 the loop runs just once. */
6738
6739 for(;;)
6740 {
6741 PCRE_PUCHAR save_end_subject = end_subject;
6742 PCRE_PUCHAR new_start_match;
6743
6744 /* If firstline is TRUE, the start of the match is constrained to the first
6745 line of a multiline string. That is, the match must be before or at the first
6746 newline. Implement this by temporarily adjusting end_subject so that we stop
6747 scanning at a newline. If the match fails at the newline, later code breaks
6748 this loop. */
6749
6750 if (firstline)
6751 {
6752 PCRE_PUCHAR t = start_match;
6753 #ifdef SUPPORT_UTF
6754 if (utf)
6755 {
6756 while (t < md->end_subject && !IS_NE