/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1364 - (show annotations)
Sat Oct 5 15:45:11 2013 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 219344 byte(s)
Error occurred while calculating annotation data.
Add VT to the set of characters recognized as white space.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 case OP_COMMIT:
785 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
786 eptrb, RM52);
787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
788 RRETURN(MATCH_COMMIT);
789
790 case OP_PRUNE:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
792 eptrb, RM51);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 RRETURN(MATCH_PRUNE);
795
796 case OP_PRUNE_ARG:
797 md->nomatch_mark = ecode + 2;
798 md->mark = NULL; /* In case previously set by assertion */
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM56);
801 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
802 md->mark == NULL) md->mark = ecode + 2;
803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
804 RRETURN(MATCH_PRUNE);
805
806 case OP_SKIP:
807 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
808 eptrb, RM53);
809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
810 md->start_match_ptr = eptr; /* Pass back current position */
811 RRETURN(MATCH_SKIP);
812
813 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
814 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
815 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
816 that failed and any that precede it (either they also failed, or were not
817 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
818 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
819 set to the count of the one that failed. */
820
821 case OP_SKIP_ARG:
822 md->skip_arg_count++;
823 if (md->skip_arg_count <= md->ignore_skip_arg)
824 {
825 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 break;
827 }
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
829 eptrb, RM57);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831
832 /* Pass back the current skip name by overloading md->start_match_ptr and
833 returning the special MATCH_SKIP_ARG return code. This will either be
834 caught by a matching MARK, or get to the top, where it causes a rematch
835 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
836
837 md->start_match_ptr = ecode + 2;
838 RRETURN(MATCH_SKIP_ARG);
839
840 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
841 the branch in which it occurs can be determined. Overload the start of
842 match pointer to do this. */
843
844 case OP_THEN:
845 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
846 eptrb, RM54);
847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848 md->start_match_ptr = ecode;
849 RRETURN(MATCH_THEN);
850
851 case OP_THEN_ARG:
852 md->nomatch_mark = ecode + 2;
853 md->mark = NULL; /* In case previously set by assertion */
854 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
855 md, eptrb, RM58);
856 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
857 md->mark == NULL) md->mark = ecode + 2;
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 md->start_match_ptr = ecode;
860 RRETURN(MATCH_THEN);
861
862 /* Handle an atomic group that does not contain any capturing parentheses.
863 This can be handled like an assertion. Prior to 8.13, all atomic groups
864 were handled this way. In 8.13, the code was changed as below for ONCE, so
865 that backups pass through the group and thereby reset captured values.
866 However, this uses a lot more stack, so in 8.20, atomic groups that do not
867 contain any captures generate OP_ONCE_NC, which can be handled in the old,
868 less stack intensive way.
869
870 Check the alternative branches in turn - the matching won't pass the KET
871 for this kind of subpattern. If any one branch matches, we carry on as at
872 the end of a normal bracket, leaving the subject pointer, but resetting
873 the start-of-match value in case it was changed by \K. */
874
875 case OP_ONCE_NC:
876 prev = ecode;
877 saved_eptr = eptr;
878 save_mark = md->mark;
879 do
880 {
881 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
882 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
883 {
884 mstart = md->start_match_ptr;
885 break;
886 }
887 if (rrc == MATCH_THEN)
888 {
889 next = ecode + GET(ecode,1);
890 if (md->start_match_ptr < next &&
891 (*ecode == OP_ALT || *next == OP_ALT))
892 rrc = MATCH_NOMATCH;
893 }
894
895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896 ecode += GET(ecode,1);
897 md->mark = save_mark;
898 }
899 while (*ecode == OP_ALT);
900
901 /* If hit the end of the group (which could be repeated), fail */
902
903 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
904
905 /* Continue as from after the group, updating the offsets high water
906 mark, since extracts may have been taken. */
907
908 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
909
910 offset_top = md->end_offset_top;
911 eptr = md->end_match_ptr;
912
913 /* For a non-repeating ket, just continue at this level. This also
914 happens for a repeating ket if no characters were matched in the group.
915 This is the forcible breaking of infinite loops as implemented in Perl
916 5.005. */
917
918 if (*ecode == OP_KET || eptr == saved_eptr)
919 {
920 ecode += 1+LINK_SIZE;
921 break;
922 }
923
924 /* The repeating kets try the rest of the pattern or restart from the
925 preceding bracket, in the appropriate order. The second "call" of match()
926 uses tail recursion, to avoid using another stack frame. */
927
928 if (*ecode == OP_KETRMIN)
929 {
930 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
932 ecode = prev;
933 goto TAIL_RECURSE;
934 }
935 else /* OP_KETRMAX */
936 {
937 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939 ecode += 1 + LINK_SIZE;
940 goto TAIL_RECURSE;
941 }
942 /* Control never gets here */
943
944 /* Handle a capturing bracket, other than those that are possessive with an
945 unlimited repeat. If there is space in the offset vector, save the current
946 subject position in the working slot at the top of the vector. We mustn't
947 change the current values of the data slot, because they may be set from a
948 previous iteration of this group, and be referred to by a reference inside
949 the group. A failure to match might occur after the group has succeeded,
950 if something later on doesn't match. For this reason, we need to restore
951 the working value and also the values of the final offsets, in case they
952 were set by a previous iteration of the same bracket.
953
954 If there isn't enough space in the offset vector, treat this as if it were
955 a non-capturing bracket. Don't worry about setting the flag for the error
956 case here; that is handled in the code for KET. */
957
958 case OP_CBRA:
959 case OP_SCBRA:
960 number = GET2(ecode, 1+LINK_SIZE);
961 offset = number << 1;
962
963 #ifdef PCRE_DEBUG
964 printf("start bracket %d\n", number);
965 printf("subject=");
966 pchars(eptr, 16, TRUE, md);
967 printf("\n");
968 #endif
969
970 if (offset < md->offset_max)
971 {
972 save_offset1 = md->offset_vector[offset];
973 save_offset2 = md->offset_vector[offset+1];
974 save_offset3 = md->offset_vector[md->offset_end - number];
975 save_capture_last = md->capture_last;
976 save_mark = md->mark;
977
978 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
979 md->offset_vector[md->offset_end - number] =
980 (int)(eptr - md->start_subject);
981
982 for (;;)
983 {
984 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
986 eptrb, RM1);
987 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
988
989 /* If we backed up to a THEN, check whether it is within the current
990 branch by comparing the address of the THEN that is passed back with
991 the end of the branch. If it is within the current branch, and the
992 branch is one of two or more alternatives (it either starts or ends
993 with OP_ALT), we have reached the limit of THEN's action, so convert
994 the return code to NOMATCH, which will cause normal backtracking to
995 happen from now on. Otherwise, THEN is passed back to an outer
996 alternative. This implements Perl's treatment of parenthesized groups,
997 where a group not containing | does not affect the current alternative,
998 that is, (X) is NOT the same as (X|(*F)). */
999
1000 if (rrc == MATCH_THEN)
1001 {
1002 next = ecode + GET(ecode,1);
1003 if (md->start_match_ptr < next &&
1004 (*ecode == OP_ALT || *next == OP_ALT))
1005 rrc = MATCH_NOMATCH;
1006 }
1007
1008 /* Anything other than NOMATCH is passed back. */
1009
1010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011 md->capture_last = save_capture_last;
1012 ecode += GET(ecode, 1);
1013 md->mark = save_mark;
1014 if (*ecode != OP_ALT) break;
1015 }
1016
1017 DPRINTF(("bracket %d failed\n", number));
1018 md->offset_vector[offset] = save_offset1;
1019 md->offset_vector[offset+1] = save_offset2;
1020 md->offset_vector[md->offset_end - number] = save_offset3;
1021
1022 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1023
1024 RRETURN(rrc);
1025 }
1026
1027 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1028 as a non-capturing bracket. */
1029
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032
1033 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1034
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037
1038 /* Non-capturing or atomic group, except for possessive with unlimited
1039 repeat and ONCE group with no captures. Loop for all the alternatives.
1040
1041 When we get to the final alternative within the brackets, we used to return
1042 the result of a recursive call to match() whatever happened so it was
1043 possible to reduce stack usage by turning this into a tail recursion,
1044 except in the case of a possibly empty group. However, now that there is
1045 the possiblity of (*THEN) occurring in the final alternative, this
1046 optimization is no longer always possible.
1047
1048 We can optimize if we know there are no (*THEN)s in the pattern; at present
1049 this is the best that can be done.
1050
1051 MATCH_ONCE is returned when the end of an atomic group is successfully
1052 reached, but subsequent matching fails. It passes back up the tree (causing
1053 captured values to be reset) until the original atomic group level is
1054 reached. This is tested by comparing md->once_target with the start of the
1055 group. At this point, the return is converted into MATCH_NOMATCH so that
1056 previous backup points can be taken. */
1057
1058 case OP_ONCE:
1059 case OP_BRA:
1060 case OP_SBRA:
1061 DPRINTF(("start non-capturing bracket\n"));
1062
1063 for (;;)
1064 {
1065 if (op >= OP_SBRA || op == OP_ONCE)
1066 md->match_function_type = MATCH_CBEGROUP;
1067
1068 /* If this is not a possibly empty group, and there are no (*THEN)s in
1069 the pattern, and this is the final alternative, optimize as described
1070 above. */
1071
1072 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1073 {
1074 ecode += PRIV(OP_lengths)[*ecode];
1075 goto TAIL_RECURSE;
1076 }
1077
1078 /* In all other cases, we have to make another call to match(). */
1079
1080 save_mark = md->mark;
1081 save_capture_last = md->capture_last;
1082 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 RM2);
1084
1085 /* See comment in the code for capturing groups above about handling
1086 THEN. */
1087
1088 if (rrc == MATCH_THEN)
1089 {
1090 next = ecode + GET(ecode,1);
1091 if (md->start_match_ptr < next &&
1092 (*ecode == OP_ALT || *next == OP_ALT))
1093 rrc = MATCH_NOMATCH;
1094 }
1095
1096 if (rrc != MATCH_NOMATCH)
1097 {
1098 if (rrc == MATCH_ONCE)
1099 {
1100 const pcre_uchar *scode = ecode;
1101 if (*scode != OP_ONCE) /* If not at start, find it */
1102 {
1103 while (*scode == OP_ALT) scode += GET(scode, 1);
1104 scode -= GET(scode, 1);
1105 }
1106 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1107 }
1108 RRETURN(rrc);
1109 }
1110 ecode += GET(ecode, 1);
1111 md->mark = save_mark;
1112 if (*ecode != OP_ALT) break;
1113 md->capture_last = save_capture_last;
1114 }
1115
1116 RRETURN(MATCH_NOMATCH);
1117
1118 /* Handle possessive capturing brackets with an unlimited repeat. We come
1119 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1120 handled similarly to the normal case above. However, the matching is
1121 different. The end of these brackets will always be OP_KETRPOS, which
1122 returns MATCH_KETRPOS without going further in the pattern. By this means
1123 we can handle the group by iteration rather than recursion, thereby
1124 reducing the amount of stack needed. */
1125
1126 case OP_CBRAPOS:
1127 case OP_SCBRAPOS:
1128 allow_zero = FALSE;
1129
1130 POSSESSIVE_CAPTURE:
1131 number = GET2(ecode, 1+LINK_SIZE);
1132 offset = number << 1;
1133
1134 #ifdef PCRE_DEBUG
1135 printf("start possessive bracket %d\n", number);
1136 printf("subject=");
1137 pchars(eptr, 16, TRUE, md);
1138 printf("\n");
1139 #endif
1140
1141 if (offset < md->offset_max)
1142 {
1143 matched_once = FALSE;
1144 code_offset = (int)(ecode - md->start_code);
1145
1146 save_offset1 = md->offset_vector[offset];
1147 save_offset2 = md->offset_vector[offset+1];
1148 save_offset3 = md->offset_vector[md->offset_end - number];
1149 save_capture_last = md->capture_last;
1150
1151 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1152
1153 /* Each time round the loop, save the current subject position for use
1154 when the group matches. For MATCH_MATCH, the group has matched, so we
1155 restart it with a new subject starting position, remembering that we had
1156 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1157 usual. If we haven't matched any alternatives in any iteration, check to
1158 see if a previous iteration matched. If so, the group has matched;
1159 continue from afterwards. Otherwise it has failed; restore the previous
1160 capture values before returning NOMATCH. */
1161
1162 for (;;)
1163 {
1164 md->offset_vector[md->offset_end - number] =
1165 (int)(eptr - md->start_subject);
1166 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1167 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1168 eptrb, RM63);
1169 if (rrc == MATCH_KETRPOS)
1170 {
1171 offset_top = md->end_offset_top;
1172 eptr = md->end_match_ptr;
1173 ecode = md->start_code + code_offset;
1174 save_capture_last = md->capture_last;
1175 matched_once = TRUE;
1176 continue;
1177 }
1178
1179 /* See comment in the code for capturing groups above about handling
1180 THEN. */
1181
1182 if (rrc == MATCH_THEN)
1183 {
1184 next = ecode + GET(ecode,1);
1185 if (md->start_match_ptr < next &&
1186 (*ecode == OP_ALT || *next == OP_ALT))
1187 rrc = MATCH_NOMATCH;
1188 }
1189
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 md->capture_last = save_capture_last;
1192 ecode += GET(ecode, 1);
1193 if (*ecode != OP_ALT) break;
1194 }
1195
1196 if (!matched_once)
1197 {
1198 md->offset_vector[offset] = save_offset1;
1199 md->offset_vector[offset+1] = save_offset2;
1200 md->offset_vector[md->offset_end - number] = save_offset3;
1201 }
1202
1203 if (allow_zero || matched_once)
1204 {
1205 ecode += 1 + LINK_SIZE;
1206 break;
1207 }
1208
1209 RRETURN(MATCH_NOMATCH);
1210 }
1211
1212 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1213 as a non-capturing bracket. */
1214
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217
1218 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222
1223 /* Non-capturing possessive bracket with unlimited repeat. We come here
1224 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1225 without the capturing complication. It is written out separately for speed
1226 and cleanliness. */
1227
1228 case OP_BRAPOS:
1229 case OP_SBRAPOS:
1230 allow_zero = FALSE;
1231
1232 POSSESSIVE_NON_CAPTURE:
1233 matched_once = FALSE;
1234 code_offset = (int)(ecode - md->start_code);
1235 save_capture_last = md->capture_last;
1236
1237 for (;;)
1238 {
1239 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1240 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1241 eptrb, RM48);
1242 if (rrc == MATCH_KETRPOS)
1243 {
1244 offset_top = md->end_offset_top;
1245 eptr = md->end_match_ptr;
1246 ecode = md->start_code + code_offset;
1247 matched_once = TRUE;
1248 continue;
1249 }
1250
1251 /* See comment in the code for capturing groups above about handling
1252 THEN. */
1253
1254 if (rrc == MATCH_THEN)
1255 {
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1260 }
1261
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1266 }
1267
1268 if (matched_once || allow_zero)
1269 {
1270 ecode += 1 + LINK_SIZE;
1271 break;
1272 }
1273 RRETURN(MATCH_NOMATCH);
1274
1275 /* Control never reaches here. */
1276
1277 /* Conditional group: compilation checked that there are no more than
1278 two branches. If the condition is false, skipping the first branch takes us
1279 past the end if there is only one branch, but that's OK because that is
1280 exactly what going to the ket would do. */
1281
1282 case OP_COND:
1283 case OP_SCOND:
1284 codelink = GET(ecode, 1);
1285
1286 /* Because of the way auto-callout works during compile, a callout item is
1287 inserted between OP_COND and an assertion condition. */
1288
1289 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1290 {
1291 if (PUBL(callout) != NULL)
1292 {
1293 PUBL(callout_block) cb;
1294 cb.version = 2; /* Version 1 of the callout block */
1295 cb.callout_number = ecode[LINK_SIZE+2];
1296 cb.offset_vector = md->offset_vector;
1297 #if defined COMPILE_PCRE8
1298 cb.subject = (PCRE_SPTR)md->start_subject;
1299 #elif defined COMPILE_PCRE16
1300 cb.subject = (PCRE_SPTR16)md->start_subject;
1301 #elif defined COMPILE_PCRE32
1302 cb.subject = (PCRE_SPTR32)md->start_subject;
1303 #endif
1304 cb.subject_length = (int)(md->end_subject - md->start_subject);
1305 cb.start_match = (int)(mstart - md->start_subject);
1306 cb.current_position = (int)(eptr - md->start_subject);
1307 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1308 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1309 cb.capture_top = offset_top/2;
1310 cb.capture_last = md->capture_last & CAPLMASK;
1311 /* Internal change requires this for API compatibility. */
1312 if (cb.capture_last == 0) cb.capture_last = -1;
1313 cb.callout_data = md->callout_data;
1314 cb.mark = md->nomatch_mark;
1315 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1316 if (rrc < 0) RRETURN(rrc);
1317 }
1318 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1319 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1320 }
1321
1322 condcode = ecode[LINK_SIZE+1];
1323
1324 /* Now see what the actual condition is */
1325
1326 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1327 {
1328 if (md->recursive == NULL) /* Not recursing => FALSE */
1329 {
1330 condition = FALSE;
1331 ecode += GET(ecode, 1);
1332 }
1333 else
1334 {
1335 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1336 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1337
1338 /* If the test is for recursion into a specific subpattern, and it is
1339 false, but the test was set up by name, scan the table to see if the
1340 name refers to any other numbers, and test them. The condition is true
1341 if any one is set. */
1342
1343 if (!condition && condcode == OP_NRREF)
1344 {
1345 pcre_uchar *slotA = md->name_table;
1346 for (i = 0; i < md->name_count; i++)
1347 {
1348 if (GET2(slotA, 0) == recno) break;
1349 slotA += md->name_entry_size;
1350 }
1351
1352 /* Found a name for the number - there can be only one; duplicate
1353 names for different numbers are allowed, but not vice versa. First
1354 scan down for duplicates. */
1355
1356 if (i < md->name_count)
1357 {
1358 pcre_uchar *slotB = slotA;
1359 while (slotB > md->name_table)
1360 {
1361 slotB -= md->name_entry_size;
1362 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1363 {
1364 condition = GET2(slotB, 0) == md->recursive->group_num;
1365 if (condition) break;
1366 }
1367 else break;
1368 }
1369
1370 /* Scan up for duplicates */
1371
1372 if (!condition)
1373 {
1374 slotB = slotA;
1375 for (i++; i < md->name_count; i++)
1376 {
1377 slotB += md->name_entry_size;
1378 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1379 {
1380 condition = GET2(slotB, 0) == md->recursive->group_num;
1381 if (condition) break;
1382 }
1383 else break;
1384 }
1385 }
1386 }
1387 }
1388
1389 /* Chose branch according to the condition */
1390
1391 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1392 }
1393 }
1394
1395 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1396 {
1397 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1398 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1399
1400 /* If the numbered capture is unset, but the reference was by name,
1401 scan the table to see if the name refers to any other numbers, and test
1402 them. The condition is true if any one is set. This is tediously similar
1403 to the code above, but not close enough to try to amalgamate. */
1404
1405 if (!condition && condcode == OP_NCREF)
1406 {
1407 unsigned int refno = offset >> 1;
1408 pcre_uchar *slotA = md->name_table;
1409
1410 for (i = 0; i < md->name_count; i++)
1411 {
1412 if (GET2(slotA, 0) == refno) break;
1413 slotA += md->name_entry_size;
1414 }
1415
1416 /* Found a name for the number - there can be only one; duplicate names
1417 for different numbers are allowed, but not vice versa. First scan down
1418 for duplicates. */
1419
1420 if (i < md->name_count)
1421 {
1422 pcre_uchar *slotB = slotA;
1423 while (slotB > md->name_table)
1424 {
1425 slotB -= md->name_entry_size;
1426 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1427 {
1428 offset = GET2(slotB, 0) << 1;
1429 condition = offset < offset_top &&
1430 md->offset_vector[offset] >= 0;
1431 if (condition) break;
1432 }
1433 else break;
1434 }
1435
1436 /* Scan up for duplicates */
1437
1438 if (!condition)
1439 {
1440 slotB = slotA;
1441 for (i++; i < md->name_count; i++)
1442 {
1443 slotB += md->name_entry_size;
1444 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1445 {
1446 offset = GET2(slotB, 0) << 1;
1447 condition = offset < offset_top &&
1448 md->offset_vector[offset] >= 0;
1449 if (condition) break;
1450 }
1451 else break;
1452 }
1453 }
1454 }
1455 }
1456
1457 /* Chose branch according to the condition */
1458
1459 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1460 }
1461
1462 else if (condcode == OP_DEF) /* DEFINE - always false */
1463 {
1464 condition = FALSE;
1465 ecode += GET(ecode, 1);
1466 }
1467
1468 /* The condition is an assertion. Call match() to evaluate it - setting
1469 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1470 an assertion. */
1471
1472 else
1473 {
1474 md->match_function_type = MATCH_CONDASSERT;
1475 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1476 if (rrc == MATCH_MATCH)
1477 {
1478 if (md->end_offset_top > offset_top)
1479 offset_top = md->end_offset_top; /* Captures may have happened */
1480 condition = TRUE;
1481 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1482 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1483 }
1484
1485 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1486 assertion; it is therefore treated as NOMATCH. */
1487
1488 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1489 {
1490 RRETURN(rrc); /* Need braces because of following else */
1491 }
1492 else
1493 {
1494 condition = FALSE;
1495 ecode += codelink;
1496 }
1497 }
1498
1499 /* We are now at the branch that is to be obeyed. As there is only one, can
1500 use tail recursion to avoid using another stack frame, except when there is
1501 unlimited repeat of a possibly empty group. In the latter case, a recursive
1502 call to match() is always required, unless the second alternative doesn't
1503 exist, in which case we can just plough on. Note that, for compatibility
1504 with Perl, the | in a conditional group is NOT treated as creating two
1505 alternatives. If a THEN is encountered in the branch, it propagates out to
1506 the enclosing alternative (unless nested in a deeper set of alternatives,
1507 of course). */
1508
1509 if (condition || *ecode == OP_ALT)
1510 {
1511 if (op != OP_SCOND)
1512 {
1513 ecode += 1 + LINK_SIZE;
1514 goto TAIL_RECURSE;
1515 }
1516
1517 md->match_function_type = MATCH_CBEGROUP;
1518 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1519 RRETURN(rrc);
1520 }
1521
1522 /* Condition false & no alternative; continue after the group. */
1523
1524 else
1525 {
1526 ecode += 1 + LINK_SIZE;
1527 }
1528 break;
1529
1530
1531 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1532 to close any currently open capturing brackets. */
1533
1534 case OP_CLOSE:
1535 number = GET2(ecode, 1); /* Must be less than 65536 */
1536 offset = number << 1;
1537
1538 #ifdef PCRE_DEBUG
1539 printf("end bracket %d at *ACCEPT", number);
1540 printf("\n");
1541 #endif
1542
1543 md->capture_last = (md->capture_last & OVFLMASK) | number;
1544 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1545 {
1546 md->offset_vector[offset] =
1547 md->offset_vector[md->offset_end - number];
1548 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1549 if (offset_top <= offset) offset_top = offset + 2;
1550 }
1551 ecode += 1 + IMM2_SIZE;
1552 break;
1553
1554
1555 /* End of the pattern, either real or forced. */
1556
1557 case OP_END:
1558 case OP_ACCEPT:
1559 case OP_ASSERT_ACCEPT:
1560
1561 /* If we have matched an empty string, fail if not in an assertion and not
1562 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1563 is set and we have matched at the start of the subject. In both cases,
1564 backtracking will then try other alternatives, if any. */
1565
1566 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1567 md->recursive == NULL &&
1568 (md->notempty ||
1569 (md->notempty_atstart &&
1570 mstart == md->start_subject + md->start_offset)))
1571 RRETURN(MATCH_NOMATCH);
1572
1573 /* Otherwise, we have a match. */
1574
1575 md->end_match_ptr = eptr; /* Record where we ended */
1576 md->end_offset_top = offset_top; /* and how many extracts were taken */
1577 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1578
1579 /* For some reason, the macros don't work properly if an expression is
1580 given as the argument to RRETURN when the heap is in use. */
1581
1582 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1583 RRETURN(rrc);
1584
1585 /* Assertion brackets. Check the alternative branches in turn - the
1586 matching won't pass the KET for an assertion. If any one branch matches,
1587 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1588 start of each branch to move the current point backwards, so the code at
1589 this level is identical to the lookahead case. When the assertion is part
1590 of a condition, we want to return immediately afterwards. The caller of
1591 this incarnation of the match() function will have set MATCH_CONDASSERT in
1592 md->match_function type, and one of these opcodes will be the first opcode
1593 that is processed. We use a local variable that is preserved over calls to
1594 match() to remember this case. */
1595
1596 case OP_ASSERT:
1597 case OP_ASSERTBACK:
1598 save_mark = md->mark;
1599 if (md->match_function_type == MATCH_CONDASSERT)
1600 {
1601 condassert = TRUE;
1602 md->match_function_type = 0;
1603 }
1604 else condassert = FALSE;
1605
1606 /* Loop for each branch */
1607
1608 do
1609 {
1610 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1611
1612 /* A match means that the assertion is true; break out of the loop
1613 that matches its alternatives. */
1614
1615 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1616 {
1617 mstart = md->start_match_ptr; /* In case \K reset it */
1618 break;
1619 }
1620
1621 /* If not matched, restore the previous mark setting. */
1622
1623 md->mark = save_mark;
1624
1625 /* See comment in the code for capturing groups above about handling
1626 THEN. */
1627
1628 if (rrc == MATCH_THEN)
1629 {
1630 next = ecode + GET(ecode,1);
1631 if (md->start_match_ptr < next &&
1632 (*ecode == OP_ALT || *next == OP_ALT))
1633 rrc = MATCH_NOMATCH;
1634 }
1635
1636 /* Anything other than NOMATCH causes the entire assertion to fail,
1637 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1638 uncaptured THEN, which means they take their normal effect. This
1639 consistent approach does not always have exactly the same effect as in
1640 Perl. */
1641
1642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1643 ecode += GET(ecode, 1);
1644 }
1645 while (*ecode == OP_ALT); /* Continue for next alternative */
1646
1647 /* If we have tried all the alternative branches, the assertion has
1648 failed. If not, we broke out after a match. */
1649
1650 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1651
1652 /* If checking an assertion for a condition, return MATCH_MATCH. */
1653
1654 if (condassert) RRETURN(MATCH_MATCH);
1655
1656 /* Continue from after a successful assertion, updating the offsets high
1657 water mark, since extracts may have been taken during the assertion. */
1658
1659 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1660 ecode += 1 + LINK_SIZE;
1661 offset_top = md->end_offset_top;
1662 continue;
1663
1664 /* Negative assertion: all branches must fail to match for the assertion to
1665 succeed. */
1666
1667 case OP_ASSERT_NOT:
1668 case OP_ASSERTBACK_NOT:
1669 save_mark = md->mark;
1670 if (md->match_function_type == MATCH_CONDASSERT)
1671 {
1672 condassert = TRUE;
1673 md->match_function_type = 0;
1674 }
1675 else condassert = FALSE;
1676
1677 /* Loop for each alternative branch. */
1678
1679 do
1680 {
1681 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1682 md->mark = save_mark; /* Always restore the mark setting */
1683
1684 switch(rrc)
1685 {
1686 case MATCH_MATCH: /* A successful match means */
1687 case MATCH_ACCEPT: /* the assertion has failed. */
1688 RRETURN(MATCH_NOMATCH);
1689
1690 case MATCH_NOMATCH: /* Carry on with next branch */
1691 break;
1692
1693 /* See comment in the code for capturing groups above about handling
1694 THEN. */
1695
1696 case MATCH_THEN:
1697 next = ecode + GET(ecode,1);
1698 if (md->start_match_ptr < next &&
1699 (*ecode == OP_ALT || *next == OP_ALT))
1700 {
1701 rrc = MATCH_NOMATCH;
1702 break;
1703 }
1704 /* Otherwise fall through. */
1705
1706 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1707 assertion to fail to match, without considering any more alternatives.
1708 Failing to match means the assertion is true. This is a consistent
1709 approach, but does not always have the same effect as in Perl. */
1710
1711 case MATCH_COMMIT:
1712 case MATCH_SKIP:
1713 case MATCH_SKIP_ARG:
1714 case MATCH_PRUNE:
1715 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1716 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1717
1718 /* Anything else is an error */
1719
1720 default:
1721 RRETURN(rrc);
1722 }
1723
1724 /* Continue with next branch */
1725
1726 ecode += GET(ecode,1);
1727 }
1728 while (*ecode == OP_ALT);
1729
1730 /* All branches in the assertion failed to match. */
1731
1732 NEG_ASSERT_TRUE:
1733 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1734 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1735 continue;
1736
1737 /* Move the subject pointer back. This occurs only at the start of
1738 each branch of a lookbehind assertion. If we are too close to the start to
1739 move back, this match function fails. When working with UTF-8 we move
1740 back a number of characters, not bytes. */
1741
1742 case OP_REVERSE:
1743 #ifdef SUPPORT_UTF
1744 if (utf)
1745 {
1746 i = GET(ecode, 1);
1747 while (i-- > 0)
1748 {
1749 eptr--;
1750 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1751 BACKCHAR(eptr);
1752 }
1753 }
1754 else
1755 #endif
1756
1757 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1758
1759 {
1760 eptr -= GET(ecode, 1);
1761 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1762 }
1763
1764 /* Save the earliest consulted character, then skip to next op code */
1765
1766 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1767 ecode += 1 + LINK_SIZE;
1768 break;
1769
1770 /* The callout item calls an external function, if one is provided, passing
1771 details of the match so far. This is mainly for debugging, though the
1772 function is able to force a failure. */
1773
1774 case OP_CALLOUT:
1775 if (PUBL(callout) != NULL)
1776 {
1777 PUBL(callout_block) cb;
1778 cb.version = 2; /* Version 1 of the callout block */
1779 cb.callout_number = ecode[1];
1780 cb.offset_vector = md->offset_vector;
1781 #if defined COMPILE_PCRE8
1782 cb.subject = (PCRE_SPTR)md->start_subject;
1783 #elif defined COMPILE_PCRE16
1784 cb.subject = (PCRE_SPTR16)md->start_subject;
1785 #elif defined COMPILE_PCRE32
1786 cb.subject = (PCRE_SPTR32)md->start_subject;
1787 #endif
1788 cb.subject_length = (int)(md->end_subject - md->start_subject);
1789 cb.start_match = (int)(mstart - md->start_subject);
1790 cb.current_position = (int)(eptr - md->start_subject);
1791 cb.pattern_position = GET(ecode, 2);
1792 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1793 cb.capture_top = offset_top/2;
1794 cb.capture_last = md->capture_last & CAPLMASK;
1795 /* Internal change requires this for API compatibility. */
1796 if (cb.capture_last == 0) cb.capture_last = -1;
1797 cb.callout_data = md->callout_data;
1798 cb.mark = md->nomatch_mark;
1799 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1800 if (rrc < 0) RRETURN(rrc);
1801 }
1802 ecode += 2 + 2*LINK_SIZE;
1803 break;
1804
1805 /* Recursion either matches the current regex, or some subexpression. The
1806 offset data is the offset to the starting bracket from the start of the
1807 whole pattern. (This is so that it works from duplicated subpatterns.)
1808
1809 The state of the capturing groups is preserved over recursion, and
1810 re-instated afterwards. We don't know how many are started and not yet
1811 finished (offset_top records the completed total) so we just have to save
1812 all the potential data. There may be up to 65535 such values, which is too
1813 large to put on the stack, but using malloc for small numbers seems
1814 expensive. As a compromise, the stack is used when there are no more than
1815 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1816
1817 There are also other values that have to be saved. We use a chained
1818 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1819 for the original version of this logic. It has, however, been hacked around
1820 a lot, so he is not to blame for the current way it works. */
1821
1822 case OP_RECURSE:
1823 {
1824 recursion_info *ri;
1825 unsigned int recno;
1826
1827 callpat = md->start_code + GET(ecode, 1);
1828 recno = (callpat == md->start_code)? 0 :
1829 GET2(callpat, 1 + LINK_SIZE);
1830
1831 /* Check for repeating a recursion without advancing the subject pointer.
1832 This should catch convoluted mutual recursions. (Some simple cases are
1833 caught at compile time.) */
1834
1835 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1836 if (recno == ri->group_num && eptr == ri->subject_position)
1837 RRETURN(PCRE_ERROR_RECURSELOOP);
1838
1839 /* Add to "recursing stack" */
1840
1841 new_recursive.group_num = recno;
1842 new_recursive.saved_capture_last = md->capture_last;
1843 new_recursive.subject_position = eptr;
1844 new_recursive.prevrec = md->recursive;
1845 md->recursive = &new_recursive;
1846
1847 /* Where to continue from afterwards */
1848
1849 ecode += 1 + LINK_SIZE;
1850
1851 /* Now save the offset data */
1852
1853 new_recursive.saved_max = md->offset_end;
1854 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1855 new_recursive.offset_save = stacksave;
1856 else
1857 {
1858 new_recursive.offset_save =
1859 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1860 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1861 }
1862 memcpy(new_recursive.offset_save, md->offset_vector,
1863 new_recursive.saved_max * sizeof(int));
1864
1865 /* OK, now we can do the recursion. After processing each alternative,
1866 restore the offset data and the last captured value. If there were nested
1867 recursions, md->recursive might be changed, so reset it before looping.
1868 */
1869
1870 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1871 cbegroup = (*callpat >= OP_SBRA);
1872 do
1873 {
1874 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1875 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1876 md, eptrb, RM6);
1877 memcpy(md->offset_vector, new_recursive.offset_save,
1878 new_recursive.saved_max * sizeof(int));
1879 md->capture_last = new_recursive.saved_capture_last;
1880 md->recursive = new_recursive.prevrec;
1881 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1882 {
1883 DPRINTF(("Recursion matched\n"));
1884 if (new_recursive.offset_save != stacksave)
1885 (PUBL(free))(new_recursive.offset_save);
1886
1887 /* Set where we got to in the subject, and reset the start in case
1888 it was changed by \K. This *is* propagated back out of a recursion,
1889 for Perl compatibility. */
1890
1891 eptr = md->end_match_ptr;
1892 mstart = md->start_match_ptr;
1893 goto RECURSION_MATCHED; /* Exit loop; end processing */
1894 }
1895
1896 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1897 recursion; they cause a NOMATCH for the entire recursion. These codes
1898 are defined in a range that can be tested for. */
1899
1900 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1901 RRETURN(MATCH_NOMATCH);
1902
1903 /* Any return code other than NOMATCH is an error. */
1904
1905 if (rrc != MATCH_NOMATCH)
1906 {
1907 DPRINTF(("Recursion gave error %d\n", rrc));
1908 if (new_recursive.offset_save != stacksave)
1909 (PUBL(free))(new_recursive.offset_save);
1910 RRETURN(rrc);
1911 }
1912
1913 md->recursive = &new_recursive;
1914 callpat += GET(callpat, 1);
1915 }
1916 while (*callpat == OP_ALT);
1917
1918 DPRINTF(("Recursion didn't match\n"));
1919 md->recursive = new_recursive.prevrec;
1920 if (new_recursive.offset_save != stacksave)
1921 (PUBL(free))(new_recursive.offset_save);
1922 RRETURN(MATCH_NOMATCH);
1923 }
1924
1925 RECURSION_MATCHED:
1926 break;
1927
1928 /* An alternation is the end of a branch; scan along to find the end of the
1929 bracketed group and go to there. */
1930
1931 case OP_ALT:
1932 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1933 break;
1934
1935 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1936 indicating that it may occur zero times. It may repeat infinitely, or not
1937 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1938 with fixed upper repeat limits are compiled as a number of copies, with the
1939 optional ones preceded by BRAZERO or BRAMINZERO. */
1940
1941 case OP_BRAZERO:
1942 next = ecode + 1;
1943 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945 do next += GET(next, 1); while (*next == OP_ALT);
1946 ecode = next + 1 + LINK_SIZE;
1947 break;
1948
1949 case OP_BRAMINZERO:
1950 next = ecode + 1;
1951 do next += GET(next, 1); while (*next == OP_ALT);
1952 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1953 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1954 ecode++;
1955 break;
1956
1957 case OP_SKIPZERO:
1958 next = ecode+1;
1959 do next += GET(next,1); while (*next == OP_ALT);
1960 ecode = next + 1 + LINK_SIZE;
1961 break;
1962
1963 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1964 here; just jump to the group, with allow_zero set TRUE. */
1965
1966 case OP_BRAPOSZERO:
1967 op = *(++ecode);
1968 allow_zero = TRUE;
1969 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1970 goto POSSESSIVE_NON_CAPTURE;
1971
1972 /* End of a group, repeated or non-repeating. */
1973
1974 case OP_KET:
1975 case OP_KETRMIN:
1976 case OP_KETRMAX:
1977 case OP_KETRPOS:
1978 prev = ecode - GET(ecode, 1);
1979
1980 /* If this was a group that remembered the subject start, in order to break
1981 infinite repeats of empty string matches, retrieve the subject start from
1982 the chain. Otherwise, set it NULL. */
1983
1984 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1985 {
1986 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1987 eptrb = eptrb->epb_prev; /* Backup to previous group */
1988 }
1989 else saved_eptr = NULL;
1990
1991 /* If we are at the end of an assertion group or a non-capturing atomic
1992 group, stop matching and return MATCH_MATCH, but record the current high
1993 water mark for use by positive assertions. We also need to record the match
1994 start in case it was changed by \K. */
1995
1996 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1997 *prev == OP_ONCE_NC)
1998 {
1999 md->end_match_ptr = eptr; /* For ONCE_NC */
2000 md->end_offset_top = offset_top;
2001 md->start_match_ptr = mstart;
2002 RRETURN(MATCH_MATCH); /* Sets md->mark */
2003 }
2004
2005 /* For capturing groups we have to check the group number back at the start
2006 and if necessary complete handling an extraction by setting the offsets and
2007 bumping the high water mark. Whole-pattern recursion is coded as a recurse
2008 into group 0, so it won't be picked up here. Instead, we catch it when the
2009 OP_END is reached. Other recursion is handled here. We just have to record
2010 the current subject position and start match pointer and give a MATCH
2011 return. */
2012
2013 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
2014 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
2015 {
2016 number = GET2(prev, 1+LINK_SIZE);
2017 offset = number << 1;
2018
2019 #ifdef PCRE_DEBUG
2020 printf("end bracket %d", number);
2021 printf("\n");
2022 #endif
2023
2024 /* Handle a recursively called group. */
2025
2026 if (md->recursive != NULL && md->recursive->group_num == number)
2027 {
2028 md->end_match_ptr = eptr;
2029 md->start_match_ptr = mstart;
2030 RRETURN(MATCH_MATCH);
2031 }
2032
2033 /* Deal with capturing */
2034
2035 md->capture_last = (md->capture_last & OVFLMASK) | number;
2036 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
2037 {
2038 /* If offset is greater than offset_top, it means that we are
2039 "skipping" a capturing group, and that group's offsets must be marked
2040 unset. In earlier versions of PCRE, all the offsets were unset at the
2041 start of matching, but this doesn't work because atomic groups and
2042 assertions can cause a value to be set that should later be unset.
2043 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
2044 part of the atomic group, but this is not on the final matching path,
2045 so must be unset when 2 is set. (If there is no group 2, there is no
2046 problem, because offset_top will then be 2, indicating no capture.) */
2047
2048 if (offset > offset_top)
2049 {
2050 register int *iptr = md->offset_vector + offset_top;
2051 register int *iend = md->offset_vector + offset;
2052 while (iptr < iend) *iptr++ = -1;
2053 }
2054
2055 /* Now make the extraction */
2056
2057 md->offset_vector[offset] =
2058 md->offset_vector[md->offset_end - number];
2059 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2060 if (offset_top <= offset) offset_top = offset + 2;
2061 }
2062 }
2063
2064 /* For an ordinary non-repeating ket, just continue at this level. This
2065 also happens for a repeating ket if no characters were matched in the
2066 group. This is the forcible breaking of infinite loops as implemented in
2067 Perl 5.005. For a non-repeating atomic group that includes captures,
2068 establish a backup point by processing the rest of the pattern at a lower
2069 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2070 original OP_ONCE level, thereby bypassing intermediate backup points, but
2071 resetting any captures that happened along the way. */
2072
2073 if (*ecode == OP_KET || eptr == saved_eptr)
2074 {
2075 if (*prev == OP_ONCE)
2076 {
2077 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2080 RRETURN(MATCH_ONCE);
2081 }
2082 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2083 break;
2084 }
2085
2086 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2087 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2088 at a time from the outer level, thus saving stack. */
2089
2090 if (*ecode == OP_KETRPOS)
2091 {
2092 md->end_match_ptr = eptr;
2093 md->end_offset_top = offset_top;
2094 RRETURN(MATCH_KETRPOS);
2095 }
2096
2097 /* The normal repeating kets try the rest of the pattern or restart from
2098 the preceding bracket, in the appropriate order. In the second case, we can
2099 use tail recursion to avoid using another stack frame, unless we have an
2100 an atomic group or an unlimited repeat of a group that can match an empty
2101 string. */
2102
2103 if (*ecode == OP_KETRMIN)
2104 {
2105 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2106 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2107 if (*prev == OP_ONCE)
2108 {
2109 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2111 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2112 RRETURN(MATCH_ONCE);
2113 }
2114 if (*prev >= OP_SBRA) /* Could match an empty string */
2115 {
2116 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2117 RRETURN(rrc);
2118 }
2119 ecode = prev;
2120 goto TAIL_RECURSE;
2121 }
2122 else /* OP_KETRMAX */
2123 {
2124 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2125 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2127 if (*prev == OP_ONCE)
2128 {
2129 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2130 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2131 md->once_target = prev;
2132 RRETURN(MATCH_ONCE);
2133 }
2134 ecode += 1 + LINK_SIZE;
2135 goto TAIL_RECURSE;
2136 }
2137 /* Control never gets here */
2138
2139 /* Not multiline mode: start of subject assertion, unless notbol. */
2140
2141 case OP_CIRC:
2142 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2143
2144 /* Start of subject assertion */
2145
2146 case OP_SOD:
2147 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2148 ecode++;
2149 break;
2150
2151 /* Multiline mode: start of subject unless notbol, or after any newline. */
2152
2153 case OP_CIRCM:
2154 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2155 if (eptr != md->start_subject &&
2156 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2157 RRETURN(MATCH_NOMATCH);
2158 ecode++;
2159 break;
2160
2161 /* Start of match assertion */
2162
2163 case OP_SOM:
2164 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2165 ecode++;
2166 break;
2167
2168 /* Reset the start of match point */
2169
2170 case OP_SET_SOM:
2171 mstart = eptr;
2172 ecode++;
2173 break;
2174
2175 /* Multiline mode: assert before any newline, or before end of subject
2176 unless noteol is set. */
2177
2178 case OP_DOLLM:
2179 if (eptr < md->end_subject)
2180 {
2181 if (!IS_NEWLINE(eptr))
2182 {
2183 if (md->partial != 0 &&
2184 eptr + 1 >= md->end_subject &&
2185 NLBLOCK->nltype == NLTYPE_FIXED &&
2186 NLBLOCK->nllen == 2 &&
2187 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2188 {
2189 md->hitend = TRUE;
2190 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2191 }
2192 RRETURN(MATCH_NOMATCH);
2193 }
2194 }
2195 else
2196 {
2197 if (md->noteol) RRETURN(MATCH_NOMATCH);
2198 SCHECK_PARTIAL();
2199 }
2200 ecode++;
2201 break;
2202
2203 /* Not multiline mode: assert before a terminating newline or before end of
2204 subject unless noteol is set. */
2205
2206 case OP_DOLL:
2207 if (md->noteol) RRETURN(MATCH_NOMATCH);
2208 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2209
2210 /* ... else fall through for endonly */
2211
2212 /* End of subject assertion (\z) */
2213
2214 case OP_EOD:
2215 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2216 SCHECK_PARTIAL();
2217 ecode++;
2218 break;
2219
2220 /* End of subject or ending \n assertion (\Z) */
2221
2222 case OP_EODN:
2223 ASSERT_NL_OR_EOS:
2224 if (eptr < md->end_subject &&
2225 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2226 {
2227 if (md->partial != 0 &&
2228 eptr + 1 >= md->end_subject &&
2229 NLBLOCK->nltype == NLTYPE_FIXED &&
2230 NLBLOCK->nllen == 2 &&
2231 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2232 {
2233 md->hitend = TRUE;
2234 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2235 }
2236 RRETURN(MATCH_NOMATCH);
2237 }
2238
2239 /* Either at end of string or \n before end. */
2240
2241 SCHECK_PARTIAL();
2242 ecode++;
2243 break;
2244
2245 /* Word boundary assertions */
2246
2247 case OP_NOT_WORD_BOUNDARY:
2248 case OP_WORD_BOUNDARY:
2249 {
2250
2251 /* Find out if the previous and current characters are "word" characters.
2252 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2253 be "non-word" characters. Remember the earliest consulted character for
2254 partial matching. */
2255
2256 #ifdef SUPPORT_UTF
2257 if (utf)
2258 {
2259 /* Get status of previous character */
2260
2261 if (eptr == md->start_subject) prev_is_word = FALSE; else
2262 {
2263 PCRE_PUCHAR lastptr = eptr - 1;
2264 BACKCHAR(lastptr);
2265 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2266 GETCHAR(c, lastptr);
2267 #ifdef SUPPORT_UCP
2268 if (md->use_ucp)
2269 {
2270 if (c == '_') prev_is_word = TRUE; else
2271 {
2272 int cat = UCD_CATEGORY(c);
2273 prev_is_word = (cat == ucp_L || cat == ucp_N);
2274 }
2275 }
2276 else
2277 #endif
2278 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2279 }
2280
2281 /* Get status of next character */
2282
2283 if (eptr >= md->end_subject)
2284 {
2285 SCHECK_PARTIAL();
2286 cur_is_word = FALSE;
2287 }
2288 else
2289 {
2290 GETCHAR(c, eptr);
2291 #ifdef SUPPORT_UCP
2292 if (md->use_ucp)
2293 {
2294 if (c == '_') cur_is_word = TRUE; else
2295 {
2296 int cat = UCD_CATEGORY(c);
2297 cur_is_word = (cat == ucp_L || cat == ucp_N);
2298 }
2299 }
2300 else
2301 #endif
2302 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2303 }
2304 }
2305 else
2306 #endif
2307
2308 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2309 consistency with the behaviour of \w we do use it in this case. */
2310
2311 {
2312 /* Get status of previous character */
2313
2314 if (eptr == md->start_subject) prev_is_word = FALSE; else
2315 {
2316 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2317 #ifdef SUPPORT_UCP
2318 if (md->use_ucp)
2319 {
2320 c = eptr[-1];
2321 if (c == '_') prev_is_word = TRUE; else
2322 {
2323 int cat = UCD_CATEGORY(c);
2324 prev_is_word = (cat == ucp_L || cat == ucp_N);
2325 }
2326 }
2327 else
2328 #endif
2329 prev_is_word = MAX_255(eptr[-1])
2330 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2331 }
2332
2333 /* Get status of next character */
2334
2335 if (eptr >= md->end_subject)
2336 {
2337 SCHECK_PARTIAL();
2338 cur_is_word = FALSE;
2339 }
2340 else
2341 #ifdef SUPPORT_UCP
2342 if (md->use_ucp)
2343 {
2344 c = *eptr;
2345 if (c == '_') cur_is_word = TRUE; else
2346 {
2347 int cat = UCD_CATEGORY(c);
2348 cur_is_word = (cat == ucp_L || cat == ucp_N);
2349 }
2350 }
2351 else
2352 #endif
2353 cur_is_word = MAX_255(*eptr)
2354 && ((md->ctypes[*eptr] & ctype_word) != 0);
2355 }
2356
2357 /* Now see if the situation is what we want */
2358
2359 if ((*ecode++ == OP_WORD_BOUNDARY)?
2360 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2361 RRETURN(MATCH_NOMATCH);
2362 }
2363 break;
2364
2365 /* Match any single character type except newline; have to take care with
2366 CRLF newlines and partial matching. */
2367
2368 case OP_ANY:
2369 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2370 if (md->partial != 0 &&
2371 eptr + 1 >= md->end_subject &&
2372 NLBLOCK->nltype == NLTYPE_FIXED &&
2373 NLBLOCK->nllen == 2 &&
2374 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2375 {
2376 md->hitend = TRUE;
2377 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2378 }
2379
2380 /* Fall through */
2381
2382 /* Match any single character whatsoever. */
2383
2384 case OP_ALLANY:
2385 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2386 { /* not be updated before SCHECK_PARTIAL. */
2387 SCHECK_PARTIAL();
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 eptr++;
2391 #ifdef SUPPORT_UTF
2392 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2393 #endif
2394 ecode++;
2395 break;
2396
2397 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2398 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2399
2400 case OP_ANYBYTE:
2401 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2402 { /* not be updated before SCHECK_PARTIAL. */
2403 SCHECK_PARTIAL();
2404 RRETURN(MATCH_NOMATCH);
2405 }
2406 eptr++;
2407 ecode++;
2408 break;
2409
2410 case OP_NOT_DIGIT:
2411 if (eptr >= md->end_subject)
2412 {
2413 SCHECK_PARTIAL();
2414 RRETURN(MATCH_NOMATCH);
2415 }
2416 GETCHARINCTEST(c, eptr);
2417 if (
2418 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2419 c < 256 &&
2420 #endif
2421 (md->ctypes[c] & ctype_digit) != 0
2422 )
2423 RRETURN(MATCH_NOMATCH);
2424 ecode++;
2425 break;
2426
2427 case OP_DIGIT:
2428 if (eptr >= md->end_subject)
2429 {
2430 SCHECK_PARTIAL();
2431 RRETURN(MATCH_NOMATCH);
2432 }
2433 GETCHARINCTEST(c, eptr);
2434 if (
2435 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2436 c > 255 ||
2437 #endif
2438 (md->ctypes[c] & ctype_digit) == 0
2439 )
2440 RRETURN(MATCH_NOMATCH);
2441 ecode++;
2442 break;
2443
2444 case OP_NOT_WHITESPACE:
2445 if (eptr >= md->end_subject)
2446 {
2447 SCHECK_PARTIAL();
2448 RRETURN(MATCH_NOMATCH);
2449 }
2450 GETCHARINCTEST(c, eptr);
2451 if (
2452 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2453 c < 256 &&
2454 #endif
2455 (md->ctypes[c] & ctype_space) != 0
2456 )
2457 RRETURN(MATCH_NOMATCH);
2458 ecode++;
2459 break;
2460
2461 case OP_WHITESPACE:
2462 if (eptr >= md->end_subject)
2463 {
2464 SCHECK_PARTIAL();
2465 RRETURN(MATCH_NOMATCH);
2466 }
2467 GETCHARINCTEST(c, eptr);
2468 if (
2469 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2470 c > 255 ||
2471 #endif
2472 (md->ctypes[c] & ctype_space) == 0
2473 )
2474 RRETURN(MATCH_NOMATCH);
2475 ecode++;
2476 break;
2477
2478 case OP_NOT_WORDCHAR:
2479 if (eptr >= md->end_subject)
2480 {
2481 SCHECK_PARTIAL();
2482 RRETURN(MATCH_NOMATCH);
2483 }
2484 GETCHARINCTEST(c, eptr);
2485 if (
2486 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2487 c < 256 &&
2488 #endif
2489 (md->ctypes[c] & ctype_word) != 0
2490 )
2491 RRETURN(MATCH_NOMATCH);
2492 ecode++;
2493 break;
2494
2495 case OP_WORDCHAR:
2496 if (eptr >= md->end_subject)
2497 {
2498 SCHECK_PARTIAL();
2499 RRETURN(MATCH_NOMATCH);
2500 }
2501 GETCHARINCTEST(c, eptr);
2502 if (
2503 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2504 c > 255 ||
2505 #endif
2506 (md->ctypes[c] & ctype_word) == 0
2507 )
2508 RRETURN(MATCH_NOMATCH);
2509 ecode++;
2510 break;
2511
2512 case OP_ANYNL:
2513 if (eptr >= md->end_subject)
2514 {
2515 SCHECK_PARTIAL();
2516 RRETURN(MATCH_NOMATCH);
2517 }
2518 GETCHARINCTEST(c, eptr);
2519 switch(c)
2520 {
2521 default: RRETURN(MATCH_NOMATCH);
2522
2523 case CHAR_CR:
2524 if (eptr >= md->end_subject)
2525 {
2526 SCHECK_PARTIAL();
2527 }
2528 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2529 break;
2530
2531 case CHAR_LF:
2532 break;
2533
2534 case CHAR_VT:
2535 case CHAR_FF:
2536 case CHAR_NEL:
2537 #ifndef EBCDIC
2538 case 0x2028:
2539 case 0x2029:
2540 #endif /* Not EBCDIC */
2541 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2542 break;
2543 }
2544 ecode++;
2545 break;
2546
2547 case OP_NOT_HSPACE:
2548 if (eptr >= md->end_subject)
2549 {
2550 SCHECK_PARTIAL();
2551 RRETURN(MATCH_NOMATCH);
2552 }
2553 GETCHARINCTEST(c, eptr);
2554 switch(c)
2555 {
2556 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2557 default: break;
2558 }
2559 ecode++;
2560 break;
2561
2562 case OP_HSPACE:
2563 if (eptr >= md->end_subject)
2564 {
2565 SCHECK_PARTIAL();
2566 RRETURN(MATCH_NOMATCH);
2567 }
2568 GETCHARINCTEST(c, eptr);
2569 switch(c)
2570 {
2571 HSPACE_CASES: break; /* Byte and multibyte cases */
2572 default: RRETURN(MATCH_NOMATCH);
2573 }
2574 ecode++;
2575 break;
2576
2577 case OP_NOT_VSPACE:
2578 if (eptr >= md->end_subject)
2579 {
2580 SCHECK_PARTIAL();
2581 RRETURN(MATCH_NOMATCH);
2582 }
2583 GETCHARINCTEST(c, eptr);
2584 switch(c)
2585 {
2586 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2587 default: break;
2588 }
2589 ecode++;
2590 break;
2591
2592 case OP_VSPACE:
2593 if (eptr >= md->end_subject)
2594 {
2595 SCHECK_PARTIAL();
2596 RRETURN(MATCH_NOMATCH);
2597 }
2598 GETCHARINCTEST(c, eptr);
2599 switch(c)
2600 {
2601 VSPACE_CASES: break;
2602 default: RRETURN(MATCH_NOMATCH);
2603 }
2604 ecode++;
2605 break;
2606
2607 #ifdef SUPPORT_UCP
2608 /* Check the next character by Unicode property. We will get here only
2609 if the support is in the binary; otherwise a compile-time error occurs. */
2610
2611 case OP_PROP:
2612 case OP_NOTPROP:
2613 if (eptr >= md->end_subject)
2614 {
2615 SCHECK_PARTIAL();
2616 RRETURN(MATCH_NOMATCH);
2617 }
2618 GETCHARINCTEST(c, eptr);
2619 {
2620 const pcre_uint32 *cp;
2621 const ucd_record *prop = GET_UCD(c);
2622
2623 switch(ecode[1])
2624 {
2625 case PT_ANY:
2626 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2627 break;
2628
2629 case PT_LAMP:
2630 if ((prop->chartype == ucp_Lu ||
2631 prop->chartype == ucp_Ll ||
2632 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2633 RRETURN(MATCH_NOMATCH);
2634 break;
2635
2636 case PT_GC:
2637 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2638 RRETURN(MATCH_NOMATCH);
2639 break;
2640
2641 case PT_PC:
2642 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2643 RRETURN(MATCH_NOMATCH);
2644 break;
2645
2646 case PT_SC:
2647 if ((ecode[2] != prop->script) == (op == OP_PROP))
2648 RRETURN(MATCH_NOMATCH);
2649 break;
2650
2651 /* These are specials */
2652
2653 case PT_ALNUM:
2654 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2655 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2656 RRETURN(MATCH_NOMATCH);
2657 break;
2658
2659 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2660 which means that Perl space and POSIX space are now identical. PCRE
2661 was changed at release 8.34. */
2662
2663 case PT_SPACE: /* Perl space */
2664 case PT_PXSPACE: /* POSIX space */
2665 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2666 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2667 c == CHAR_FF || c == CHAR_CR)
2668 == (op == OP_NOTPROP))
2669 RRETURN(MATCH_NOMATCH);
2670 break;
2671
2672 case PT_WORD:
2673 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2674 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2675 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2676 RRETURN(MATCH_NOMATCH);
2677 break;
2678
2679 case PT_CLIST:
2680 cp = PRIV(ucd_caseless_sets) + ecode[2];
2681 for (;;)
2682 {
2683 if (c < *cp)
2684 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2685 if (c == *cp++)
2686 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2687 }
2688 break;
2689
2690 case PT_UCNC:
2691 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2692 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2693 c >= 0xe000) == (op == OP_NOTPROP))
2694 RRETURN(MATCH_NOMATCH);
2695 break;
2696
2697 /* This should never occur */
2698
2699 default:
2700 RRETURN(PCRE_ERROR_INTERNAL);
2701 }
2702
2703 ecode += 3;
2704 }
2705 break;
2706
2707 /* Match an extended Unicode sequence. We will get here only if the support
2708 is in the binary; otherwise a compile-time error occurs. */
2709
2710 case OP_EXTUNI:
2711 if (eptr >= md->end_subject)
2712 {
2713 SCHECK_PARTIAL();
2714 RRETURN(MATCH_NOMATCH);
2715 }
2716 else
2717 {
2718 int lgb, rgb;
2719 GETCHARINCTEST(c, eptr);
2720 lgb = UCD_GRAPHBREAK(c);
2721 while (eptr < md->end_subject)
2722 {
2723 int len = 1;
2724 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2725 rgb = UCD_GRAPHBREAK(c);
2726 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2727 lgb = rgb;
2728 eptr += len;
2729 }
2730 }
2731 CHECK_PARTIAL();
2732 ecode++;
2733 break;
2734 #endif /* SUPPORT_UCP */
2735
2736
2737 /* Match a back reference, possibly repeatedly. Look past the end of the
2738 item to see if there is repeat information following. The code is similar
2739 to that for character classes, but repeated for efficiency. Then obey
2740 similar code to character type repeats - written out again for speed.
2741 However, if the referenced string is the empty string, always treat
2742 it as matched, any number of times (otherwise there could be infinite
2743 loops). If the reference is unset, there are two possibilities:
2744
2745 (a) In the default, Perl-compatible state, set the length negative;
2746 this ensures that every attempt at a match fails. We can't just fail
2747 here, because of the possibility of quantifiers with zero minima.
2748
2749 (b) If the JavaScript compatibility flag is set, set the length to zero
2750 so that the back reference matches an empty string.
2751
2752 Otherwise, set the length to the length of what was matched by the
2753 referenced subpattern.
2754
2755 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2756 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2757 and OP_DNREFI are used. In this case we must scan the list of groups to
2758 which the name refers, and use the first one that is set. */
2759
2760 case OP_DNREF:
2761 case OP_DNREFI:
2762 caseless = op == OP_DNREFI;
2763 {
2764 int count = GET2(ecode, 1+IMM2_SIZE);
2765 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2766 ecode += 1 + 2*IMM2_SIZE;
2767
2768 while (count-- > 0)
2769 {
2770 offset = GET2(slot, 0) << 1;
2771 if (offset < offset_top && md->offset_vector[offset] >= 0) break;
2772 slot += md->name_entry_size;
2773 }
2774 if (count < 0)
2775 length = (md->jscript_compat)? 0 : -1;
2776 else
2777 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2778 }
2779 goto REF_REPEAT;
2780
2781 case OP_REF:
2782 case OP_REFI:
2783 caseless = op == OP_REFI;
2784 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2785 ecode += 1 + IMM2_SIZE;
2786
2787
2788 if (offset >= offset_top || md->offset_vector[offset] < 0)
2789 length = (md->jscript_compat)? 0 : -1;
2790 else
2791 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2792
2793 /* Set up for repetition, or handle the non-repeated case */
2794
2795 REF_REPEAT:
2796 switch (*ecode)
2797 {
2798 case OP_CRSTAR:
2799 case OP_CRMINSTAR:
2800 case OP_CRPLUS:
2801 case OP_CRMINPLUS:
2802 case OP_CRQUERY:
2803 case OP_CRMINQUERY:
2804 c = *ecode++ - OP_CRSTAR;
2805 minimize = (c & 1) != 0;
2806 min = rep_min[c]; /* Pick up values from tables; */
2807 max = rep_max[c]; /* zero for max => infinity */
2808 if (max == 0) max = INT_MAX;
2809 break;
2810
2811 case OP_CRRANGE:
2812 case OP_CRMINRANGE:
2813 minimize = (*ecode == OP_CRMINRANGE);
2814 min = GET2(ecode, 1);
2815 max = GET2(ecode, 1 + IMM2_SIZE);
2816 if (max == 0) max = INT_MAX;
2817 ecode += 1 + 2 * IMM2_SIZE;
2818 break;
2819
2820 default: /* No repeat follows */
2821 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2822 {
2823 if (length == -2) eptr = md->end_subject; /* Partial match */
2824 CHECK_PARTIAL();
2825 RRETURN(MATCH_NOMATCH);
2826 }
2827 eptr += length;
2828 continue; /* With the main loop */
2829 }
2830
2831 /* Handle repeated back references. If the length of the reference is
2832 zero, just continue with the main loop. If the length is negative, it
2833 means the reference is unset in non-Java-compatible mode. If the minimum is
2834 zero, we can continue at the same level without recursion. For any other
2835 minimum, carrying on will result in NOMATCH. */
2836
2837 if (length == 0) continue;
2838 if (length < 0 && min == 0) continue;
2839
2840 /* First, ensure the minimum number of matches are present. We get back
2841 the length of the reference string explicitly rather than passing the
2842 address of eptr, so that eptr can be a register variable. */
2843
2844 for (i = 1; i <= min; i++)
2845 {
2846 int slength;
2847 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2848 {
2849 if (slength == -2) eptr = md->end_subject; /* Partial match */
2850 CHECK_PARTIAL();
2851 RRETURN(MATCH_NOMATCH);
2852 }
2853 eptr += slength;
2854 }
2855
2856 /* If min = max, continue at the same level without recursion.
2857 They are not both allowed to be zero. */
2858
2859 if (min == max) continue;
2860
2861 /* If minimizing, keep trying and advancing the pointer */
2862
2863 if (minimize)
2864 {
2865 for (fi = min;; fi++)
2866 {
2867 int slength;
2868 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2869 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2870 if (fi >= max) RRETURN(MATCH_NOMATCH);
2871 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2872 {
2873 if (slength == -2) eptr = md->end_subject; /* Partial match */
2874 CHECK_PARTIAL();
2875 RRETURN(MATCH_NOMATCH);
2876 }
2877 eptr += slength;
2878 }
2879 /* Control never gets here */
2880 }
2881
2882 /* If maximizing, find the longest string and work backwards */
2883
2884 else
2885 {
2886 pp = eptr;
2887 for (i = min; i < max; i++)
2888 {
2889 int slength;
2890 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2891 {
2892 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2893 the soft partial matching case. */
2894
2895 if (slength == -2 && md->partial != 0 &&
2896 md->end_subject > md->start_used_ptr)
2897 {
2898 md->hitend = TRUE;
2899 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2900 }
2901 break;
2902 }
2903 eptr += slength;
2904 }
2905
2906 while (eptr >= pp)
2907 {
2908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 eptr -= length;
2911 }
2912 RRETURN(MATCH_NOMATCH);
2913 }
2914 /* Control never gets here */
2915
2916 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2917 used when all the characters in the class have values in the range 0-255,
2918 and either the matching is caseful, or the characters are in the range
2919 0-127 when UTF-8 processing is enabled. The only difference between
2920 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2921 encountered.
2922
2923 First, look past the end of the item to see if there is repeat information
2924 following. Then obey similar code to character type repeats - written out
2925 again for speed. */
2926
2927 case OP_NCLASS:
2928 case OP_CLASS:
2929 {
2930 /* The data variable is saved across frames, so the byte map needs to
2931 be stored there. */
2932 #define BYTE_MAP ((pcre_uint8 *)data)
2933 data = ecode + 1; /* Save for matching */
2934 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2935
2936 switch (*ecode)
2937 {
2938 case OP_CRSTAR:
2939 case OP_CRMINSTAR:
2940 case OP_CRPLUS:
2941 case OP_CRMINPLUS:
2942 case OP_CRQUERY:
2943 case OP_CRMINQUERY:
2944 c = *ecode++ - OP_CRSTAR;
2945 minimize = (c & 1) != 0;
2946 min = rep_min[c]; /* Pick up values from tables; */
2947 max = rep_max[c]; /* zero for max => infinity */
2948 if (max == 0) max = INT_MAX;
2949 break;
2950
2951 case OP_CRRANGE:
2952 case OP_CRMINRANGE:
2953 minimize = (*ecode == OP_CRMINRANGE);
2954 min = GET2(ecode, 1);
2955 max = GET2(ecode, 1 + IMM2_SIZE);
2956 if (max == 0) max = INT_MAX;
2957 ecode += 1 + 2 * IMM2_SIZE;
2958 break;
2959
2960 default: /* No repeat follows */
2961 min = max = 1;
2962 break;
2963 }
2964
2965 /* First, ensure the minimum number of matches are present. */
2966
2967 #ifdef SUPPORT_UTF
2968 if (utf)
2969 {
2970 for (i = 1; i <= min; i++)
2971 {
2972 if (eptr >= md->end_subject)
2973 {
2974 SCHECK_PARTIAL();
2975 RRETURN(MATCH_NOMATCH);
2976 }
2977 GETCHARINC(c, eptr);
2978 if (c > 255)
2979 {
2980 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2981 }
2982 else
2983 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2984 }
2985 }
2986 else
2987 #endif
2988 /* Not UTF mode */
2989 {
2990 for (i = 1; i <= min; i++)
2991 {
2992 if (eptr >= md->end_subject)
2993 {
2994 SCHECK_PARTIAL();
2995 RRETURN(MATCH_NOMATCH);
2996 }
2997 c = *eptr++;
2998 #ifndef COMPILE_PCRE8
2999 if (c > 255)
3000 {
3001 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3002 }
3003 else
3004 #endif
3005 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3006 }
3007 }
3008
3009 /* If max == min we can continue with the main loop without the
3010 need to recurse. */
3011
3012 if (min == max) continue;
3013
3014 /* If minimizing, keep testing the rest of the expression and advancing
3015 the pointer while it matches the class. */
3016
3017 if (minimize)
3018 {
3019 #ifdef SUPPORT_UTF
3020 if (utf)
3021 {
3022 for (fi = min;; fi++)
3023 {
3024 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
3025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3026 if (fi >= max) RRETURN(MATCH_NOMATCH);
3027 if (eptr >= md->end_subject)
3028 {
3029 SCHECK_PARTIAL();
3030 RRETURN(MATCH_NOMATCH);
3031 }
3032 GETCHARINC(c, eptr);
3033 if (c > 255)
3034 {
3035 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3036 }
3037 else
3038 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3039 }
3040 }
3041 else
3042 #endif
3043 /* Not UTF mode */
3044 {
3045 for (fi = min;; fi++)
3046 {
3047 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3049 if (fi >= max) RRETURN(MATCH_NOMATCH);
3050 if (eptr >= md->end_subject)
3051 {
3052 SCHECK_PARTIAL();
3053 RRETURN(MATCH_NOMATCH);
3054 }
3055 c = *eptr++;
3056 #ifndef COMPILE_PCRE8
3057 if (c > 255)
3058 {
3059 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3060 }
3061 else
3062 #endif
3063 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3064 }
3065 }
3066 /* Control never gets here */
3067 }
3068
3069 /* If maximizing, find the longest possible run, then work backwards. */
3070
3071 else
3072 {
3073 pp = eptr;
3074
3075 #ifdef SUPPORT_UTF
3076 if (utf)
3077 {
3078 for (i = min; i < max; i++)
3079 {
3080 int len = 1;
3081 if (eptr >= md->end_subject)
3082 {
3083 SCHECK_PARTIAL();
3084 break;
3085 }
3086 GETCHARLEN(c, eptr, len);
3087 if (c > 255)
3088 {
3089 if (op == OP_CLASS) break;
3090 }
3091 else
3092 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3093 eptr += len;
3094 }
3095 for (;;)
3096 {
3097 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3099 if (eptr-- == pp) break; /* Stop if tried at original pos */
3100 BACKCHAR(eptr);
3101 }
3102 }
3103 else
3104 #endif
3105 /* Not UTF mode */
3106 {
3107 for (i = min; i < max; i++)
3108 {
3109 if (eptr >= md->end_subject)
3110 {
3111 SCHECK_PARTIAL();
3112 break;
3113 }
3114 c = *eptr;
3115 #ifndef COMPILE_PCRE8
3116 if (c > 255)
3117 {
3118 if (op == OP_CLASS) break;
3119 }
3120 else
3121 #endif
3122 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3123 eptr++;
3124 }
3125 while (eptr >= pp)
3126 {
3127 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3128 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3129 eptr--;
3130 }
3131 }
3132
3133 RRETURN(MATCH_NOMATCH);
3134 }
3135 #undef BYTE_MAP
3136 }
3137 /* Control never gets here */
3138
3139
3140 /* Match an extended character class. This opcode is encountered only
3141 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3142 mode, because Unicode properties are supported in non-UTF-8 mode. */
3143
3144 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3145 case OP_XCLASS:
3146 {
3147 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3148 ecode += GET(ecode, 1); /* Advance past the item */
3149
3150 switch (*ecode)
3151 {
3152 case OP_CRSTAR:
3153 case OP_CRMINSTAR:
3154 case OP_CRPLUS:
3155 case OP_CRMINPLUS:
3156 case OP_CRQUERY:
3157 case OP_CRMINQUERY:
3158 c = *ecode++ - OP_CRSTAR;
3159 minimize = (c & 1) != 0;
3160 min = rep_min[c]; /* Pick up values from tables; */
3161 max = rep_max[c]; /* zero for max => infinity */
3162 if (max == 0) max = INT_MAX;
3163 break;
3164
3165 case OP_CRRANGE:
3166 case OP_CRMINRANGE:
3167 minimize = (*ecode == OP_CRMINRANGE);
3168 min = GET2(ecode, 1);
3169 max = GET2(ecode, 1 + IMM2_SIZE);
3170 if (max == 0) max = INT_MAX;
3171 ecode += 1 + 2 * IMM2_SIZE;
3172 break;
3173
3174 default: /* No repeat follows */
3175 min = max = 1;
3176 break;
3177 }
3178
3179 /* First, ensure the minimum number of matches are present. */
3180
3181 for (i = 1; i <= min; i++)
3182 {
3183 if (eptr >= md->end_subject)
3184 {
3185 SCHECK_PARTIAL();
3186 RRETURN(MATCH_NOMATCH);
3187 }
3188 GETCHARINCTEST(c, eptr);
3189 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3190 }
3191
3192 /* If max == min we can continue with the main loop without the
3193 need to recurse. */
3194
3195 if (min == max) continue;
3196
3197 /* If minimizing, keep testing the rest of the expression and advancing
3198 the pointer while it matches the class. */
3199
3200 if (minimize)
3201 {
3202 for (fi = min;; fi++)
3203 {
3204 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3205 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3206 if (fi >= max) RRETURN(MATCH_NOMATCH);
3207 if (eptr >= md->end_subject)
3208 {
3209 SCHECK_PARTIAL();
3210 RRETURN(MATCH_NOMATCH);
3211 }
3212 GETCHARINCTEST(c, eptr);
3213 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3214 }
3215 /* Control never gets here */
3216 }
3217
3218 /* If maximizing, find the longest possible run, then work backwards. */
3219
3220 else
3221 {
3222 pp = eptr;
3223 for (i = min; i < max; i++)
3224 {
3225 int len = 1;
3226 if (eptr >= md->end_subject)
3227 {
3228 SCHECK_PARTIAL();
3229 break;
3230 }
3231 #ifdef SUPPORT_UTF
3232 GETCHARLENTEST(c, eptr, len);
3233 #else
3234 c = *eptr;
3235 #endif
3236 if (!PRIV(xclass)(c, data, utf)) break;
3237 eptr += len;
3238 }
3239 for(;;)
3240 {
3241 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3243 if (eptr-- == pp) break; /* Stop if tried at original pos */
3244 #ifdef SUPPORT_UTF
3245 if (utf) BACKCHAR(eptr);
3246 #endif
3247 }
3248 RRETURN(MATCH_NOMATCH);
3249 }
3250
3251 /* Control never gets here */
3252 }
3253 #endif /* End of XCLASS */
3254
3255 /* Match a single character, casefully */
3256
3257 case OP_CHAR:
3258 #ifdef SUPPORT_UTF
3259 if (utf)
3260 {
3261 length = 1;
3262 ecode++;
3263 GETCHARLEN(fc, ecode, length);
3264 if (length > md->end_subject - eptr)
3265 {
3266 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3267 RRETURN(MATCH_NOMATCH);
3268 }
3269 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3270 }
3271 else
3272 #endif
3273 /* Not UTF mode */
3274 {
3275 if (md->end_subject - eptr < 1)
3276 {
3277 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3278 RRETURN(MATCH_NOMATCH);
3279 }
3280 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3281 ecode += 2;
3282 }
3283 break;
3284
3285 /* Match a single character, caselessly. If we are at the end of the
3286 subject, give up immediately. */
3287
3288 case OP_CHARI:
3289 if (eptr >= md->end_subject)
3290 {
3291 SCHECK_PARTIAL();
3292 RRETURN(MATCH_NOMATCH);
3293 }
3294
3295 #ifdef SUPPORT_UTF
3296 if (utf)
3297 {
3298 length = 1;
3299 ecode++;
3300 GETCHARLEN(fc, ecode, length);
3301
3302 /* If the pattern character's value is < 128, we have only one byte, and
3303 we know that its other case must also be one byte long, so we can use the
3304 fast lookup table. We know that there is at least one byte left in the
3305 subject. */
3306
3307 if (fc < 128)
3308 {
3309 pcre_uint32 cc = RAWUCHAR(eptr);
3310 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3311 ecode++;
3312 eptr++;
3313 }
3314
3315 /* Otherwise we must pick up the subject character. Note that we cannot
3316 use the value of "length" to check for sufficient bytes left, because the
3317 other case of the character may have more or fewer bytes. */
3318
3319 else
3320 {
3321 pcre_uint32 dc;
3322 GETCHARINC(dc, eptr);
3323 ecode += length;
3324
3325 /* If we have Unicode property support, we can use it to test the other
3326 case of the character, if there is one. */
3327
3328 if (fc != dc)
3329 {
3330 #ifdef SUPPORT_UCP
3331 if (dc != UCD_OTHERCASE(fc))
3332 #endif
3333 RRETURN(MATCH_NOMATCH);
3334 }
3335 }
3336 }
3337 else
3338 #endif /* SUPPORT_UTF */
3339
3340 /* Not UTF mode */
3341 {
3342 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3343 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3344 eptr++;
3345 ecode += 2;
3346 }
3347 break;
3348
3349 /* Match a single character repeatedly. */
3350
3351 case OP_EXACT:
3352 case OP_EXACTI:
3353 min = max = GET2(ecode, 1);
3354 ecode += 1 + IMM2_SIZE;
3355 goto REPEATCHAR;
3356
3357 case OP_POSUPTO:
3358 case OP_POSUPTOI:
3359 possessive = TRUE;
3360 /* Fall through */
3361
3362 case OP_UPTO:
3363 case OP_UPTOI:
3364 case OP_MINUPTO:
3365 case OP_MINUPTOI:
3366 min = 0;
3367 max = GET2(ecode, 1);
3368 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3369 ecode += 1 + IMM2_SIZE;
3370 goto REPEATCHAR;
3371
3372 case OP_POSSTAR:
3373 case OP_POSSTARI:
3374 possessive = TRUE;
3375 min = 0;
3376 max = INT_MAX;
3377 ecode++;
3378 goto REPEATCHAR;
3379
3380 case OP_POSPLUS:
3381 case OP_POSPLUSI:
3382 possessive = TRUE;
3383 min = 1;
3384 max = INT_MAX;
3385 ecode++;
3386 goto REPEATCHAR;
3387
3388 case OP_POSQUERY:
3389 case OP_POSQUERYI:
3390 possessive = TRUE;
3391 min = 0;
3392 max = 1;
3393 ecode++;
3394 goto REPEATCHAR;
3395
3396 case OP_STAR:
3397 case OP_STARI:
3398 case OP_MINSTAR:
3399 case OP_MINSTARI:
3400 case OP_PLUS:
3401 case OP_PLUSI:
3402 case OP_MINPLUS:
3403 case OP_MINPLUSI:
3404 case OP_QUERY:
3405 case OP_QUERYI:
3406 case OP_MINQUERY:
3407 case OP_MINQUERYI:
3408 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3409 minimize = (c & 1) != 0;
3410 min = rep_min[c]; /* Pick up values from tables; */
3411 max = rep_max[c]; /* zero for max => infinity */
3412 if (max == 0) max = INT_MAX;
3413
3414 /* Common code for all repeated single-character matches. We first check
3415 for the minimum number of characters. If the minimum equals the maximum, we
3416 are done. Otherwise, if minimizing, check the rest of the pattern for a
3417 match; if there isn't one, advance up to the maximum, one character at a
3418 time.
3419
3420 If maximizing, advance up to the maximum number of matching characters,
3421 until eptr is past the end of the maximum run. If possessive, we are
3422 then done (no backing up). Otherwise, match at this position; anything
3423 other than no match is immediately returned. For nomatch, back up one
3424 character, unless we are matching \R and the last thing matched was
3425 \r\n, in which case, back up two bytes. When we reach the first optional
3426 character position, we can save stack by doing a tail recurse.
3427
3428 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3429 for speed. */
3430
3431 REPEATCHAR:
3432 #ifdef SUPPORT_UTF
3433 if (utf)
3434 {
3435 length = 1;
3436 charptr = ecode;
3437 GETCHARLEN(fc, ecode, length);
3438 ecode += length;
3439
3440 /* Handle multibyte character matching specially here. There is
3441 support for caseless matching if UCP support is present. */
3442
3443 if (length > 1)
3444 {
3445 #ifdef SUPPORT_UCP
3446 pcre_uint32 othercase;
3447 if (op >= OP_STARI && /* Caseless */
3448 (othercase = UCD_OTHERCASE(fc)) != fc)
3449 oclength = PRIV(ord2utf)(othercase, occhars);
3450 else oclength = 0;
3451 #endif /* SUPPORT_UCP */
3452
3453 for (i = 1; i <= min; i++)
3454 {
3455 if (eptr <= md->end_subject - length &&
3456 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3457 #ifdef SUPPORT_UCP
3458 else if (oclength > 0 &&
3459 eptr <= md->end_subject - oclength &&
3460 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3461 #endif /* SUPPORT_UCP */
3462 else
3463 {
3464 CHECK_PARTIAL();
3465 RRETURN(MATCH_NOMATCH);
3466 }
3467 }
3468
3469 if (min == max) continue;
3470
3471 if (minimize)
3472 {
3473 for (fi = min;; fi++)
3474 {
3475 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3476 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3477 if (fi >= max) RRETURN(MATCH_NOMATCH);
3478 if (eptr <= md->end_subject - length &&
3479 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3480 #ifdef SUPPORT_UCP
3481 else if (oclength > 0 &&
3482 eptr <= md->end_subject - oclength &&
3483 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3484 #endif /* SUPPORT_UCP */
3485 else
3486 {
3487 CHECK_PARTIAL();
3488 RRETURN(MATCH_NOMATCH);
3489 }
3490 }
3491 /* Control never gets here */
3492 }
3493
3494 else /* Maximize */
3495 {
3496 pp = eptr;
3497 for (i = min; i < max; i++)
3498 {
3499 if (eptr <= md->end_subject - length &&
3500 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3501 #ifdef SUPPORT_UCP
3502 else if (oclength > 0 &&
3503 eptr <= md->end_subject - oclength &&
3504 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3505 #endif /* SUPPORT_UCP */
3506 else
3507 {
3508 CHECK_PARTIAL();
3509 break;
3510 }
3511 }
3512
3513 if (possessive) continue; /* No backtracking */
3514 for(;;)
3515 {
3516 if (eptr == pp) goto TAIL_RECURSE;
3517 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3518 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3519 #ifdef SUPPORT_UCP
3520 eptr--;
3521 BACKCHAR(eptr);
3522 #else /* without SUPPORT_UCP */
3523 eptr -= length;
3524 #endif /* SUPPORT_UCP */
3525 }
3526 }
3527 /* Control never gets here */
3528 }
3529
3530 /* If the length of a UTF-8 character is 1, we fall through here, and
3531 obey the code as for non-UTF-8 characters below, though in this case the
3532 value of fc will always be < 128. */
3533 }
3534 else
3535 #endif /* SUPPORT_UTF */
3536 /* When not in UTF-8 mode, load a single-byte character. */
3537 fc = *ecode++;
3538
3539 /* The value of fc at this point is always one character, though we may
3540 or may not be in UTF mode. The code is duplicated for the caseless and
3541 caseful cases, for speed, since matching characters is likely to be quite
3542 common. First, ensure the minimum number of matches are present. If min =
3543 max, continue at the same level without recursing. Otherwise, if
3544 minimizing, keep trying the rest of the expression and advancing one
3545 matching character if failing, up to the maximum. Alternatively, if
3546 maximizing, find the maximum number of characters and work backwards. */
3547
3548 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3549 max, (char *)eptr));
3550
3551 if (op >= OP_STARI) /* Caseless */
3552 {
3553 #ifdef COMPILE_PCRE8
3554 /* fc must be < 128 if UTF is enabled. */
3555 foc = md->fcc[fc];
3556 #else
3557 #ifdef SUPPORT_UTF
3558 #ifdef SUPPORT_UCP
3559 if (utf && fc > 127)
3560 foc = UCD_OTHERCASE(fc);
3561 #else
3562 if (utf && fc > 127)
3563 foc = fc;
3564 #endif /* SUPPORT_UCP */
3565 else
3566 #endif /* SUPPORT_UTF */
3567 foc = TABLE_GET(fc, md->fcc, fc);
3568 #endif /* COMPILE_PCRE8 */
3569
3570 for (i = 1; i <= min; i++)
3571 {
3572 pcre_uint32 cc; /* Faster than pcre_uchar */
3573 if (eptr >= md->end_subject)
3574 {
3575 SCHECK_PARTIAL();
3576 RRETURN(MATCH_NOMATCH);
3577 }
3578 cc = RAWUCHARTEST(eptr);
3579 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3580 eptr++;
3581 }
3582 if (min == max) continue;
3583 if (minimize)
3584 {
3585 for (fi = min;; fi++)
3586 {
3587 pcre_uint32 cc; /* Faster than pcre_uchar */
3588 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3590 if (fi >= max) RRETURN(MATCH_NOMATCH);
3591 if (eptr >= md->end_subject)
3592 {
3593 SCHECK_PARTIAL();
3594 RRETURN(MATCH_NOMATCH);
3595 }
3596 cc = RAWUCHARTEST(eptr);
3597 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3598 eptr++;
3599 }
3600 /* Control never gets here */
3601 }
3602 else /* Maximize */
3603 {
3604 pp = eptr;
3605 for (i = min; i < max; i++)
3606 {
3607 pcre_uint32 cc; /* Faster than pcre_uchar */
3608 if (eptr >= md->end_subject)
3609 {
3610 SCHECK_PARTIAL();
3611 break;
3612 }
3613 cc = RAWUCHARTEST(eptr);
3614 if (fc != cc && foc != cc) break;
3615 eptr++;
3616 }
3617 if (possessive) continue; /* No backtracking */
3618 for (;;)
3619 {
3620 if (eptr == pp) goto TAIL_RECURSE;
3621 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3622 eptr--;
3623 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3624 }
3625 /* Control never gets here */
3626 }
3627 }
3628
3629 /* Caseful comparisons (includes all multi-byte characters) */
3630
3631 else
3632 {
3633 for (i = 1; i <= min; i++)
3634 {
3635 if (eptr >= md->end_subject)
3636 {
3637 SCHECK_PARTIAL();
3638 RRETURN(MATCH_NOMATCH);
3639 }
3640 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3641 }
3642
3643 if (min == max) continue;
3644
3645 if (minimize)
3646 {
3647 for (fi = min;; fi++)
3648 {
3649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3651 if (fi >= max) RRETURN(MATCH_NOMATCH);
3652 if (eptr >= md->end_subject)
3653 {
3654 SCHECK_PARTIAL();
3655 RRETURN(MATCH_NOMATCH);
3656 }
3657 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3658 }
3659 /* Control never gets here */
3660 }
3661 else /* Maximize */
3662 {
3663 pp = eptr;
3664 for (i = min; i < max; i++)
3665 {
3666 if (eptr >= md->end_subject)
3667 {
3668 SCHECK_PARTIAL();
3669 break;
3670 }
3671 if (fc != RAWUCHARTEST(eptr)) break;
3672 eptr++;
3673 }
3674 if (possessive) continue; /* No backtracking */
3675 for (;;)
3676 {
3677 if (eptr == pp) goto TAIL_RECURSE;
3678 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3679 eptr--;
3680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3681 }
3682 /* Control never gets here */
3683 }
3684 }
3685 /* Control never gets here */
3686
3687 /* Match a negated single one-byte character. The character we are
3688 checking can be multibyte. */
3689
3690 case OP_NOT:
3691 case OP_NOTI:
3692 if (eptr >= md->end_subject)
3693 {
3694 SCHECK_PARTIAL();
3695 RRETURN(MATCH_NOMATCH);
3696 }
3697 #ifdef SUPPORT_UTF
3698 if (utf)
3699 {
3700 register pcre_uint32 ch, och;
3701
3702 ecode++;
3703 GETCHARINC(ch, ecode);
3704 GETCHARINC(c, eptr);
3705
3706 if (op == OP_NOT)
3707 {
3708 if (ch == c) RRETURN(MATCH_NOMATCH);
3709 }
3710 else
3711 {
3712 #ifdef SUPPORT_UCP
3713 if (ch > 127)
3714 och = UCD_OTHERCASE(ch);
3715 #else
3716 if (ch > 127)
3717 och = ch;
3718 #endif /* SUPPORT_UCP */
3719 else
3720 och = TABLE_GET(ch, md->fcc, ch);
3721 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3722 }
3723 }
3724 else
3725 #endif
3726 {
3727 register pcre_uint32 ch = ecode[1];
3728 c = *eptr++;
3729 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3730 RRETURN(MATCH_NOMATCH);
3731 ecode += 2;
3732 }
3733 break;
3734
3735 /* Match a negated single one-byte character repeatedly. This is almost a
3736 repeat of the code for a repeated single character, but I haven't found a
3737 nice way of commoning these up that doesn't require a test of the
3738 positive/negative option for each character match. Maybe that wouldn't add
3739 very much to the time taken, but character matching *is* what this is all
3740 about... */
3741
3742 case OP_NOTEXACT:
3743 case OP_NOTEXACTI:
3744 min = max = GET2(ecode, 1);
3745 ecode += 1 + IMM2_SIZE;
3746 goto REPEATNOTCHAR;
3747
3748 case OP_NOTUPTO:
3749 case OP_NOTUPTOI:
3750 case OP_NOTMINUPTO:
3751 case OP_NOTMINUPTOI:
3752 min = 0;
3753 max = GET2(ecode, 1);
3754 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3755 ecode += 1 + IMM2_SIZE;
3756 goto REPEATNOTCHAR;
3757
3758 case OP_NOTPOSSTAR:
3759 case OP_NOTPOSSTARI:
3760 possessive = TRUE;
3761 min = 0;
3762 max = INT_MAX;
3763 ecode++;
3764 goto REPEATNOTCHAR;
3765
3766 case OP_NOTPOSPLUS:
3767 case OP_NOTPOSPLUSI:
3768 possessive = TRUE;
3769 min = 1;
3770 max = INT_MAX;
3771 ecode++;
3772 goto REPEATNOTCHAR;
3773
3774 case OP_NOTPOSQUERY:
3775 case OP_NOTPOSQUERYI:
3776 possessive = TRUE;
3777 min = 0;
3778 max = 1;
3779 ecode++;
3780 goto REPEATNOTCHAR;
3781
3782 case OP_NOTPOSUPTO:
3783 case OP_NOTPOSUPTOI:
3784 possessive = TRUE;
3785 min = 0;
3786 max = GET2(ecode, 1);
3787 ecode += 1 + IMM2_SIZE;
3788 goto REPEATNOTCHAR;
3789
3790 case OP_NOTSTAR:
3791 case OP_NOTSTARI:
3792 case OP_NOTMINSTAR:
3793 case OP_NOTMINSTARI:
3794 case OP_NOTPLUS:
3795 case OP_NOTPLUSI:
3796 case OP_NOTMINPLUS:
3797 case OP_NOTMINPLUSI:
3798 case OP_NOTQUERY:
3799 case OP_NOTQUERYI:
3800 case OP_NOTMINQUERY:
3801 case OP_NOTMINQUERYI:
3802 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3803 minimize = (c & 1) != 0;
3804 min = rep_min[c]; /* Pick up values from tables; */
3805 max = rep_max[c]; /* zero for max => infinity */
3806 if (max == 0) max = INT_MAX;
3807
3808 /* Common code for all repeated single-byte matches. */
3809
3810 REPEATNOTCHAR:
3811 GETCHARINCTEST(fc, ecode);
3812
3813 /* The code is duplicated for the caseless and caseful cases, for speed,
3814 since matching characters is likely to be quite common. First, ensure the
3815 minimum number of matches are present. If min = max, continue at the same
3816 level without recursing. Otherwise, if minimizing, keep trying the rest of
3817 the expression and advancing one matching character if failing, up to the
3818 maximum. Alternatively, if maximizing, find the maximum number of
3819 characters and work backwards. */
3820
3821 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3822 max, (char *)eptr));
3823
3824 if (op >= OP_NOTSTARI) /* Caseless */
3825 {
3826 #ifdef SUPPORT_UTF
3827 #ifdef SUPPORT_UCP
3828 if (utf && fc > 127)
3829 foc = UCD_OTHERCASE(fc);
3830 #else
3831 if (utf && fc > 127)
3832 foc = fc;
3833 #endif /* SUPPORT_UCP */
3834 else
3835 #endif /* SUPPORT_UTF */
3836 foc = TABLE_GET(fc, md->fcc, fc);
3837
3838 #ifdef SUPPORT_UTF
3839 if (utf)
3840 {
3841 register pcre_uint32 d;
3842 for (i = 1; i <= min; i++)
3843 {
3844 if (eptr >= md->end_subject)
3845 {
3846 SCHECK_PARTIAL();
3847 RRETURN(MATCH_NOMATCH);
3848 }
3849 GETCHARINC(d, eptr);
3850 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3851 }
3852 }
3853 else
3854 #endif /* SUPPORT_UTF */
3855 /* Not UTF mode */
3856 {
3857 for (i = 1; i <= min; i++)
3858 {
3859 if (eptr >= md->end_subject)
3860 {
3861 SCHECK_PARTIAL();
3862 RRETURN(MATCH_NOMATCH);
3863 }
3864 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3865 eptr++;
3866 }
3867 }
3868
3869 if (min == max) continue;
3870
3871 if (minimize)
3872 {
3873 #ifdef SUPPORT_UTF
3874 if (utf)
3875 {
3876 register pcre_uint32 d;
3877 for (fi = min;; fi++)
3878 {
3879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3881 if (fi >= max) RRETURN(MATCH_NOMATCH);
3882 if (eptr >= md->end_subject)
3883 {
3884 SCHECK_PARTIAL();
3885 RRETURN(MATCH_NOMATCH);
3886 }
3887 GETCHARINC(d, eptr);
3888 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3889 }
3890 }
3891 else
3892 #endif /*SUPPORT_UTF */
3893 /* Not UTF mode */
3894 {
3895 for (fi = min;; fi++)
3896 {
3897 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3898 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3899 if (fi >= max) RRETURN(MATCH_NOMATCH);
3900 if (eptr >= md->end_subject)
3901 {
3902 SCHECK_PARTIAL();
3903 RRETURN(MATCH_NOMATCH);
3904 }
3905 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3906 eptr++;
3907 }
3908 }
3909 /* Control never gets here */
3910 }
3911
3912 /* Maximize case */
3913
3914 else
3915 {
3916 pp = eptr;
3917
3918 #ifdef SUPPORT_UTF
3919 if (utf)
3920 {
3921 register pcre_uint32 d;
3922 for (i = min; i < max; i++)
3923 {
3924 int len = 1;
3925 if (eptr >= md->end_subject)
3926 {
3927 SCHECK_PARTIAL();
3928 break;
3929 }
3930 GETCHARLEN(d, eptr, len);
3931 if (fc == d || (unsigned int)foc == d) break;
3932 eptr += len;
3933 }
3934 if (possessive) continue; /* No backtracking */
3935 for(;;)
3936 {
3937 if (eptr == pp) goto TAIL_RECURSE;
3938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3940 eptr--;
3941 BACKCHAR(eptr);
3942 }
3943 }
3944 else
3945 #endif /* SUPPORT_UTF */
3946 /* Not UTF mode */
3947 {
3948 for (i = min; i < max; i++)
3949 {
3950 if (eptr >= md->end_subject)
3951 {
3952 SCHECK_PARTIAL();
3953 break;
3954 }
3955 if (fc == *eptr || foc == *eptr) break;
3956 eptr++;
3957 }
3958 if (possessive) continue; /* No backtracking */
3959 for (;;)
3960 {
3961 if (eptr == pp) goto TAIL_RECURSE;
3962 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3964 eptr--;
3965 }
3966 }
3967 /* Control never gets here */
3968 }
3969 }
3970
3971 /* Caseful comparisons */
3972
3973 else
3974 {
3975 #ifdef SUPPORT_UTF
3976 if (utf)
3977 {
3978 register pcre_uint32 d;
3979 for (i = 1; i <= min; i++)
3980 {
3981 if (eptr >= md->end_subject)
3982 {
3983 SCHECK_PARTIAL();
3984 RRETURN(MATCH_NOMATCH);
3985 }
3986 GETCHARINC(d, eptr);
3987 if (fc == d) RRETURN(MATCH_NOMATCH);
3988 }
3989 }
3990 else
3991 #endif
3992 /* Not UTF mode */
3993 {
3994 for (i = 1; i <= min; i++)
3995 {
3996 if (eptr >= md->end_subject)
3997 {
3998 SCHECK_PARTIAL();
3999 RRETURN(MATCH_NOMATCH);
4000 }
4001 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4002 }
4003 }
4004
4005 if (min == max) continue;
4006
4007 if (minimize)
4008 {
4009 #ifdef SUPPORT_UTF
4010 if (utf)
4011 {
4012 register pcre_uint32 d;
4013 for (fi = min;; fi++)
4014 {
4015 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
4016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4017 if (fi >= max) RRETURN(MATCH_NOMATCH);
4018 if (eptr >= md->end_subject)
4019 {
4020 SCHECK_PARTIAL();
4021 RRETURN(MATCH_NOMATCH);
4022 }
4023 GETCHARINC(d, eptr);
4024 if (fc == d) RRETURN(MATCH_NOMATCH);
4025 }
4026 }
4027 else
4028 #endif
4029 /* Not UTF mode */
4030 {
4031 for (fi = min;; fi++)
4032 {
4033 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4035 if (fi >= max) RRETURN(MATCH_NOMATCH);
4036 if (eptr >= md->end_subject)
4037 {
4038 SCHECK_PARTIAL();
4039 RRETURN(MATCH_NOMATCH);
4040 }
4041 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4042 }
4043 }
4044 /* Control never gets here */
4045 }
4046
4047 /* Maximize case */
4048
4049 else
4050 {
4051 pp = eptr;
4052
4053 #ifdef SUPPORT_UTF
4054 if (utf)
4055 {
4056 register pcre_uint32 d;
4057 for (i = min; i < max; i++)
4058 {
4059 int len = 1;
4060 if (eptr >= md->end_subject)
4061 {
4062 SCHECK_PARTIAL();
4063 break;
4064 }
4065 GETCHARLEN(d, eptr, len);
4066 if (fc == d) break;
4067 eptr += len;
4068 }
4069 if (possessive) continue; /* No backtracking */
4070 for(;;)
4071 {
4072 if (eptr == pp) goto TAIL_RECURSE;
4073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4075 eptr--;
4076 BACKCHAR(eptr);
4077 }
4078 }
4079 else
4080 #endif
4081 /* Not UTF mode */
4082 {
4083 for (i = min; i < max; i++)
4084 {
4085 if (eptr >= md->end_subject)
4086 {
4087 SCHECK_PARTIAL();
4088 break;
4089 }
4090 if (fc == *eptr) break;
4091 eptr++;
4092 }
4093 if (possessive) continue; /* No backtracking */
4094 for (;;)
4095 {
4096 if (eptr == pp) goto TAIL_RECURSE;
4097 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4099 eptr--;
4100 }
4101 }
4102 /* Control never gets here */
4103 }
4104 }
4105 /* Control never gets here */
4106
4107 /* Match a single character type repeatedly; several different opcodes
4108 share code. This is very similar to the code for single characters, but we
4109 repeat it in the interests of efficiency. */
4110
4111 case OP_TYPEEXACT:
4112 min = max = GET2(ecode, 1);
4113 minimize = TRUE;
4114 ecode += 1 + IMM2_SIZE;
4115 goto REPEATTYPE;
4116
4117 case OP_TYPEUPTO:
4118 case OP_TYPEMINUPTO:
4119 min = 0;
4120 max = GET2(ecode, 1);
4121 minimize = *ecode == OP_TYPEMINUPTO;
4122 ecode += 1 + IMM2_SIZE;
4123 goto REPEATTYPE;
4124
4125 case OP_TYPEPOSSTAR:
4126 possessive = TRUE;
4127 min = 0;
4128 max = INT_MAX;
4129 ecode++;
4130 goto REPEATTYPE;
4131
4132 case OP_TYPEPOSPLUS:
4133 possessive = TRUE;
4134 min = 1;
4135 max = INT_MAX;
4136 ecode++;
4137 goto REPEATTYPE;
4138
4139 case OP_TYPEPOSQUERY:
4140 possessive = TRUE;
4141 min = 0;
4142 max = 1;
4143 ecode++;
4144 goto REPEATTYPE;
4145
4146 case OP_TYPEPOSUPTO:
4147 possessive = TRUE;
4148 min = 0;
4149 max = GET2(ecode, 1);
4150 ecode += 1 + IMM2_SIZE;
4151 goto REPEATTYPE;
4152
4153 case OP_TYPESTAR:
4154 case OP_TYPEMINSTAR:
4155 case OP_TYPEPLUS:
4156 case OP_TYPEMINPLUS:
4157 case OP_TYPEQUERY:
4158 case OP_TYPEMINQUERY:
4159 c = *ecode++ - OP_TYPESTAR;
4160 minimize = (c & 1) != 0;
4161 min = rep_min[c]; /* Pick up values from tables; */
4162 max = rep_max[c]; /* zero for max => infinity */
4163 if (max == 0) max = INT_MAX;
4164
4165 /* Common code for all repeated single character type matches. Note that
4166 in UTF-8 mode, '.' matches a character of any length, but for the other
4167 character types, the valid characters are all one-byte long. */
4168
4169 REPEATTYPE:
4170 ctype = *ecode++; /* Code for the character type */
4171
4172 #ifdef SUPPORT_UCP
4173 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4174 {
4175 prop_fail_result = ctype == OP_NOTPROP;
4176 prop_type = *ecode++;
4177 prop_value = *ecode++;
4178 }
4179 else prop_type = -1;
4180 #endif
4181
4182 /* First, ensure the minimum number of matches are present. Use inline
4183 code for maximizing the speed, and do the type test once at the start
4184 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4185 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4186 and single-bytes. */
4187
4188 if (min > 0)
4189 {
4190 #ifdef SUPPORT_UCP
4191 if (prop_type >= 0)
4192 {
4193 switch(prop_type)
4194 {
4195 case PT_ANY:
4196 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4197 for (i = 1; i <= min; i++)
4198 {
4199 if (eptr >= md->end_subject)
4200 {
4201 SCHECK_PARTIAL();
4202 RRETURN(MATCH_NOMATCH);
4203 }
4204 GETCHARINCTEST(c, eptr);
4205 }
4206 break;
4207
4208 case PT_LAMP:
4209 for (i = 1; i <= min; i++)
4210 {
4211 int chartype;
4212 if (eptr >= md->end_subject)
4213 {
4214 SCHECK_PARTIAL();
4215 RRETURN(MATCH_NOMATCH);
4216 }
4217 GETCHARINCTEST(c, eptr);
4218 chartype = UCD_CHARTYPE(c);
4219 if ((chartype == ucp_Lu ||
4220 chartype == ucp_Ll ||
4221 chartype == ucp_Lt) == prop_fail_result)
4222 RRETURN(MATCH_NOMATCH);
4223 }
4224 break;
4225
4226 case PT_GC:
4227 for (i = 1; i <= min; i++)
4228 {
4229 if (eptr >= md->end_subject)
4230 {
4231 SCHECK_PARTIAL();
4232 RRETURN(MATCH_NOMATCH);
4233 }
4234 GETCHARINCTEST(c, eptr);
4235 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4236 RRETURN(MATCH_NOMATCH);
4237 }
4238 break;
4239
4240 case PT_PC:
4241 for (i = 1; i <= min; i++)
4242 {
4243 if (eptr >= md->end_subject)
4244 {
4245 SCHECK_PARTIAL();
4246 RRETURN(MATCH_NOMATCH);
4247 }
4248 GETCHARINCTEST(c, eptr);
4249 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4250 RRETURN(MATCH_NOMATCH);
4251 }
4252 break;
4253
4254 case PT_SC:
4255 for (i = 1; i <= min; i++)
4256 {
4257 if (eptr >= md->end_subject)
4258 {
4259 SCHECK_PARTIAL();
4260 RRETURN(MATCH_NOMATCH);
4261 }
4262 GETCHARINCTEST(c, eptr);
4263 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4264 RRETURN(MATCH_NOMATCH);
4265 }
4266 break;
4267
4268 case PT_ALNUM:
4269 for (i = 1; i <= min; i++)
4270 {
4271 int category;
4272 if (eptr >= md->end_subject)
4273 {
4274 SCHECK_PARTIAL();
4275 RRETURN(MATCH_NOMATCH);
4276 }
4277 GETCHARINCTEST(c, eptr);
4278 category = UCD_CATEGORY(c);
4279 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4280 RRETURN(MATCH_NOMATCH);
4281 }
4282 break;
4283
4284 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4285 which means that Perl space and POSIX space are now identical. PCRE
4286 was changed at release 8.34. */
4287
4288 case PT_SPACE: /* Perl space */
4289 case PT_PXSPACE: /* POSIX space */
4290 for (i = 1; i <= min; i++)
4291 {
4292 if (eptr >= md->end_subject)
4293 {
4294 SCHECK_PARTIAL();
4295 RRETURN(MATCH_NOMATCH);
4296 }
4297 GETCHARINCTEST(c, eptr);
4298 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4299 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4300 == prop_fail_result)
4301 RRETURN(MATCH_NOMATCH);
4302 }
4303 break;
4304
4305 case PT_WORD:
4306 for (i = 1; i <= min; i++)
4307 {
4308 int category;
4309 if (eptr >= md->end_subject)
4310 {
4311 SCHECK_PARTIAL();
4312 RRETURN(MATCH_NOMATCH);
4313 }
4314 GETCHARINCTEST(c, eptr);
4315 category = UCD_CATEGORY(c);
4316 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4317 == prop_fail_result)
4318 RRETURN(MATCH_NOMATCH);
4319 }
4320 break;
4321
4322 case PT_CLIST:
4323 for (i = 1; i <= min; i++)
4324 {
4325 const pcre_uint32 *cp;
4326 if (eptr >= md->end_subject)
4327 {
4328 SCHECK_PARTIAL();
4329 RRETURN(MATCH_NOMATCH);
4330 }
4331 GETCHARINCTEST(c, eptr);
4332 cp = PRIV(ucd_caseless_sets) + prop_value;
4333 for (;;)
4334 {
4335 if (c < *cp)
4336 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4337 if (c == *cp++)
4338 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4339 }
4340 }
4341 break;
4342
4343 case PT_UCNC:
4344 for (i = 1; i <= min; i++)
4345 {
4346 if (eptr >= md->end_subject)
4347 {
4348 SCHECK_PARTIAL();
4349 RRETURN(MATCH_NOMATCH);
4350 }
4351 GETCHARINCTEST(c, eptr);
4352 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4353 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4354 c >= 0xe000) == prop_fail_result)
4355 RRETURN(MATCH_NOMATCH);
4356 }
4357 break;
4358
4359 /* This should not occur */
4360
4361 default:
4362 RRETURN(PCRE_ERROR_INTERNAL);
4363 }
4364 }
4365
4366 /* Match extended Unicode sequences. We will get here only if the
4367 support is in the binary; otherwise a compile-time error occurs. */
4368
4369 else if (ctype == OP_EXTUNI)
4370 {
4371 for (i = 1; i <= min; i++)
4372 {
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 RRETURN(MATCH_NOMATCH);
4377 }
4378 else
4379 {
4380 int lgb, rgb;
4381 GETCHARINCTEST(c, eptr);
4382 lgb = UCD_GRAPHBREAK(c);
4383 while (eptr < md->end_subject)
4384 {
4385 int len = 1;
4386 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4387 rgb = UCD_GRAPHBREAK(c);
4388 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4389 lgb = rgb;
4390 eptr += len;
4391 }
4392 }
4393 CHECK_PARTIAL();
4394 }
4395 }
4396
4397 else
4398 #endif /* SUPPORT_UCP */
4399
4400 /* Handle all other cases when the coding is UTF-8 */
4401
4402 #ifdef SUPPORT_UTF
4403 if (utf) switch(ctype)
4404 {
4405 case OP_ANY:
4406 for (i = 1; i <= min; i++)
4407 {
4408 if (eptr >= md->end_subject)
4409 {
4410 SCHECK_PARTIAL();
4411 RRETURN(MATCH_NOMATCH);
4412 }
4413 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4414 if (md->partial != 0 &&
4415 eptr + 1 >= md->end_subject &&
4416 NLBLOCK->nltype == NLTYPE_FIXED &&
4417 NLBLOCK->nllen == 2 &&
4418 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4419 {
4420 md->hitend = TRUE;
4421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4422 }
4423 eptr++;
4424 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4425 }
4426 break;
4427
4428 case OP_ALLANY:
4429 for (i = 1; i <= min; i++)
4430 {
4431 if (eptr >= md->end_subject)
4432 {
4433 SCHECK_PARTIAL();
4434 RRETURN(MATCH_NOMATCH);
4435 }
4436 eptr++;
4437 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4438 }
4439 break;
4440
4441 case OP_ANYBYTE:
4442 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4443 eptr += min;
4444 break;
4445
4446 case OP_ANYNL:
4447 for (i = 1; i <= min; i++)
4448 {
4449 if (eptr >= md->end_subject)
4450 {
4451 SCHECK_PARTIAL();
4452 RRETURN(MATCH_NOMATCH);
4453 }
4454 GETCHARINC(c, eptr);
4455 switch(c)
4456 {
4457 default: RRETURN(MATCH_NOMATCH);
4458
4459 case CHAR_CR:
4460 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4461 break;
4462
4463 case CHAR_LF:
4464 break;
4465
4466 case CHAR_VT:
4467 case CHAR_FF:
4468 case CHAR_NEL:
4469 #ifndef EBCDIC
4470 case 0x2028:
4471 case 0x2029:
4472 #endif /* Not EBCDIC */
4473 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4474 break;
4475 }
4476 }
4477 break;
4478
4479 case OP_NOT_HSPACE:
4480 for (i = 1; i <= min; i++)
4481 {
4482 if (eptr >= md->end_subject)
4483 {
4484 SCHECK_PARTIAL();
4485 RRETURN(MATCH_NOMATCH);
4486 }
4487 GETCHARINC(c, eptr);
4488 switch(c)
4489 {
4490 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4491 default: break;
4492 }
4493 }
4494 break;
4495
4496 case OP_HSPACE:
4497 for (i = 1; i <= min; i++)
4498 {
4499 if (eptr >= md->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 RRETURN(MATCH_NOMATCH);
4503 }
4504 GETCHARINC(c, eptr);
4505 switch(c)
4506 {
4507 HSPACE_CASES: break; /* Byte and multibyte cases */
4508 default: RRETURN(MATCH_NOMATCH);
4509 }
4510 }
4511 break;
4512
4513 case OP_NOT_VSPACE:
4514 for (i = 1; i <= min; i++)
4515 {
4516 if (eptr >= md->end_subject)
4517 {
4518 SCHECK_PARTIAL();
4519 RRETURN(MATCH_NOMATCH);
4520 }
4521 GETCHARINC(c, eptr);
4522 switch(c)
4523 {
4524 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4525 default: break;
4526 }
4527 }
4528 break;
4529
4530 case OP_VSPACE:
4531 for (i = 1; i <= min; i++)
4532 {
4533 if (eptr >= md->end_subject)
4534 {
4535 SCHECK_PARTIAL();
4536 RRETURN(MATCH_NOMATCH);
4537 }
4538 GETCHARINC(c, eptr);
4539 switch(c)
4540 {
4541 VSPACE_CASES: break;
4542 default: RRETURN(MATCH_NOMATCH);
4543 }
4544 }
4545 break;
4546
4547 case OP_NOT_DIGIT:
4548 for (i = 1; i <= min; i++)
4549 {
4550 if (eptr >= md->end_subject)
4551 {
4552 SCHECK_PARTIAL();
4553 RRETURN(MATCH_NOMATCH);
4554 }
4555 GETCHARINC(c, eptr);
4556 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4557 RRETURN(MATCH_NOMATCH);
4558 }
4559 break;
4560
4561 case OP_DIGIT:
4562 for (i = 1; i <= min; i++)
4563 {
4564 pcre_uint32 cc;
4565 if (eptr >= md->end_subject)
4566 {
4567 SCHECK_PARTIAL();
4568 RRETURN(MATCH_NOMATCH);
4569 }
4570 cc = RAWUCHAR(eptr);
4571 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4572 RRETURN(MATCH_NOMATCH);
4573 eptr++;
4574 /* No need to skip more bytes - we know it's a 1-byte character */
4575 }
4576 break;
4577
4578 case OP_NOT_WHITESPACE:
4579 for (i = 1; i <= min; i++)
4580 {
4581 pcre_uint32 cc;
4582 if (eptr >= md->end_subject)
4583 {
4584 SCHECK_PARTIAL();
4585 RRETURN(MATCH_NOMATCH);
4586 }
4587 cc = RAWUCHAR(eptr);
4588 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4589 RRETURN(MATCH_NOMATCH);
4590 eptr++;
4591 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4592 }
4593 break;
4594
4595 case OP_WHITESPACE:
4596 for (i = 1; i <= min; i++)
4597 {
4598 pcre_uint32 cc;
4599 if (eptr >= md->end_subject)
4600 {
4601 SCHECK_PARTIAL();
4602 RRETURN(MATCH_NOMATCH);
4603 }
4604 cc = RAWUCHAR(eptr);
4605 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4606 RRETURN(MATCH_NOMATCH);
4607 eptr++;
4608 /* No need to skip more bytes - we know it's a 1-byte character */
4609 }
4610 break;
4611
4612 case OP_NOT_WORDCHAR:
4613 for (i = 1; i <= min; i++)
4614 {
4615 pcre_uint32 cc;
4616 if (eptr >= md->end_subject)
4617 {
4618 SCHECK_PARTIAL();
4619 RRETURN(MATCH_NOMATCH);
4620 }
4621 cc = RAWUCHAR(eptr);
4622 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4623 RRETURN(MATCH_NOMATCH);
4624 eptr++;
4625 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4626 }
4627 break;
4628
4629 case OP_WORDCHAR:
4630 for (i = 1; i <= min; i++)
4631 {
4632 pcre_uint32 cc;
4633 if (eptr >= md->end_subject)
4634 {
4635 SCHECK_PARTIAL();
4636 RRETURN(MATCH_NOMATCH);
4637 }
4638 cc = RAWUCHAR(eptr);
4639 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4640 RRETURN(MATCH_NOMATCH);
4641 eptr++;
4642 /* No need to skip more bytes - we know it's a 1-byte character */
4643 }
4644 break;
4645
4646 default:
4647 RRETURN(PCRE_ERROR_INTERNAL);
4648 } /* End switch(ctype) */
4649
4650 else
4651 #endif /* SUPPORT_UTF */
4652
4653 /* Code for the non-UTF-8 case for minimum matching of operators other
4654 than OP_PROP and OP_NOTPROP. */
4655
4656 switch(ctype)
4657 {
4658 case OP_ANY:
4659 for (i = 1; i <= min; i++)
4660 {
4661 if (eptr >= md->end_subject)
4662 {
4663 SCHECK_PARTIAL();
4664 RRETURN(MATCH_NOMATCH);
4665 }
4666 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4667 if (md->partial != 0 &&
4668 eptr + 1 >= md->end_subject &&
4669 NLBLOCK->nltype == NLTYPE_FIXED &&
4670 NLBLOCK->nllen == 2 &&
4671 *eptr == NLBLOCK->nl[0])
4672 {
4673 md->hitend = TRUE;
4674 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4675 }
4676 eptr++;
4677 }
4678 break;
4679
4680 case OP_ALLANY:
4681 if (eptr > md->end_subject - min)
4682 {
4683 SCHECK_PARTIAL();
4684 RRETURN(MATCH_NOMATCH);
4685 }
4686 eptr += min;
4687 break;
4688
4689 case OP_ANYBYTE:
4690 if (eptr > md->end_subject - min)
4691 {
4692 SCHECK_PARTIAL();
4693 RRETURN(MATCH_NOMATCH);
4694 }
4695 eptr += min;
4696 break;
4697
4698 case OP_ANYNL:
4699 for (i = 1; i <= min; i++)
4700 {
4701 if (eptr >= md->end_subject)
4702 {
4703 SCHECK_PARTIAL();
4704 RRETURN(MATCH_NOMATCH);
4705 }
4706 switch(*eptr++)
4707 {
4708 default: RRETURN(MATCH_NOMATCH);
4709
4710 case CHAR_CR:
4711 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4712 break;
4713
4714 case CHAR_LF:
4715 break;
4716
4717 case CHAR_VT:
4718 case CHAR_FF:
4719 case CHAR_NEL:
4720 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4721 case 0x2028:
4722 case 0x2029:
4723 #endif
4724 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4725 break;
4726 }
4727 }
4728 break;
4729
4730 case OP_NOT_HSPACE:
4731 for (i = 1; i <= min; i++)
4732 {
4733 if (eptr >= md->end_subject)
4734 {
4735 SCHECK_PARTIAL();
4736 RRETURN(MATCH_NOMATCH);
4737 }
4738 switch(*eptr++)
4739 {
4740 default: break;
4741 HSPACE_BYTE_CASES:
4742 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4743 HSPACE_MULTIBYTE_CASES:
4744 #endif
4745 RRETURN(MATCH_NOMATCH);
4746 }
4747 }
4748 break;
4749
4750 case OP_HSPACE:
4751 for (i = 1; i <= min; i++)
4752 {
4753 if (eptr >= md->end_subject)
4754 {
4755 SCHECK_PARTIAL();
4756 RRETURN(MATCH_NOMATCH);
4757 }
4758 switch(*eptr++)
4759 {
4760 default: RRETURN(MATCH_NOMATCH);
4761 HSPACE_BYTE_CASES:
4762 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4763 HSPACE_MULTIBYTE_CASES:
4764 #endif
4765 break;
4766 }
4767 }
4768 break;
4769
4770 case OP_NOT_VSPACE:
4771 for (i = 1; i <= min; i++)
4772 {
4773 if (eptr >= md->end_subject)
4774 {
4775 SCHECK_PARTIAL();
4776 RRETURN(MATCH_NOMATCH);
4777 }
4778 switch(*eptr++)
4779 {
4780 VSPACE_BYTE_CASES:
4781 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4782 VSPACE_MULTIBYTE_CASES:
4783 #endif
4784 RRETURN(MATCH_NOMATCH);
4785 default: break;
4786 }
4787 }
4788 break;
4789
4790 case OP_VSPACE:
4791 for (i = 1; i <= min; i++)
4792 {
4793 if (eptr >= md->end_subject)
4794 {
4795 SCHECK_PARTIAL();
4796 RRETURN(MATCH_NOMATCH);
4797 }
4798 switch(*eptr++)
4799 {
4800 default: RRETURN(MATCH_NOMATCH);
4801 VSPACE_BYTE_CASES:
4802 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4803 VSPACE_MULTIBYTE_CASES:
4804 #endif
4805 break;
4806 }
4807 }
4808 break;
4809
4810 case OP_NOT_DIGIT:
4811 for (i = 1; i <= min; i++)
4812 {
4813 if (eptr >= md->end_subject)
4814 {
4815 SCHECK_PARTIAL();
4816 RRETURN(MATCH_NOMATCH);
4817 }
4818 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4819 RRETURN(MATCH_NOMATCH);
4820 eptr++;
4821 }
4822 break;
4823
4824 case OP_DIGIT:
4825 for (i = 1; i <= min; i++)
4826 {
4827 if (eptr >= md->end_subject)
4828 {
4829 SCHECK_PARTIAL();
4830 RRETURN(MATCH_NOMATCH);
4831 }
4832 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4833 RRETURN(MATCH_NOMATCH);
4834 eptr++;
4835 }
4836 break;
4837
4838 case OP_NOT_WHITESPACE:
4839 for (i = 1; i <= min; i++)
4840 {
4841 if (eptr >= md->end_subject)
4842 {
4843 SCHECK_PARTIAL();
4844 RRETURN(MATCH_NOMATCH);
4845 }
4846 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4847 RRETURN(MATCH_NOMATCH);
4848 eptr++;
4849 }
4850 break;
4851
4852 case OP_WHITESPACE:
4853 for (i = 1; i <= min; i++)
4854 {
4855 if (eptr >= md->end_subject)
4856 {
4857 SCHECK_PARTIAL();
4858 RRETURN(MATCH_NOMATCH);
4859 }
4860 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4861 RRETURN(MATCH_NOMATCH);
4862 eptr++;
4863 }
4864 break;
4865
4866 case OP_NOT_WORDCHAR:
4867 for (i = 1; i <= min; i++)
4868 {
4869 if (eptr >= md->end_subject)
4870 {
4871 SCHECK_PARTIAL();
4872 RRETURN(MATCH_NOMATCH);
4873 }
4874 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4875 RRETURN(MATCH_NOMATCH);
4876 eptr++;
4877 }
4878 break;
4879
4880 case OP_WORDCHAR:
4881 for (i = 1; i <= min; i++)
4882 {
4883 if (eptr >= md->end_subject)
4884 {
4885 SCHECK_PARTIAL();
4886 RRETURN(MATCH_NOMATCH);
4887 }
4888 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4889 RRETURN(MATCH_NOMATCH);
4890 eptr++;
4891 }
4892 break;
4893
4894 default:
4895 RRETURN(PCRE_ERROR_INTERNAL);
4896 }
4897 }
4898
4899 /* If min = max, continue at the same level without recursing */
4900
4901 if (min == max) continue;
4902
4903 /* If minimizing, we have to test the rest of the pattern before each
4904 subsequent match. Again, separate the UTF-8 case for speed, and also
4905 separate the UCP cases. */
4906
4907 if (minimize)
4908 {
4909 #ifdef SUPPORT_UCP
4910 if (prop_type >= 0)
4911 {
4912 switch(prop_type)
4913 {
4914 case PT_ANY:
4915 for (fi = min;; fi++)
4916 {
4917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919 if (fi >= max) RRETURN(MATCH_NOMATCH);
4920 if (eptr >= md->end_subject)
4921 {
4922 SCHECK_PARTIAL();
4923 RRETURN(MATCH_NOMATCH);
4924 }
4925 GETCHARINCTEST(c, eptr);
4926 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4927 }
4928 /* Control never gets here */
4929
4930 case PT_LAMP:
4931 for (fi = min;; fi++)
4932 {
4933 int chartype;
4934 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4936 if (fi >= max) RRETURN(MATCH_NOMATCH);
4937 if (eptr >= md->end_subject)
4938 {
4939 SCHECK_PARTIAL();
4940 RRETURN(MATCH_NOMATCH);
4941 }
4942 GETCHARINCTEST(c, eptr);
4943 chartype = UCD_CHARTYPE(c);
4944 if ((chartype == ucp_Lu ||
4945 chartype == ucp_Ll ||
4946 chartype == ucp_Lt) == prop_fail_result)
4947 RRETURN(MATCH_NOMATCH);
4948 }
4949 /* Control never gets here */
4950
4951 case PT_GC:
4952 for (fi = min;; fi++)
4953 {
4954 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4955 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4956 if (fi >= max) RRETURN(MATCH_NOMATCH);
4957 if (eptr >= md->end_subject)
4958 {
4959 SCHECK_PARTIAL();
4960 RRETURN(MATCH_NOMATCH);
4961 }
4962 GETCHARINCTEST(c, eptr);
4963 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4964 RRETURN(MATCH_NOMATCH);
4965 }
4966 /* Control never gets here */
4967
4968 case PT_PC:
4969 for (fi = min;; fi++)
4970 {
4971 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4973 if (fi >= max) RRETURN(MATCH_NOMATCH);
4974 if (eptr >= md->end_subject)
4975 {
4976 SCHECK_PARTIAL();
4977 RRETURN(MATCH_NOMATCH);
4978 }
4979 GETCHARINCTEST(c, eptr);
4980 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4981 RRETURN(MATCH_NOMATCH);
4982 }
4983 /* Control never gets here */
4984
4985 case PT_SC:
4986 for (fi = min;; fi++)
4987 {
4988 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4990 if (fi >= max) RRETURN(MATCH_NOMATCH);
4991 if (eptr >= md->end_subject)
4992 {
4993 SCHECK_PARTIAL();
4994 RRETURN(MATCH_NOMATCH);
4995 }
4996 GETCHARINCTEST(c, eptr);
4997 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4998 RRETURN(MATCH_NOMATCH);
4999 }
5000 /* Control never gets here */
5001
5002 case PT_ALNUM:
5003 for (fi = min;; fi++)
5004 {
5005 int category;
5006 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
5007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5008 if (fi >= max) RRETURN(MATCH_NOMATCH);
5009 if (eptr >= md->end_subject)
5010 {
5011 SCHECK_PARTIAL();
5012 RRETURN(MATCH_NOMATCH);
5013 }
5014 GETCHARINCTEST(c, eptr);
5015 category = UCD_CATEGORY(c);
5016 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5017 RRETURN(MATCH_NOMATCH);
5018 }
5019 /* Control never gets here */
5020
5021 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5022 which means that Perl space and POSIX space are now identical. PCRE
5023 was changed at release 8.34. */
5024
5025 case PT_SPACE: /* Perl space */
5026 case PT_PXSPACE: /* POSIX space */
5027 for (fi = min;; fi++)
5028 {
5029 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5030 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5031 if (fi >= max) RRETURN(MATCH_NOMATCH);
5032 if (eptr >= md->end_subject)
5033 {
5034 SCHECK_PARTIAL();
5035 RRETURN(MATCH_NOMATCH);
5036 }
5037 GETCHARINCTEST(c, eptr);
5038 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5039 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5040 == prop_fail_result)
5041 RRETURN(MATCH_NOMATCH);
5042 }
5043 /* Control never gets here */
5044
5045 case PT_WORD:
5046 for (fi = min;; fi++)
5047 {
5048 int category;
5049 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5051 if (fi >= max) RRETURN(MATCH_NOMATCH);
5052 if (eptr >= md->end_subject)
5053 {
5054 SCHECK_PARTIAL();
5055 RRETURN(MATCH_NOMATCH);
5056 }
5057 GETCHARINCTEST(c, eptr);
5058 category = UCD_CATEGORY(c);
5059 if ((category == ucp_L ||
5060 category == ucp_N ||
5061 c == CHAR_UNDERSCORE)
5062 == prop_fail_result)
5063 RRETURN(MATCH_NOMATCH);
5064 }
5065 /* Control never gets here */
5066
5067 case PT_CLIST:
5068 for (fi = min;; fi++)
5069 {
5070 const pcre_uint32 *cp;
5071 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5072 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5073 if (fi >= max) RRETURN(MATCH_NOMATCH);
5074 if (eptr >= md->end_subject)
5075 {
5076 SCHECK_PARTIAL();
5077 RRETURN(MATCH_NOMATCH);
5078 }
5079 GETCHARINCTEST(c, eptr);
5080 cp = PRIV(ucd_caseless_sets) + prop_value;
5081 for (;;)
5082 {
5083 if (c < *cp)
5084 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5085 if (c == *cp++)
5086 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5087 }
5088 }
5089 /* Control never gets here */
5090
5091 case PT_UCNC:
5092 for (fi = min;; fi++)
5093 {
5094 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5096 if (fi >= max) RRETURN(MATCH_NOMATCH);
5097 if (eptr >= md->end_subject)
5098 {
5099 SCHECK_PARTIAL();
5100 RRETURN(MATCH_NOMATCH);
5101 }
5102 GETCHARINCTEST(c, eptr);
5103 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5104 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5105 c >= 0xe000) == prop_fail_result)
5106 RRETURN(MATCH_NOMATCH);
5107 }
5108 /* Control never gets here */
5109
5110 /* This should never occur */
5111 default:
5112 RRETURN(PCRE_ERROR_INTERNAL);
5113 }
5114 }
5115
5116 /* Match extended Unicode sequences. We will get here only if the
5117 support is in the binary; otherwise a compile-time error occurs. */
5118
5119 else if (ctype == OP_EXTUNI)
5120 {
5121 for (fi = min;; fi++)
5122 {
5123 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5124 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5125 if (fi >= max) RRETURN(MATCH_NOMATCH);
5126 if (eptr >= md->end_subject)
5127 {
5128 SCHECK_PARTIAL();
5129 RRETURN(MATCH_NOMATCH);
5130 }
5131 else
5132 {
5133 int lgb, rgb;
5134 GETCHARINCTEST(c, eptr);
5135 lgb = UCD_GRAPHBREAK(c);
5136 while (eptr < md->end_subject)
5137 {
5138 int len = 1;
5139 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5140 rgb = UCD_GRAPHBREAK(c);
5141 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5142 lgb = rgb;
5143 eptr += len;
5144 }
5145 }
5146 CHECK_PARTIAL();
5147 }
5148 }
5149 else
5150 #endif /* SUPPORT_UCP */
5151
5152 #ifdef SUPPORT_UTF
5153 if (utf)
5154 {
5155 for (fi = min;; fi++)
5156 {
5157 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5158 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5159 if (fi >= max) RRETURN(MATCH_NOMATCH);
5160 if (eptr >= md->end_subject)
5161 {
5162 SCHECK_PARTIAL();
5163 RRETURN(MATCH_NOMATCH);
5164 }
5165 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5166 RRETURN(MATCH_NOMATCH);
5167 GETCHARINC(c, eptr);
5168 switch(ctype)
5169 {
5170 case OP_ANY: /* This is the non-NL case */
5171 if (md->partial != 0 && /* Take care with CRLF partial */
5172 eptr >= md->end_subject &&
5173 NLBLOCK->nltype == NLTYPE_FIXED &&
5174 NLBLOCK->nllen == 2 &&
5175 c == NLBLOCK->nl[0])
5176 {
5177 md->hitend = TRUE;
5178 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5179 }
5180 break;
5181
5182 case OP_ALLANY:
5183 case OP_ANYBYTE:
5184 break;
5185
5186 case OP_ANYNL:
5187 switch(c)
5188 {
5189 default: RRETURN(MATCH_NOMATCH);
5190 case CHAR_CR:
5191 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5192 break;
5193
5194 case CHAR_LF:
5195 break;
5196
5197 case CHAR_VT:
5198 case CHAR_FF:
5199 case CHAR_NEL:
5200 #ifndef EBCDIC
5201 case 0x2028:
5202 case 0x2029:
5203 #endif /* Not EBCDIC */
5204 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5205 break;
5206 }
5207 break;
5208
5209 case OP_NOT_HSPACE:
5210 switch(c)
5211 {
5212 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5213 default: break;
5214 }
5215 break;
5216
5217 case OP_HSPACE:
5218 switch(c)
5219 {
5220 HSPACE_CASES: break;
5221 default: RRETURN(MATCH_NOMATCH);
5222 }
5223 break;
5224
5225 case OP_NOT_VSPACE:
5226 switch(c)
5227 {
5228 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5229 default: break;
5230 }
5231 break;
5232
5233 case OP_VSPACE:
5234 switch(c)
5235 {
5236 VSPACE_CASES: break;
5237 default: RRETURN(MATCH_NOMATCH);
5238 }
5239 break;
5240
5241 case OP_NOT_DIGIT:
5242 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5243 RRETURN(MATCH_NOMATCH);
5244 break;
5245
5246 case OP_DIGIT:
5247 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5248 RRETURN(MATCH_NOMATCH);
5249 break;
5250
5251 case OP_NOT_WHITESPACE:
5252 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5253 RRETURN(MATCH_NOMATCH);
5254 break;
5255
5256 case OP_WHITESPACE:
5257 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5258 RRETURN(MATCH_NOMATCH);
5259 break;
5260
5261 case OP_NOT_WORDCHAR:
5262 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5263 RRETURN(MATCH_NOMATCH);
5264 break;
5265
5266 case OP_WORDCHAR:
5267 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5268 RRETURN(MATCH_NOMATCH);
5269 break;
5270
5271 default:
5272 RRETURN(PCRE_ERROR_INTERNAL);
5273 }
5274 }
5275 }
5276 else
5277 #endif
5278 /* Not UTF mode */
5279 {
5280 for (fi = min;; fi++)
5281 {
5282 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5283 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5284 if (fi >= max) RRETURN(MATCH_NOMATCH);
5285 if (eptr >= md->end_subject)
5286 {
5287 SCHECK_PARTIAL();
5288 RRETURN(MATCH_NOMATCH);
5289 }
5290 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5291 RRETURN(MATCH_NOMATCH);
5292 c = *eptr++;
5293 switch(ctype)
5294 {
5295 case OP_ANY: /* This is the non-NL case */
5296 if (md->partial != 0 && /* Take care with CRLF partial */
5297 eptr >= md->end_subject &&
5298 NLBLOCK->nltype == NLTYPE_FIXED &&
5299 NLBLOCK->nllen == 2 &&
5300 c == NLBLOCK->nl[0])
5301 {
5302 md->hitend = TRUE;
5303 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5304 }
5305 break;
5306
5307 case OP_ALLANY:
5308 case OP_ANYBYTE:
5309 break;
5310
5311 case OP_ANYNL:
5312 switch(c)
5313 {
5314 default: RRETURN(MATCH_NOMATCH);
5315 case CHAR_CR:
5316 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5317 break;
5318
5319 case CHAR_LF:
5320 break;
5321
5322 case CHAR_VT:
5323 case CHAR_FF:
5324 case CHAR_NEL:
5325 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5326 case 0x2028:
5327 case 0x2029:
5328 #endif
5329 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5330 break;
5331 }
5332 break;
5333
5334 case OP_NOT_HSPACE:
5335 switch(c)
5336 {
5337 default: break;
5338 HSPACE_BYTE_CASES:
5339 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5340 HSPACE_MULTIBYTE_CASES:
5341 #endif
5342 RRETURN(MATCH_NOMATCH);
5343 }
5344 break;
5345
5346 case OP_HSPACE:
5347 switch(c)
5348 {
5349 default: RRETURN(MATCH_NOMATCH);
5350 HSPACE_BYTE_CASES:
5351 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5352 HSPACE_MULTIBYTE_CASES:
5353 #endif
5354 break;
5355 }
5356 break;
5357
5358 case OP_NOT_VSPACE:
5359 switch(c)
5360 {
5361 default: break;
5362 VSPACE_BYTE_CASES:
5363 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5364 VSPACE_MULTIBYTE_CASES:
5365 #endif
5366 RRETURN(MATCH_NOMATCH);
5367 }
5368 break;
5369
5370 case OP_VSPACE:
5371 switch(c)
5372 {
5373 default: RRETURN(MATCH_NOMATCH);
5374 VSPACE_BYTE_CASES:
5375 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5376 VSPACE_MULTIBYTE_CASES:
5377 #endif
5378 break;
5379 }
5380 break;
5381
5382 case OP_NOT_DIGIT:
5383 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5384 break;
5385
5386 case OP_DIGIT:
5387 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5388 break;
5389
5390 case OP_NOT_WHITESPACE:
5391 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5392 break;
5393
5394 case OP_WHITESPACE:
5395 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5396 break;
5397
5398 case OP_NOT_WORDCHAR:
5399 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5400 break;
5401
5402 case OP_WORDCHAR:
5403 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5404 break;
5405
5406 default:
5407 RRETURN(PCRE_ERROR_INTERNAL);
5408 }
5409 }
5410 }
5411 /* Control never gets here */
5412 }
5413
5414 /* If maximizing, it is worth using inline code for speed, doing the type
5415 test once at the start (i.e. keep it out of the loop). Again, keep the
5416 UTF-8 and UCP stuff separate. */
5417
5418 else
5419 {
5420 pp = eptr; /* Remember where we started */
5421
5422 #ifdef SUPPORT_UCP
5423 if (prop_type >= 0)
5424 {
5425 switch(prop_type)
5426 {
5427 case PT_ANY:
5428 for (i = min; i < max; i++)
5429 {
5430 int len = 1;
5431 if (eptr >= md->end_subject)
5432 {
5433 SCHECK_PARTIAL();
5434 break;
5435 }
5436 GETCHARLENTEST(c, eptr, len);
5437 if (prop_fail_result) break;
5438 eptr+= len;
5439 }
5440 break;
5441
5442 case PT_LAMP:
5443 for (i = min; i < max; i++)
5444 {
5445 int chartype;
5446 int len = 1;
5447 if (eptr >= md->end_subject)
5448 {
5449 SCHECK_PARTIAL();
5450 break;
5451 }
5452 GETCHARLENTEST(c, eptr, len);
5453 chartype = UCD_CHARTYPE(c);
5454 if ((chartype == ucp_Lu ||
5455 chartype == ucp_Ll ||
5456 chartype == ucp_Lt) == prop_fail_result)
5457 break;
5458 eptr+= len;
5459 }
5460 break;
5461
5462 case PT_GC:
5463 for (i = min; i < max; i++)
5464 {
5465 int len = 1;
5466 if (eptr >= md->end_subject)
5467 {
5468 SCHECK_PARTIAL();
5469 break;
5470 }
5471 GETCHARLENTEST(c, eptr, len);
5472 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5473 eptr+= len;
5474 }
5475 break;
5476
5477 case PT_PC:
5478 for (i = min; i < max; i++)
5479 {
5480 int len = 1;
5481 if (eptr >= md->end_subject)
5482 {
5483 SCHECK_PARTIAL();
5484 break;
5485 }
5486 GETCHARLENTEST(c, eptr, len);
5487 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5488 eptr+= len;
5489 }
5490 break;
5491
5492 case PT_SC:
5493 for (i = min; i < max; i++)
5494 {
5495 int len = 1;
5496 if (eptr >= md->end_subject)
5497 {
5498 SCHECK_PARTIAL();
5499 break;
5500 }
5501 GETCHARLENTEST(c, eptr, len);
5502 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5503 eptr+= len;
5504 }
5505 break;
5506
5507 case PT_ALNUM:
5508 for (i = min; i < max; i++)
5509 {
5510 int category;
5511 int len = 1;
5512 if (eptr >= md->end_subject)
5513 {
5514 SCHECK_PARTIAL();
5515 break;
5516 }
5517 GETCHARLENTEST(c, eptr, len);
5518 category = UCD_CATEGORY(c);
5519 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5520 break;
5521 eptr+= len;
5522 }
5523 break;
5524
5525 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5526 which means that Perl space and POSIX space are now identical. PCRE
5527 was changed at release 8.34. */
5528
5529 case PT_SPACE: /* Perl space */
5530 case PT_PXSPACE: /* POSIX space */
5531 for (i = min; i < max; i++)
5532 {
5533 int len = 1;
5534 if (eptr >= md->end_subject)
5535 {
5536 SCHECK_PARTIAL();
5537 break;
5538 }
5539 GETCHARLENTEST(c, eptr, len);
5540 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5541 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5542 == prop_fail_result)
5543 break;
5544 eptr+= len;
5545 }
5546 break;
5547
5548 case PT_WORD:
5549 for (i = min; i < max; i++)
5550 {
5551 int category;
5552 int len = 1;
5553 if (eptr >= md->end_subject)
5554 {
5555 SCHECK_PARTIAL();
5556 break;
5557 }
5558 GETCHARLENTEST(c, eptr, len);
5559 category = UCD_CATEGORY(c);
5560 if ((category == ucp_L || category == ucp_N ||
5561 c == CHAR_UNDERSCORE) == prop_fail_result)
5562 break;
5563 eptr+= len;
5564 }
5565 break;
5566
5567 case PT_CLIST:
5568 for (i = min; i < max; i++)
5569 {
5570 const pcre_uint32 *cp;
5571 int len = 1;
5572 if (eptr >= md->end_subject)
5573 {
5574 SCHECK_PARTIAL();
5575 break;
5576 }
5577 GETCHARLENTEST(c, eptr, len);
5578 cp = PRIV(ucd_caseless_sets) + prop_value;
5579 for (;;)
5580 {
5581 if (c < *cp)
5582 { if (prop_fail_result) break; else goto GOT_MAX; }
5583 if (c == *cp++)
5584 { if (prop_fail_result) goto GOT_MAX; else break; }
5585 }
5586 eptr += len;
5587 }
5588 GOT_MAX:
5589 break;
5590
5591 case PT_UCNC:
5592 for (i = min; i < max; i++)
5593 {
5594 int len = 1;
5595 if (eptr >= md->end_subject)
5596 {
5597 SCHECK_PARTIAL();
5598 break;
5599 }
5600 GETCHARLENTEST(c, eptr, len);
5601 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5602 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5603 c >= 0xe000) == prop_fail_result)
5604 break;
5605 eptr += len;
5606 }
5607 break;
5608
5609 default:
5610 RRETURN(PCRE_ERROR_INTERNAL);
5611 }
5612
5613 /* eptr is now past the end of the maximum run */
5614
5615 if (possessive) continue; /* No backtracking */
5616 for(;;)
5617 {
5618 if (eptr == pp) goto TAIL_RECURSE;
5619 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5620 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5621 eptr--;
5622 if (utf) BACKCHAR(eptr);
5623 }
5624 }
5625
5626 /* Match extended Unicode grapheme clusters. We will get here only if the
5627 support is in the binary; otherwise a compile-time error occurs. */
5628
5629 else if (ctype == OP_EXTUNI)
5630 {
5631 for (i = min; i < max; i++)
5632 {
5633 if (eptr >= md->end_subject)
5634 {
5635 SCHECK_PARTIAL();
5636 break;
5637 }
5638 else
5639 {
5640 int lgb, rgb;
5641 GETCHARINCTEST(c, eptr);
5642 lgb = UCD_GRAPHBREAK(c);
5643 while (eptr < md->end_subject)
5644 {
5645 int len = 1;
5646 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5647 rgb = UCD_GRAPHBREAK(c);
5648 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5649 lgb = rgb;
5650 eptr += len;
5651 }
5652 }
5653 CHECK_PARTIAL();
5654 }
5655
5656 /* eptr is now past the end of the maximum run */
5657
5658 if (possessive) continue; /* No backtracking */
5659
5660 for(;;)
5661 {
5662 int lgb, rgb;
5663 PCRE_PUCHAR fptr;
5664
5665 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5666 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5668
5669 /* Backtracking over an extended grapheme cluster involves inspecting
5670 the previous two characters (if present) to see if a break is
5671 permitted between them. */
5672
5673 eptr--;
5674 if (!utf) c = *eptr; else
5675 {
5676 BACKCHAR(eptr);
5677 GETCHAR(c, eptr);
5678 }
5679 rgb = UCD_GRAPHBREAK(c);
5680
5681 for (;;)
5682 {
5683 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5684 fptr = eptr - 1;
5685 if (!utf) c = *fptr; else
5686 {
5687 BACKCHAR(fptr);
5688 GETCHAR(c, fptr);
5689 }
5690 lgb = UCD_GRAPHBREAK(c);
5691 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5692 eptr = fptr;
5693 rgb = lgb;
5694 }
5695 }
5696 }
5697
5698 else
5699 #endif /* SUPPORT_UCP */
5700
5701 #ifdef SUPPORT_UTF
5702 if (utf)
5703 {
5704 switch(ctype)
5705 {
5706 case OP_ANY:
5707 if (max < INT_MAX)
5708 {
5709 for (i = min; i < max; i++)
5710 {
5711 if (eptr >= md->end_subject)
5712 {
5713 SCHECK_PARTIAL();
5714 break;
5715 }
5716 if (IS_NEWLINE(eptr)) break;
5717 if (md->partial != 0 && /* Take care with CRLF partial */
5718 eptr + 1 >= md->end_subject &&
5719 NLBLOCK->nltype == NLTYPE_FIXED &&
5720 NLBLOCK->nllen == 2 &&
5721 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5722 {
5723 md->hitend = TRUE;
5724 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5725 }
5726 eptr++;
5727 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5728 }
5729 }
5730
5731 /* Handle unlimited UTF-8 repeat */
5732
5733 else
5734 {
5735 for (i = min; i < max; i++)
5736 {
5737 if (eptr >= md->end_subject)
5738 {
5739 SCHECK_PARTIAL();
5740 break;
5741 }
5742 if (IS_NEWLINE(eptr)) break;
5743 if (md->partial != 0 && /* Take care with CRLF partial */
5744 eptr + 1 >= md->end_subject &&
5745 NLBLOCK->nltype == NLTYPE_FIXED &&
5746 NLBLOCK->nllen == 2 &&
5747 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5748 {
5749 md->hitend = TRUE;
5750 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5751 }
5752 eptr++;
5753 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5754 }
5755 }
5756 break;
5757
5758 case OP_ALLANY:
5759 if (max < INT_MAX)
5760 {
5761 for (i = min; i < max; i++)
5762 {
5763 if (eptr >= md->end_subject)
5764 {
5765 SCHECK_PARTIAL();
5766 break;
5767 }
5768 eptr++;
5769 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5770 }
5771 }
5772 else
5773 {
5774 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5775 SCHECK_PARTIAL();
5776 }
5777 break;
5778
5779 /* The byte case is the same as non-UTF8 */
5780
5781 case OP_ANYBYTE:
5782 c = max - min;
5783 if (c > (unsigned int)(md->end_subject - eptr))
5784 {
5785 eptr = md->end_subject;
5786 SCHECK_PARTIAL();
5787 }
5788 else eptr += c;
5789 break;
5790
5791 case OP_ANYNL:
5792 for (i = min; i < max; i++)
5793 {
5794 int len = 1;
5795 if (eptr >= md->end_subject)
5796 {
5797 SCHECK_PARTIAL();
5798 break;
5799 }
5800 GETCHARLEN(c, eptr, len);
5801 if (c == CHAR_CR)
5802 {
5803 if (++eptr >= md->end_subject) break;
5804 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5805 }
5806 else
5807 {
5808 if (c != CHAR_LF &&
5809 (md->bsr_anycrlf ||
5810 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5811 #ifndef EBCDIC
5812 && c != 0x2028 && c != 0x2029
5813 #endif /* Not EBCDIC */
5814 )))
5815 break;
5816 eptr += len;
5817 }
5818 }
5819 break;
5820
5821 case OP_NOT_HSPACE:
5822 case OP_HSPACE:
5823 for (i = min; i < max; i++)
5824 {
5825 BOOL gotspace;
5826 int len = 1;
5827 if (eptr >= md->end_subject)
5828 {
5829 SCHECK_PARTIAL();
5830 break;
5831 }
5832 GETCHARLEN(c, eptr, len);
5833 switch(c)
5834 {
5835 HSPACE_CASES: gotspace = TRUE; break;
5836 default: gotspace = FALSE; break;
5837 }
5838 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5839 eptr += len;
5840 }
5841 break;
5842
5843 case OP_NOT_VSPACE:
5844 case OP_VSPACE:
5845 for (i = min; i < max; i++)
5846 {
5847 BOOL gotspace;
5848 int len = 1;
5849 if (eptr >= md->end_subject)
5850 {
5851 SCHECK_PARTIAL();
5852 break;
5853 }
5854 GETCHARLEN(c, eptr, len);
5855 switch(c)
5856 {
5857 VSPACE_CASES: gotspace = TRUE; break;
5858 default: gotspace = FALSE; break;
5859 }
5860 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5861 eptr += len;
5862 }
5863 break;
5864
5865 case OP_NOT_DIGIT:
5866 for (i = min; i < max; i++)
5867 {
5868 int len = 1;
5869 if (eptr >= md->end_subject)
5870 {
5871 SCHECK_PARTIAL();
5872 break;
5873 }
5874 GETCHARLEN(c, eptr, len);
5875 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5876 eptr+= len;
5877 }
5878 break;
5879
5880 case OP_DIGIT:
5881 for (i = min; i < max; i++)
5882 {
5883 int len = 1;
5884 if (eptr >= md->end_subject)
5885 {
5886 SCHECK_PARTIAL();
5887 break;
5888 }
5889 GETCHARLEN(c, eptr, len);
5890 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5891 eptr+= len;
5892 }
5893 break;
5894
5895 case OP_NOT_WHITESPACE:
5896 for (i = min; i < max; i++)
5897 {
5898 int len = 1;
5899 if (eptr >= md->end_subject)
5900 {
5901 SCHECK_PARTIAL();
5902 break;
5903 }
5904 GETCHARLEN(c, eptr, len);
5905 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5906 eptr+= len;
5907 }
5908 break;
5909
5910 case OP_WHITESPACE:
5911 for (i = min; i < max; i++)
5912 {
5913 int len = 1;
5914 if (eptr >= md->end_subject)
5915 {
5916 SCHECK_PARTIAL();
5917 break;
5918 }
5919 GETCHARLEN(c, eptr, len);
5920 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5921 eptr+= len;
5922 }
5923 break;
5924
5925 case OP_NOT_WORDCHAR:
5926 for (i = min; i < max; i++)
5927 {
5928 int len = 1;
5929 if (eptr >= md->end_subject)
5930 {
5931 SCHECK_PARTIAL();
5932 break;
5933 }
5934 GETCHARLEN(c, eptr, len);
5935 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5936 eptr+= len;
5937 }
5938 break;
5939
5940 case OP_WORDCHAR:
5941 for (i = min; i < max; i++)
5942 {
5943 int len = 1;
5944 if (eptr >= md->end_subject)
5945 {
5946 SCHECK_PARTIAL();
5947 break;
5948 }
5949 GETCHARLEN(c, eptr, len);
5950 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5951 eptr+= len;
5952 }
5953 break;
5954
5955 default:
5956 RRETURN(PCRE_ERROR_INTERNAL);
5957 }
5958
5959 if (possessive) continue; /* No backtracking */
5960 for(;;)
5961 {
5962 if (eptr == pp) goto TAIL_RECURSE;
5963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5965 eptr--;
5966 BACKCHAR(eptr);
5967 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5968 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5969 }
5970 }
5971 else
5972 #endif /* SUPPORT_UTF */
5973 /* Not UTF mode */
5974 {
5975 switch(ctype)
5976 {
5977 case OP_ANY:
5978 for (i = min; i < max; i++)
5979 {
5980 if (eptr >= md->end_subject)
5981 {
5982 SCHECK_PARTIAL();
5983 break;
5984 }
5985 if (IS_NEWLINE(eptr)) break;
5986 if (md->partial != 0 && /* Take care with CRLF partial */
5987 eptr + 1 >= md->end_subject &&
5988 NLBLOCK->nltype == NLTYPE_FIXED &&
5989 NLBLOCK->nllen == 2 &&
5990 *eptr == NLBLOCK->nl[0])
5991 {
5992 md->hitend = TRUE;
5993 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5994 }
5995 eptr++;
5996 }
5997 break;
5998
5999 case OP_ALLANY:
6000 case OP_ANYBYTE:
6001 c = max - min;
6002 if (c > (unsigned int)(md->end_subject - eptr))
6003 {
6004 eptr = md->end_subject;
6005 SCHECK_PARTIAL();
6006 }
6007 else eptr += c;
6008 break;
6009
6010 case OP_ANYNL:
6011 for (i = min; i < max; i++)
6012 {
6013 if (eptr >= md->end_subject)
6014 {
6015 SCHECK_PARTIAL();
6016 break;
6017 }
6018 c = *eptr;
6019 if (c == CHAR_CR)
6020 {
6021 if (++eptr >= md->end_subject) break;
6022 if (*eptr == CHAR_LF) eptr++;
6023 }
6024 else
6025 {
6026 if (c != CHAR_LF && (md->bsr_anycrlf ||
6027 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6028 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6029 && c != 0x2028 && c != 0x2029
6030 #endif
6031 ))) break;
6032 eptr++;
6033 }
6034 }
6035 break;
6036
6037 case OP_NOT_HSPACE:
6038 for (i = min; i < max; i++)
6039 {
6040 if (eptr >= md->end_subject)
6041 {
6042 SCHECK_PARTIAL();
6043 break;
6044 }
6045 switch(*eptr)
6046 {
6047 default: eptr++; break;
6048 HSPACE_BYTE_CASES:
6049 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6050 HSPACE_MULTIBYTE_CASES:
6051 #endif
6052 goto ENDLOOP00;
6053 }
6054 }
6055 ENDLOOP00:
6056 break;
6057
6058 case OP_HSPACE:
6059 for (i = min; i < max; i++)
6060 {
6061 if (eptr >= md->end_subject)
6062 {
6063 SCHECK_PARTIAL();
6064 break;
6065 }
6066 switch(*eptr)
6067 {
6068 default: goto ENDLOOP01;
6069 HSPACE_BYTE_CASES:
6070 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6071 HSPACE_MULTIBYTE_CASES:
6072 #endif
6073 eptr++; break;
6074 }
6075 }
6076 ENDLOOP01:
6077 break;
6078
6079 case OP_NOT_VSPACE:
6080 for (i = min; i < max; i++)
6081 {
6082 if (eptr >= md->end_subject)
6083 {
6084 SCHECK_PARTIAL();
6085 break;
6086 }
6087 switch(*eptr)
6088 {
6089 default: eptr++; break;
6090 VSPACE_BYTE_CASES:
6091 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6092 VSPACE_MULTIBYTE_CASES:
6093 #endif
6094 goto ENDLOOP02;
6095 }
6096 }
6097 ENDLOOP02:
6098 break;
6099
6100 case OP_VSPACE:
6101 for (i = min; i < max; i++)
6102 {
6103 if (eptr >= md->end_subject)
6104 {
6105 SCHECK_PARTIAL();
6106 break;
6107 }
6108 switch(*eptr)
6109 {
6110 default: goto ENDLOOP03;
6111 VSPACE_BYTE_CASES:
6112 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6113 VSPACE_MULTIBYTE_CASES:
6114 #endif
6115 eptr++; break;
6116 }
6117 }
6118 ENDLOOP03:
6119 break;
6120
6121 case OP_NOT_DIGIT:
6122 for (i = min; i < max; i++)
6123 {
6124 if (eptr >= md->end_subject)
6125 {
6126 SCHECK_PARTIAL();
6127 break;
6128 }
6129 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6130 eptr++;
6131 }
6132 break;
6133
6134 case OP_DIGIT:
6135 for (i = min; i < max; i++)
6136 {
6137 if (eptr >= md->end_subject)
6138 {
6139 SCHECK_PARTIAL();
6140 break;
6141 }
6142 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6143 eptr++;
6144 }
6145 break;
6146
6147 case OP_NOT_WHITESPACE:
6148 for (i = min; i < max; i++)
6149 {
6150 if (eptr >= md->end_subject)
6151 {
6152 SCHECK_PARTIAL();
6153 break;
6154 }
6155 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6156 eptr++;
6157 }
6158 break;
6159
6160 case OP_WHITESPACE:
6161 for (i = min; i < max; i++)
6162 {
6163 if (eptr >= md->end_subject)
6164 {
6165 SCHECK_PARTIAL();
6166 break;
6167 }
6168 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6169 eptr++;
6170 }
6171 break;
6172
6173 case OP_NOT_WORDCHAR:
6174 for (i = min; i < max; i++)
6175 {
6176 if (eptr >= md->end_subject)
6177 {
6178 SCHECK_PARTIAL();
6179 break;
6180 }
6181 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6182 eptr++;
6183 }
6184 break;
6185
6186 case OP_WORDCHAR:
6187 for (i = min; i < max; i++)
6188 {
6189 if (eptr >= md->end_subject)
6190 {
6191 SCHECK_PARTIAL();
6192 break;
6193 }
6194 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6195 eptr++;
6196 }
6197 break;
6198
6199 default:
6200 RRETURN(PCRE_ERROR_INTERNAL);
6201 }
6202
6203 if (possessive) continue; /* No backtracking */
6204 for (;;)
6205 {
6206 if (eptr == pp) goto TAIL_RECURSE;
6207 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6209 eptr--;
6210 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6211 eptr[-1] == CHAR_CR) eptr--;
6212 }
6213 }
6214
6215 /* Control never gets here */
6216 }
6217
6218 /* There's been some horrible disaster. Arrival here can only mean there is
6219 something seriously wrong in the code above or the OP_xxx definitions. */
6220
6221 default:
6222 DPRINTF(("Unknown opcode %d\n", *ecode));
6223 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6224 }
6225
6226 /* Do not stick any code in here without much thought; it is assumed
6227 that "continue" in the code above comes out to here to repeat the main
6228 loop. */
6229
6230 } /* End of main loop */
6231 /* Control never reaches here */
6232
6233
6234 /* When compiling to use the heap rather than the stack for recursive calls to
6235 match(), the RRETURN() macro jumps here. The number that is saved in
6236 frame->Xwhere indicates which label we actually want to return to. */
6237
6238 #ifdef NO_RECURSE
6239 #define LBL(val) case val: goto L_RM##val;
6240 HEAP_RETURN:
6241 switch (frame->Xwhere)
6242 {
6243 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6244 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6245 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6246 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6247 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6248 LBL(65) LBL(66)
6249 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6250 LBL(21)
6251 #endif
6252 #ifdef SUPPORT_UTF
6253 LBL(16) LBL(18) LBL(20)
6254 LBL(22) LBL(23) LBL(28) LBL(30)
6255 LBL(32) LBL(34) LBL(42) LBL(46)
6256 #ifdef SUPPORT_UCP
6257 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6258 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6259 #endif /* SUPPORT_UCP */
6260 #endif /* SUPPORT_UTF */
6261 default:
6262 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6263 return PCRE_ERROR_INTERNAL;
6264 }
6265 #undef LBL
6266 #endif /* NO_RECURSE */
6267 }
6268
6269
6270 /***************************************************************************
6271 ****************************************************************************
6272 RECURSION IN THE match() FUNCTION
6273
6274 Undefine all the macros that were defined above to handle this. */
6275
6276 #ifdef NO_RECURSE
6277 #undef eptr
6278 #undef ecode
6279 #undef mstart
6280 #undef offset_top
6281 #undef eptrb
6282 #undef flags
6283
6284 #undef callpat
6285 #undef charptr
6286 #undef data
6287 #undef next
6288 #undef pp
6289 #undef prev
6290 #undef saved_eptr
6291
6292 #undef new_recursive
6293
6294 #undef cur_is_word
6295 #undef condition
6296 #undef prev_is_word
6297
6298 #undef ctype
6299 #undef length
6300 #undef max
6301 #undef min
6302 #undef number
6303 #undef offset
6304 #undef op
6305 #undef save_capture_last
6306 #undef save_offset1
6307 #undef save_offset2
6308 #undef save_offset3
6309 #undef stacksave
6310
6311 #undef newptrb
6312
6313 #endif
6314
6315 /* These two are defined as macros in both cases */
6316
6317 #undef fc
6318 #undef fi
6319
6320 /***************************************************************************
6321 ***************************************************************************/
6322
6323
6324 #ifdef NO_RECURSE
6325 /*************************************************
6326 * Release allocated heap frames *
6327 *************************************************/
6328
6329 /* This function releases all the allocated frames. The base frame is on the
6330 machine stack, and so must not be freed.
6331
6332 Argument: the address of the base frame
6333 Returns: nothing
6334 */
6335
6336 static void
6337 release_match_heapframes (heapframe *frame_base)
6338 {
6339 heapframe *nextframe = frame_base->Xnextframe;
6340 while (nextframe != NULL)
6341 {
6342 heapframe *oldframe = nextframe;
6343 nextframe = nextframe->Xnextframe;
6344 (PUBL(stack_free))(oldframe);
6345 }
6346 }
6347 #endif
6348
6349
6350 /*************************************************
6351 * Execute a Regular Expression *
6352 *************************************************/
6353
6354 /* This function applies a compiled re to a subject string and picks out
6355 portions of the string if it matches. Two elements in the vector are set for
6356 each substring: the offsets to the start and end of the substring.
6357
6358 Arguments:
6359 argument_re points to the compiled expression
6360 extra_data points to extra data or is NULL
6361 subject points to the subject string
6362 length length of subject string (may contain binary zeros)
6363 start_offset where to start in the subject string
6364 options option bits
6365 offsets points to a vector of ints to be filled in with offsets
6366 offsetcount the number of elements in the vector
6367
6368 Returns: > 0 => success; value is the number of elements filled in
6369 = 0 => success, but offsets is not big enough
6370 -1 => failed to match
6371 < -1 => some kind of unexpected problem
6372 */
6373
6374 #if defined COMPILE_PCRE8
6375 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6376 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6377 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6378 int offsetcount)
6379 #elif defined COMPILE_PCRE16
6380 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6381 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6382 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6383 int offsetcount)
6384 #elif defined COMPILE_PCRE32
6385 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6386 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6387 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6388 int offsetcount)
6389 #endif
6390 {
6391 int rc, ocount, arg_offset_max;
6392 int newline;
6393 BOOL using_temporary_offsets = FALSE;
6394 BOOL anchored;
6395 BOOL startline;
6396 BOOL firstline;
6397 BOOL utf;
6398 BOOL has_first_char = FALSE;
6399 BOOL has_req_char = FALSE;
6400 pcre_uchar first_char = 0;
6401 pcre_uchar first_char2 = 0;
6402 pcre_uchar req_char = 0;
6403 pcre_uchar req_char2 = 0;
6404 match_data match_block;
6405 match_data *md = &match_block;
6406 const pcre_uint8 *tables;
6407 const pcre_uint8 *start_bits = NULL;
6408 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6409 PCRE_PUCHAR end_subject;
6410 PCRE_PUCHAR start_partial = NULL;
6411 PCRE_PUCHAR match_partial = NULL;
6412 PCRE_PUCHAR req_char_ptr = start_match - 1;
6413
6414 const pcre_study_data *study;
6415 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6416
6417 #ifdef NO_RECURSE
6418 heapframe frame_zero;
6419 frame_zero.Xprevframe = NULL; /* Marks the top level */
6420 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6421 md->match_frames_base = &frame_zero;
6422 #endif
6423
6424 /* Check for the special magic call that measures the size of the stack used
6425 per recursive call of match(). Without the funny casting for sizeof, a Windows
6426 compiler gave this error: "unary minus operator applied to unsigned type,
6427 result still unsigned". Hopefully the cast fixes that. */
6428
6429 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6430 start_offset == -999)
6431 #ifdef NO_RECURSE
6432 return -((int)sizeof(heapframe));
6433 #else
6434 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6435 #endif
6436
6437 /* Plausibility checks */
6438
6439 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6440 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6441 return PCRE_ERROR_NULL;
6442 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6443 if (length < 0) return PCRE_ERROR_BADLENGTH;
6444 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6445
6446 /* Check that the first field in the block is the magic number. If it is not,
6447 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6448 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6449 means that the pattern is likely compiled with different endianness. */
6450
6451 if (re->magic_number != MAGIC_NUMBER)
6452 return re->magic_number == REVERSED_MAGIC_NUMBER?
6453 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6454 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6455
6456 /* These two settings are used in the code for checking a UTF-8 string that
6457 follows immediately afterwards. Other values in the md block are used only
6458 during "normal" pcre_exec() processing, not when the JIT support is in use,
6459 so they are set up later. */
6460
6461 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6462 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6463 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6464 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6465
6466 /* Check a UTF-8 string if required. Pass back the character offset and error
6467 code for an invalid string if a results vector is available. */
6468
6469 #ifdef SUPPORT_UTF
6470 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6471 {
6472 int erroroffset;
6473 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6474 if (errorcode != 0)
6475 {
6476 if (offsetcount >= 2)
6477 {
6478 offsets[0] = erroroffset;
6479 offsets[1] = errorcode;
6480 }
6481 #if defined COMPILE_PCRE8
6482 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6483 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6484 #elif defined COMPILE_PCRE16
6485 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6486 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6487 #elif defined COMPILE_PCRE32
6488 return PCRE_ERROR_BADUTF32;
6489 #endif
6490 }
6491 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6492 /* Check that a start_offset points to the start of a UTF character. */
6493 if (start_offset > 0 && start_offset < length &&
6494 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6495 return PCRE_ERROR_BADUTF8_OFFSET;
6496 #endif
6497 }
6498 #endif
6499
6500 /* If the pattern was successfully studied with JIT support, run the JIT
6501 executable instead of the rest of this function. Most options must be set at
6502 compile time for the JIT code to be usable. Fallback to the normal code path if
6503 an unsupported flag is set. */
6504
6505 #ifdef SUPPORT_JIT
6506 if (extra_data != NULL
6507 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6508 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6509 && extra_data->executable_jit != NULL
6510 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6511 {
6512 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6513 start_offset, options, offsets, offsetcount);
6514
6515 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6516 mode is not compiled. In this case we simply fallback to interpreter. */
6517
6518 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6519 }
6520 #endif
6521
6522 /* Carry on with non-JIT matching. This information is for finding all the
6523 numbers associated with a given name, for condition testing. */
6524
6525 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6526 md->name_count = re->name_count;
6527 md->name_entry_size = re->name_entry_size;
6528
6529 /* Fish out the optional data from the extra_data structure, first setting
6530 the default values. */
6531
6532 study = NULL;
6533 md->match_limit = MATCH_LIMIT;
6534 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6535 md->callout_data = NULL;
6536
6537 /* The table pointer is always in native byte order. */
6538
6539 tables = re->tables;
6540
6541 /* The two limit values override the defaults, whatever their value. */
6542
6543 if (extra_data != NULL)
6544 {
6545 register unsigned int flags = extra_data->flags;
6546 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6547 study = (const pcre_study_data *)extra_data->study_data;
6548 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6549 md->match_limit = extra_data->match_limit;
6550 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6551 md->match_limit_recursion = extra_data->match_limit_recursion;
6552 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6553 md->callout_data = extra_data->callout_data;
6554 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6555 }
6556
6557 /* Limits in the regex override only if they are smaller. */
6558
6559 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6560 md->match_limit = re->limit_match;
6561
6562 if ((re->flags & PCRE_RLSET) != 0 &&
6563 re->limit_recursion < md->match_limit_recursion)
6564 md->match_limit_recursion = re->limit_recursion;
6565
6566 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6567 is a feature that makes it possible to save compiled regex and re-use them
6568 in other programs later. */
6569
6570 if (tables == NULL) tables = PRIV(default_tables);
6571
6572 /* Set up other data */
6573
6574 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6575 startline = (re->flags & PCRE_STARTLINE) != 0;
6576 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6577
6578 /* The code starts after the real_pcre block and the capture name table. */
6579
6580 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6581 re->name_count * re->name_entry_size;
6582
6583 md->start_subject = (PCRE_PUCHAR)subject;
6584 md->start_offset = start_offset;
6585 md->end_subject = md->start_subject + length;
6586 end_subject = md->end_subject;
6587
6588 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6589 md->use_ucp = (re->options & PCRE_UCP) != 0;
6590 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6591 md->ignore_skip_arg = 0;
6592
6593 /* Some options are unpacked into BOOL variables in the hope that testing
6594 them will be faster than individual option bits. */
6595
6596 md->notbol = (options & PCRE_NOTBOL) != 0;
6597 md->noteol = (options & PCRE_NOTEOL) != 0;
6598 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6599 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6600
6601 md->hitend = FALSE;
6602 md->mark = md->nomatch_mark = NULL; /* In case never set */
6603
6604 md->recursive = NULL; /* No recursion at top level */
6605 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6606
6607 md->lcc = tables + lcc_offset;
6608 md->fcc = tables + fcc_offset;
6609 md->ctypes = tables + ctypes_offset;
6610
6611 /* Handle different \R options. */
6612
6613 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6614 {
6615 case 0:
6616 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6617 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6618 else
6619 #ifdef BSR_ANYCRLF
6620 md->bsr_anycrlf = TRUE;
6621 #else
6622 md->bsr_anycrlf = FALSE;
6623 #endif
6624 break;
6625
6626 case PCRE_BSR_ANYCRLF:
6627 md->bsr_anycrlf = TRUE;
6628 break;
6629
6630 case PCRE_BSR_UNICODE:
6631 md->bsr_anycrlf = FALSE;
6632 break;
6633
6634 default: return PCRE_ERROR_BADNEWLINE;
6635 }
6636
6637 /* Handle different types of newline. The three bits give eight cases. If
6638 nothing is set at run time, whatever was used at compile time applies. */
6639
6640 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6641 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6642 {
6643 case 0: newline = NEWLINE; break; /* Compile-time default */
6644 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6645 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6646 case PCRE_NEWLINE_CR+
6647 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6648 case PCRE_NEWLINE_ANY: newline = -1; break;
6649 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6650 default: return PCRE_ERROR_BADNEWLINE;
6651 }
6652
6653 if (newline == -2)
6654 {
6655 md->nltype = NLTYPE_ANYCRLF;
6656 }
6657 else if (newline < 0)
6658 {
6659 md->nltype = NLTYPE_ANY;
6660 }
6661 else
6662 {
6663 md->nltype = NLTYPE_FIXED;
6664 if (newline > 255)
6665 {
6666 md->nllen = 2;
6667 md->nl[0] = (newline >> 8) & 255;
6668 md->nl[1] = newline & 255;
6669 }
6670 else
6671 {
6672 md->nllen = 1;
6673 md->nl[0] = newline;
6674 }
6675 }
6676
6677 /* Partial matching was originally supported only for a restricted set of
6678 regexes; from release 8.00 there are no restrictions, but the bits are still
6679 defined (though never set). So there's no harm in leaving this code. */
6680
6681 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6682 return PCRE_ERROR_BADPARTIAL;
6683
6684 /* If the expression has got more back references than the offsets supplied can
6685 hold, we get a temporary chunk of working store to use during the matching.
6686 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6687 of 3. */
6688
6689 ocount = offsetcount - (offsetcount % 3);
6690 arg_offset_max = (2*ocount)/3;
6691
6692 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6693 {
6694 ocount = re->top_backref * 3 + 3;
6695 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6696 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6697 using_temporary_offsets = TRUE;
6698 DPRINTF(("Got memory to hold back references\n"));
6699 }
6700 else md->offset_vector = offsets;
6701 md->offset_end = ocount;
6702 md->offset_max = (2*ocount)/3;
6703 md->capture_last = 0;
6704
6705 /* Reset the working variable associated with each extraction. These should
6706 never be used unless previously set, but they get saved and restored, and so we
6707 initialize them to avoid reading uninitialized locations. Also, unset the
6708 offsets for the matched string. This is really just for tidiness with callouts,
6709 in case they inspect these fields. */
6710
6711 if (md->offset_vector != NULL)
6712 {
6713 register int *iptr = md->offset_vector + ocount;
6714 register int *iend = iptr - re->top_bracket;
6715 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6716 while (--iptr >= iend) *iptr = -1;
6717 md->offset_vector[0] = md->offset_vector[1] = -1;
6718 }
6719
6720 /* Set up the first character to match, if available. The first_char value is
6721 never set for an anchored regular expression, but the anchoring may be forced
6722 at run time, so we have to test for anchoring. The first char may be unset for
6723 an unanchored pattern, of course. If there's no first char and the pattern was
6724 studied, there may be a bitmap of possible first characters. */
6725
6726 if (!anchored)
6727 {
6728 if ((re->flags & PCRE_FIRSTSET) != 0)
6729 {
6730 has_first_char = TRUE;
6731 first_char = first_char2 = (pcre_uchar)(re->first_char);
6732 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6733 {
6734 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6735 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6736 if (utf && first_char > 127)
6737 first_char2 = UCD_OTHERCASE(first_char);
6738 #endif
6739 }
6740 }
6741 else
6742 if (!startline && study != NULL &&
6743 (study->flags & PCRE_STUDY_MAPPED) != 0)
6744 start_bits = study->start_bits;
6745 }
6746
6747 /* For anchored or unanchored matches, there may be a "last known required
6748 character" set. */
6749
6750 if ((re->flags & PCRE_REQCHSET) != 0)
6751 {
6752 has_req_char = TRUE;
6753 req_char = req_char2 = (pcre_uchar)(re->req_char); </