/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1251 - (show annotations)
Wed Feb 20 17:42:03 2013 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 213787 byte(s)
Pass back the bumpalong value for partial matches.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_COMMIT (-998)
91 #define MATCH_KETRPOS (-997)
92 #define MATCH_ONCE (-996)
93 #define MATCH_PRUNE (-995)
94 #define MATCH_SKIP (-994)
95 #define MATCH_SKIP_ARG (-993)
96 #define MATCH_THEN (-992)
97
98 /* Maximum number of ints of offset to save on the stack for recursive calls.
99 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
100 because the offset vector is always a multiple of 3 long. */
101
102 #define REC_STACK_SAVE_MAX 30
103
104 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
105
106 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
107 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
108
109 #ifdef PCRE_DEBUG
110 /*************************************************
111 * Debugging function to print chars *
112 *************************************************/
113
114 /* Print a sequence of chars in printable format, stopping at the end of the
115 subject if the requested.
116
117 Arguments:
118 p points to characters
119 length number to print
120 is_subject TRUE if printing from within md->start_subject
121 md pointer to matching data block, if is_subject is TRUE
122
123 Returns: nothing
124 */
125
126 static void
127 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
128 {
129 pcre_uint32 c;
130 BOOL utf = md->utf;
131 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
132 while (length-- > 0)
133 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
134 }
135 #endif
136
137
138
139 /*************************************************
140 * Match a back-reference *
141 *************************************************/
142
143 /* Normally, if a back reference hasn't been set, the length that is passed is
144 negative, so the match always fails. However, in JavaScript compatibility mode,
145 the length passed is zero. Note that in caseless UTF-8 mode, the number of
146 subject bytes matched may be different to the number of reference bytes.
147
148 Arguments:
149 offset index into the offset vector
150 eptr pointer into the subject
151 length length of reference to be matched (number of bytes)
152 md points to match data block
153 caseless TRUE if caseless
154
155 Returns: >= 0 the number of subject bytes matched
156 -1 no match
157 -2 partial match; always given if at end subject
158 */
159
160 static int
161 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
162 BOOL caseless)
163 {
164 PCRE_PUCHAR eptr_start = eptr;
165 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
166 #ifdef SUPPORT_UTF
167 BOOL utf = md->utf;
168 #endif
169
170 #ifdef PCRE_DEBUG
171 if (eptr >= md->end_subject)
172 printf("matching subject <null>");
173 else
174 {
175 printf("matching subject ");
176 pchars(eptr, length, TRUE, md);
177 }
178 printf(" against backref ");
179 pchars(p, length, FALSE, md);
180 printf("\n");
181 #endif
182
183 /* Always fail if reference not set (and not JavaScript compatible - in that
184 case the length is passed as zero). */
185
186 if (length < 0) return -1;
187
188 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
189 properly if Unicode properties are supported. Otherwise, we can check only
190 ASCII characters. */
191
192 if (caseless)
193 {
194 #ifdef SUPPORT_UTF
195 #ifdef SUPPORT_UCP
196 if (utf)
197 {
198 /* Match characters up to the end of the reference. NOTE: the number of
199 data units matched may differ, because in UTF-8 there are some characters
200 whose upper and lower case versions code have different numbers of bytes.
201 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
202 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
203 sequence of two of the latter. It is important, therefore, to check the
204 length along the reference, not along the subject (earlier code did this
205 wrong). */
206
207 PCRE_PUCHAR endptr = p + length;
208 while (p < endptr)
209 {
210 pcre_uint32 c, d;
211 const ucd_record *ur;
212 if (eptr >= md->end_subject) return -2; /* Partial match */
213 GETCHARINC(c, eptr);
214 GETCHARINC(d, p);
215 ur = GET_UCD(d);
216 if (c != d && c != d + ur->other_case)
217 {
218 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
219 for (;;)
220 {
221 if (c < *pp) return -1;
222 if (c == *pp++) break;
223 }
224 }
225 }
226 }
227 else
228 #endif
229 #endif
230
231 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
232 is no UCP support. */
233 {
234 while (length-- > 0)
235 {
236 pcre_uchar cc, cp;
237 if (eptr >= md->end_subject) return -2; /* Partial match */
238 cc = RAWUCHARTEST(eptr);
239 cp = RAWUCHARTEST(p);
240 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
241 p++;
242 eptr++;
243 }
244 }
245 }
246
247 /* In the caseful case, we can just compare the bytes, whether or not we
248 are in UTF-8 mode. */
249
250 else
251 {
252 while (length-- > 0)
253 {
254 if (eptr >= md->end_subject) return -2; /* Partial match */
255 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
256 }
257 }
258
259 return (int)(eptr - eptr_start);
260 }
261
262
263
264 /***************************************************************************
265 ****************************************************************************
266 RECURSION IN THE match() FUNCTION
267
268 The match() function is highly recursive, though not every recursive call
269 increases the recursive depth. Nevertheless, some regular expressions can cause
270 it to recurse to a great depth. I was writing for Unix, so I just let it call
271 itself recursively. This uses the stack for saving everything that has to be
272 saved for a recursive call. On Unix, the stack can be large, and this works
273 fine.
274
275 It turns out that on some non-Unix-like systems there are problems with
276 programs that use a lot of stack. (This despite the fact that every last chip
277 has oodles of memory these days, and techniques for extending the stack have
278 been known for decades.) So....
279
280 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
281 calls by keeping local variables that need to be preserved in blocks of memory
282 obtained from malloc() instead instead of on the stack. Macros are used to
283 achieve this so that the actual code doesn't look very different to what it
284 always used to.
285
286 The original heap-recursive code used longjmp(). However, it seems that this
287 can be very slow on some operating systems. Following a suggestion from Stan
288 Switzer, the use of longjmp() has been abolished, at the cost of having to
289 provide a unique number for each call to RMATCH. There is no way of generating
290 a sequence of numbers at compile time in C. I have given them names, to make
291 them stand out more clearly.
292
293 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
294 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
295 tests. Furthermore, not using longjmp() means that local dynamic variables
296 don't have indeterminate values; this has meant that the frame size can be
297 reduced because the result can be "passed back" by straight setting of the
298 variable instead of being passed in the frame.
299 ****************************************************************************
300 ***************************************************************************/
301
302 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
303 below must be updated in sync. */
304
305 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
306 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
307 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
308 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
309 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
310 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
311 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
312
313 /* These versions of the macros use the stack, as normal. There are debugging
314 versions and production versions. Note that the "rw" argument of RMATCH isn't
315 actually used in this definition. */
316
317 #ifndef NO_RECURSE
318 #define REGISTER register
319
320 #ifdef PCRE_DEBUG
321 #define RMATCH(ra,rb,rc,rd,re,rw) \
322 { \
323 printf("match() called in line %d\n", __LINE__); \
324 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
325 printf("to line %d\n", __LINE__); \
326 }
327 #define RRETURN(ra) \
328 { \
329 printf("match() returned %d from line %d\n", ra, __LINE__); \
330 return ra; \
331 }
332 #else
333 #define RMATCH(ra,rb,rc,rd,re,rw) \
334 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
335 #define RRETURN(ra) return ra
336 #endif
337
338 #else
339
340
341 /* These versions of the macros manage a private stack on the heap. Note that
342 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
343 argument of match(), which never changes. */
344
345 #define REGISTER
346
347 #define RMATCH(ra,rb,rc,rd,re,rw)\
348 {\
349 heapframe *newframe = frame->Xnextframe;\
350 if (newframe == NULL)\
351 {\
352 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
353 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
354 newframe->Xnextframe = NULL;\
355 frame->Xnextframe = newframe;\
356 }\
357 frame->Xwhere = rw;\
358 newframe->Xeptr = ra;\
359 newframe->Xecode = rb;\
360 newframe->Xmstart = mstart;\
361 newframe->Xoffset_top = rc;\
362 newframe->Xeptrb = re;\
363 newframe->Xrdepth = frame->Xrdepth + 1;\
364 newframe->Xprevframe = frame;\
365 frame = newframe;\
366 DPRINTF(("restarting from line %d\n", __LINE__));\
367 goto HEAP_RECURSE;\
368 L_##rw:\
369 DPRINTF(("jumped back to line %d\n", __LINE__));\
370 }
371
372 #define RRETURN(ra)\
373 {\
374 heapframe *oldframe = frame;\
375 frame = oldframe->Xprevframe;\
376 if (frame != NULL)\
377 {\
378 rrc = ra;\
379 goto HEAP_RETURN;\
380 }\
381 return ra;\
382 }
383
384
385 /* Structure for remembering the local variables in a private frame */
386
387 typedef struct heapframe {
388 struct heapframe *Xprevframe;
389 struct heapframe *Xnextframe;
390
391 /* Function arguments that may change */
392
393 PCRE_PUCHAR Xeptr;
394 const pcre_uchar *Xecode;
395 PCRE_PUCHAR Xmstart;
396 int Xoffset_top;
397 eptrblock *Xeptrb;
398 unsigned int Xrdepth;
399
400 /* Function local variables */
401
402 PCRE_PUCHAR Xcallpat;
403 #ifdef SUPPORT_UTF
404 PCRE_PUCHAR Xcharptr;
405 #endif
406 PCRE_PUCHAR Xdata;
407 PCRE_PUCHAR Xnext;
408 PCRE_PUCHAR Xpp;
409 PCRE_PUCHAR Xprev;
410 PCRE_PUCHAR Xsaved_eptr;
411
412 recursion_info Xnew_recursive;
413
414 BOOL Xcur_is_word;
415 BOOL Xcondition;
416 BOOL Xprev_is_word;
417
418 #ifdef SUPPORT_UCP
419 int Xprop_type;
420 unsigned int Xprop_value;
421 int Xprop_fail_result;
422 int Xoclength;
423 pcre_uchar Xocchars[6];
424 #endif
425
426 int Xcodelink;
427 int Xctype;
428 unsigned int Xfc;
429 int Xfi;
430 int Xlength;
431 int Xmax;
432 int Xmin;
433 unsigned int Xnumber;
434 int Xoffset;
435 unsigned int Xop;
436 pcre_int32 Xsave_capture_last;
437 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
438 int Xstacksave[REC_STACK_SAVE_MAX];
439
440 eptrblock Xnewptrb;
441
442 /* Where to jump back to */
443
444 int Xwhere;
445
446 } heapframe;
447
448 #endif
449
450
451 /***************************************************************************
452 ***************************************************************************/
453
454
455
456 /*************************************************
457 * Match from current position *
458 *************************************************/
459
460 /* This function is called recursively in many circumstances. Whenever it
461 returns a negative (error) response, the outer incarnation must also return the
462 same response. */
463
464 /* These macros pack up tests that are used for partial matching, and which
465 appear several times in the code. We set the "hit end" flag if the pointer is
466 at the end of the subject and also past the start of the subject (i.e.
467 something has been matched). For hard partial matching, we then return
468 immediately. The second one is used when we already know we are past the end of
469 the subject. */
470
471 #define CHECK_PARTIAL()\
472 if (md->partial != 0 && eptr >= md->end_subject && \
473 eptr > md->start_used_ptr) \
474 { \
475 md->hitend = TRUE; \
476 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
477 }
478
479 #define SCHECK_PARTIAL()\
480 if (md->partial != 0 && eptr > md->start_used_ptr) \
481 { \
482 md->hitend = TRUE; \
483 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
484 }
485
486
487 /* Performance note: It might be tempting to extract commonly used fields from
488 the md structure (e.g. utf, end_subject) into individual variables to improve
489 performance. Tests using gcc on a SPARC disproved this; in the first case, it
490 made performance worse.
491
492 Arguments:
493 eptr pointer to current character in subject
494 ecode pointer to current position in compiled code
495 mstart pointer to the current match start position (can be modified
496 by encountering \K)
497 offset_top current top pointer
498 md pointer to "static" info for the match
499 eptrb pointer to chain of blocks containing eptr at start of
500 brackets - for testing for empty matches
501 rdepth the recursion depth
502
503 Returns: MATCH_MATCH if matched ) these values are >= 0
504 MATCH_NOMATCH if failed to match )
505 a negative MATCH_xxx value for PRUNE, SKIP, etc
506 a negative PCRE_ERROR_xxx value if aborted by an error condition
507 (e.g. stopped by repeated call or recursion limit)
508 */
509
510 static int
511 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
512 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
513 unsigned int rdepth)
514 {
515 /* These variables do not need to be preserved over recursion in this function,
516 so they can be ordinary variables in all cases. Mark some of them with
517 "register" because they are used a lot in loops. */
518
519 register int rrc; /* Returns from recursive calls */
520 register int i; /* Used for loops not involving calls to RMATCH() */
521 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
522 register BOOL utf; /* Local copy of UTF flag for speed */
523
524 BOOL minimize, possessive; /* Quantifier options */
525 BOOL caseless;
526 int condcode;
527
528 /* When recursion is not being used, all "local" variables that have to be
529 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
530 frame on the stack here; subsequent instantiations are obtained from the heap
531 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
532 the top-level on the stack rather than malloc-ing them all gives a performance
533 boost in many cases where there is not much "recursion". */
534
535 #ifdef NO_RECURSE
536 heapframe *frame = (heapframe *)md->match_frames_base;
537
538 /* Copy in the original argument variables */
539
540 frame->Xeptr = eptr;
541 frame->Xecode = ecode;
542 frame->Xmstart = mstart;
543 frame->Xoffset_top = offset_top;
544 frame->Xeptrb = eptrb;
545 frame->Xrdepth = rdepth;
546
547 /* This is where control jumps back to to effect "recursion" */
548
549 HEAP_RECURSE:
550
551 /* Macros make the argument variables come from the current frame */
552
553 #define eptr frame->Xeptr
554 #define ecode frame->Xecode
555 #define mstart frame->Xmstart
556 #define offset_top frame->Xoffset_top
557 #define eptrb frame->Xeptrb
558 #define rdepth frame->Xrdepth
559
560 /* Ditto for the local variables */
561
562 #ifdef SUPPORT_UTF
563 #define charptr frame->Xcharptr
564 #endif
565 #define callpat frame->Xcallpat
566 #define codelink frame->Xcodelink
567 #define data frame->Xdata
568 #define next frame->Xnext
569 #define pp frame->Xpp
570 #define prev frame->Xprev
571 #define saved_eptr frame->Xsaved_eptr
572
573 #define new_recursive frame->Xnew_recursive
574
575 #define cur_is_word frame->Xcur_is_word
576 #define condition frame->Xcondition
577 #define prev_is_word frame->Xprev_is_word
578
579 #ifdef SUPPORT_UCP
580 #define prop_type frame->Xprop_type
581 #define prop_value frame->Xprop_value
582 #define prop_fail_result frame->Xprop_fail_result
583 #define oclength frame->Xoclength
584 #define occhars frame->Xocchars
585 #endif
586
587 #define ctype frame->Xctype
588 #define fc frame->Xfc
589 #define fi frame->Xfi
590 #define length frame->Xlength
591 #define max frame->Xmax
592 #define min frame->Xmin
593 #define number frame->Xnumber
594 #define offset frame->Xoffset
595 #define op frame->Xop
596 #define save_capture_last frame->Xsave_capture_last
597 #define save_offset1 frame->Xsave_offset1
598 #define save_offset2 frame->Xsave_offset2
599 #define save_offset3 frame->Xsave_offset3
600 #define stacksave frame->Xstacksave
601
602 #define newptrb frame->Xnewptrb
603
604 /* When recursion is being used, local variables are allocated on the stack and
605 get preserved during recursion in the normal way. In this environment, fi and
606 i, and fc and c, can be the same variables. */
607
608 #else /* NO_RECURSE not defined */
609 #define fi i
610 #define fc c
611
612 /* Many of the following variables are used only in small blocks of the code.
613 My normal style of coding would have declared them within each of those blocks.
614 However, in order to accommodate the version of this code that uses an external
615 "stack" implemented on the heap, it is easier to declare them all here, so the
616 declarations can be cut out in a block. The only declarations within blocks
617 below are for variables that do not have to be preserved over a recursive call
618 to RMATCH(). */
619
620 #ifdef SUPPORT_UTF
621 const pcre_uchar *charptr;
622 #endif
623 const pcre_uchar *callpat;
624 const pcre_uchar *data;
625 const pcre_uchar *next;
626 PCRE_PUCHAR pp;
627 const pcre_uchar *prev;
628 PCRE_PUCHAR saved_eptr;
629
630 recursion_info new_recursive;
631
632 BOOL cur_is_word;
633 BOOL condition;
634 BOOL prev_is_word;
635
636 #ifdef SUPPORT_UCP
637 int prop_type;
638 unsigned int prop_value;
639 int prop_fail_result;
640 int oclength;
641 pcre_uchar occhars[6];
642 #endif
643
644 int codelink;
645 int ctype;
646 int length;
647 int max;
648 int min;
649 unsigned int number;
650 int offset;
651 unsigned int op;
652 pcre_int32 save_capture_last;
653 int save_offset1, save_offset2, save_offset3;
654 int stacksave[REC_STACK_SAVE_MAX];
655
656 eptrblock newptrb;
657
658 /* There is a special fudge for calling match() in a way that causes it to
659 measure the size of its basic stack frame when the stack is being used for
660 recursion. The second argument (ecode) being NULL triggers this behaviour. It
661 cannot normally ever be NULL. The return is the negated value of the frame
662 size. */
663
664 if (ecode == NULL)
665 {
666 if (rdepth == 0)
667 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
668 else
669 {
670 int len = (char *)&rdepth - (char *)eptr;
671 return (len > 0)? -len : len;
672 }
673 }
674 #endif /* NO_RECURSE */
675
676 /* To save space on the stack and in the heap frame, I have doubled up on some
677 of the local variables that are used only in localised parts of the code, but
678 still need to be preserved over recursive calls of match(). These macros define
679 the alternative names that are used. */
680
681 #define allow_zero cur_is_word
682 #define cbegroup condition
683 #define code_offset codelink
684 #define condassert condition
685 #define matched_once prev_is_word
686 #define foc number
687 #define save_mark data
688
689 /* These statements are here to stop the compiler complaining about unitialized
690 variables. */
691
692 #ifdef SUPPORT_UCP
693 prop_value = 0;
694 prop_fail_result = 0;
695 #endif
696
697
698 /* This label is used for tail recursion, which is used in a few cases even
699 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
700 used. Thanks to Ian Taylor for noticing this possibility and sending the
701 original patch. */
702
703 TAIL_RECURSE:
704
705 /* OK, now we can get on with the real code of the function. Recursive calls
706 are specified by the macro RMATCH and RRETURN is used to return. When
707 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
708 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
709 defined). However, RMATCH isn't like a function call because it's quite a
710 complicated macro. It has to be used in one particular way. This shouldn't,
711 however, impact performance when true recursion is being used. */
712
713 #ifdef SUPPORT_UTF
714 utf = md->utf; /* Local copy of the flag */
715 #else
716 utf = FALSE;
717 #endif
718
719 /* First check that we haven't called match() too many times, or that we
720 haven't exceeded the recursive call limit. */
721
722 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
723 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
724
725 /* At the start of a group with an unlimited repeat that may match an empty
726 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
727 done this way to save having to use another function argument, which would take
728 up space on the stack. See also MATCH_CONDASSERT below.
729
730 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
731 such remembered pointers, to be checked when we hit the closing ket, in order
732 to break infinite loops that match no characters. When match() is called in
733 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
734 NOT be used with tail recursion, because the memory block that is used is on
735 the stack, so a new one may be required for each match(). */
736
737 if (md->match_function_type == MATCH_CBEGROUP)
738 {
739 newptrb.epb_saved_eptr = eptr;
740 newptrb.epb_prev = eptrb;
741 eptrb = &newptrb;
742 md->match_function_type = 0;
743 }
744
745 /* Now start processing the opcodes. */
746
747 for (;;)
748 {
749 minimize = possessive = FALSE;
750 op = *ecode;
751
752 switch(op)
753 {
754 case OP_MARK:
755 md->nomatch_mark = ecode + 2;
756 md->mark = NULL; /* In case previously set by assertion */
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
758 eptrb, RM55);
759 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
760 md->mark == NULL) md->mark = ecode + 2;
761
762 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
763 argument, and we must check whether that argument matches this MARK's
764 argument. It is passed back in md->start_match_ptr (an overloading of that
765 variable). If it does match, we reset that variable to the current subject
766 position and return MATCH_SKIP. Otherwise, pass back the return code
767 unaltered. */
768
769 else if (rrc == MATCH_SKIP_ARG &&
770 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
771 {
772 md->start_match_ptr = eptr;
773 RRETURN(MATCH_SKIP);
774 }
775 RRETURN(rrc);
776
777 case OP_FAIL:
778 RRETURN(MATCH_NOMATCH);
779
780 /* COMMIT overrides PRUNE, SKIP, and THEN */
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
786 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
787 rrc != MATCH_THEN)
788 RRETURN(rrc);
789 RRETURN(MATCH_COMMIT);
790
791 /* PRUNE overrides THEN */
792
793 case OP_PRUNE:
794 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
795 eptrb, RM51);
796 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
797 RRETURN(MATCH_PRUNE);
798
799 case OP_PRUNE_ARG:
800 md->nomatch_mark = ecode + 2;
801 md->mark = NULL; /* In case previously set by assertion */
802 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
803 eptrb, RM56);
804 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
805 md->mark == NULL) md->mark = ecode + 2;
806 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
807 RRETURN(MATCH_PRUNE);
808
809 /* SKIP overrides PRUNE and THEN */
810
811 case OP_SKIP:
812 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
813 eptrb, RM53);
814 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
815 RRETURN(rrc);
816 md->start_match_ptr = eptr; /* Pass back current position */
817 RRETURN(MATCH_SKIP);
818
819 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
820 nomatch_mark. There is a flag that disables this opcode when re-matching a
821 pattern that ended with a SKIP for which there was not a matching MARK. */
822
823 case OP_SKIP_ARG:
824 if (md->ignore_skip_arg)
825 {
826 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
827 break;
828 }
829 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
830 eptrb, RM57);
831 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
832 RRETURN(rrc);
833
834 /* Pass back the current skip name by overloading md->start_match_ptr and
835 returning the special MATCH_SKIP_ARG return code. This will either be
836 caught by a matching MARK, or get to the top, where it causes a rematch
837 with the md->ignore_skip_arg flag set. */
838
839 md->start_match_ptr = ecode + 2;
840 RRETURN(MATCH_SKIP_ARG);
841
842 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
843 the branch in which it occurs can be determined. Overload the start of
844 match pointer to do this. */
845
846 case OP_THEN:
847 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
848 eptrb, RM54);
849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
850 md->start_match_ptr = ecode;
851 RRETURN(MATCH_THEN);
852
853 case OP_THEN_ARG:
854 md->nomatch_mark = ecode + 2;
855 md->mark = NULL; /* In case previously set by assertion */
856 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
857 md, eptrb, RM58);
858 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
859 md->mark == NULL) md->mark = ecode + 2;
860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
861 md->start_match_ptr = ecode;
862 RRETURN(MATCH_THEN);
863
864 /* Handle an atomic group that does not contain any capturing parentheses.
865 This can be handled like an assertion. Prior to 8.13, all atomic groups
866 were handled this way. In 8.13, the code was changed as below for ONCE, so
867 that backups pass through the group and thereby reset captured values.
868 However, this uses a lot more stack, so in 8.20, atomic groups that do not
869 contain any captures generate OP_ONCE_NC, which can be handled in the old,
870 less stack intensive way.
871
872 Check the alternative branches in turn - the matching won't pass the KET
873 for this kind of subpattern. If any one branch matches, we carry on as at
874 the end of a normal bracket, leaving the subject pointer, but resetting
875 the start-of-match value in case it was changed by \K. */
876
877 case OP_ONCE_NC:
878 prev = ecode;
879 saved_eptr = eptr;
880 save_mark = md->mark;
881 do
882 {
883 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
884 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
885 {
886 mstart = md->start_match_ptr;
887 break;
888 }
889 if (rrc == MATCH_THEN)
890 {
891 next = ecode + GET(ecode,1);
892 if (md->start_match_ptr < next &&
893 (*ecode == OP_ALT || *next == OP_ALT))
894 rrc = MATCH_NOMATCH;
895 }
896
897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
898 ecode += GET(ecode,1);
899 md->mark = save_mark;
900 }
901 while (*ecode == OP_ALT);
902
903 /* If hit the end of the group (which could be repeated), fail */
904
905 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
906
907 /* Continue as from after the group, updating the offsets high water
908 mark, since extracts may have been taken. */
909
910 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
911
912 offset_top = md->end_offset_top;
913 eptr = md->end_match_ptr;
914
915 /* For a non-repeating ket, just continue at this level. This also
916 happens for a repeating ket if no characters were matched in the group.
917 This is the forcible breaking of infinite loops as implemented in Perl
918 5.005. */
919
920 if (*ecode == OP_KET || eptr == saved_eptr)
921 {
922 ecode += 1+LINK_SIZE;
923 break;
924 }
925
926 /* The repeating kets try the rest of the pattern or restart from the
927 preceding bracket, in the appropriate order. The second "call" of match()
928 uses tail recursion, to avoid using another stack frame. */
929
930 if (*ecode == OP_KETRMIN)
931 {
932 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
934 ecode = prev;
935 goto TAIL_RECURSE;
936 }
937 else /* OP_KETRMAX */
938 {
939 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
941 ecode += 1 + LINK_SIZE;
942 goto TAIL_RECURSE;
943 }
944 /* Control never gets here */
945
946 /* Handle a capturing bracket, other than those that are possessive with an
947 unlimited repeat. If there is space in the offset vector, save the current
948 subject position in the working slot at the top of the vector. We mustn't
949 change the current values of the data slot, because they may be set from a
950 previous iteration of this group, and be referred to by a reference inside
951 the group. A failure to match might occur after the group has succeeded,
952 if something later on doesn't match. For this reason, we need to restore
953 the working value and also the values of the final offsets, in case they
954 were set by a previous iteration of the same bracket.
955
956 If there isn't enough space in the offset vector, treat this as if it were
957 a non-capturing bracket. Don't worry about setting the flag for the error
958 case here; that is handled in the code for KET. */
959
960 case OP_CBRA:
961 case OP_SCBRA:
962 number = GET2(ecode, 1+LINK_SIZE);
963 offset = number << 1;
964
965 #ifdef PCRE_DEBUG
966 printf("start bracket %d\n", number);
967 printf("subject=");
968 pchars(eptr, 16, TRUE, md);
969 printf("\n");
970 #endif
971
972 if (offset < md->offset_max)
973 {
974 save_offset1 = md->offset_vector[offset];
975 save_offset2 = md->offset_vector[offset+1];
976 save_offset3 = md->offset_vector[md->offset_end - number];
977 save_capture_last = md->capture_last;
978 save_mark = md->mark;
979
980 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
981 md->offset_vector[md->offset_end - number] =
982 (int)(eptr - md->start_subject);
983
984 for (;;)
985 {
986 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
987 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
988 eptrb, RM1);
989 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
990
991 /* If we backed up to a THEN, check whether it is within the current
992 branch by comparing the address of the THEN that is passed back with
993 the end of the branch. If it is within the current branch, and the
994 branch is one of two or more alternatives (it either starts or ends
995 with OP_ALT), we have reached the limit of THEN's action, so convert
996 the return code to NOMATCH, which will cause normal backtracking to
997 happen from now on. Otherwise, THEN is passed back to an outer
998 alternative. This implements Perl's treatment of parenthesized groups,
999 where a group not containing | does not affect the current alternative,
1000 that is, (X) is NOT the same as (X|(*F)). */
1001
1002 if (rrc == MATCH_THEN)
1003 {
1004 next = ecode + GET(ecode,1);
1005 if (md->start_match_ptr < next &&
1006 (*ecode == OP_ALT || *next == OP_ALT))
1007 rrc = MATCH_NOMATCH;
1008 }
1009
1010 /* Anything other than NOMATCH is passed back. */
1011
1012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1013 md->capture_last = save_capture_last;
1014 ecode += GET(ecode, 1);
1015 md->mark = save_mark;
1016 if (*ecode != OP_ALT) break;
1017 }
1018
1019 DPRINTF(("bracket %d failed\n", number));
1020 md->offset_vector[offset] = save_offset1;
1021 md->offset_vector[offset+1] = save_offset2;
1022 md->offset_vector[md->offset_end - number] = save_offset3;
1023
1024 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1025
1026 RRETURN(rrc);
1027 }
1028
1029 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1030 as a non-capturing bracket. */
1031
1032 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034
1035 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1036
1037 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1038 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1039
1040 /* Non-capturing or atomic group, except for possessive with unlimited
1041 repeat and ONCE group with no captures. Loop for all the alternatives.
1042
1043 When we get to the final alternative within the brackets, we used to return
1044 the result of a recursive call to match() whatever happened so it was
1045 possible to reduce stack usage by turning this into a tail recursion,
1046 except in the case of a possibly empty group. However, now that there is
1047 the possiblity of (*THEN) occurring in the final alternative, this
1048 optimization is no longer always possible.
1049
1050 We can optimize if we know there are no (*THEN)s in the pattern; at present
1051 this is the best that can be done.
1052
1053 MATCH_ONCE is returned when the end of an atomic group is successfully
1054 reached, but subsequent matching fails. It passes back up the tree (causing
1055 captured values to be reset) until the original atomic group level is
1056 reached. This is tested by comparing md->once_target with the start of the
1057 group. At this point, the return is converted into MATCH_NOMATCH so that
1058 previous backup points can be taken. */
1059
1060 case OP_ONCE:
1061 case OP_BRA:
1062 case OP_SBRA:
1063 DPRINTF(("start non-capturing bracket\n"));
1064
1065 for (;;)
1066 {
1067 if (op >= OP_SBRA || op == OP_ONCE)
1068 md->match_function_type = MATCH_CBEGROUP;
1069
1070 /* If this is not a possibly empty group, and there are no (*THEN)s in
1071 the pattern, and this is the final alternative, optimize as described
1072 above. */
1073
1074 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1075 {
1076 ecode += PRIV(OP_lengths)[*ecode];
1077 goto TAIL_RECURSE;
1078 }
1079
1080 /* In all other cases, we have to make another call to match(). */
1081
1082 save_mark = md->mark;
1083 save_capture_last = md->capture_last;
1084 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1085 RM2);
1086
1087 /* See comment in the code for capturing groups above about handling
1088 THEN. */
1089
1090 if (rrc == MATCH_THEN)
1091 {
1092 next = ecode + GET(ecode,1);
1093 if (md->start_match_ptr < next &&
1094 (*ecode == OP_ALT || *next == OP_ALT))
1095 rrc = MATCH_NOMATCH;
1096 }
1097
1098 if (rrc != MATCH_NOMATCH)
1099 {
1100 if (rrc == MATCH_ONCE)
1101 {
1102 const pcre_uchar *scode = ecode;
1103 if (*scode != OP_ONCE) /* If not at start, find it */
1104 {
1105 while (*scode == OP_ALT) scode += GET(scode, 1);
1106 scode -= GET(scode, 1);
1107 }
1108 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1109 }
1110 RRETURN(rrc);
1111 }
1112 ecode += GET(ecode, 1);
1113 md->mark = save_mark;
1114 if (*ecode != OP_ALT) break;
1115 md->capture_last = save_capture_last;
1116 }
1117
1118 RRETURN(MATCH_NOMATCH);
1119
1120 /* Handle possessive capturing brackets with an unlimited repeat. We come
1121 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1122 handled similarly to the normal case above. However, the matching is
1123 different. The end of these brackets will always be OP_KETRPOS, which
1124 returns MATCH_KETRPOS without going further in the pattern. By this means
1125 we can handle the group by iteration rather than recursion, thereby
1126 reducing the amount of stack needed. */
1127
1128 case OP_CBRAPOS:
1129 case OP_SCBRAPOS:
1130 allow_zero = FALSE;
1131
1132 POSSESSIVE_CAPTURE:
1133 number = GET2(ecode, 1+LINK_SIZE);
1134 offset = number << 1;
1135
1136 #ifdef PCRE_DEBUG
1137 printf("start possessive bracket %d\n", number);
1138 printf("subject=");
1139 pchars(eptr, 16, TRUE, md);
1140 printf("\n");
1141 #endif
1142
1143 if (offset < md->offset_max)
1144 {
1145 matched_once = FALSE;
1146 code_offset = (int)(ecode - md->start_code);
1147
1148 save_offset1 = md->offset_vector[offset];
1149 save_offset2 = md->offset_vector[offset+1];
1150 save_offset3 = md->offset_vector[md->offset_end - number];
1151 save_capture_last = md->capture_last;
1152
1153 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1154
1155 /* Each time round the loop, save the current subject position for use
1156 when the group matches. For MATCH_MATCH, the group has matched, so we
1157 restart it with a new subject starting position, remembering that we had
1158 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1159 usual. If we haven't matched any alternatives in any iteration, check to
1160 see if a previous iteration matched. If so, the group has matched;
1161 continue from afterwards. Otherwise it has failed; restore the previous
1162 capture values before returning NOMATCH. */
1163
1164 for (;;)
1165 {
1166 md->offset_vector[md->offset_end - number] =
1167 (int)(eptr - md->start_subject);
1168 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1169 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1170 eptrb, RM63);
1171 if (rrc == MATCH_KETRPOS)
1172 {
1173 offset_top = md->end_offset_top;
1174 eptr = md->end_match_ptr;
1175 ecode = md->start_code + code_offset;
1176 save_capture_last = md->capture_last;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 md->capture_last = save_capture_last;
1194 ecode += GET(ecode, 1);
1195 if (*ecode != OP_ALT) break;
1196 }
1197
1198 if (!matched_once)
1199 {
1200 md->offset_vector[offset] = save_offset1;
1201 md->offset_vector[offset+1] = save_offset2;
1202 md->offset_vector[md->offset_end - number] = save_offset3;
1203 }
1204
1205 if (allow_zero || matched_once)
1206 {
1207 ecode += 1 + LINK_SIZE;
1208 break;
1209 }
1210
1211 RRETURN(MATCH_NOMATCH);
1212 }
1213
1214 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1215 as a non-capturing bracket. */
1216
1217 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1218 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1219
1220 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1221
1222 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1223 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1224
1225 /* Non-capturing possessive bracket with unlimited repeat. We come here
1226 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1227 without the capturing complication. It is written out separately for speed
1228 and cleanliness. */
1229
1230 case OP_BRAPOS:
1231 case OP_SBRAPOS:
1232 allow_zero = FALSE;
1233
1234 POSSESSIVE_NON_CAPTURE:
1235 matched_once = FALSE;
1236 code_offset = (int)(ecode - md->start_code);
1237 save_capture_last = md->capture_last;
1238
1239 for (;;)
1240 {
1241 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1242 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1243 eptrb, RM48);
1244 if (rrc == MATCH_KETRPOS)
1245 {
1246 offset_top = md->end_offset_top;
1247 eptr = md->end_match_ptr;
1248 ecode = md->start_code + code_offset;
1249 matched_once = TRUE;
1250 continue;
1251 }
1252
1253 /* See comment in the code for capturing groups above about handling
1254 THEN. */
1255
1256 if (rrc == MATCH_THEN)
1257 {
1258 next = ecode + GET(ecode,1);
1259 if (md->start_match_ptr < next &&
1260 (*ecode == OP_ALT || *next == OP_ALT))
1261 rrc = MATCH_NOMATCH;
1262 }
1263
1264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1265 ecode += GET(ecode, 1);
1266 if (*ecode != OP_ALT) break;
1267 md->capture_last = save_capture_last;
1268 }
1269
1270 if (matched_once || allow_zero)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275 RRETURN(MATCH_NOMATCH);
1276
1277 /* Control never reaches here. */
1278
1279 /* Conditional group: compilation checked that there are no more than
1280 two branches. If the condition is false, skipping the first branch takes us
1281 past the end if there is only one branch, but that's OK because that is
1282 exactly what going to the ket would do. */
1283
1284 case OP_COND:
1285 case OP_SCOND:
1286 codelink = GET(ecode, 1);
1287
1288 /* Because of the way auto-callout works during compile, a callout item is
1289 inserted between OP_COND and an assertion condition. */
1290
1291 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1292 {
1293 if (PUBL(callout) != NULL)
1294 {
1295 PUBL(callout_block) cb;
1296 cb.version = 2; /* Version 1 of the callout block */
1297 cb.callout_number = ecode[LINK_SIZE+2];
1298 cb.offset_vector = md->offset_vector;
1299 #if defined COMPILE_PCRE8
1300 cb.subject = (PCRE_SPTR)md->start_subject;
1301 #elif defined COMPILE_PCRE16
1302 cb.subject = (PCRE_SPTR16)md->start_subject;
1303 #elif defined COMPILE_PCRE32
1304 cb.subject = (PCRE_SPTR32)md->start_subject;
1305 #endif
1306 cb.subject_length = (int)(md->end_subject - md->start_subject);
1307 cb.start_match = (int)(mstart - md->start_subject);
1308 cb.current_position = (int)(eptr - md->start_subject);
1309 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1310 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1311 cb.capture_top = offset_top/2;
1312 cb.capture_last = md->capture_last & CAPLMASK;
1313 /* Internal change requires this for API compatibility. */
1314 if (cb.capture_last == 0) cb.capture_last = -1;
1315 cb.callout_data = md->callout_data;
1316 cb.mark = md->nomatch_mark;
1317 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1318 if (rrc < 0) RRETURN(rrc);
1319 }
1320 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1321 }
1322
1323 condcode = ecode[LINK_SIZE+1];
1324
1325 /* Now see what the actual condition is */
1326
1327 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1328 {
1329 if (md->recursive == NULL) /* Not recursing => FALSE */
1330 {
1331 condition = FALSE;
1332 ecode += GET(ecode, 1);
1333 }
1334 else
1335 {
1336 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1337 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1338
1339 /* If the test is for recursion into a specific subpattern, and it is
1340 false, but the test was set up by name, scan the table to see if the
1341 name refers to any other numbers, and test them. The condition is true
1342 if any one is set. */
1343
1344 if (!condition && condcode == OP_NRREF)
1345 {
1346 pcre_uchar *slotA = md->name_table;
1347 for (i = 0; i < md->name_count; i++)
1348 {
1349 if (GET2(slotA, 0) == recno) break;
1350 slotA += md->name_entry_size;
1351 }
1352
1353 /* Found a name for the number - there can be only one; duplicate
1354 names for different numbers are allowed, but not vice versa. First
1355 scan down for duplicates. */
1356
1357 if (i < md->name_count)
1358 {
1359 pcre_uchar *slotB = slotA;
1360 while (slotB > md->name_table)
1361 {
1362 slotB -= md->name_entry_size;
1363 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1364 {
1365 condition = GET2(slotB, 0) == md->recursive->group_num;
1366 if (condition) break;
1367 }
1368 else break;
1369 }
1370
1371 /* Scan up for duplicates */
1372
1373 if (!condition)
1374 {
1375 slotB = slotA;
1376 for (i++; i < md->name_count; i++)
1377 {
1378 slotB += md->name_entry_size;
1379 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1380 {
1381 condition = GET2(slotB, 0) == md->recursive->group_num;
1382 if (condition) break;
1383 }
1384 else break;
1385 }
1386 }
1387 }
1388 }
1389
1390 /* Chose branch according to the condition */
1391
1392 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1393 }
1394 }
1395
1396 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1397 {
1398 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1399 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1400
1401 /* If the numbered capture is unset, but the reference was by name,
1402 scan the table to see if the name refers to any other numbers, and test
1403 them. The condition is true if any one is set. This is tediously similar
1404 to the code above, but not close enough to try to amalgamate. */
1405
1406 if (!condition && condcode == OP_NCREF)
1407 {
1408 unsigned int refno = offset >> 1;
1409 pcre_uchar *slotA = md->name_table;
1410
1411 for (i = 0; i < md->name_count; i++)
1412 {
1413 if (GET2(slotA, 0) == refno) break;
1414 slotA += md->name_entry_size;
1415 }
1416
1417 /* Found a name for the number - there can be only one; duplicate names
1418 for different numbers are allowed, but not vice versa. First scan down
1419 for duplicates. */
1420
1421 if (i < md->name_count)
1422 {
1423 pcre_uchar *slotB = slotA;
1424 while (slotB > md->name_table)
1425 {
1426 slotB -= md->name_entry_size;
1427 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1428 {
1429 offset = GET2(slotB, 0) << 1;
1430 condition = offset < offset_top &&
1431 md->offset_vector[offset] >= 0;
1432 if (condition) break;
1433 }
1434 else break;
1435 }
1436
1437 /* Scan up for duplicates */
1438
1439 if (!condition)
1440 {
1441 slotB = slotA;
1442 for (i++; i < md->name_count; i++)
1443 {
1444 slotB += md->name_entry_size;
1445 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1446 {
1447 offset = GET2(slotB, 0) << 1;
1448 condition = offset < offset_top &&
1449 md->offset_vector[offset] >= 0;
1450 if (condition) break;
1451 }
1452 else break;
1453 }
1454 }
1455 }
1456 }
1457
1458 /* Chose branch according to the condition */
1459
1460 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1461 }
1462
1463 else if (condcode == OP_DEF) /* DEFINE - always false */
1464 {
1465 condition = FALSE;
1466 ecode += GET(ecode, 1);
1467 }
1468
1469 /* The condition is an assertion. Call match() to evaluate it - setting
1470 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1471 an assertion. */
1472
1473 else
1474 {
1475 md->match_function_type = MATCH_CONDASSERT;
1476 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1477 if (rrc == MATCH_MATCH)
1478 {
1479 if (md->end_offset_top > offset_top)
1480 offset_top = md->end_offset_top; /* Captures may have happened */
1481 condition = TRUE;
1482 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1483 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1484 }
1485
1486 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1487 assertion; it is therefore treated as NOMATCH. */
1488
1489 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1490 {
1491 RRETURN(rrc); /* Need braces because of following else */
1492 }
1493 else
1494 {
1495 condition = FALSE;
1496 ecode += codelink;
1497 }
1498 }
1499
1500 /* We are now at the branch that is to be obeyed. As there is only one, can
1501 use tail recursion to avoid using another stack frame, except when there is
1502 unlimited repeat of a possibly empty group. In the latter case, a recursive
1503 call to match() is always required, unless the second alternative doesn't
1504 exist, in which case we can just plough on. Note that, for compatibility
1505 with Perl, the | in a conditional group is NOT treated as creating two
1506 alternatives. If a THEN is encountered in the branch, it propagates out to
1507 the enclosing alternative (unless nested in a deeper set of alternatives,
1508 of course). */
1509
1510 if (condition || *ecode == OP_ALT)
1511 {
1512 if (op != OP_SCOND)
1513 {
1514 ecode += 1 + LINK_SIZE;
1515 goto TAIL_RECURSE;
1516 }
1517
1518 md->match_function_type = MATCH_CBEGROUP;
1519 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1520 RRETURN(rrc);
1521 }
1522
1523 /* Condition false & no alternative; continue after the group. */
1524
1525 else
1526 {
1527 ecode += 1 + LINK_SIZE;
1528 }
1529 break;
1530
1531
1532 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1533 to close any currently open capturing brackets. */
1534
1535 case OP_CLOSE:
1536 number = GET2(ecode, 1); /* Must be less than 65536 */
1537 offset = number << 1;
1538
1539 #ifdef PCRE_DEBUG
1540 printf("end bracket %d at *ACCEPT", number);
1541 printf("\n");
1542 #endif
1543
1544 md->capture_last = (md->capture_last & OVFLMASK) | number;
1545 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1546 {
1547 md->offset_vector[offset] =
1548 md->offset_vector[md->offset_end - number];
1549 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1550 if (offset_top <= offset) offset_top = offset + 2;
1551 }
1552 ecode += 1 + IMM2_SIZE;
1553 break;
1554
1555
1556 /* End of the pattern, either real or forced. */
1557
1558 case OP_END:
1559 case OP_ACCEPT:
1560 case OP_ASSERT_ACCEPT:
1561
1562 /* If we have matched an empty string, fail if not in an assertion and not
1563 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1564 is set and we have matched at the start of the subject. In both cases,
1565 backtracking will then try other alternatives, if any. */
1566
1567 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1568 md->recursive == NULL &&
1569 (md->notempty ||
1570 (md->notempty_atstart &&
1571 mstart == md->start_subject + md->start_offset)))
1572 RRETURN(MATCH_NOMATCH);
1573
1574 /* Otherwise, we have a match. */
1575
1576 md->end_match_ptr = eptr; /* Record where we ended */
1577 md->end_offset_top = offset_top; /* and how many extracts were taken */
1578 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1579
1580 /* For some reason, the macros don't work properly if an expression is
1581 given as the argument to RRETURN when the heap is in use. */
1582
1583 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1584 RRETURN(rrc);
1585
1586 /* Assertion brackets. Check the alternative branches in turn - the
1587 matching won't pass the KET for an assertion. If any one branch matches,
1588 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1589 start of each branch to move the current point backwards, so the code at
1590 this level is identical to the lookahead case. When the assertion is part
1591 of a condition, we want to return immediately afterwards. The caller of
1592 this incarnation of the match() function will have set MATCH_CONDASSERT in
1593 md->match_function type, and one of these opcodes will be the first opcode
1594 that is processed. We use a local variable that is preserved over calls to
1595 match() to remember this case. */
1596
1597 case OP_ASSERT:
1598 case OP_ASSERTBACK:
1599 save_mark = md->mark;
1600 if (md->match_function_type == MATCH_CONDASSERT)
1601 {
1602 condassert = TRUE;
1603 md->match_function_type = 0;
1604 }
1605 else condassert = FALSE;
1606
1607 do
1608 {
1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1610 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1611 {
1612 mstart = md->start_match_ptr; /* In case \K reset it */
1613 break;
1614 }
1615 md->mark = save_mark;
1616
1617 /* A COMMIT failure must fail the entire assertion, without trying any
1618 subsequent branches. */
1619
1620 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1621
1622 /* PCRE does not allow THEN to escape beyond an assertion; it
1623 is treated as NOMATCH. */
1624
1625 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1626 ecode += GET(ecode, 1);
1627 }
1628 while (*ecode == OP_ALT);
1629
1630 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1631
1632 /* If checking an assertion for a condition, return MATCH_MATCH. */
1633
1634 if (condassert) RRETURN(MATCH_MATCH);
1635
1636 /* Continue from after the assertion, updating the offsets high water
1637 mark, since extracts may have been taken during the assertion. */
1638
1639 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1640 ecode += 1 + LINK_SIZE;
1641 offset_top = md->end_offset_top;
1642 continue;
1643
1644 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1645 PRUNE, or COMMIT means we must assume failure without checking subsequent
1646 branches. */
1647
1648 case OP_ASSERT_NOT:
1649 case OP_ASSERTBACK_NOT:
1650 save_mark = md->mark;
1651 if (md->match_function_type == MATCH_CONDASSERT)
1652 {
1653 condassert = TRUE;
1654 md->match_function_type = 0;
1655 }
1656 else condassert = FALSE;
1657
1658 do
1659 {
1660 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1661 md->mark = save_mark;
1662 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1663 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1664 {
1665 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1666 break;
1667 }
1668
1669 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1670 as NOMATCH. */
1671
1672 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1673 ecode += GET(ecode,1);
1674 }
1675 while (*ecode == OP_ALT);
1676
1677 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1678
1679 ecode += 1 + LINK_SIZE;
1680 continue;
1681
1682 /* Move the subject pointer back. This occurs only at the start of
1683 each branch of a lookbehind assertion. If we are too close to the start to
1684 move back, this match function fails. When working with UTF-8 we move
1685 back a number of characters, not bytes. */
1686
1687 case OP_REVERSE:
1688 #ifdef SUPPORT_UTF
1689 if (utf)
1690 {
1691 i = GET(ecode, 1);
1692 while (i-- > 0)
1693 {
1694 eptr--;
1695 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1696 BACKCHAR(eptr);
1697 }
1698 }
1699 else
1700 #endif
1701
1702 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1703
1704 {
1705 eptr -= GET(ecode, 1);
1706 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1707 }
1708
1709 /* Save the earliest consulted character, then skip to next op code */
1710
1711 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1712 ecode += 1 + LINK_SIZE;
1713 break;
1714
1715 /* The callout item calls an external function, if one is provided, passing
1716 details of the match so far. This is mainly for debugging, though the
1717 function is able to force a failure. */
1718
1719 case OP_CALLOUT:
1720 if (PUBL(callout) != NULL)
1721 {
1722 PUBL(callout_block) cb;
1723 cb.version = 2; /* Version 1 of the callout block */
1724 cb.callout_number = ecode[1];
1725 cb.offset_vector = md->offset_vector;
1726 #if defined COMPILE_PCRE8
1727 cb.subject = (PCRE_SPTR)md->start_subject;
1728 #elif defined COMPILE_PCRE16
1729 cb.subject = (PCRE_SPTR16)md->start_subject;
1730 #elif defined COMPILE_PCRE32
1731 cb.subject = (PCRE_SPTR32)md->start_subject;
1732 #endif
1733 cb.subject_length = (int)(md->end_subject - md->start_subject);
1734 cb.start_match = (int)(mstart - md->start_subject);
1735 cb.current_position = (int)(eptr - md->start_subject);
1736 cb.pattern_position = GET(ecode, 2);
1737 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1738 cb.capture_top = offset_top/2;
1739 cb.capture_last = md->capture_last & CAPLMASK;
1740 /* Internal change requires this for API compatibility. */
1741 if (cb.capture_last == 0) cb.capture_last = -1;
1742 cb.callout_data = md->callout_data;
1743 cb.mark = md->nomatch_mark;
1744 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1745 if (rrc < 0) RRETURN(rrc);
1746 }
1747 ecode += 2 + 2*LINK_SIZE;
1748 break;
1749
1750 /* Recursion either matches the current regex, or some subexpression. The
1751 offset data is the offset to the starting bracket from the start of the
1752 whole pattern. (This is so that it works from duplicated subpatterns.)
1753
1754 The state of the capturing groups is preserved over recursion, and
1755 re-instated afterwards. We don't know how many are started and not yet
1756 finished (offset_top records the completed total) so we just have to save
1757 all the potential data. There may be up to 65535 such values, which is too
1758 large to put on the stack, but using malloc for small numbers seems
1759 expensive. As a compromise, the stack is used when there are no more than
1760 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1761
1762 There are also other values that have to be saved. We use a chained
1763 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1764 for the original version of this logic. It has, however, been hacked around
1765 a lot, so he is not to blame for the current way it works. */
1766
1767 case OP_RECURSE:
1768 {
1769 recursion_info *ri;
1770 unsigned int recno;
1771
1772 callpat = md->start_code + GET(ecode, 1);
1773 recno = (callpat == md->start_code)? 0 :
1774 GET2(callpat, 1 + LINK_SIZE);
1775
1776 /* Check for repeating a recursion without advancing the subject pointer.
1777 This should catch convoluted mutual recursions. (Some simple cases are
1778 caught at compile time.) */
1779
1780 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1781 if (recno == ri->group_num && eptr == ri->subject_position)
1782 RRETURN(PCRE_ERROR_RECURSELOOP);
1783
1784 /* Add to "recursing stack" */
1785
1786 new_recursive.group_num = recno;
1787 new_recursive.saved_capture_last = md->capture_last;
1788 new_recursive.subject_position = eptr;
1789 new_recursive.prevrec = md->recursive;
1790 md->recursive = &new_recursive;
1791
1792 /* Where to continue from afterwards */
1793
1794 ecode += 1 + LINK_SIZE;
1795
1796 /* Now save the offset data */
1797
1798 new_recursive.saved_max = md->offset_end;
1799 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1800 new_recursive.offset_save = stacksave;
1801 else
1802 {
1803 new_recursive.offset_save =
1804 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1805 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1806 }
1807 memcpy(new_recursive.offset_save, md->offset_vector,
1808 new_recursive.saved_max * sizeof(int));
1809
1810 /* OK, now we can do the recursion. After processing each alternative,
1811 restore the offset data and the last captured value. If there were nested
1812 recursions, md->recursive might be changed, so reset it before looping.
1813 */
1814
1815 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1816 cbegroup = (*callpat >= OP_SBRA);
1817 do
1818 {
1819 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1820 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1821 md, eptrb, RM6);
1822 memcpy(md->offset_vector, new_recursive.offset_save,
1823 new_recursive.saved_max * sizeof(int));
1824 md->capture_last = new_recursive.saved_capture_last;
1825 md->recursive = new_recursive.prevrec;
1826 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1827 {
1828 DPRINTF(("Recursion matched\n"));
1829 if (new_recursive.offset_save != stacksave)
1830 (PUBL(free))(new_recursive.offset_save);
1831
1832 /* Set where we got to in the subject, and reset the start in case
1833 it was changed by \K. This *is* propagated back out of a recursion,
1834 for Perl compatibility. */
1835
1836 eptr = md->end_match_ptr;
1837 mstart = md->start_match_ptr;
1838 goto RECURSION_MATCHED; /* Exit loop; end processing */
1839 }
1840
1841 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1842 is treated as NOMATCH. */
1843
1844 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1845 rrc != MATCH_COMMIT)
1846 {
1847 DPRINTF(("Recursion gave error %d\n", rrc));
1848 if (new_recursive.offset_save != stacksave)
1849 (PUBL(free))(new_recursive.offset_save);
1850 RRETURN(rrc);
1851 }
1852
1853 md->recursive = &new_recursive;
1854 callpat += GET(callpat, 1);
1855 }
1856 while (*callpat == OP_ALT);
1857
1858 DPRINTF(("Recursion didn't match\n"));
1859 md->recursive = new_recursive.prevrec;
1860 if (new_recursive.offset_save != stacksave)
1861 (PUBL(free))(new_recursive.offset_save);
1862 RRETURN(MATCH_NOMATCH);
1863 }
1864
1865 RECURSION_MATCHED:
1866 break;
1867
1868 /* An alternation is the end of a branch; scan along to find the end of the
1869 bracketed group and go to there. */
1870
1871 case OP_ALT:
1872 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1873 break;
1874
1875 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1876 indicating that it may occur zero times. It may repeat infinitely, or not
1877 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1878 with fixed upper repeat limits are compiled as a number of copies, with the
1879 optional ones preceded by BRAZERO or BRAMINZERO. */
1880
1881 case OP_BRAZERO:
1882 next = ecode + 1;
1883 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1885 do next += GET(next, 1); while (*next == OP_ALT);
1886 ecode = next + 1 + LINK_SIZE;
1887 break;
1888
1889 case OP_BRAMINZERO:
1890 next = ecode + 1;
1891 do next += GET(next, 1); while (*next == OP_ALT);
1892 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1894 ecode++;
1895 break;
1896
1897 case OP_SKIPZERO:
1898 next = ecode+1;
1899 do next += GET(next,1); while (*next == OP_ALT);
1900 ecode = next + 1 + LINK_SIZE;
1901 break;
1902
1903 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1904 here; just jump to the group, with allow_zero set TRUE. */
1905
1906 case OP_BRAPOSZERO:
1907 op = *(++ecode);
1908 allow_zero = TRUE;
1909 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1910 goto POSSESSIVE_NON_CAPTURE;
1911
1912 /* End of a group, repeated or non-repeating. */
1913
1914 case OP_KET:
1915 case OP_KETRMIN:
1916 case OP_KETRMAX:
1917 case OP_KETRPOS:
1918 prev = ecode - GET(ecode, 1);
1919
1920 /* If this was a group that remembered the subject start, in order to break
1921 infinite repeats of empty string matches, retrieve the subject start from
1922 the chain. Otherwise, set it NULL. */
1923
1924 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1925 {
1926 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1927 eptrb = eptrb->epb_prev; /* Backup to previous group */
1928 }
1929 else saved_eptr = NULL;
1930
1931 /* If we are at the end of an assertion group or a non-capturing atomic
1932 group, stop matching and return MATCH_MATCH, but record the current high
1933 water mark for use by positive assertions. We also need to record the match
1934 start in case it was changed by \K. */
1935
1936 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1937 *prev == OP_ONCE_NC)
1938 {
1939 md->end_match_ptr = eptr; /* For ONCE_NC */
1940 md->end_offset_top = offset_top;
1941 md->start_match_ptr = mstart;
1942 RRETURN(MATCH_MATCH); /* Sets md->mark */
1943 }
1944
1945 /* For capturing groups we have to check the group number back at the start
1946 and if necessary complete handling an extraction by setting the offsets and
1947 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1948 into group 0, so it won't be picked up here. Instead, we catch it when the
1949 OP_END is reached. Other recursion is handled here. We just have to record
1950 the current subject position and start match pointer and give a MATCH
1951 return. */
1952
1953 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1954 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1955 {
1956 number = GET2(prev, 1+LINK_SIZE);
1957 offset = number << 1;
1958
1959 #ifdef PCRE_DEBUG
1960 printf("end bracket %d", number);
1961 printf("\n");
1962 #endif
1963
1964 /* Handle a recursively called group. */
1965
1966 if (md->recursive != NULL && md->recursive->group_num == number)
1967 {
1968 md->end_match_ptr = eptr;
1969 md->start_match_ptr = mstart;
1970 RRETURN(MATCH_MATCH);
1971 }
1972
1973 /* Deal with capturing */
1974
1975 md->capture_last = (md->capture_last & OVFLMASK) | number;
1976 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1977 {
1978 /* If offset is greater than offset_top, it means that we are
1979 "skipping" a capturing group, and that group's offsets must be marked
1980 unset. In earlier versions of PCRE, all the offsets were unset at the
1981 start of matching, but this doesn't work because atomic groups and
1982 assertions can cause a value to be set that should later be unset.
1983 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1984 part of the atomic group, but this is not on the final matching path,
1985 so must be unset when 2 is set. (If there is no group 2, there is no
1986 problem, because offset_top will then be 2, indicating no capture.) */
1987
1988 if (offset > offset_top)
1989 {
1990 register int *iptr = md->offset_vector + offset_top;
1991 register int *iend = md->offset_vector + offset;
1992 while (iptr < iend) *iptr++ = -1;
1993 }
1994
1995 /* Now make the extraction */
1996
1997 md->offset_vector[offset] =
1998 md->offset_vector[md->offset_end - number];
1999 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2000 if (offset_top <= offset) offset_top = offset + 2;
2001 }
2002 }
2003
2004 /* For an ordinary non-repeating ket, just continue at this level. This
2005 also happens for a repeating ket if no characters were matched in the
2006 group. This is the forcible breaking of infinite loops as implemented in
2007 Perl 5.005. For a non-repeating atomic group that includes captures,
2008 establish a backup point by processing the rest of the pattern at a lower
2009 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2010 original OP_ONCE level, thereby bypassing intermediate backup points, but
2011 resetting any captures that happened along the way. */
2012
2013 if (*ecode == OP_KET || eptr == saved_eptr)
2014 {
2015 if (*prev == OP_ONCE)
2016 {
2017 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2020 RRETURN(MATCH_ONCE);
2021 }
2022 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2023 break;
2024 }
2025
2026 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2027 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2028 at a time from the outer level, thus saving stack. */
2029
2030 if (*ecode == OP_KETRPOS)
2031 {
2032 md->end_match_ptr = eptr;
2033 md->end_offset_top = offset_top;
2034 RRETURN(MATCH_KETRPOS);
2035 }
2036
2037 /* The normal repeating kets try the rest of the pattern or restart from
2038 the preceding bracket, in the appropriate order. In the second case, we can
2039 use tail recursion to avoid using another stack frame, unless we have an
2040 an atomic group or an unlimited repeat of a group that can match an empty
2041 string. */
2042
2043 if (*ecode == OP_KETRMIN)
2044 {
2045 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2046 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2047 if (*prev == OP_ONCE)
2048 {
2049 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2051 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2052 RRETURN(MATCH_ONCE);
2053 }
2054 if (*prev >= OP_SBRA) /* Could match an empty string */
2055 {
2056 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2057 RRETURN(rrc);
2058 }
2059 ecode = prev;
2060 goto TAIL_RECURSE;
2061 }
2062 else /* OP_KETRMAX */
2063 {
2064 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2065 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 if (*prev == OP_ONCE)
2068 {
2069 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2071 md->once_target = prev;
2072 RRETURN(MATCH_ONCE);
2073 }
2074 ecode += 1 + LINK_SIZE;
2075 goto TAIL_RECURSE;
2076 }
2077 /* Control never gets here */
2078
2079 /* Not multiline mode: start of subject assertion, unless notbol. */
2080
2081 case OP_CIRC:
2082 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2083
2084 /* Start of subject assertion */
2085
2086 case OP_SOD:
2087 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2088 ecode++;
2089 break;
2090
2091 /* Multiline mode: start of subject unless notbol, or after any newline. */
2092
2093 case OP_CIRCM:
2094 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2095 if (eptr != md->start_subject &&
2096 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2097 RRETURN(MATCH_NOMATCH);
2098 ecode++;
2099 break;
2100
2101 /* Start of match assertion */
2102
2103 case OP_SOM:
2104 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2105 ecode++;
2106 break;
2107
2108 /* Reset the start of match point */
2109
2110 case OP_SET_SOM:
2111 mstart = eptr;
2112 ecode++;
2113 break;
2114
2115 /* Multiline mode: assert before any newline, or before end of subject
2116 unless noteol is set. */
2117
2118 case OP_DOLLM:
2119 if (eptr < md->end_subject)
2120 {
2121 if (!IS_NEWLINE(eptr))
2122 {
2123 if (md->partial != 0 &&
2124 eptr + 1 >= md->end_subject &&
2125 NLBLOCK->nltype == NLTYPE_FIXED &&
2126 NLBLOCK->nllen == 2 &&
2127 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2128 {
2129 md->hitend = TRUE;
2130 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2131 }
2132 RRETURN(MATCH_NOMATCH);
2133 }
2134 }
2135 else
2136 {
2137 if (md->noteol) RRETURN(MATCH_NOMATCH);
2138 SCHECK_PARTIAL();
2139 }
2140 ecode++;
2141 break;
2142
2143 /* Not multiline mode: assert before a terminating newline or before end of
2144 subject unless noteol is set. */
2145
2146 case OP_DOLL:
2147 if (md->noteol) RRETURN(MATCH_NOMATCH);
2148 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2149
2150 /* ... else fall through for endonly */
2151
2152 /* End of subject assertion (\z) */
2153
2154 case OP_EOD:
2155 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2156 SCHECK_PARTIAL();
2157 ecode++;
2158 break;
2159
2160 /* End of subject or ending \n assertion (\Z) */
2161
2162 case OP_EODN:
2163 ASSERT_NL_OR_EOS:
2164 if (eptr < md->end_subject &&
2165 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2166 {
2167 if (md->partial != 0 &&
2168 eptr + 1 >= md->end_subject &&
2169 NLBLOCK->nltype == NLTYPE_FIXED &&
2170 NLBLOCK->nllen == 2 &&
2171 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2172 {
2173 md->hitend = TRUE;
2174 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2175 }
2176 RRETURN(MATCH_NOMATCH);
2177 }
2178
2179 /* Either at end of string or \n before end. */
2180
2181 SCHECK_PARTIAL();
2182 ecode++;
2183 break;
2184
2185 /* Word boundary assertions */
2186
2187 case OP_NOT_WORD_BOUNDARY:
2188 case OP_WORD_BOUNDARY:
2189 {
2190
2191 /* Find out if the previous and current characters are "word" characters.
2192 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2193 be "non-word" characters. Remember the earliest consulted character for
2194 partial matching. */
2195
2196 #ifdef SUPPORT_UTF
2197 if (utf)
2198 {
2199 /* Get status of previous character */
2200
2201 if (eptr == md->start_subject) prev_is_word = FALSE; else
2202 {
2203 PCRE_PUCHAR lastptr = eptr - 1;
2204 BACKCHAR(lastptr);
2205 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2206 GETCHAR(c, lastptr);
2207 #ifdef SUPPORT_UCP
2208 if (md->use_ucp)
2209 {
2210 if (c == '_') prev_is_word = TRUE; else
2211 {
2212 int cat = UCD_CATEGORY(c);
2213 prev_is_word = (cat == ucp_L || cat == ucp_N);
2214 }
2215 }
2216 else
2217 #endif
2218 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2219 }
2220
2221 /* Get status of next character */
2222
2223 if (eptr >= md->end_subject)
2224 {
2225 SCHECK_PARTIAL();
2226 cur_is_word = FALSE;
2227 }
2228 else
2229 {
2230 GETCHAR(c, eptr);
2231 #ifdef SUPPORT_UCP
2232 if (md->use_ucp)
2233 {
2234 if (c == '_') cur_is_word = TRUE; else
2235 {
2236 int cat = UCD_CATEGORY(c);
2237 cur_is_word = (cat == ucp_L || cat == ucp_N);
2238 }
2239 }
2240 else
2241 #endif
2242 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2243 }
2244 }
2245 else
2246 #endif
2247
2248 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2249 consistency with the behaviour of \w we do use it in this case. */
2250
2251 {
2252 /* Get status of previous character */
2253
2254 if (eptr == md->start_subject) prev_is_word = FALSE; else
2255 {
2256 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2257 #ifdef SUPPORT_UCP
2258 if (md->use_ucp)
2259 {
2260 c = eptr[-1];
2261 if (c == '_') prev_is_word = TRUE; else
2262 {
2263 int cat = UCD_CATEGORY(c);
2264 prev_is_word = (cat == ucp_L || cat == ucp_N);
2265 }
2266 }
2267 else
2268 #endif
2269 prev_is_word = MAX_255(eptr[-1])
2270 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2271 }
2272
2273 /* Get status of next character */
2274
2275 if (eptr >= md->end_subject)
2276 {
2277 SCHECK_PARTIAL();
2278 cur_is_word = FALSE;
2279 }
2280 else
2281 #ifdef SUPPORT_UCP
2282 if (md->use_ucp)
2283 {
2284 c = *eptr;
2285 if (c == '_') cur_is_word = TRUE; else
2286 {
2287 int cat = UCD_CATEGORY(c);
2288 cur_is_word = (cat == ucp_L || cat == ucp_N);
2289 }
2290 }
2291 else
2292 #endif
2293 cur_is_word = MAX_255(*eptr)
2294 && ((md->ctypes[*eptr] & ctype_word) != 0);
2295 }
2296
2297 /* Now see if the situation is what we want */
2298
2299 if ((*ecode++ == OP_WORD_BOUNDARY)?
2300 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2301 RRETURN(MATCH_NOMATCH);
2302 }
2303 break;
2304
2305 /* Match any single character type except newline; have to take care with
2306 CRLF newlines and partial matching. */
2307
2308 case OP_ANY:
2309 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2310 if (md->partial != 0 &&
2311 eptr + 1 >= md->end_subject &&
2312 NLBLOCK->nltype == NLTYPE_FIXED &&
2313 NLBLOCK->nllen == 2 &&
2314 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2315 {
2316 md->hitend = TRUE;
2317 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2318 }
2319
2320 /* Fall through */
2321
2322 /* Match any single character whatsoever. */
2323
2324 case OP_ALLANY:
2325 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2326 { /* not be updated before SCHECK_PARTIAL. */
2327 SCHECK_PARTIAL();
2328 RRETURN(MATCH_NOMATCH);
2329 }
2330 eptr++;
2331 #ifdef SUPPORT_UTF
2332 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2333 #endif
2334 ecode++;
2335 break;
2336
2337 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2338 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2339
2340 case OP_ANYBYTE:
2341 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2342 { /* not be updated before SCHECK_PARTIAL. */
2343 SCHECK_PARTIAL();
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 eptr++;
2347 ecode++;
2348 break;
2349
2350 case OP_NOT_DIGIT:
2351 if (eptr >= md->end_subject)
2352 {
2353 SCHECK_PARTIAL();
2354 RRETURN(MATCH_NOMATCH);
2355 }
2356 GETCHARINCTEST(c, eptr);
2357 if (
2358 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2359 c < 256 &&
2360 #endif
2361 (md->ctypes[c] & ctype_digit) != 0
2362 )
2363 RRETURN(MATCH_NOMATCH);
2364 ecode++;
2365 break;
2366
2367 case OP_DIGIT:
2368 if (eptr >= md->end_subject)
2369 {
2370 SCHECK_PARTIAL();
2371 RRETURN(MATCH_NOMATCH);
2372 }
2373 GETCHARINCTEST(c, eptr);
2374 if (
2375 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2376 c > 255 ||
2377 #endif
2378 (md->ctypes[c] & ctype_digit) == 0
2379 )
2380 RRETURN(MATCH_NOMATCH);
2381 ecode++;
2382 break;
2383
2384 case OP_NOT_WHITESPACE:
2385 if (eptr >= md->end_subject)
2386 {
2387 SCHECK_PARTIAL();
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 GETCHARINCTEST(c, eptr);
2391 if (
2392 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2393 c < 256 &&
2394 #endif
2395 (md->ctypes[c] & ctype_space) != 0
2396 )
2397 RRETURN(MATCH_NOMATCH);
2398 ecode++;
2399 break;
2400
2401 case OP_WHITESPACE:
2402 if (eptr >= md->end_subject)
2403 {
2404 SCHECK_PARTIAL();
2405 RRETURN(MATCH_NOMATCH);
2406 }
2407 GETCHARINCTEST(c, eptr);
2408 if (
2409 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2410 c > 255 ||
2411 #endif
2412 (md->ctypes[c] & ctype_space) == 0
2413 )
2414 RRETURN(MATCH_NOMATCH);
2415 ecode++;
2416 break;
2417
2418 case OP_NOT_WORDCHAR:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 RRETURN(MATCH_NOMATCH);
2423 }
2424 GETCHARINCTEST(c, eptr);
2425 if (
2426 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2427 c < 256 &&
2428 #endif
2429 (md->ctypes[c] & ctype_word) != 0
2430 )
2431 RRETURN(MATCH_NOMATCH);
2432 ecode++;
2433 break;
2434
2435 case OP_WORDCHAR:
2436 if (eptr >= md->end_subject)
2437 {
2438 SCHECK_PARTIAL();
2439 RRETURN(MATCH_NOMATCH);
2440 }
2441 GETCHARINCTEST(c, eptr);
2442 if (
2443 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2444 c > 255 ||
2445 #endif
2446 (md->ctypes[c] & ctype_word) == 0
2447 )
2448 RRETURN(MATCH_NOMATCH);
2449 ecode++;
2450 break;
2451
2452 case OP_ANYNL:
2453 if (eptr >= md->end_subject)
2454 {
2455 SCHECK_PARTIAL();
2456 RRETURN(MATCH_NOMATCH);
2457 }
2458 GETCHARINCTEST(c, eptr);
2459 switch(c)
2460 {
2461 default: RRETURN(MATCH_NOMATCH);
2462
2463 case CHAR_CR:
2464 if (eptr >= md->end_subject)
2465 {
2466 SCHECK_PARTIAL();
2467 }
2468 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2469 break;
2470
2471 case CHAR_LF:
2472 break;
2473
2474 case CHAR_VT:
2475 case CHAR_FF:
2476 case CHAR_NEL:
2477 #ifndef EBCDIC
2478 case 0x2028:
2479 case 0x2029:
2480 #endif /* Not EBCDIC */
2481 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2482 break;
2483 }
2484 ecode++;
2485 break;
2486
2487 case OP_NOT_HSPACE:
2488 if (eptr >= md->end_subject)
2489 {
2490 SCHECK_PARTIAL();
2491 RRETURN(MATCH_NOMATCH);
2492 }
2493 GETCHARINCTEST(c, eptr);
2494 switch(c)
2495 {
2496 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2497 default: break;
2498 }
2499 ecode++;
2500 break;
2501
2502 case OP_HSPACE:
2503 if (eptr >= md->end_subject)
2504 {
2505 SCHECK_PARTIAL();
2506 RRETURN(MATCH_NOMATCH);
2507 }
2508 GETCHARINCTEST(c, eptr);
2509 switch(c)
2510 {
2511 HSPACE_CASES: break; /* Byte and multibyte cases */
2512 default: RRETURN(MATCH_NOMATCH);
2513 }
2514 ecode++;
2515 break;
2516
2517 case OP_NOT_VSPACE:
2518 if (eptr >= md->end_subject)
2519 {
2520 SCHECK_PARTIAL();
2521 RRETURN(MATCH_NOMATCH);
2522 }
2523 GETCHARINCTEST(c, eptr);
2524 switch(c)
2525 {
2526 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2527 default: break;
2528 }
2529 ecode++;
2530 break;
2531
2532 case OP_VSPACE:
2533 if (eptr >= md->end_subject)
2534 {
2535 SCHECK_PARTIAL();
2536 RRETURN(MATCH_NOMATCH);
2537 }
2538 GETCHARINCTEST(c, eptr);
2539 switch(c)
2540 {
2541 VSPACE_CASES: break;
2542 default: RRETURN(MATCH_NOMATCH);
2543 }
2544 ecode++;
2545 break;
2546
2547 #ifdef SUPPORT_UCP
2548 /* Check the next character by Unicode property. We will get here only
2549 if the support is in the binary; otherwise a compile-time error occurs. */
2550
2551 case OP_PROP:
2552 case OP_NOTPROP:
2553 if (eptr >= md->end_subject)
2554 {
2555 SCHECK_PARTIAL();
2556 RRETURN(MATCH_NOMATCH);
2557 }
2558 GETCHARINCTEST(c, eptr);
2559 {
2560 const pcre_uint32 *cp;
2561 const ucd_record *prop = GET_UCD(c);
2562
2563 switch(ecode[1])
2564 {
2565 case PT_ANY:
2566 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2567 break;
2568
2569 case PT_LAMP:
2570 if ((prop->chartype == ucp_Lu ||
2571 prop->chartype == ucp_Ll ||
2572 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2573 RRETURN(MATCH_NOMATCH);
2574 break;
2575
2576 case PT_GC:
2577 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2580
2581 case PT_PC:
2582 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2583 RRETURN(MATCH_NOMATCH);
2584 break;
2585
2586 case PT_SC:
2587 if ((ecode[2] != prop->script) == (op == OP_PROP))
2588 RRETURN(MATCH_NOMATCH);
2589 break;
2590
2591 /* These are specials */
2592
2593 case PT_ALNUM:
2594 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2595 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2596 RRETURN(MATCH_NOMATCH);
2597 break;
2598
2599 case PT_SPACE: /* Perl space */
2600 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2601 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2602 == (op == OP_NOTPROP))
2603 RRETURN(MATCH_NOMATCH);
2604 break;
2605
2606 case PT_PXSPACE: /* POSIX space */
2607 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2608 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2609 c == CHAR_FF || c == CHAR_CR)
2610 == (op == OP_NOTPROP))
2611 RRETURN(MATCH_NOMATCH);
2612 break;
2613
2614 case PT_WORD:
2615 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2616 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2617 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2618 RRETURN(MATCH_NOMATCH);
2619 break;
2620
2621 case PT_CLIST:
2622 cp = PRIV(ucd_caseless_sets) + ecode[2];
2623 for (;;)
2624 {
2625 if (c < *cp)
2626 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2627 if (c == *cp++)
2628 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2629 }
2630 break;
2631
2632 /* This should never occur */
2633
2634 default:
2635 RRETURN(PCRE_ERROR_INTERNAL);
2636 }
2637
2638 ecode += 3;
2639 }
2640 break;
2641
2642 /* Match an extended Unicode sequence. We will get here only if the support
2643 is in the binary; otherwise a compile-time error occurs. */
2644
2645 case OP_EXTUNI:
2646 if (eptr >= md->end_subject)
2647 {
2648 SCHECK_PARTIAL();
2649 RRETURN(MATCH_NOMATCH);
2650 }
2651 else
2652 {
2653 int lgb, rgb;
2654 GETCHARINCTEST(c, eptr);
2655 lgb = UCD_GRAPHBREAK(c);
2656 while (eptr < md->end_subject)
2657 {
2658 int len = 1;
2659 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2660 rgb = UCD_GRAPHBREAK(c);
2661 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2662 lgb = rgb;
2663 eptr += len;
2664 }
2665 }
2666 CHECK_PARTIAL();
2667 ecode++;
2668 break;
2669 #endif /* SUPPORT_UCP */
2670
2671
2672 /* Match a back reference, possibly repeatedly. Look past the end of the
2673 item to see if there is repeat information following. The code is similar
2674 to that for character classes, but repeated for efficiency. Then obey
2675 similar code to character type repeats - written out again for speed.
2676 However, if the referenced string is the empty string, always treat
2677 it as matched, any number of times (otherwise there could be infinite
2678 loops). */
2679
2680 case OP_REF:
2681 case OP_REFI:
2682 caseless = op == OP_REFI;
2683 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2684 ecode += 1 + IMM2_SIZE;
2685
2686 /* If the reference is unset, there are two possibilities:
2687
2688 (a) In the default, Perl-compatible state, set the length negative;
2689 this ensures that every attempt at a match fails. We can't just fail
2690 here, because of the possibility of quantifiers with zero minima.
2691
2692 (b) If the JavaScript compatibility flag is set, set the length to zero
2693 so that the back reference matches an empty string.
2694
2695 Otherwise, set the length to the length of what was matched by the
2696 referenced subpattern. */
2697
2698 if (offset >= offset_top || md->offset_vector[offset] < 0)
2699 length = (md->jscript_compat)? 0 : -1;
2700 else
2701 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2702
2703 /* Set up for repetition, or handle the non-repeated case */
2704
2705 switch (*ecode)
2706 {
2707 case OP_CRSTAR:
2708 case OP_CRMINSTAR:
2709 case OP_CRPLUS:
2710 case OP_CRMINPLUS:
2711 case OP_CRQUERY:
2712 case OP_CRMINQUERY:
2713 c = *ecode++ - OP_CRSTAR;
2714 minimize = (c & 1) != 0;
2715 min = rep_min[c]; /* Pick up values from tables; */
2716 max = rep_max[c]; /* zero for max => infinity */
2717 if (max == 0) max = INT_MAX;
2718 break;
2719
2720 case OP_CRRANGE:
2721 case OP_CRMINRANGE:
2722 minimize = (*ecode == OP_CRMINRANGE);
2723 min = GET2(ecode, 1);
2724 max = GET2(ecode, 1 + IMM2_SIZE);
2725 if (max == 0) max = INT_MAX;
2726 ecode += 1 + 2 * IMM2_SIZE;
2727 break;
2728
2729 default: /* No repeat follows */
2730 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2731 {
2732 if (length == -2) eptr = md->end_subject; /* Partial match */
2733 CHECK_PARTIAL();
2734 RRETURN(MATCH_NOMATCH);
2735 }
2736 eptr += length;
2737 continue; /* With the main loop */
2738 }
2739
2740 /* Handle repeated back references. If the length of the reference is
2741 zero, just continue with the main loop. If the length is negative, it
2742 means the reference is unset in non-Java-compatible mode. If the minimum is
2743 zero, we can continue at the same level without recursion. For any other
2744 minimum, carrying on will result in NOMATCH. */
2745
2746 if (length == 0) continue;
2747 if (length < 0 && min == 0) continue;
2748
2749 /* First, ensure the minimum number of matches are present. We get back
2750 the length of the reference string explicitly rather than passing the
2751 address of eptr, so that eptr can be a register variable. */
2752
2753 for (i = 1; i <= min; i++)
2754 {
2755 int slength;
2756 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2757 {
2758 if (slength == -2) eptr = md->end_subject; /* Partial match */
2759 CHECK_PARTIAL();
2760 RRETURN(MATCH_NOMATCH);
2761 }
2762 eptr += slength;
2763 }
2764
2765 /* If min = max, continue at the same level without recursion.
2766 They are not both allowed to be zero. */
2767
2768 if (min == max) continue;
2769
2770 /* If minimizing, keep trying and advancing the pointer */
2771
2772 if (minimize)
2773 {
2774 for (fi = min;; fi++)
2775 {
2776 int slength;
2777 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2779 if (fi >= max) RRETURN(MATCH_NOMATCH);
2780 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2781 {
2782 if (slength == -2) eptr = md->end_subject; /* Partial match */
2783 CHECK_PARTIAL();
2784 RRETURN(MATCH_NOMATCH);
2785 }
2786 eptr += slength;
2787 }
2788 /* Control never gets here */
2789 }
2790
2791 /* If maximizing, find the longest string and work backwards */
2792
2793 else
2794 {
2795 pp = eptr;
2796 for (i = min; i < max; i++)
2797 {
2798 int slength;
2799 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2800 {
2801 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2802 the soft partial matching case. */
2803
2804 if (slength == -2 && md->partial != 0 &&
2805 md->end_subject > md->start_used_ptr)
2806 {
2807 md->hitend = TRUE;
2808 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2809 }
2810 break;
2811 }
2812 eptr += slength;
2813 }
2814
2815 while (eptr >= pp)
2816 {
2817 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2818 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2819 eptr -= length;
2820 }
2821 RRETURN(MATCH_NOMATCH);
2822 }
2823 /* Control never gets here */
2824
2825 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2826 used when all the characters in the class have values in the range 0-255,
2827 and either the matching is caseful, or the characters are in the range
2828 0-127 when UTF-8 processing is enabled. The only difference between
2829 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2830 encountered.
2831
2832 First, look past the end of the item to see if there is repeat information
2833 following. Then obey similar code to character type repeats - written out
2834 again for speed. */
2835
2836 case OP_NCLASS:
2837 case OP_CLASS:
2838 {
2839 /* The data variable is saved across frames, so the byte map needs to
2840 be stored there. */
2841 #define BYTE_MAP ((pcre_uint8 *)data)
2842 data = ecode + 1; /* Save for matching */
2843 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2844
2845 switch (*ecode)
2846 {
2847 case OP_CRSTAR:
2848 case OP_CRMINSTAR:
2849 case OP_CRPLUS:
2850 case OP_CRMINPLUS:
2851 case OP_CRQUERY:
2852 case OP_CRMINQUERY:
2853 c = *ecode++ - OP_CRSTAR;
2854 minimize = (c & 1) != 0;
2855 min = rep_min[c]; /* Pick up values from tables; */
2856 max = rep_max[c]; /* zero for max => infinity */
2857 if (max == 0) max = INT_MAX;
2858 break;
2859
2860 case OP_CRRANGE:
2861 case OP_CRMINRANGE:
2862 minimize = (*ecode == OP_CRMINRANGE);
2863 min = GET2(ecode, 1);
2864 max = GET2(ecode, 1 + IMM2_SIZE);
2865 if (max == 0) max = INT_MAX;
2866 ecode += 1 + 2 * IMM2_SIZE;
2867 break;
2868
2869 default: /* No repeat follows */
2870 min = max = 1;
2871 break;
2872 }
2873
2874 /* First, ensure the minimum number of matches are present. */
2875
2876 #ifdef SUPPORT_UTF
2877 if (utf)
2878 {
2879 for (i = 1; i <= min; i++)
2880 {
2881 if (eptr >= md->end_subject)
2882 {
2883 SCHECK_PARTIAL();
2884 RRETURN(MATCH_NOMATCH);
2885 }
2886 GETCHARINC(c, eptr);
2887 if (c > 255)
2888 {
2889 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2890 }
2891 else
2892 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2893 }
2894 }
2895 else
2896 #endif
2897 /* Not UTF mode */
2898 {
2899 for (i = 1; i <= min; i++)
2900 {
2901 if (eptr >= md->end_subject)
2902 {
2903 SCHECK_PARTIAL();
2904 RRETURN(MATCH_NOMATCH);
2905 }
2906 c = *eptr++;
2907 #ifndef COMPILE_PCRE8
2908 if (c > 255)
2909 {
2910 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2911 }
2912 else
2913 #endif
2914 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2915 }
2916 }
2917
2918 /* If max == min we can continue with the main loop without the
2919 need to recurse. */
2920
2921 if (min == max) continue;
2922
2923 /* If minimizing, keep testing the rest of the expression and advancing
2924 the pointer while it matches the class. */
2925
2926 if (minimize)
2927 {
2928 #ifdef SUPPORT_UTF
2929 if (utf)
2930 {
2931 for (fi = min;; fi++)
2932 {
2933 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2935 if (fi >= max) RRETURN(MATCH_NOMATCH);
2936 if (eptr >= md->end_subject)
2937 {
2938 SCHECK_PARTIAL();
2939 RRETURN(MATCH_NOMATCH);
2940 }
2941 GETCHARINC(c, eptr);
2942 if (c > 255)
2943 {
2944 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2945 }
2946 else
2947 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2948 }
2949 }
2950 else
2951 #endif
2952 /* Not UTF mode */
2953 {
2954 for (fi = min;; fi++)
2955 {
2956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2958 if (fi >= max) RRETURN(MATCH_NOMATCH);
2959 if (eptr >= md->end_subject)
2960 {
2961 SCHECK_PARTIAL();
2962 RRETURN(MATCH_NOMATCH);
2963 }
2964 c = *eptr++;
2965 #ifndef COMPILE_PCRE8
2966 if (c > 255)
2967 {
2968 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2969 }
2970 else
2971 #endif
2972 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2973 }
2974 }
2975 /* Control never gets here */
2976 }
2977
2978 /* If maximizing, find the longest possible run, then work backwards. */
2979
2980 else
2981 {
2982 pp = eptr;
2983
2984 #ifdef SUPPORT_UTF
2985 if (utf)
2986 {
2987 for (i = min; i < max; i++)
2988 {
2989 int len = 1;
2990 if (eptr >= md->end_subject)
2991 {
2992 SCHECK_PARTIAL();
2993 break;
2994 }
2995 GETCHARLEN(c, eptr, len);
2996 if (c > 255)
2997 {
2998 if (op == OP_CLASS) break;
2999 }
3000 else
3001 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3002 eptr += len;
3003 }
3004 for (;;)
3005 {
3006 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3008 if (eptr-- == pp) break; /* Stop if tried at original pos */
3009 BACKCHAR(eptr);
3010 }
3011 }
3012 else
3013 #endif
3014 /* Not UTF mode */
3015 {
3016 for (i = min; i < max; i++)
3017 {
3018 if (eptr >= md->end_subject)
3019 {
3020 SCHECK_PARTIAL();
3021 break;
3022 }
3023 c = *eptr;
3024 #ifndef COMPILE_PCRE8
3025 if (c > 255)
3026 {
3027 if (op == OP_CLASS) break;
3028 }
3029 else
3030 #endif
3031 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3032 eptr++;
3033 }
3034 while (eptr >= pp)
3035 {
3036 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3037 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3038 eptr--;
3039 }
3040 }
3041
3042 RRETURN(MATCH_NOMATCH);
3043 }
3044 #undef BYTE_MAP
3045 }
3046 /* Control never gets here */
3047
3048
3049 /* Match an extended character class. This opcode is encountered only
3050 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3051 mode, because Unicode properties are supported in non-UTF-8 mode. */
3052
3053 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3054 case OP_XCLASS:
3055 {
3056 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3057 ecode += GET(ecode, 1); /* Advance past the item */
3058
3059 switch (*ecode)
3060 {
3061 case OP_CRSTAR:
3062 case OP_CRMINSTAR:
3063 case OP_CRPLUS:
3064 case OP_CRMINPLUS:
3065 case OP_CRQUERY:
3066 case OP_CRMINQUERY:
3067 c = *ecode++ - OP_CRSTAR;
3068 minimize = (c & 1) != 0;
3069 min = rep_min[c]; /* Pick up values from tables; */
3070 max = rep_max[c]; /* zero for max => infinity */
3071 if (max == 0) max = INT_MAX;
3072 break;
3073
3074 case OP_CRRANGE:
3075 case OP_CRMINRANGE:
3076 minimize = (*ecode == OP_CRMINRANGE);
3077 min = GET2(ecode, 1);
3078 max = GET2(ecode, 1 + IMM2_SIZE);
3079 if (max == 0) max = INT_MAX;
3080 ecode += 1 + 2 * IMM2_SIZE;
3081 break;
3082
3083 default: /* No repeat follows */
3084 min = max = 1;
3085 break;
3086 }
3087
3088 /* First, ensure the minimum number of matches are present. */
3089
3090 for (i = 1; i <= min; i++)
3091 {
3092 if (eptr >= md->end_subject)
3093 {
3094 SCHECK_PARTIAL();
3095 RRETURN(MATCH_NOMATCH);
3096 }
3097 GETCHARINCTEST(c, eptr);
3098 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3099 }
3100
3101 /* If max == min we can continue with the main loop without the
3102 need to recurse. */
3103
3104 if (min == max) continue;
3105
3106 /* If minimizing, keep testing the rest of the expression and advancing
3107 the pointer while it matches the class. */
3108
3109 if (minimize)
3110 {
3111 for (fi = min;; fi++)
3112 {
3113 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3114 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3115 if (fi >= max) RRETURN(MATCH_NOMATCH);
3116 if (eptr >= md->end_subject)
3117 {
3118 SCHECK_PARTIAL();
3119 RRETURN(MATCH_NOMATCH);
3120 }
3121 GETCHARINCTEST(c, eptr);
3122 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3123 }
3124 /* Control never gets here */
3125 }
3126
3127 /* If maximizing, find the longest possible run, then work backwards. */
3128
3129 else
3130 {
3131 pp = eptr;
3132 for (i = min; i < max; i++)
3133 {
3134 int len = 1;
3135 if (eptr >= md->end_subject)
3136 {
3137 SCHECK_PARTIAL();
3138 break;
3139 }
3140 #ifdef SUPPORT_UTF
3141 GETCHARLENTEST(c, eptr, len);
3142 #else
3143 c = *eptr;
3144 #endif
3145 if (!PRIV(xclass)(c, data, utf)) break;
3146 eptr += len;
3147 }
3148 for(;;)
3149 {
3150 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3151 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3152 if (eptr-- == pp) break; /* Stop if tried at original pos */
3153 #ifdef SUPPORT_UTF
3154 if (utf) BACKCHAR(eptr);
3155 #endif
3156 }
3157 RRETURN(MATCH_NOMATCH);
3158 }
3159
3160 /* Control never gets here */
3161 }
3162 #endif /* End of XCLASS */
3163
3164 /* Match a single character, casefully */
3165
3166 case OP_CHAR:
3167 #ifdef SUPPORT_UTF
3168 if (utf)
3169 {
3170 length = 1;
3171 ecode++;
3172 GETCHARLEN(fc, ecode, length);
3173 if (length > md->end_subject - eptr)
3174 {
3175 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3179 }
3180 else
3181 #endif
3182 /* Not UTF mode */
3183 {
3184 if (md->end_subject - eptr < 1)
3185 {
3186 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3187 RRETURN(MATCH_NOMATCH);
3188 }
3189 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3190 ecode += 2;
3191 }
3192 break;
3193
3194 /* Match a single character, caselessly. If we are at the end of the
3195 subject, give up immediately. */
3196
3197 case OP_CHARI:
3198 if (eptr >= md->end_subject)
3199 {
3200 SCHECK_PARTIAL();
3201 RRETURN(MATCH_NOMATCH);
3202 }
3203
3204 #ifdef SUPPORT_UTF
3205 if (utf)
3206 {
3207 length = 1;
3208 ecode++;
3209 GETCHARLEN(fc, ecode, length);
3210
3211 /* If the pattern character's value is < 128, we have only one byte, and
3212 we know that its other case must also be one byte long, so we can use the
3213 fast lookup table. We know that there is at least one byte left in the
3214 subject. */
3215
3216 if (fc < 128)
3217 {
3218 pcre_uchar cc = RAWUCHAR(eptr);
3219 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3220 ecode++;
3221 eptr++;
3222 }
3223
3224 /* Otherwise we must pick up the subject character. Note that we cannot
3225 use the value of "length" to check for sufficient bytes left, because the
3226 other case of the character may have more or fewer bytes. */
3227
3228 else
3229 {
3230 pcre_uint32 dc;
3231 GETCHARINC(dc, eptr);
3232 ecode += length;
3233
3234 /* If we have Unicode property support, we can use it to test the other
3235 case of the character, if there is one. */
3236
3237 if (fc != dc)
3238 {
3239 #ifdef SUPPORT_UCP
3240 if (dc != UCD_OTHERCASE(fc))
3241 #endif
3242 RRETURN(MATCH_NOMATCH);
3243 }
3244 }
3245 }
3246 else
3247 #endif /* SUPPORT_UTF */
3248
3249 /* Not UTF mode */
3250 {
3251 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3252 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3253 eptr++;
3254 ecode += 2;
3255 }
3256 break;
3257
3258 /* Match a single character repeatedly. */
3259
3260 case OP_EXACT:
3261 case OP_EXACTI:
3262 min = max = GET2(ecode, 1);
3263 ecode += 1 + IMM2_SIZE;
3264 goto REPEATCHAR;
3265
3266 case OP_POSUPTO:
3267 case OP_POSUPTOI:
3268 possessive = TRUE;
3269 /* Fall through */
3270
3271 case OP_UPTO:
3272 case OP_UPTOI:
3273 case OP_MINUPTO:
3274 case OP_MINUPTOI:
3275 min = 0;
3276 max = GET2(ecode, 1);
3277 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3278 ecode += 1 + IMM2_SIZE;
3279 goto REPEATCHAR;
3280
3281 case OP_POSSTAR:
3282 case OP_POSSTARI:
3283 possessive = TRUE;
3284 min = 0;
3285 max = INT_MAX;
3286 ecode++;
3287 goto REPEATCHAR;
3288
3289 case OP_POSPLUS:
3290 case OP_POSPLUSI:
3291 possessive = TRUE;
3292 min = 1;
3293 max = INT_MAX;
3294 ecode++;
3295 goto REPEATCHAR;
3296
3297 case OP_POSQUERY:
3298 case OP_POSQUERYI:
3299 possessive = TRUE;
3300 min = 0;
3301 max = 1;
3302 ecode++;
3303 goto REPEATCHAR;
3304
3305 case OP_STAR:
3306 case OP_STARI:
3307 case OP_MINSTAR:
3308 case OP_MINSTARI:
3309 case OP_PLUS:
3310 case OP_PLUSI:
3311 case OP_MINPLUS:
3312 case OP_MINPLUSI:
3313 case OP_QUERY:
3314 case OP_QUERYI:
3315 case OP_MINQUERY:
3316 case OP_MINQUERYI:
3317 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3318 minimize = (c & 1) != 0;
3319 min = rep_min[c]; /* Pick up values from tables; */
3320 max = rep_max[c]; /* zero for max => infinity */
3321 if (max == 0) max = INT_MAX;
3322
3323 /* Common code for all repeated single-character matches. */
3324
3325 REPEATCHAR:
3326 #ifdef SUPPORT_UTF
3327 if (utf)
3328 {
3329 length = 1;
3330 charptr = ecode;
3331 GETCHARLEN(fc, ecode, length);
3332 ecode += length;
3333
3334 /* Handle multibyte character matching specially here. There is
3335 support for caseless matching if UCP support is present. */
3336
3337 if (length > 1)
3338 {
3339 #ifdef SUPPORT_UCP
3340 pcre_uint32 othercase;
3341 if (op >= OP_STARI && /* Caseless */
3342 (othercase = UCD_OTHERCASE(fc)) != fc)
3343 oclength = PRIV(ord2utf)(othercase, occhars);
3344 else oclength = 0;
3345 #endif /* SUPPORT_UCP */
3346
3347 for (i = 1; i <= min; i++)
3348 {
3349 if (eptr <= md->end_subject - length &&
3350 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3351 #ifdef SUPPORT_UCP
3352 else if (oclength > 0 &&
3353 eptr <= md->end_subject - oclength &&
3354 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3355 #endif /* SUPPORT_UCP */
3356 else
3357 {
3358 CHECK_PARTIAL();
3359 RRETURN(MATCH_NOMATCH);
3360 }
3361 }
3362
3363 if (min == max) continue;
3364
3365 if (minimize)
3366 {
3367 for (fi = min;; fi++)
3368 {
3369 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3370 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3371 if (fi >= max) RRETURN(MATCH_NOMATCH);
3372 if (eptr <= md->end_subject - length &&
3373 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3374 #ifdef SUPPORT_UCP
3375 else if (oclength > 0 &&
3376 eptr <= md->end_subject - oclength &&
3377 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3378 #endif /* SUPPORT_UCP */
3379 else
3380 {
3381 CHECK_PARTIAL();
3382 RRETURN(MATCH_NOMATCH);
3383 }
3384 }
3385 /* Control never gets here */
3386 }
3387
3388 else /* Maximize */
3389 {
3390 pp = eptr;
3391 for (i = min; i < max; i++)
3392 {
3393 if (eptr <= md->end_subject - length &&
3394 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3395 #ifdef SUPPORT_UCP
3396 else if (oclength > 0 &&
3397 eptr <= md->end_subject - oclength &&
3398 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3399 #endif /* SUPPORT_UCP */
3400 else
3401 {
3402 CHECK_PARTIAL();
3403 break;
3404 }
3405 }
3406
3407 if (possessive) continue;
3408
3409 for(;;)
3410 {
3411 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3412 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3413 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3414 #ifdef SUPPORT_UCP
3415 eptr--;
3416 BACKCHAR(eptr);
3417 #else /* without SUPPORT_UCP */
3418 eptr -= length;
3419 #endif /* SUPPORT_UCP */
3420 }
3421 }
3422 /* Control never gets here */
3423 }
3424
3425 /* If the length of a UTF-8 character is 1, we fall through here, and
3426 obey the code as for non-UTF-8 characters below, though in this case the
3427 value of fc will always be < 128. */
3428 }
3429 else
3430 #endif /* SUPPORT_UTF */
3431 /* When not in UTF-8 mode, load a single-byte character. */
3432 fc = *ecode++;
3433
3434 /* The value of fc at this point is always one character, though we may
3435 or may not be in UTF mode. The code is duplicated for the caseless and
3436 caseful cases, for speed, since matching characters is likely to be quite
3437 common. First, ensure the minimum number of matches are present. If min =
3438 max, continue at the same level without recursing. Otherwise, if
3439 minimizing, keep trying the rest of the expression and advancing one
3440 matching character if failing, up to the maximum. Alternatively, if
3441 maximizing, find the maximum number of characters and work backwards. */
3442
3443 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3444 max, (char *)eptr));
3445
3446 if (op >= OP_STARI) /* Caseless */
3447 {
3448 #ifdef COMPILE_PCRE8
3449 /* fc must be < 128 if UTF is enabled. */
3450 foc = md->fcc[fc];
3451 #else
3452 #ifdef SUPPORT_UTF
3453 #ifdef SUPPORT_UCP
3454 if (utf && fc > 127)
3455 foc = UCD_OTHERCASE(fc);
3456 #else
3457 if (utf && fc > 127)
3458 foc = fc;
3459 #endif /* SUPPORT_UCP */
3460 else
3461 #endif /* SUPPORT_UTF */
3462 foc = TABLE_GET(fc, md->fcc, fc);
3463 #endif /* COMPILE_PCRE8 */
3464
3465 for (i = 1; i <= min; i++)
3466 {
3467 pcre_uint32 cc; /* Faster than pcre_uchar */
3468 if (eptr >= md->end_subject)
3469 {
3470 SCHECK_PARTIAL();
3471 RRETURN(MATCH_NOMATCH);
3472 }
3473 cc = RAWUCHARTEST(eptr);
3474 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3475 eptr++;
3476 }
3477 if (min == max) continue;
3478 if (minimize)
3479 {
3480 for (fi = min;; fi++)
3481 {
3482 pcre_uint32 cc; /* Faster than pcre_uchar */
3483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3485 if (fi >= max) RRETURN(MATCH_NOMATCH);
3486 if (eptr >= md->end_subject)
3487 {
3488 SCHECK_PARTIAL();
3489 RRETURN(MATCH_NOMATCH);
3490 }
3491 cc = RAWUCHARTEST(eptr);
3492 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3493 eptr++;
3494 }
3495 /* Control never gets here */
3496 }
3497 else /* Maximize */
3498 {
3499 pp = eptr;
3500 for (i = min; i < max; i++)
3501 {
3502 pcre_uint32 cc; /* Faster than pcre_uchar */
3503 if (eptr >= md->end_subject)
3504 {
3505 SCHECK_PARTIAL();
3506 break;
3507 }
3508 cc = RAWUCHARTEST(eptr);
3509 if (fc != cc && foc != cc) break;
3510 eptr++;
3511 }
3512
3513 if (possessive) continue;
3514
3515 while (eptr >= pp)
3516 {
3517 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3518 eptr--;
3519 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3520 }
3521 RRETURN(MATCH_NOMATCH);
3522 }
3523 /* Control never gets here */
3524 }
3525
3526 /* Caseful comparisons (includes all multi-byte characters) */
3527
3528 else
3529 {
3530 for (i = 1; i <= min; i++)
3531 {
3532 if (eptr >= md->end_subject)
3533 {
3534 SCHECK_PARTIAL();
3535 RRETURN(MATCH_NOMATCH);
3536 }
3537 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3538 }
3539
3540 if (min == max) continue;
3541
3542 if (minimize)
3543 {
3544 for (fi = min;; fi++)
3545 {
3546 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3547 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3548 if (fi >= max) RRETURN(MATCH_NOMATCH);
3549 if (eptr >= md->end_subject)
3550 {
3551 SCHECK_PARTIAL();
3552 RRETURN(MATCH_NOMATCH);
3553 }
3554 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3555 }
3556 /* Control never gets here */
3557 }
3558 else /* Maximize */
3559 {
3560 pp = eptr;
3561 for (i = min; i < max; i++)
3562 {
3563 if (eptr >= md->end_subject)
3564 {
3565 SCHECK_PARTIAL();
3566 break;
3567 }
3568 if (fc != RAWUCHARTEST(eptr)) break;
3569 eptr++;
3570 }
3571 if (possessive) continue;
3572
3573 while (eptr >= pp)
3574 {
3575 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3576 eptr--;
3577 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3578 }
3579 RRETURN(MATCH_NOMATCH);
3580 }
3581 }
3582 /* Control never gets here */
3583
3584 /* Match a negated single one-byte character. The character we are
3585 checking can be multibyte. */
3586
3587 case OP_NOT:
3588 case OP_NOTI:
3589 if (eptr >= md->end_subject)
3590 {
3591 SCHECK_PARTIAL();
3592 RRETURN(MATCH_NOMATCH);
3593 }
3594 #ifdef SUPPORT_UTF
3595 if (utf)
3596 {
3597 register pcre_uint32 ch, och;
3598
3599 ecode++;
3600 GETCHARINC(ch, ecode);
3601 GETCHARINC(c, eptr);
3602
3603 if (op == OP_NOT)
3604 {
3605 if (ch == c) RRETURN(MATCH_NOMATCH);
3606 }
3607 else
3608 {
3609 #ifdef SUPPORT_UCP
3610 if (ch > 127)
3611 och = UCD_OTHERCASE(ch);
3612 #else
3613 if (ch > 127)
3614 och = ch;
3615 #endif /* SUPPORT_UCP */
3616 else
3617 och = TABLE_GET(ch, md->fcc, ch);
3618 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3619 }
3620 }
3621 else
3622 #endif
3623 {
3624 register pcre_uint32 ch = ecode[1];
3625 c = *eptr++;
3626 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3627 RRETURN(MATCH_NOMATCH);
3628 ecode += 2;
3629 }
3630 break;
3631
3632 /* Match a negated single one-byte character repeatedly. This is almost a
3633 repeat of the code for a repeated single character, but I haven't found a
3634 nice way of commoning these up that doesn't require a test of the
3635 positive/negative option for each character match. Maybe that wouldn't add
3636 very much to the time taken, but character matching *is* what this is all
3637 about... */
3638
3639 case OP_NOTEXACT:
3640 case OP_NOTEXACTI:
3641 min = max = GET2(ecode, 1);
3642 ecode += 1 + IMM2_SIZE;
3643 goto REPEATNOTCHAR;
3644
3645 case OP_NOTUPTO:
3646 case OP_NOTUPTOI:
3647 case OP_NOTMINUPTO:
3648 case OP_NOTMINUPTOI:
3649 min = 0;
3650 max = GET2(ecode, 1);
3651 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3652 ecode += 1 + IMM2_SIZE;
3653 goto REPEATNOTCHAR;
3654
3655 case OP_NOTPOSSTAR:
3656 case OP_NOTPOSSTARI:
3657 possessive = TRUE;
3658 min = 0;
3659 max = INT_MAX;
3660 ecode++;
3661 goto REPEATNOTCHAR;
3662
3663 case OP_NOTPOSPLUS:
3664 case OP_NOTPOSPLUSI:
3665 possessive = TRUE;
3666 min = 1;
3667 max = INT_MAX;
3668 ecode++;
3669 goto REPEATNOTCHAR;
3670
3671 case OP_NOTPOSQUERY:
3672 case OP_NOTPOSQUERYI:
3673 possessive = TRUE;
3674 min = 0;
3675 max = 1;
3676 ecode++;
3677 goto REPEATNOTCHAR;
3678
3679 case OP_NOTPOSUPTO:
3680 case OP_NOTPOSUPTOI:
3681 possessive = TRUE;
3682 min = 0;
3683 max = GET2(ecode, 1);
3684 ecode += 1 + IMM2_SIZE;
3685 goto REPEATNOTCHAR;
3686
3687 case OP_NOTSTAR:
3688 case OP_NOTSTARI:
3689 case OP_NOTMINSTAR:
3690 case OP_NOTMINSTARI:
3691 case OP_NOTPLUS:
3692 case OP_NOTPLUSI:
3693 case OP_NOTMINPLUS:
3694 case OP_NOTMINPLUSI:
3695 case OP_NOTQUERY:
3696 case OP_NOTQUERYI:
3697 case OP_NOTMINQUERY:
3698 case OP_NOTMINQUERYI:
3699 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3700 minimize = (c & 1) != 0;
3701 min = rep_min[c]; /* Pick up values from tables; */
3702 max = rep_max[c]; /* zero for max => infinity */
3703 if (max == 0) max = INT_MAX;
3704
3705 /* Common code for all repeated single-byte matches. */
3706
3707 REPEATNOTCHAR:
3708 GETCHARINCTEST(fc, ecode);
3709
3710 /* The code is duplicated for the caseless and caseful cases, for speed,
3711 since matching characters is likely to be quite common. First, ensure the
3712 minimum number of matches are present. If min = max, continue at the same
3713 level without recursing. Otherwise, if minimizing, keep trying the rest of
3714 the expression and advancing one matching character if failing, up to the
3715 maximum. Alternatively, if maximizing, find the maximum number of
3716 characters and work backwards. */
3717
3718 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3719 max, (char *)eptr));
3720
3721 if (op >= OP_NOTSTARI) /* Caseless */
3722 {
3723 #ifdef SUPPORT_UTF
3724 #ifdef SUPPORT_UCP
3725 if (utf && fc > 127)
3726 foc = UCD_OTHERCASE(fc);
3727 #else
3728 if (utf && fc > 127)
3729 foc = fc;
3730 #endif /* SUPPORT_UCP */
3731 else
3732 #endif /* SUPPORT_UTF */
3733 foc = TABLE_GET(fc, md->fcc, fc);
3734
3735 #ifdef SUPPORT_UTF
3736 if (utf)
3737 {
3738 register pcre_uint32 d;
3739 for (i = 1; i <= min; i++)
3740 {
3741 if (eptr >= md->end_subject)
3742 {
3743 SCHECK_PARTIAL();
3744 RRETURN(MATCH_NOMATCH);
3745 }
3746 GETCHARINC(d, eptr);
3747 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3748 }
3749 }
3750 else
3751 #endif
3752 /* Not UTF mode */
3753 {
3754 for (i = 1; i <= min; i++)
3755 {
3756 if (eptr >= md->end_subject)
3757 {
3758 SCHECK_PARTIAL();
3759 RRETURN(MATCH_NOMATCH);
3760 }
3761 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3762 eptr++;
3763 }
3764 }
3765
3766 if (min == max) continue;
3767
3768 if (minimize)
3769 {
3770 #ifdef SUPPORT_UTF
3771 if (utf)
3772 {
3773 register pcre_uint32 d;
3774 for (fi = min;; fi++)
3775 {
3776 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3777 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3778 if (fi >= max) RRETURN(MATCH_NOMATCH);
3779 if (eptr >= md->end_subject)
3780 {
3781 SCHECK_PARTIAL();
3782 RRETURN(MATCH_NOMATCH);
3783 }
3784 GETCHARINC(d, eptr);
3785 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3786 }
3787 }
3788 else
3789 #endif
3790 /* Not UTF mode */
3791 {
3792 for (fi = min;; fi++)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (fi >= max) RRETURN(MATCH_NOMATCH);
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 RRETURN(MATCH_NOMATCH);
3801 }
3802 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3803 eptr++;
3804 }
3805 }
3806 /* Control never gets here */
3807 }
3808
3809 /* Maximize case */
3810
3811 else
3812 {
3813 pp = eptr;
3814
3815 #ifdef SUPPORT_UTF
3816 if (utf)
3817 {
3818 register pcre_uint32 d;
3819 for (i = min; i < max; i++)
3820 {
3821 int len = 1;
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 break;
3826 }
3827 GETCHARLEN(d, eptr, len);
3828 if (fc == d || (unsigned int)foc == d) break;
3829 eptr += len;
3830 }
3831 if (possessive) continue;
3832 for(;;)
3833 {
3834 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3836 if (eptr-- == pp) break; /* Stop if tried at original pos */
3837 BACKCHAR(eptr);
3838 }
3839 }
3840 else
3841 #endif
3842 /* Not UTF mode */
3843 {
3844 for (i = min; i < max; i++)
3845 {
3846 if (eptr >= md->end_subject)
3847 {
3848 SCHECK_PARTIAL();
3849 break;
3850 }
3851 if (fc == *eptr || foc == *eptr) break;
3852 eptr++;
3853 }
3854 if (possessive) continue;
3855 while (eptr >= pp)
3856 {
3857 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3859 eptr--;
3860 }
3861 }
3862
3863 RRETURN(MATCH_NOMATCH);
3864 }
3865 /* Control never gets here */
3866 }
3867
3868 /* Caseful comparisons */
3869
3870 else
3871 {
3872 #ifdef SUPPORT_UTF
3873 if (utf)
3874 {
3875 register pcre_uint32 d;
3876 for (i = 1; i <= min; i++)
3877 {
3878 if (eptr >= md->end_subject)
3879 {
3880 SCHECK_PARTIAL();
3881 RRETURN(MATCH_NOMATCH);
3882 }
3883 GETCHARINC(d, eptr);
3884 if (fc == d) RRETURN(MATCH_NOMATCH);
3885 }
3886 }
3887 else
3888 #endif
3889 /* Not UTF mode */
3890 {
3891 for (i = 1; i <= min; i++)
3892 {
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 RRETURN(MATCH_NOMATCH);
3897 }
3898 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3899 }
3900 }
3901
3902 if (min == max) continue;
3903
3904 if (minimize)
3905 {
3906 #ifdef SUPPORT_UTF
3907 if (utf)
3908 {
3909 register pcre_uint32 d;
3910 for (fi = min;; fi++)
3911 {
3912 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3914 if (fi >= max) RRETURN(MATCH_NOMATCH);
3915 if (eptr >= md->end_subject)
3916 {
3917 SCHECK_PARTIAL();
3918 RRETURN(MATCH_NOMATCH);
3919 }
3920 GETCHARINC(d, eptr);
3921 if (fc == d) RRETURN(MATCH_NOMATCH);
3922 }
3923 }
3924 else
3925 #endif
3926 /* Not UTF mode */
3927 {
3928 for (fi = min;; fi++)
3929 {
3930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3932 if (fi >= max) RRETURN(MATCH_NOMATCH);
3933 if (eptr >= md->end_subject)
3934 {
3935 SCHECK_PARTIAL();
3936 RRETURN(MATCH_NOMATCH);
3937 }
3938 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3939 }
3940 }
3941 /* Control never gets here */
3942 }
3943
3944 /* Maximize case */
3945
3946 else
3947 {
3948 pp = eptr;
3949
3950 #ifdef SUPPORT_UTF
3951 if (utf)
3952 {
3953 register pcre_uint32 d;
3954 for (i = min; i < max; i++)
3955 {
3956 int len = 1;
3957 if (eptr >= md->end_subject)
3958 {
3959 SCHECK_PARTIAL();
3960 break;
3961 }
3962 GETCHARLEN(d, eptr, len);
3963 if (fc == d) break;
3964 eptr += len;
3965 }
3966 if (possessive) continue;
3967 for(;;)
3968 {
3969 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3971 if (eptr-- == pp) break; /* Stop if tried at original pos */
3972 BACKCHAR(eptr);
3973 }
3974 }
3975 else
3976 #endif
3977 /* Not UTF mode */
3978 {
3979 for (i = min; i < max; i++)
3980 {
3981 if (eptr >= md->end_subject)
3982 {
3983 SCHECK_PARTIAL();
3984 break;
3985 }
3986 if (fc == *eptr) break;
3987 eptr++;
3988 }
3989 if (possessive) continue;
3990 while (eptr >= pp)
3991 {
3992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3994 eptr--;
3995 }
3996 }
3997
3998 RRETURN(MATCH_NOMATCH);
3999 }
4000 }
4001 /* Control never gets here */
4002
4003 /* Match a single character type repeatedly; several different opcodes
4004 share code. This is very similar to the code for single characters, but we
4005 repeat it in the interests of efficiency. */
4006
4007 case OP_TYPEEXACT:
4008 min = max = GET2(ecode, 1);
4009 minimize = TRUE;
4010 ecode += 1 + IMM2_SIZE;
4011 goto REPEATTYPE;
4012
4013 case OP_TYPEUPTO:
4014 case OP_TYPEMINUPTO:
4015 min = 0;
4016 max = GET2(ecode, 1);
4017 minimize = *ecode == OP_TYPEMINUPTO;
4018 ecode += 1 + IMM2_SIZE;
4019 goto REPEATTYPE;
4020
4021 case OP_TYPEPOSSTAR:
4022 possessive = TRUE;
4023 min = 0;
4024 max = INT_MAX;
4025 ecode++;
4026 goto REPEATTYPE;
4027
4028 case OP_TYPEPOSPLUS:
4029 possessive = TRUE;
4030 min = 1;
4031 max = INT_MAX;
4032 ecode++;
4033 goto REPEATTYPE;
4034
4035 case OP_TYPEPOSQUERY:
4036 possessive = TRUE;
4037 min = 0;
4038 max = 1;
4039 ecode++;
4040 goto REPEATTYPE;
4041
4042 case OP_TYPEPOSUPTO:
4043 possessive = TRUE;
4044 min = 0;
4045 max = GET2(ecode, 1);
4046 ecode += 1 + IMM2_SIZE;
4047 goto REPEATTYPE;
4048
4049 case OP_TYPESTAR:
4050 case OP_TYPEMINSTAR:
4051 case OP_TYPEPLUS:
4052 case OP_TYPEMINPLUS:
4053 case OP_TYPEQUERY:
4054 case OP_TYPEMINQUERY:
4055 c = *ecode++ - OP_TYPESTAR;
4056 minimize = (c & 1) != 0;
4057 min = rep_min[c]; /* Pick up values from tables; */
4058 max = rep_max[c]; /* zero for max => infinity */
4059 if (max == 0) max = INT_MAX;
4060
4061 /* Common code for all repeated single character type matches. Note that
4062 in UTF-8 mode, '.' matches a character of any length, but for the other
4063 character types, the valid characters are all one-byte long. */
4064
4065 REPEATTYPE:
4066 ctype = *ecode++; /* Code for the character type */
4067
4068 #ifdef SUPPORT_UCP
4069 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4070 {
4071 prop_fail_result = ctype == OP_NOTPROP;
4072 prop_type = *ecode++;
4073 prop_value = *ecode++;
4074 }
4075 else prop_type = -1;
4076 #endif
4077
4078 /* First, ensure the minimum number of matches are present. Use inline
4079 code for maximizing the speed, and do the type test once at the start
4080 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4081 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4082 and single-bytes. */
4083
4084 if (min > 0)
4085 {
4086 #ifdef SUPPORT_UCP
4087 if (prop_type >= 0)
4088 {
4089 switch(prop_type)
4090 {
4091 case PT_ANY:
4092 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4093 for (i = 1; i <= min; i++)
4094 {
4095 if (eptr >= md->end_subject)
4096 {
4097 SCHECK_PARTIAL();
4098 RRETURN(MATCH_NOMATCH);
4099 }
4100 GETCHARINCTEST(c, eptr);
4101 }
4102 break;
4103
4104 case PT_LAMP:
4105 for (i = 1; i <= min; i++)
4106 {
4107 int chartype;
4108 if (eptr >= md->end_subject)
4109 {
4110 SCHECK_PARTIAL();
4111 RRETURN(MATCH_NOMATCH);
4112 }
4113 GETCHARINCTEST(c, eptr);
4114 chartype = UCD_CHARTYPE(c);
4115 if ((chartype == ucp_Lu ||
4116 chartype == ucp_Ll ||
4117 chartype == ucp_Lt) == prop_fail_result)
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 break;
4121
4122 case PT_GC:
4123 for (i = 1; i <= min; i++)
4124 {
4125 if (eptr >= md->end_subject)
4126 {
4127 SCHECK_PARTIAL();
4128 RRETURN(MATCH_NOMATCH);
4129 }
4130 GETCHARINCTEST(c, eptr);
4131 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4132 RRETURN(MATCH_NOMATCH);
4133 }
4134 break;
4135
4136 case PT_PC:
4137 for (i = 1; i <= min; i++)
4138 {
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 RRETURN(MATCH_NOMATCH);
4143 }
4144 GETCHARINCTEST(c, eptr);
4145 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4146 RRETURN(MATCH_NOMATCH);
4147 }
4148 break;
4149
4150 case PT_SC:
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 RRETURN(MATCH_NOMATCH);
4157 }
4158 GETCHARINCTEST(c, eptr);
4159 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4160 RRETURN(MATCH_NOMATCH);
4161 }
4162 break;
4163
4164 case PT_ALNUM:
4165 for (i = 1; i <= min; i++)
4166 {
4167 int category;
4168 if (eptr >= md->end_subject)
4169 {
4170 SCHECK_PARTIAL();
4171 RRETURN(MATCH_NOMATCH);
4172 }
4173 GETCHARINCTEST(c, eptr);
4174 category = UCD_CATEGORY(c);
4175 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4176 RRETURN(MATCH_NOMATCH);
4177 }
4178 break;
4179
4180 case PT_SPACE: /* Perl space */
4181 for (i = 1; i <= min; i++)
4182 {
4183 if (eptr >= md->end_subject)
4184 {
4185 SCHECK_PARTIAL();
4186 RRETURN(MATCH_NOMATCH);
4187 }
4188 GETCHARINCTEST(c, eptr);
4189 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4190 c == CHAR_FF || c == CHAR_CR)
4191 == prop_fail_result)
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 break;
4195
4196 case PT_PXSPACE: /* POSIX space */
4197 for (i = 1; i <= min; i++)
4198 {
4199 if (eptr >= md->end_subject)
4200 {
4201 SCHECK_PARTIAL();
4202 RRETURN(MATCH_NOMATCH);
4203 }
4204 GETCHARINCTEST(c, eptr);
4205 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4206 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4207 == prop_fail_result)
4208 RRETURN(MATCH_NOMATCH);
4209 }
4210 break;
4211
4212 case PT_WORD:
4213 for (i = 1; i <= min; i++)
4214 {
4215 int category;
4216 if (eptr >= md->end_subject)
4217 {
4218 SCHECK_PARTIAL();
4219 RRETURN(MATCH_NOMATCH);
4220 }
4221 GETCHARINCTEST(c, eptr);
4222 category = UCD_CATEGORY(c);
4223 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4224 == prop_fail_result)
4225 RRETURN(MATCH_NOMATCH);
4226 }
4227 break;
4228
4229 case PT_CLIST:
4230 for (i = 1; i <= min; i++)
4231 {
4232 const pcre_uint32 *cp;
4233 if (eptr >= md->end_subject)
4234 {
4235 SCHECK_PARTIAL();
4236 RRETURN(MATCH_NOMATCH);
4237 }
4238 GETCHARINCTEST(c, eptr);
4239 cp = PRIV(ucd_caseless_sets) + prop_value;
4240 for (;;)
4241 {
4242 if (c < *cp)
4243 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4244 if (c == *cp++)
4245 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4246 }
4247 }
4248 break;
4249
4250 /* This should not occur */
4251
4252 default:
4253 RRETURN(PCRE_ERROR_INTERNAL);
4254 }
4255 }
4256
4257 /* Match extended Unicode sequences. We will get here only if the
4258 support is in the binary; otherwise a compile-time error occurs. */
4259
4260 else if (ctype == OP_EXTUNI)
4261 {
4262 for (i = 1; i <= min; i++)
4263 {
4264 if (eptr >= md->end_subject)
4265 {
4266 SCHECK_PARTIAL();
4267 RRETURN(MATCH_NOMATCH);
4268 }
4269 else
4270 {
4271 int lgb, rgb;
4272 GETCHARINCTEST(c, eptr);
4273 lgb = UCD_GRAPHBREAK(c);
4274 while (eptr < md->end_subject)
4275 {
4276 int len = 1;
4277 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4278 rgb = UCD_GRAPHBREAK(c);
4279 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4280 lgb = rgb;
4281 eptr += len;
4282 }
4283 }
4284 CHECK_PARTIAL();
4285 }
4286 }
4287
4288 else
4289 #endif /* SUPPORT_UCP */
4290
4291 /* Handle all other cases when the coding is UTF-8 */
4292
4293 #ifdef SUPPORT_UTF
4294 if (utf) switch(ctype)
4295 {
4296 case OP_ANY:
4297 for (i = 1; i <= min; i++)
4298 {
4299 if (eptr >= md->end_subject)
4300 {
4301 SCHECK_PARTIAL();
4302 RRETURN(MATCH_NOMATCH);
4303 }
4304 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4305 if (md->partial != 0 &&
4306 eptr + 1 >= md->end_subject &&
4307 NLBLOCK->nltype == NLTYPE_FIXED &&
4308 NLBLOCK->nllen == 2 &&
4309 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4310 {
4311 md->hitend = TRUE;
4312 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4313 }
4314 eptr++;
4315 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4316 }
4317 break;
4318
4319 case OP_ALLANY:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 RRETURN(MATCH_NOMATCH);
4326 }
4327 eptr++;
4328 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4329 }
4330 break;
4331
4332 case OP_ANYBYTE:
4333 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4334 eptr += min;
4335 break;
4336
4337 case OP_ANYNL:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 RRETURN(MATCH_NOMATCH);
4344 }
4345 GETCHARINC(c, eptr);
4346 switch(c)
4347 {
4348 default: RRETURN(MATCH_NOMATCH);
4349
4350 case CHAR_CR:
4351 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4352 break;
4353
4354 case CHAR_LF:
4355 break;
4356
4357 case CHAR_VT:
4358 case CHAR_FF:
4359 case CHAR_NEL:
4360 #ifndef EBCDIC
4361 case 0x2028:
4362 case 0x2029:
4363 #endif /* Not EBCDIC */
4364 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4365 break;
4366 }
4367 }
4368 break;
4369
4370 case OP_NOT_HSPACE:
4371 for (i = 1; i <= min; i++)
4372 {
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 RRETURN(MATCH_NOMATCH);
4377 }
4378 GETCHARINC(c, eptr);
4379 switch(c)
4380 {
4381 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4382 default: break;
4383 }
4384 }
4385 break;
4386
4387 case OP_HSPACE:
4388 for (i = 1; i <= min; i++)
4389 {
4390 if (eptr >= md->end_subject)
4391 {
4392 SCHECK_PARTIAL();
4393 RRETURN(MATCH_NOMATCH);
4394 }
4395 GETCHARINC(c, eptr);
4396 switch(c)
4397 {
4398 HSPACE_CASES: break; /* Byte and multibyte cases */
4399 default: RRETURN(MATCH_NOMATCH);
4400 }
4401 }
4402 break;
4403
4404 case OP_NOT_VSPACE:
4405 for (i = 1; i <= min; i++)
4406 {
4407 if (eptr >= md->end_subject)
4408 {
4409 SCHECK_PARTIAL();
4410 RRETURN(MATCH_NOMATCH);
4411 }
4412 GETCHARINC(c, eptr);
4413 switch(c)
4414 {
4415 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4416 default: break;
4417 }
4418 }
4419 break;
4420
4421 case OP_VSPACE:
4422 for (i = 1; i <= min; i++)
4423 {
4424 if (eptr >= md->end_subject)
4425 {
4426 SCHECK_PARTIAL();
4427 RRETURN(MATCH_NOMATCH);
4428 }
4429 GETCHARINC(c, eptr);
4430 switch(c)
4431 {
4432 VSPACE_CASES: break;
4433 default: RRETURN(MATCH_NOMATCH);
4434 }
4435 }
4436 break;
4437
4438 case OP_NOT_DIGIT:
4439 for (i = 1; i <= min; i++)
4440 {
4441 if (eptr >= md->end_subject)
4442 {
4443 SCHECK_PARTIAL();
4444 RRETURN(MATCH_NOMATCH);
4445 }
4446 GETCHARINC(c, eptr);
4447 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4448 RRETURN(MATCH_NOMATCH);
4449 }
4450 break;
4451
4452 case OP_DIGIT:
4453 for (i = 1; i <= min; i++)
4454 {
4455 pcre_uchar cc;
4456
4457 if (eptr >= md->end_subject)
4458 {
4459 SCHECK_PARTIAL();
4460 RRETURN(MATCH_NOMATCH);
4461 }
4462 cc = RAWUCHAR(eptr);
4463 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4464 RRETURN(MATCH_NOMATCH);
4465 eptr++;
4466 /* No need to skip more bytes - we know it's a 1-byte character */
4467 }
4468 break;
4469
4470 case OP_NOT_WHITESPACE:
4471 for (i = 1; i <= min; i++)
4472 {
4473 pcre_uchar cc;
4474
4475 if (eptr >= md->end_subject)
4476 {
4477 SCHECK_PARTIAL();
4478 RRETURN(MATCH_NOMATCH);
4479 }
4480 cc = RAWUCHAR(eptr);
4481 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4482 RRETURN(MATCH_NOMATCH);
4483 eptr++;
4484 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4485 }
4486 break;
4487
4488 case OP_WHITESPACE:
4489 for (i = 1; i <= min; i++)
4490 {
4491 pcre_uchar cc;
4492
4493 if (eptr >= md->end_subject)
4494 {
4495 SCHECK_PARTIAL();
4496 RRETURN(MATCH_NOMATCH);
4497 }
4498 cc = RAWUCHAR(eptr);
4499 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4500 RRETURN(MATCH_NOMATCH);
4501 eptr++;
4502 /* No need to skip more bytes - we know it's a 1-byte character */
4503 }
4504 break;
4505
4506 case OP_NOT_WORDCHAR:
4507 for (i = 1; i <= min; i++)
4508 {
4509 pcre_uchar cc;
4510
4511 if (eptr >= md->end_subject)
4512 {
4513 SCHECK_PARTIAL();
4514 RRETURN(MATCH_NOMATCH);
4515 }
4516 cc = RAWUCHAR(eptr);
4517 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4518 RRETURN(MATCH_NOMATCH);
4519 eptr++;
4520 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4521 }
4522 break;
4523
4524 case OP_WORDCHAR:
4525 for (i = 1; i <= min; i++)
4526 {
4527 pcre_uchar cc;
4528
4529 if (eptr >= md->end_subject)
4530 {
4531 SCHECK_PARTIAL();
4532 RRETURN(MATCH_NOMATCH);
4533 }
4534 cc = RAWUCHAR(eptr);
4535 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4536 RRETURN(MATCH_NOMATCH);
4537 eptr++;
4538 /* No need to skip more bytes - we know it's a 1-byte character */
4539 }
4540 break;
4541
4542 default:
4543 RRETURN(PCRE_ERROR_INTERNAL);
4544 } /* End switch(ctype) */
4545
4546 else
4547 #endif /* SUPPORT_UTF */
4548
4549 /* Code for the non-UTF-8 case for minimum matching of operators other
4550 than OP_PROP and OP_NOTPROP. */
4551
4552 switch(ctype)
4553 {
4554 case OP_ANY:
4555 for (i = 1; i <= min; i++)
4556 {
4557 if (eptr >= md->end_subject)
4558 {
4559 SCHECK_PARTIAL();
4560 RRETURN(MATCH_NOMATCH);
4561 }
4562 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4563 if (md->partial != 0 &&
4564 eptr + 1 >= md->end_subject &&
4565 NLBLOCK->nltype == NLTYPE_FIXED &&
4566 NLBLOCK->nllen == 2 &&
4567 *eptr == NLBLOCK->nl[0])
4568 {
4569 md->hitend = TRUE;
4570 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4571 }
4572 eptr++;
4573 }
4574 break;
4575
4576 case OP_ALLANY:
4577 if (eptr > md->end_subject - min)
4578 {
4579 SCHECK_PARTIAL();
4580 RRETURN(MATCH_NOMATCH);
4581 }
4582 eptr += min;
4583 break;
4584
4585 case OP_ANYBYTE:
4586 if (eptr > md->end_subject - min)
4587 {
4588 SCHECK_PARTIAL();
4589 RRETURN(MATCH_NOMATCH);
4590 }
4591 eptr += min;
4592 break;
4593
4594 case OP_ANYNL:
4595 for (i = 1; i <= min; i++)
4596 {
4597 if (eptr >= md->end_subject)
4598 {
4599 SCHECK_PARTIAL();
4600 RRETURN(MATCH_NOMATCH);
4601 }
4602 switch(*eptr++)
4603 {
4604 default: RRETURN(MATCH_NOMATCH);
4605
4606 case CHAR_CR:
4607 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4608 break;
4609
4610 case CHAR_LF:
4611 break;
4612
4613 case CHAR_VT:
4614 case CHAR_FF:
4615 case CHAR_NEL:
4616 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4617 case 0x2028:
4618 case 0x2029:
4619 #endif
4620 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4621 break;
4622 }
4623 }
4624 break;
4625
4626 case OP_NOT_HSPACE:
4627 for (i = 1; i <= min; i++)
4628 {
4629 if (eptr >= md->end_subject)
4630 {
4631 SCHECK_PARTIAL();
4632 RRETURN(MATCH_NOMATCH);
4633 }
4634 switch(*eptr++)
4635 {
4636 default: break;
4637 HSPACE_BYTE_CASES:
4638 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4639 HSPACE_MULTIBYTE_CASES:
4640 #endif
4641 RRETURN(MATCH_NOMATCH);
4642 }
4643 }
4644 break;
4645
4646 case OP_HSPACE:
4647 for (i = 1; i <= min; i++)
4648 {
4649 if (eptr >= md->end_subject)
4650 {
4651 SCHECK_PARTIAL();
4652 RRETURN(MATCH_NOMATCH);
4653 }
4654 switch(*eptr++)
4655 {
4656 default: RRETURN(MATCH_NOMATCH);
4657 HSPACE_BYTE_CASES:
4658 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4659 HSPACE_MULTIBYTE_CASES:
4660 #endif
4661 break;
4662 }
4663 }
4664 break;
4665
4666 case OP_NOT_VSPACE:
4667 for (i = 1; i <= min; i++)
4668 {
4669 if (eptr >= md->end_subject)
4670 {
4671 SCHECK_PARTIAL();
4672 RRETURN(MATCH_NOMATCH);
4673 }
4674 switch(*eptr++)
4675 {
4676 VSPACE_BYTE_CASES:
4677 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4678 VSPACE_MULTIBYTE_CASES:
4679 #endif
4680 RRETURN(MATCH_NOMATCH);
4681 default: break;
4682 }
4683 }
4684 break;
4685
4686 case OP_VSPACE:
4687 for (i = 1; i <= min; i++)
4688 {
4689 if (eptr >= md->end_subject)
4690 {
4691 SCHECK_PARTIAL();
4692 RRETURN(MATCH_NOMATCH);
4693 }
4694 switch(*eptr++)
4695 {
4696 default: RRETURN(MATCH_NOMATCH);
4697 VSPACE_BYTE_CASES:
4698 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4699 VSPACE_MULTIBYTE_CASES:
4700 #endif
4701 break;
4702 }
4703 }
4704 break;
4705
4706 case OP_NOT_DIGIT:
4707 for (i = 1; i <= min; i++)
4708 {
4709 if (eptr >= md->end_subject)
4710 {
4711 SCHECK_PARTIAL();
4712 RRETURN(MATCH_NOMATCH);
4713 }
4714 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4715 RRETURN(MATCH_NOMATCH);
4716 eptr++;
4717 }
4718 break;
4719
4720 case OP_DIGIT:
4721 for (i = 1; i <= min; i++)
4722 {
4723 if (eptr >= md->end_subject)
4724 {
4725 SCHECK_PARTIAL();
4726 RRETURN(MATCH_NOMATCH);
4727 }
4728 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4729 RRETURN(MATCH_NOMATCH);
4730 eptr++;
4731 }
4732 break;
4733
4734 case OP_NOT_WHITESPACE:
4735 for (i = 1; i <= min; i++)
4736 {
4737 if (eptr >= md->end_subject)
4738 {
4739 SCHECK_PARTIAL();
4740 RRETURN(MATCH_NOMATCH);
4741 }
4742 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4743 RRETURN(MATCH_NOMATCH);
4744 eptr++;
4745 }
4746 break;
4747
4748 case OP_WHITESPACE:
4749 for (i = 1; i <= min; i++)
4750 {
4751 if (eptr >= md->end_subject)
4752 {
4753 SCHECK_PARTIAL();
4754 RRETURN(MATCH_NOMATCH);
4755 }
4756 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4757 RRETURN(MATCH_NOMATCH);
4758 eptr++;
4759 }
4760 break;
4761
4762 case OP_NOT_WORDCHAR:
4763 for (i = 1; i <= min; i++)
4764 {
4765 if (eptr >= md->end_subject)
4766 {
4767 SCHECK_PARTIAL();
4768 RRETURN(MATCH_NOMATCH);
4769 }
4770 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4771 RRETURN(MATCH_NOMATCH);
4772 eptr++;
4773 }
4774 break;
4775
4776 case OP_WORDCHAR:
4777 for (i = 1; i <= min; i++)
4778 {
4779 if (eptr >= md->end_subject)
4780 {
4781 SCHECK_PARTIAL();
4782 RRETURN(MATCH_NOMATCH);
4783 }
4784 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4785 RRETURN(MATCH_NOMATCH);
4786 eptr++;
4787 }
4788 break;
4789
4790 default:
4791 RRETURN(PCRE_ERROR_INTERNAL);
4792 }
4793 }
4794
4795 /* If min = max, continue at the same level without recursing */
4796
4797 if (min == max) continue;
4798
4799 /* If minimizing, we have to test the rest of the pattern before each
4800 subsequent match. Again, separate the UTF-8 case for speed, and also
4801 separate the UCP cases. */
4802
4803 if (minimize)
4804 {
4805 #ifdef SUPPORT_UCP
4806 if (prop_type >= 0)
4807 {
4808 switch(prop_type)
4809 {
4810 case PT_ANY:
4811 for (fi = min;; fi++)
4812 {
4813 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4815 if (fi >= max) RRETURN(MATCH_NOMATCH);
4816 if (eptr >= md->end_subject)
4817 {
4818 SCHECK_PARTIAL();
4819 RRETURN(MATCH_NOMATCH);
4820 }
4821 GETCHARINCTEST(c, eptr);
4822 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4823 }
4824 /* Control never gets here */
4825
4826 case PT_LAMP:
4827 for (fi = min;; fi++)
4828 {
4829 int chartype;
4830 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4832 if (fi >= max) RRETURN(MATCH_NOMATCH);
4833 if (eptr >= md->end_subject)
4834 {
4835 SCHECK_PARTIAL();
4836 RRETURN(MATCH_NOMATCH);
4837 }
4838 GETCHARINCTEST(c, eptr);
4839 chartype = UCD_CHARTYPE(c);
4840 if ((chartype == ucp_Lu ||
4841 chartype == ucp_Ll ||
4842 chartype == ucp_Lt) == prop_fail_result)
4843 RRETURN(MATCH_NOMATCH);
4844 }
4845 /* Control never gets here */
4846
4847 case PT_GC:
4848 for (fi = min;; fi++)
4849 {
4850 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4852 if (fi >= max) RRETURN(MATCH_NOMATCH);
4853 if (eptr >= md->end_subject)
4854 {
4855 SCHECK_PARTIAL();
4856 RRETURN(MATCH_NOMATCH);
4857 }
4858 GETCHARINCTEST(c, eptr);
4859 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4860 RRETURN(MATCH_NOMATCH);
4861 }
4862 /* Control never gets here */
4863
4864 case PT_PC:
4865 for (fi = min;; fi++)
4866 {
4867 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4869 if (fi >= max) RRETURN(MATCH_NOMATCH);
4870 if (eptr >= md->end_subject)
4871 {
4872 SCHECK_PARTIAL();
4873 RRETURN(MATCH_NOMATCH);
4874 }
4875 GETCHARINCTEST(c, eptr);
4876 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4877 RRETURN(MATCH_NOMATCH);
4878 }
4879 /* Control never gets here */
4880
4881 case PT_SC:
4882 for (fi = min;; fi++)
4883 {
4884 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4886 if (fi >= max) RRETURN(MATCH_NOMATCH);
4887 if (eptr >= md->end_subject)
4888 {
4889 SCHECK_PARTIAL();
4890 RRETURN(MATCH_NOMATCH);
4891 }
4892 GETCHARINCTEST(c, eptr);
4893 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4894 RRETURN(MATCH_NOMATCH);
4895 }
4896 /* Control never gets here */
4897
4898 case PT_ALNUM:
4899 for (fi = min;; fi++)
4900 {
4901 int category;
4902 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4904 if (fi >= max) RRETURN(MATCH_NOMATCH);
4905 if (eptr >= md->end_subject)
4906 {
4907 SCHECK_PARTIAL();
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 GETCHARINCTEST(c, eptr);
4911 category = UCD_CATEGORY(c);
4912 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4913 RRETURN(MATCH_NOMATCH);
4914 }
4915 /* Control never gets here */
4916
4917 case PT_SPACE: /* Perl space */
4918 for (fi = min;; fi++)
4919 {
4920 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4922 if (fi >= max) RRETURN(MATCH_NOMATCH);
4923 if (eptr >= md->end_subject)
4924 {
4925 SCHECK_PARTIAL();
4926 RRETURN(MATCH_NOMATCH);
4927 }
4928 GETCHARINCTEST(c, eptr);
4929 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4930 c == CHAR_FF || c == CHAR_CR)
4931 == prop_fail_result)
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 /* Control never gets here */
4935
4936 case PT_PXSPACE: /* POSIX space */
4937 for (fi = min;; fi++)
4938 {
4939 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4941 if (fi >= max) RRETURN(MATCH_NOMATCH);
4942 if (eptr >= md->end_subject)
4943 {
4944 SCHECK_PARTIAL();
4945 RRETURN(MATCH_NOMATCH);
4946 }
4947 GETCHARINCTEST(c, eptr);
4948 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4949 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4950 == prop_fail_result)
4951 RRETURN(MATCH_NOMATCH);
4952 }
4953 /* Control never gets here */
4954
4955 case PT_WORD:
4956 for (fi = min;; fi++)
4957 {
4958 int category;
4959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4961 if (fi >= max) RRETURN(MATCH_NOMATCH);
4962 if (eptr >= md->end_subject)
4963 {
4964 SCHECK_PARTIAL();
4965 RRETURN(MATCH_NOMATCH);
4966 }
4967 GETCHARINCTEST(c, eptr);
4968 category = UCD_CATEGORY(c);
4969 if ((category == ucp_L ||
4970 category == ucp_N ||
4971 c == CHAR_UNDERSCORE)
4972 == prop_fail_result)
4973 RRETURN(MATCH_NOMATCH);
4974 }
4975 /* Control never gets here */
4976
4977 case PT_CLIST:
4978 for (fi = min;; fi++)
4979 {
4980 const pcre_uint32 *cp;
4981 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
4982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4983 if (fi >= max) RRETURN(MATCH_NOMATCH);
4984 if (eptr >= md->end_subject)
4985 {
4986 SCHECK_PARTIAL();
4987 RRETURN(MATCH_NOMATCH);
4988 }
4989 GETCHARINCTEST(c, eptr);
4990 cp = PRIV(ucd_caseless_sets) + prop_value;
4991 for (;;)
4992 {
4993 if (c < *cp)
4994 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4995 if (c == *cp++)
4996 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4997 }
4998 }
4999 /* Control never gets here */
5000
5001 /* This should never occur */
5002 default:
5003 RRETURN(PCRE_ERROR_INTERNAL);
5004 }
5005 }
5006
5007 /* Match extended Unicode sequences. We will get here only if the
5008 support is in the binary; otherwise a compile-time error occurs. */
5009
5010 else if (ctype == OP_EXTUNI)
5011 {
5012 for (fi = min;; fi++)
5013 {
5014 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5016 if (fi >= max) RRETURN(MATCH_NOMATCH);
5017 if (eptr >= md->end_subject)
5018 {
5019 SCHECK_PARTIAL();
5020 RRETURN(MATCH_NOMATCH);
5021 }
5022 else
5023 {
5024 int lgb, rgb;
5025 GETCHARINCTEST(c, eptr);
5026 lgb = UCD_GRAPHBREAK(c);
5027 while (eptr < md->end_subject)
5028 {
5029 int len = 1;
5030 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5031 rgb = UCD_GRAPHBREAK(c);
5032 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5033 lgb = rgb;
5034 eptr += len;
5035 }
5036 }
5037 CHECK_PARTIAL();
5038 }
5039 }
5040 else
5041 #endif /* SUPPORT_UCP */
5042
5043 #ifdef SUPPORT_UTF
5044 if (utf)
5045 {
5046 for (fi = min;; fi++)
5047 {
5048 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5050 if (fi >= max) RRETURN(MATCH_NOMATCH);
5051 if (eptr >= md->end_subject)
5052 {
5053 SCHECK_PARTIAL();
5054 RRETURN(MATCH_NOMATCH);
5055 }
5056 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5057 RRETURN(MATCH_NOMATCH);
5058 GETCHARINC(c, eptr);
5059 switch(ctype)
5060 {
5061 case OP_ANY: /* This is the non-NL case */
5062 if (md->partial != 0 && /* Take care with CRLF partial */
5063 eptr >= md->end_subject &&
5064 NLBLOCK->nltype == NLTYPE_FIXED &&
5065 NLBLOCK->nllen == 2 &&
5066 c == NLBLOCK->nl[0])
5067 {
5068 md->hitend = TRUE;
5069 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5070 }
5071 break;
5072
5073 case OP_ALLANY:
5074 case OP_ANYBYTE:
5075 break;
5076
5077 case OP_ANYNL:
5078 switch(c)
5079 {
5080 default: RRETURN(MATCH_NOMATCH);
5081 case CHAR_CR:
5082 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5083 break;
5084
5085 case CHAR_LF:
5086 break;
5087
5088 case CHAR_VT:
5089 case CHAR_FF:
5090 case CHAR_NEL:
5091 #ifndef EBCDIC
5092 case 0x2028:
5093 case 0x2029:
5094 #endif /* Not EBCDIC */
5095 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5096 break;
5097 }
5098 break;
5099
5100 case OP_NOT_HSPACE:
5101 switch(c)
5102 {
5103 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5104 default: break;
5105 }
5106 break;
5107
5108 case OP_HSPACE:
5109 switch(c)
5110 {
5111 HSPACE_CASES: break;
5112 default: RRETURN(MATCH_NOMATCH);
5113 }
5114 break;
5115
5116 case OP_NOT_VSPACE:
5117 switch(c)
5118 {
5119 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5120 default: break;
5121 }
5122 break;
5123
5124 case OP_VSPACE:
5125 switch(c)
5126 {
5127 VSPACE_CASES: break;
5128 default: RRETURN(MATCH_NOMATCH);
5129 }
5130 break;
5131
5132 case OP_NOT_DIGIT:
5133 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5134 RRETURN(MATCH_NOMATCH);
5135 break;
5136
5137 case OP_DIGIT:
5138 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5139 RRETURN(MATCH_NOMATCH);
5140 break;
5141
5142 case OP_NOT_WHITESPACE:
5143 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5144 RRETURN(MATCH_NOMATCH);
5145 break;
5146
5147 case OP_WHITESPACE:
5148 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5149 RRETURN(MATCH_NOMATCH);
5150 break;
5151
5152 case OP_NOT_WORDCHAR:
5153 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5154 RRETURN(MATCH_NOMATCH);
5155 break;
5156
5157 case OP_WORDCHAR:
5158 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5159 RRETURN(MATCH_NOMATCH);
5160 break;
5161
5162 default:
5163 RRETURN(PCRE_ERROR_INTERNAL);
5164 }
5165 }
5166 }
5167 else
5168 #endif
5169 /* Not UTF mode */
5170 {
5171 for (fi = min;; fi++)
5172 {
5173 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5175 if (fi >= max) RRETURN(MATCH_NOMATCH);
5176 if (eptr >= md->end_subject)
5177 {
5178 SCHECK_PARTIAL();
5179 RRETURN(MATCH_NOMATCH);
5180 }
5181 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5182 RRETURN(MATCH_NOMATCH);
5183 c = *eptr++;
5184 switch(ctype)
5185 {
5186 case OP_ANY: /* This is the non-NL case */
5187 if (md->partial != 0 && /* Take care with CRLF partial */
5188 eptr >= md->end_subject &&
5189 NLBLOCK->nltype == NLTYPE_FIXED &&
5190 NLBLOCK->nllen == 2 &&
5191 c == NLBLOCK->nl[0])
5192 {
5193 md->hitend = TRUE;
5194 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5195 }
5196 break;
5197
5198 case OP_ALLANY:
5199 case OP_ANYBYTE:
5200 break;
5201
5202 case OP_ANYNL:
5203 switch(c)
5204 {
5205 default: RRETURN(MATCH_NOMATCH);
5206 case CHAR_CR:
5207 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5208 break;
5209
5210 case CHAR_LF:
5211 break;
5212
5213 case CHAR_VT:
5214 case CHAR_FF:
5215 case CHAR_NEL:
5216 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5217 case 0x2028:
5218 case 0x2029:
5219 #endif
5220 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5221 break;
5222 }
5223 break;
5224
5225 case OP_NOT_HSPACE:
5226 switch(c)
5227 {
5228 default: break;
5229 HSPACE_BYTE_CASES:
5230 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5231 HSPACE_MULTIBYTE_CASES:
5232 #endif
5233 RRETURN(MATCH_NOMATCH);
5234 }
5235 break;
5236
5237 case OP_HSPACE:
5238 switch(c)
5239 {
5240 default: RRETURN(MATCH_NOMATCH);
5241 HSPACE_BYTE_CASES:
5242 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5243 HSPACE_MULTIBYTE_CASES:
5244 #endif
5245 break;
5246 }
5247 break;
5248
5249 case OP_NOT_VSPACE:
5250 switch(c)
5251 {
5252 default: break;
5253 VSPACE_BYTE_CASES:
5254 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5255 VSPACE_MULTIBYTE_CASES:
5256 #endif
5257 RRETURN(MATCH_NOMATCH);
5258 }
5259 break;
5260
5261 case OP_VSPACE:
5262 switch(c)
5263 {
5264 default: RRETURN(MATCH_NOMATCH);
5265 VSPACE_BYTE_CASES:
5266 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5267 VSPACE_MULTIBYTE_CASES:
5268 #endif
5269 break;
5270 }
5271 break;
5272
5273 case OP_NOT_DIGIT:
5274 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5275 break;
5276
5277 case OP_DIGIT:
5278 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5279 break;
5280
5281 case OP_NOT_WHITESPACE:
5282 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5283 break;
5284
5285 case OP_WHITESPACE:
5286 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5287 break;
5288
5289 case OP_NOT_WORDCHAR:
5290 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5291 break;
5292
5293 case OP_WORDCHAR:
5294 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5295 break;
5296
5297 default:
5298 RRETURN(PCRE_ERROR_INTERNAL);
5299 }
5300 }
5301 }
5302 /* Control never gets here */
5303 }
5304
5305 /* If maximizing, it is worth using inline code for speed, doing the type
5306 test once at the start (i.e. keep it out of the loop). Again, keep the
5307 UTF-8 and UCP stuff separate. */
5308
5309 else
5310 {
5311 pp = eptr; /* Remember where we started */
5312
5313 #ifdef SUPPORT_UCP
5314 if (prop_type >= 0)
5315 {
5316 switch(prop_type)
5317 {
5318 case PT_ANY:
5319 for (i = min; i < max; i++)
5320 {
5321 int len = 1;
5322 if (eptr >= md->end_subject)
5323 {
5324 SCHECK_PARTIAL();
5325 break;
5326 }
5327 GETCHARLENTEST(c, eptr, len);
5328 if (prop_fail_result) break;
5329 eptr+= len;
5330 }
5331 break;
5332
5333 case PT_LAMP:
5334 for (i = min; i < max; i++)
5335 {
5336 int chartype;
5337 int len = 1;
5338 if (eptr >= md->end_subject)
5339 {
5340 SCHECK_PARTIAL();
5341 break;
5342 }
5343 GETCHARLENTEST(c, eptr, len);
5344 chartype = UCD_CHARTYPE(c);
5345 if ((chartype == ucp_Lu ||
5346 chartype == ucp_Ll ||
5347 chartype == ucp_Lt) == prop_fail_result)
5348 break;
5349 eptr+= len;
5350 }
5351 break;
5352
5353 case PT_GC:
5354 for (i = min; i < max; i++)
5355 {
5356 int len = 1;
5357 if (eptr >= md->end_subject)
5358 {
5359 SCHECK_PARTIAL();
5360 break;
5361 }
5362 GETCHARLENTEST(c, eptr, len);
5363 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5364 eptr+= len;
5365 }
5366 break;
5367
5368 case PT_PC:
5369 for (i = min; i < max; i++)
5370 {
5371 int len = 1;
5372 if (eptr >= md->end_subject)
5373 {
5374 SCHECK_PARTIAL();
5375 break;
5376 }
5377 GETCHARLENTEST(c, eptr, len);
5378 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5379 eptr+= len;
5380 }
5381 break;
5382
5383 case PT_SC:
5384 for (i = min; i < max; i++)
5385 {
5386 int len = 1;
5387 if (eptr >= md->end_subject)
5388 {
5389 SCHECK_PARTIAL();
5390 break;
5391 }
5392 GETCHARLENTEST(c, eptr, len);
5393 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5394 eptr+= len;
5395 }
5396 break;
5397
5398 case PT_ALNUM:
5399 for (i = min; i < max; i++)
5400 {
5401 int category;
5402 int len = 1;
5403 if (eptr >= md->end_subject)
5404 {
5405 SCHECK_PARTIAL();
5406 break;
5407 }
5408 GETCHARLENTEST(c, eptr, len);
5409 category = UCD_CATEGORY(c);
5410 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5411 break;
5412 eptr+= len;
5413 }
5414 break;
5415
5416 case PT_SPACE: /* Perl space */
5417 for (i = min; i < max; i++)
5418 {
5419 int len = 1;
5420 if (eptr >= md->end_subject)
5421 {
5422 SCHECK_PARTIAL();
5423 break;
5424 }
5425 GETCHARLENTEST(c, eptr, len);
5426 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5427 c == CHAR_FF || c == CHAR_CR)
5428 == prop_fail_result)
5429 break;
5430 eptr+= len;
5431 }
5432 break;
5433
5434 case PT_PXSPACE: /* POSIX space */
5435 for (i = min; i < max; i++)
5436 {
5437 int len = 1;
5438 if (eptr >= md->end_subject)
5439 {
5440 SCHECK_PARTIAL();
5441 break;
5442 }
5443 GETCHARLENTEST(c, eptr, len);
5444 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5445 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5446 == prop_fail_result)
5447 break;
5448 eptr+= len;
5449 }
5450 break;
5451
5452 case PT_WORD:
5453 for (i = min; i < max; i++)
5454 {
5455 int category;
5456 int len = 1;
5457 if (eptr >= md->end_subject)
5458 {
5459 SCHECK_PARTIAL();
5460 break;
5461 }
5462 GETCHARLENTEST(c, eptr, len);
5463 category = UCD_CATEGORY(c);
5464 if ((category == ucp_L || category == ucp_N ||
5465 c == CHAR_UNDERSCORE) == prop_fail_result)
5466 break;
5467 eptr+= len;
5468 }
5469 break;
5470
5471 case PT_CLIST:
5472 for (i = min; i < max; i++)
5473 {
5474 const pcre_uint32 *cp;
5475 int len = 1;
5476 if (eptr >= md->end_subject)
5477 {
5478 SCHECK_PARTIAL();
5479 break;
5480 }
5481 GETCHARLENTEST(c, eptr, len);
5482 cp = PRIV(ucd_caseless_sets) + prop_value;
5483 for (;;)
5484 {
5485 if (c < *cp)
5486 { if (prop_fail_result) break; else goto GOT_MAX; }
5487 if (c == *cp++)
5488 { if (prop_fail_result) goto GOT_MAX; else break; }
5489 }
5490 eptr += len;
5491 }
5492 GOT_MAX:
5493 break;
5494
5495 default:
5496 RRETURN(PCRE_ERROR_INTERNAL);
5497 }
5498
5499 /* eptr is now past the end of the maximum run */
5500
5501 if (possessive) continue;
5502 for(;;)
5503 {
5504 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5505 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5506 if (eptr-- == pp) break; /* Stop if tried at original pos */
5507 if (utf) BACKCHAR(eptr);
5508 }
5509 }
5510
5511 /* Match extended Unicode sequences. We will get here only if the
5512 support is in the binary; otherwise a compile-time error occurs. */
5513
5514 else if (ctype == OP_EXTUNI)
5515 {
5516 for (i = min; i < max; i++)
5517 {
5518 if (eptr >= md->end_subject)
5519 {
5520 SCHECK_PARTIAL();
5521 break;
5522 }
5523 else
5524 {
5525 int lgb, rgb;
5526 GETCHARINCTEST(c, eptr);
5527 lgb = UCD_GRAPHBREAK(c);
5528 while (eptr < md->end_subject)
5529 {
5530 int len = 1;
5531 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5532 rgb = UCD_GRAPHBREAK(c);
5533 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5534 lgb = rgb;
5535 eptr += len;
5536 }
5537 }
5538 CHECK_PARTIAL();
5539 }
5540
5541 /* eptr is now past the end of the maximum run */
5542
5543 if (possessive) continue;
5544
5545 for(;;)
5546 {
5547 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5548 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5549 if (eptr-- == pp) break; /* Stop if tried at original pos */
5550 for (;;) /* Move back over one extended */
5551 {
5552 if (!utf) c = *eptr; else
5553 {
5554 BACKCHAR(eptr);
5555 GETCHAR(c, eptr);
5556 }
5557 if (UCD_CATEGORY(c) != ucp_M) break;
5558 eptr--;
5559 }
5560 }
5561 }
5562
5563 else
5564 #endif /* SUPPORT_UCP */
5565
5566 #ifdef SUPPORT_UTF
5567 if (utf)
5568 {
5569 switch(ctype)
5570 {
5571 case OP_ANY:
5572 if (max < INT_MAX)
5573 {
5574 for (i = min; i < max; i++)
5575 {
5576 if (eptr >= md->end_subject)
5577 {
5578 SCHECK_PARTIAL();
5579 break;
5580 }
5581 if (IS_NEWLINE(eptr)) break;
5582 if (md->partial != 0 && /* Take care with CRLF partial */
5583 eptr + 1 >= md->end_subject &&
5584 NLBLOCK->nltype == NLTYPE_FIXED &&
5585 NLBLOCK->nllen == 2 &&
5586 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5587 {
5588 md->hitend = TRUE;
5589 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5590 }
5591 eptr++;
5592 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5593 }
5594 }
5595
5596 /* Handle unlimited UTF-8 repeat */
5597
5598 else
5599 {
5600 for (i = min; i < max; i++)
5601 {
5602 if (eptr >= md->end_subject)
5603 {
5604 SCHECK_PARTIAL();
5605 break;
5606 }
5607 if (IS_NEWLINE(eptr)) break;
5608 if (md->partial != 0 && /* Take care with CRLF partial */
5609 eptr + 1 >= md->end_subject &&
5610 NLBLOCK->nltype == NLTYPE_FIXED &&
5611 NLBLOCK->nllen == 2 &&
5612 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5613 {
5614 md->hitend = TRUE;
5615 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5616 }
5617 eptr++;
5618 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5619 }
5620 }
5621 break;
5622
5623 case OP_ALLANY:
5624 if (max < INT_MAX)
5625 {
5626 for (i = min; i < max; i++)
5627 {
5628 if (eptr >= md->end_subject)
5629 {
5630 SCHECK_PARTIAL();
5631 break;
5632 }
5633 eptr++;
5634 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5635 }
5636 }
5637 else
5638 {
5639 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5640 SCHECK_PARTIAL();
5641 }
5642 break;
5643
5644 /* The byte case is the same as non-UTF8 */
5645
5646 case OP_ANYBYTE:
5647 c = max - min;
5648 if (c > (unsigned int)(md->end_subject - eptr))
5649 {
5650 eptr = md->end_subject;
5651 SCHECK_PARTIAL();
5652 }
5653 else eptr += c;
5654 break;
5655
5656 case OP_ANYNL:
5657 for (i = min; i < max; i++)
5658 {
5659 int len = 1;
5660 if (eptr >= md->end_subject)
5661 {
5662 SCHECK_PARTIAL();
5663 break;
5664 }
5665 GETCHARLEN(c, eptr, len);
5666 if (c == CHAR_CR)
5667 {
5668 if (++eptr >= md->end_subject) break;
5669 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5670 }
5671 else
5672 {
5673 if (c != CHAR_LF &&
5674 (md->bsr_anycrlf ||
5675 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5676 #ifndef EBCDIC
5677 && c != 0x2028 && c != 0x2029
5678 #endif /* Not EBCDIC */
5679 )))
5680 break;
5681 eptr += len;
5682 }
5683 }
5684 break;
5685
5686 case OP_NOT_HSPACE:
5687 case OP_HSPACE:
5688 for (i = min; i < max; i++)
5689 {
5690 BOOL gotspace;
5691 int len = 1;
5692 if (eptr >= md->end_subject)
5693 {
5694 SCHECK_PARTIAL();
5695 break;
5696 }
5697 GETCHARLEN(c, eptr, len);
5698 switch(c)
5699 {
5700 HSPACE_CASES: gotspace = TRUE; break;
5701 default: gotspace = FALSE; break;
5702 }
5703 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5704 eptr += len;
5705 }
5706 break;
5707
5708 case OP_NOT_VSPACE:
5709 case OP_VSPACE:
5710 for (i = min; i < max; i++)
5711 {
5712 BOOL gotspace;
5713 int len = 1;
5714 if (eptr >= md->end_subject)
5715 {
5716 SCHECK_PARTIAL();
5717 break;
5718 }
5719 GETCHARLEN(c, eptr, len);
5720 switch(c)
5721 {
5722 VSPACE_CASES: gotspace = TRUE; break;
5723 default: gotspace = FALSE; break;
5724 }
5725 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5726 eptr += len;
5727 }
5728 break;
5729
5730 case OP_NOT_DIGIT:
5731 for (i = min; i < max; i++)
5732 {
5733 int len = 1;
5734 if (eptr >= md->end_subject)
5735 {
5736 SCHECK_PARTIAL();
5737 break;
5738 }
5739 GETCHARLEN(c, eptr, len);
5740 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5741 eptr+= len;
5742 }
5743 break;
5744
5745 case OP_DIGIT:
5746 for (i = min; i < max; i++)
5747 {
5748 int len = 1;
5749 if (eptr >= md->end_subject)
5750 {
5751 SCHECK_PARTIAL();
5752 break;
5753 }
5754 GETCHARLEN(c, eptr, len);
5755 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5756 eptr+= len;
5757 }
5758 break;
5759
5760 case OP_NOT_WHITESPACE:
5761 for (i = min; i < max; i++)
5762 {
5763 int len = 1;
5764 if (eptr >= md->end_subject)
5765 {
5766 SCHECK_PARTIAL();
5767 break;
5768 }
5769 GETCHARLEN(c, eptr, len);
5770 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5771 eptr+= len;
5772 }
5773 break;
5774
5775 case OP_WHITESPACE:
5776 for (i = min; i < max; i++)
5777 {
5778 int len = 1;
5779 if (eptr >= md->end_subject)
5780 {
5781 SCHECK_PARTIAL();
5782 break;
5783 }
5784 GETCHARLEN(c, eptr, len);
5785 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5786 eptr+= len;
5787 }
5788 break;
5789
5790 case OP_NOT_WORDCHAR:
5791 for (i = min; i < max; i++)
5792 {
5793 int len = 1;
5794 if (eptr >= md->end_subject)
5795 {
5796 SCHECK_PARTIAL();
5797 break;
5798 }
5799 GETCHARLEN(c, eptr, len);
5800 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5801 eptr+= len;
5802 }
5803 break;
5804
5805 case OP_WORDCHAR:
5806 for (i = min; i < max; i++)
5807 {
5808 int len = 1;
5809 if (eptr >= md->end_subject)
5810 {
5811 SCHECK_PARTIAL();
5812 break;
5813 }
5814 GETCHARLEN(c, eptr, len);
5815 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5816 eptr+= len;
5817 }
5818 break;
5819
5820 default:
5821 RRETURN(PCRE_ERROR_INTERNAL);
5822 }
5823
5824 /* eptr is now past the end of the maximum run. If possessive, we are
5825 done (no backing up). Otherwise, match at this position; anything other
5826 than no match is immediately returned. For nomatch, back up one
5827 character, unless we are matching \R and the last thing matched was
5828 \r\n, in which case, back up two bytes. */
5829
5830 if (possessive) continue;
5831 for(;;)
5832 {
5833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5835 if (eptr-- == pp) break; /* Stop if tried at original pos */
5836 BACKCHAR(eptr);
5837 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5838 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5839 }
5840 }
5841 else
5842 #endif /* SUPPORT_UTF */
5843 /* Not UTF mode */
5844 {
5845 switch(ctype)
5846 {
5847 case OP_ANY:
5848 for (i = min; i < max; i++)
5849 {
5850 if (eptr >= md->end_subject)
5851 {
5852 SCHECK_PARTIAL();
5853 break;
5854 }
5855 if (IS_NEWLINE(eptr)) break;
5856 if (md->partial != 0 && /* Take care with CRLF partial */
5857 eptr + 1 >= md->end_subject &&
5858 NLBLOCK->nltype == NLTYPE_FIXED &&
5859 NLBLOCK->nllen == 2 &&
5860 *eptr == NLBLOCK->nl[0])
5861 {
5862 md->hitend = TRUE;
5863 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5864 }
5865 eptr++;
5866 }
5867 break;
5868
5869 case OP_ALLANY:
5870 case OP_ANYBYTE:
5871 c = max - min;
5872 if (c > (unsigned int)(md->end_subject - eptr))
5873 {
5874 eptr = md->end_subject;
5875 SCHECK_PARTIAL();
5876 }
5877 else eptr += c;
5878 break;
5879
5880 case OP_ANYNL:
5881 for (i = min; i < max; i++)
5882 {
5883 if (eptr >= md->end_subject)
5884 {
5885 SCHECK_PARTIAL();
5886 break;
5887 }
5888 c = *eptr;
5889 if (c == CHAR_CR)
5890 {
5891 if (++eptr >= md->end_subject) break;
5892 if (*eptr == CHAR_LF) eptr++;
5893 }
5894 else
5895 {
5896 if (c != CHAR_LF && (md->bsr_anycrlf ||
5897 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5898 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5899 && c != 0x2028 && c != 0x2029
5900 #endif
5901 ))) break;
5902 eptr++;
5903 }
5904 }
5905 break;
5906
5907 case OP_NOT_HSPACE:
5908 for (i = min; i < max; i++)
5909 {
5910 if (eptr >= md->end_subject)
5911 {
5912 SCHECK_PARTIAL();
5913 break;
5914 }
5915 switch(*eptr)
5916 {
5917 default: eptr++; break;
5918 HSPACE_BYTE_CASES:
5919 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5920 HSPACE_MULTIBYTE_CASES:
5921 #endif
5922 goto ENDLOOP00;
5923 }
5924 }
5925 ENDLOOP00:
5926 break;
5927
5928 case OP_HSPACE:
5929 for (i = min; i < max; i++)
5930 {
5931 if (eptr >= md->end_subject)
5932 {
5933 SCHECK_PARTIAL();
5934 break;
5935 }
5936 switch(*eptr)
5937 {
5938 default: goto ENDLOOP01;
5939 HSPACE_BYTE_CASES:
5940 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5941 HSPACE_MULTIBYTE_CASES:
5942 #endif
5943 eptr++; break;
5944 }
5945 }
5946 ENDLOOP01:
5947 break;
5948
5949 case OP_NOT_VSPACE:
5950 for (i = min; i < max; i++)
5951 {
5952 if (eptr >= md->end_subject)
5953 {
5954 SCHECK_PARTIAL();
5955 break;
5956 }
5957 switch(*eptr)
5958 {
5959 default: eptr++; break;
5960 VSPACE_BYTE_CASES:
5961 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5962 VSPACE_MULTIBYTE_CASES:
5963 #endif
5964 goto ENDLOOP02;
5965 }
5966 }
5967 ENDLOOP02:
5968 break;
5969
5970 case OP_VSPACE:
5971 for (i = min; i < max; i++)
5972 {
5973 if (eptr >= md->end_subject)
5974 {
5975 SCHECK_PARTIAL();
5976 break;
5977 }
5978 switch(*eptr)
5979 {
5980 default: goto ENDLOOP03;
5981 VSPACE_BYTE_CASES:
5982 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5983 VSPACE_MULTIBYTE_CASES:
5984 #endif
5985 eptr++; break;
5986 }
5987 }
5988 ENDLOOP03:
5989 break;
5990
5991 case OP_NOT_DIGIT:
5992 for (i = min; i < max; i++)
5993 {
5994 if (eptr >= md->end_subject)
5995 {
5996 SCHECK_PARTIAL();
5997 break;
5998 }
5999 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6000 eptr++;
6001 }
6002 break;
6003
6004 case OP_DIGIT:
6005 for (i = min; i < max; i++)
6006 {
6007 if (eptr >= md->end_subject)
6008 {
6009 SCHECK_PARTIAL();
6010 break;
6011 }
6012 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6013 eptr++;
6014 }
6015 break;
6016
6017 case OP_NOT_WHITESPACE:
6018 for (i = min; i < max; i++)
6019 {
6020 if (eptr >= md->end_subject)
6021 {
6022 SCHECK_PARTIAL();
6023 break;
6024 }
6025 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6026 eptr++;
6027 }
6028 break;
6029
6030 case OP_WHITESPACE:
6031 for (i = min; i < max; i++)
6032 {
6033 if (eptr >= md->end_subject)
6034 {
6035 SCHECK_PARTIAL();
6036 break;
6037 }
6038 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6039 eptr++;
6040 }
6041 break;
6042
6043 case OP_NOT_WORDCHAR:
6044 for (i = min; i < max; i++)
6045 {
6046 if (eptr >= md->end_subject)
6047 {
6048 SCHECK_PARTIAL();
6049 break;
6050 }
6051 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6052 eptr++;
6053 }
6054 break;
6055
6056 case OP_WORDCHAR:
6057 for (i = min; i < max; i++)
6058 {
6059 if (eptr >= md->end_subject)
6060 {
6061 SCHECK_PARTIAL();
6062 break;
6063 }
6064 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6065 eptr++;
6066 }
6067 break;
6068
6069 default:
6070 RRETURN(PCRE_ERROR_INTERNAL);
6071 }
6072
6073 /* eptr is now past the end of the maximum run. If possessive, we are
6074 done (no backing up). Otherwise, match at this position; anything other
6075 than no match is immediately returned. For nomatch, back up one
6076 character (byte), unless we are matching \R and the last thing matched
6077 was \r\n, in which case, back up two bytes. */
6078
6079 if (possessive) continue;
6080 while (eptr >= pp)
6081 {
6082 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6083 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6084 eptr--;
6085 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6086 eptr[-1] == CHAR_CR) eptr--;
6087 }
6088 }
6089
6090 /* Get here if we can't make it match with any permitted repetitions */
6091
6092 RRETURN(MATCH_NOMATCH);
6093 }
6094 /* Control never gets here */
6095
6096 /* There's been some horrible disaster. Arrival here can only mean there is
6097 something seriously wrong in the code above or the OP_xxx definitions. */
6098
6099 default:
6100 DPRINTF(("Unknown opcode %d\n", *ecode));
6101 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6102 }
6103
6104 /* Do not stick any code in here without much thought; it is assumed
6105 that "continue" in the code above comes out to here to repeat the main
6106 loop. */
6107
6108 } /* End of main loop */
6109 /* Control never reaches here */
6110
6111
6112 /* When compiling to use the heap rather than the stack for recursive calls to
6113 match(), the RRETURN() macro jumps here. The number that is saved in
6114 frame->Xwhere indicates which label we actually want to return to. */
6115
6116 #ifdef NO_RECURSE
6117 #define LBL(val) case val: goto L_RM##val;
6118 HEAP_RETURN:
6119 switch (frame->Xwhere)
6120 {
6121 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6122 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6123 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6124 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6125 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6126 LBL(65) LBL(66)
6127 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6128 LBL(21)
6129 #endif
6130 #ifdef SUPPORT_UTF
6131 LBL(16) LBL(18) LBL(20)
6132 LBL(22) LBL(23) LBL(28) LBL(30)
6133 LBL(32) LBL(34) LBL(42) LBL(46)
6134 #ifdef SUPPORT_UCP
6135 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6136 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6137 #endif /* SUPPORT_UCP */
6138 #endif /* SUPPORT_UTF */
6139 default:
6140 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6141 return PCRE_ERROR_INTERNAL;
6142 }
6143 #undef LBL
6144 #endif /* NO_RECURSE */
6145 }
6146
6147
6148 /***************************************************************************
6149 ****************************************************************************
6150 RECURSION IN THE match() FUNCTION
6151
6152 Undefine all the macros that were defined above to handle this. */
6153
6154 #ifdef NO_RECURSE
6155 #undef eptr
6156 #undef ecode
6157 #undef mstart
6158 #undef offset_top
6159 #undef eptrb
6160 #undef flags
6161
6162 #undef callpat
6163 #undef charptr
6164 #undef data
6165 #undef next
6166 #undef pp
6167 #undef prev
6168 #undef saved_eptr
6169
6170 #undef new_recursive
6171
6172 #undef cur_is_word
6173 #undef condition
6174 #undef prev_is_word
6175
6176 #undef ctype
6177 #undef length
6178 #undef max
6179 #undef min
6180 #undef number
6181 #undef offset
6182 #undef op
6183 #undef save_capture_last
6184 #undef save_offset1
6185 #undef save_offset2
6186 #undef save_offset3
6187 #undef stacksave
6188
6189 #undef newptrb
6190
6191 #endif
6192
6193 /* These two are defined as macros in both cases */
6194
6195 #undef fc
6196 #undef fi
6197
6198 /***************************************************************************
6199 ***************************************************************************/
6200
6201
6202 #ifdef NO_RECURSE
6203 /*************************************************
6204 * Release allocated heap frames *
6205 *************************************************/
6206
6207 /* This function releases all the allocated frames. The base frame is on the
6208 machine stack, and so must not be freed.
6209
6210 Argument: the address of the base frame
6211 Returns: nothing
6212 */
6213
6214 static void
6215 release_match_heapframes (heapframe *frame_base)
6216 {
6217 heapframe *nextframe = frame_base->Xnextframe;
6218 while (nextframe != NULL)
6219 {
6220 heapframe *oldframe = nextframe;
6221 nextframe = nextframe->Xnextframe;
6222 (PUBL(stack_free))(oldframe);
6223 }
6224 }
6225 #endif
6226
6227
6228 /*************************************************
6229 * Execute a Regular Expression *
6230 *************************************************/
6231
6232 /* This function applies a compiled re to a subject string and picks out
6233 portions of the string if it matches. Two elements in the vector are set for
6234 each substring: the offsets to the start and end of the substring.
6235
6236 Arguments:
6237 argument_re points to the compiled expression
6238 extra_data points to extra data or is NULL
6239 subject points to the subject string
6240 length length of subject string (may contain binary zeros)
6241 start_offset where to start in the subject string
6242 options option bits
6243 offsets points to a vector of ints to be filled in with offsets
6244 offsetcount the number of elements in the vector
6245
6246 Returns: > 0 => success; value is the number of elements filled in
6247 = 0 => success, but offsets is not big enough
6248 -1 => failed to match
6249 < -1 => some kind of unexpected problem
6250 */
6251
6252 #if defined COMPILE_PCRE8
6253 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6254 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6255 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6256 int offsetcount)
6257 #elif defined COMPILE_PCRE16
6258 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6259 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6260 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6261 int offsetcount)
6262 #elif defined COMPILE_PCRE32
6263 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6264 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6265 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6266 int offsetcount)
6267 #endif
6268 {
6269 int rc, ocount, arg_offset_max;
6270 int newline;
6271 BOOL using_temporary_offsets = FALSE;
6272 BOOL anchored;
6273 BOOL startline;
6274 BOOL firstline;
6275 BOOL utf;
6276 BOOL has_first_char = FALSE;
6277 BOOL has_req_char = FALSE;
6278 pcre_uchar first_char = 0;
6279 pcre_uchar first_char2 = 0;
6280 pcre_uchar req_char = 0;
6281 pcre_uchar req_char2 = 0;
6282 match_data match_block;
6283 match_data *md = &match_block;
6284 const pcre_uint8 *tables;
6285 const pcre_uint8 *start_bits = NULL;
6286 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6287 PCRE_PUCHAR end_subject;
6288 PCRE_PUCHAR start_partial = NULL;
6289 PCRE_PUCHAR match_partial;
6290 PCRE_PUCHAR req_char_ptr = start_match - 1;
6291
6292 const pcre_study_data *study;
6293 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6294
6295 #ifdef NO_RECURSE
6296 heapframe frame_zero;
6297 frame_zero.Xprevframe = NULL; /* Marks the top level */
6298 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6299 md->match_frames_base = &frame_zero;
6300 #endif
6301
6302 /* Check for the special magic call that measures the size of the stack used
6303 per recursive call of match(). Without the funny casting for sizeof, a Windows
6304 compiler gave this error: "unary minus operator applied to unsigned type,
6305 result still unsigned". Hopefully the cast fixes that. */
6306
6307 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6308 start_offset == -999)
6309 #ifdef NO_RECURSE
6310 return -((int)sizeof(heapframe));
6311 #else
6312 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6313 #endif
6314
6315 /* Plausibility checks */
6316
6317 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6318 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6319 return PCRE_ERROR_NULL;
6320 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6321 if (length < 0) return PCRE_ERROR_BADLENGTH;
6322 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6323
6324 /* Check that the first field in the block is the magic number. If it is not,
6325 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6326 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6327 means that the pattern is likely compiled with different endianness. */
6328
6329 if (re->magic_number != MAGIC_NUMBER)
6330 return re->magic_number == REVERSED_MAGIC_NUMBER?
6331 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6332 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6333
6334 /* These two settings are used in the code for checking a UTF-8 string that
6335 follows immediately afterwards. Other values in the md block are used only
6336 during "normal" pcre_exec() processing, not when the JIT support is in use,
6337 so they are set up later. */
6338
6339 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6340 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6341 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6342 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6343
6344 /* Check a UTF-8 string if required. Pass back the character offset and error
6345 code for an invalid string if a results vector is available. */
6346
6347 #ifdef SUPPORT_UTF
6348 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6349 {
6350 int erroroffset;
6351 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6352 if (errorcode != 0)
6353 {
6354 if (offsetcount >= 2)
6355 {
6356 offsets[0] = erroroffset;
6357 offsets[1] = errorcode;
6358 }
6359 #if defined COMPILE_PCRE8
6360 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6361 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6362 #elif defined COMPILE_PCRE16
6363 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6364 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6365 #elif defined COMPILE_PCRE32
6366 return PCRE_ERROR_BADUTF32;
6367 #endif
6368 }
6369 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6370 /* Check that a start_offset points to the start of a UTF character. */
6371 if (start_offset > 0 && start_offset < length &&
6372 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6373 return PCRE_ERROR_BADUTF8_OFFSET;
6374 #endif
6375 }
6376 #endif
6377
6378 /* If the pattern was successfully studied with JIT support, run the JIT
6379 executable instead of the rest of this function. Most options must be set at
6380 compile time for the JIT code to be usable. Fallback to the normal code path if
6381 an unsupported flag is set. */
6382
6383 #ifdef SUPPORT_JIT
6384 if (extra_data != NULL
6385 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6386 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6387 && extra_data->executable_jit != NULL
6388 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6389 {
6390 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6391 start_offset, options, offsets, offsetcount);
6392
6393 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6394 mode is not compiled. In this case we simply fallback to interpreter. */
6395
6396 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6397 }
6398 #endif
6399
6400 /* Carry on with non-JIT matching. This information is for finding all the
6401 numbers associated with a given name, for condition testing. */
6402
6403 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6404 md->name_count = re->name_count;
6405 md->name_entry_size = re->name_entry_size;
6406
6407 /* Fish out the optional data from the extra_data structure, first setting
6408 the default values. */
6409
6410 study = NULL;
6411 md->match_limit = MATCH_LIMIT;
6412 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6413 md->callout_data = NULL;
6414
6415 /* The table pointer is always in native byte order. */
6416
6417 tables = re->tables;
6418
6419 if (extra_data != NULL)
6420 {
6421 register unsigned int flags = extra_data->flags;
6422 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6423 study = (const pcre_study_data *)extra_data->study_data;
6424 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6425 md->match_limit = extra_data->match_limit;
6426 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6427 md->match_limit_recursion = extra_data->match_limit_recursion;
6428 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6429 md->callout_data = extra_data->callout_data;
6430 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6431 }
6432
6433 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6434 is a feature that makes it possible to save compiled regex and re-use them
6435 in other programs later. */
6436
6437 if (tables == NULL) tables = PRIV(default_tables);
6438
6439 /* Set up other data */
6440
6441 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6442 startline = (re->flags & PCRE_STARTLINE) != 0;
6443 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6444
6445 /* The code starts after the real_pcre block and the capture name table. */
6446
6447 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6448 re->name_count * re->name_entry_size;
6449
6450 md->start_subject = (PCRE_PUCHAR)subject;
6451 md->start_offset = start_offset;
6452 md->end_subject = md->start_subject + length;
6453 end_subject = md->end_subject;
6454
6455 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6456 md->use_ucp = (re->options & PCRE_UCP) != 0;
6457 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6458 md->ignore_skip_arg = FALSE;
6459
6460 /* Some options are unpacked into BOOL variables in the hope that testing
6461 them will be faster than individual option bits. */
6462
6463 md->notbol = (options & PCRE_NOTBOL) != 0;
6464 md->noteol = (options & PCRE_NOTEOL) != 0;
6465 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6466 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6467
6468 md->hitend = FALSE;
6469 md->mark = md->nomatch_mark = NULL; /* In case never set */
6470
6471 md->recursive = NULL; /* No recursion at top level */
6472 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6473
6474 md->lcc = tables + lcc_offset;
6475 md->fcc = tables + fcc_offset;
6476 md->ctypes = tables + ctypes_offset;
6477
6478 /* Handle different \R options. */
6479
6480 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6481 {
6482 case 0:
6483 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6484 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6485 else
6486 #ifdef BSR_ANYCRLF
6487 md->bsr_anycrlf = TRUE;
6488 #else
6489 md->bsr_anycrlf = FALSE;
6490 #endif
6491 break;
6492
6493 case PCRE_BSR_ANYCRLF:
6494 md->bsr_anycrlf = TRUE;
6495 break;
6496
6497 case PCRE_BSR_UNICODE:
6498 md->bsr_anycrlf = FALSE;
6499 break;
6500
6501 default: return PCRE_ERROR_BADNEWLINE;
6502 }
6503
6504 /* Handle different types of newline. The three bits give eight cases. If
6505 nothing is set at run time, whatever was used at compile time applies. */
6506
6507 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6508 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6509 {
6510 case 0: newline = NEWLINE; break; /* Compile-time default */
6511 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6512 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6513 case PCRE_NEWLINE_CR+
6514 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6515 case PCRE_NEWLINE_ANY: newline = -1; break;
6516 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6517 default: return PCRE_ERROR_BADNEWLINE;
6518 }
6519
6520 if (newline == -2)
6521 {
6522 md->nltype = NLTYPE_ANYCRLF;
6523 }
6524 else if (newline < 0)
6525 {
6526 md->nltype = NLTYPE_ANY;
6527 }
6528 else
6529 {
6530 md->nltype = NLTYPE_FIXED;
6531 if (newline > 255)
6532 {
6533 md->nllen = 2;
6534 md->nl[0] = (newline >> 8) & 255;
6535 md->nl[1] = newline & 255;
6536 }
6537 else
6538 {
6539 md->nllen = 1;
6540 md->nl[0] = newline;
6541 }
6542 }
6543
6544 /* Partial matching was originally supported only for a restricted set of
6545 regexes; from release 8.00 there are no restrictions, but the bits are still
6546 defined (though never set). So there's no harm in leaving this code. */
6547
6548 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6549 return PCRE_ERROR_BADPARTIAL;
6550
6551 /* If the expression has got more back references than the offsets supplied can
6552 hold, we get a temporary chunk of working store to use during the matching.
6553 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6554 of 3. */
6555
6556 ocount = offsetcount - (offsetcount % 3);
6557 arg_offset_max = (2*ocount)/3;
6558
6559 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6560 {
6561 ocount = re->top_backref * 3 + 3;
6562 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6563 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6564 using_temporary_offsets = TRUE;
6565 DPRINTF(("Got memory to hold back references\n"));
6566 }
6567 else md->offset_vector = offsets;
6568 md->offset_end = ocount;
6569 md->offset_max = (2*ocount)/3;
6570 md->capture_last = 0;
6571
6572 /* Reset the working variable associated with each extraction. These should
6573 never be used unless previously set, but they get saved and restored, and so we
6574 initialize them to avoid reading uninitialized locations. Also, unset the
6575 offsets for the matched string. This is really just for tidiness with callouts,
6576 in case they inspect these fields. */
6577
6578 if (md->offset_vector != NULL)
6579 {
6580 register int *iptr = md->offset_vector + ocount;
6581 register int *iend = iptr - re->top_bracket;
6582 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6583 while (--iptr >= iend) *iptr = -1;
6584 md->offset_vector[0] = md->offset_vector[1] = -1;
6585 }
6586
6587 /* Set up the first character to match, if available. The first_char value is
6588 never set for an anchored regular expression, but the anchoring may be forced
6589 at run time, so we have to test for anchoring. The first char may be unset for
6590 an unanchored pattern, of course. If there's no first char and the pattern was
6591 studied, there may be a bitmap of possible first characters. */
6592
6593 if (!anchored)
6594 {
6595 if ((re->flags & PCRE_FIRSTSET) != 0)
6596 {
6597 has_first_char = TRUE;
6598 first_char = first_char2 = (pcre_uchar)(re->first_char);
6599 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6600 {
6601 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6602 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6603 if (utf && first_char > 127)
6604 first_char2 = UCD_OTHERCASE(first_char);
6605 #endif
6606 }
6607 }
6608 else
6609 if (!startline && study != NULL &&
6610 (study->flags & PCRE_STUDY_MAPPED) != 0)
6611 start_bits = study->start_bits;
6612 }
6613
6614 /* For anchored or unanchored matches, there may be a "last known required
6615 character" set. */
6616
6617 if ((re->flags & PCRE_REQCHSET) != 0)
6618 {
6619 has_req_char = TRUE;
6620 req_char = req_char2 = (pcre_uchar)(re->req_char);
6621 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6622 {
6623 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6624 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6625 if (utf && req_char > 127)
6626 req_char2 = UCD_OTHERCASE(req_char);
6627 #endif
6628 }
6629 }
6630
6631
6632 /* ==========================================================================*/
6633
6634 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6635 the loop runs just once. */
6636
6637 for(;;)
6638 {
6639 PCRE_PUCHAR save_end_subject = end_subject;
6640 PCRE_PUCHAR new_start_match;
6641
6642 /* If firstline is TRUE, the start of the match is constrained to the first
6643 line of a multiline string. That is, the match must be before or at the first
6644 newline. Implement this by temporarily adjusting end_subject so that we stop
6645 scanning at a newline. If the match fails at the newline, later code breaks
6646 this loop. */
6647
6648 if (firstline)
6649 {
6650 PCRE_PUCHAR t = start_match;
6651 #ifdef SUPPORT_UTF
6652 if (utf)
6653 {
6654 while (t < md->end_subject && !IS_NEWLINE(t))
6655 {
6656 t++;
6657 ACROSSCHAR(t < end_subject, *t, t++);
6658 }
6659 }
6660 else
6661 #endif
6662 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6663 end_subject = t;
6664 }
6665
6666 /* There are some optimizations that avoid running the match if a known
6667 starting point is not found, or if a known later character is not present.
6668 However, there is an option that disables these, for testing and for ensuring
6669 that all callouts do actually occur. The option can be set in the regex by
6670 (*NO_START_OPT) or passed in match-time options. */
6671
6672 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6673 {
6674 /* Advance to a unique first char if there is one. */
6675
6676 if (has_first_char)
6677 {
6678 pcre_uchar smc;
6679
6680 if (first_char != first_char2)
6681 while (start_match < end_subject &&
6682 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6683 start_match++;
6684 else
6685 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6686 start_match++;
6687 }
6688
6689 /* Or to just after a linebreak for a multiline match */
6690
6691 else if (startline)
6692 {
6693 if (start_match > md->start_subject + start_offset)
6694 {
6695 #ifdef SUPPORT_UTF
6696 if (utf)
6697 {
6698 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6699 {
6700 start_match++;
6701 ACROSSCHAR(start_match < end_subject, *start_match,
6702 start_match++);
6703 }
6704 }
6705 else
6706 #endif
6707 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6708 start_match++;
6709
6710 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6711 and we are now at a LF, advance the match position by one more character.
6712 */
6713
6714 if (start_match[-1] == CHAR_CR &&
6715 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6716 start_match < end_subject &&
6717 RAWUCHARTEST(start_match) == CHAR_NL)
6718 start_match++;
6719 }
6720 }
6721
6722 /* Or to a non-unique first byte after study */
6723
6724 else if (start_bits != NULL)
6725 {
6726 while (start_match < end_subject)
6727 {
6728 register pcre_uint32 c = RAWUCHARTEST(start_match);
6729 #ifndef COMPILE_PCRE8
6730 if (c > 255) c = 255;
6731 #endif
6732 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6733 {
6734 start_match++;
6735 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6736 /* In non 8-bit mode, the iteration will stop for
6737 characters > 255 at the beginning or not stop at all. */
6738 if (utf)
6739 ACROSSCHAR(start_match < end_subject, *start_match,
6740 start_match++);
6741 #endif
6742 }
6743 else break;
6744 }
6745 }
6746 } /* Starting optimizations */
6747
6748 /* Restore fudged end_subject */
6749
6750 end_subject = save_end_subject;
6751
6752 /* The following two optimizations are disabled for partial matching or if
6753 disabling is explicitly requested. */
6754
6755 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6756 {
6757 /* If the pattern was studied, a minimum subject length may be set. This is
6758 a lower bound; no actual string of that length may actually match the
6759 pattern. Although the value is, strictly, in characters, we treat it as
6760 bytes to avoid spending too much time in this optimization. */
6761