/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1273 - (show annotations)
Fri Mar 8 10:25:02 2013 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 216181 byte(s)
Fix *SKIP infinite loop.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 /* COMMIT overrides PRUNE, SKIP, and THEN */
785
786 case OP_COMMIT:
787 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
788 eptrb, RM52);
789 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
790 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
791 rrc != MATCH_THEN)
792 RRETURN(rrc);
793 RRETURN(MATCH_COMMIT);
794
795 /* PRUNE overrides THEN */
796
797 case OP_PRUNE:
798 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
799 eptrb, RM51);
800 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
801 RRETURN(MATCH_PRUNE);
802
803 case OP_PRUNE_ARG:
804 md->nomatch_mark = ecode + 2;
805 md->mark = NULL; /* In case previously set by assertion */
806 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
807 eptrb, RM56);
808 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
809 md->mark == NULL) md->mark = ecode + 2;
810 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
811 RRETURN(MATCH_PRUNE);
812
813 /* SKIP overrides PRUNE and THEN */
814
815 case OP_SKIP:
816 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
817 eptrb, RM53);
818 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
819 RRETURN(rrc);
820 md->start_match_ptr = eptr; /* Pass back current position */
821 RRETURN(MATCH_SKIP);
822
823 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
824 nomatch_mark. There is a flag that disables this opcode when re-matching a
825 pattern that ended with a SKIP for which there was not a matching MARK. */
826
827 case OP_SKIP_ARG:
828 if (md->ignore_skip_arg)
829 {
830 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
831 break;
832 }
833 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
834 eptrb, RM57);
835 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
836 RRETURN(rrc);
837
838 /* Pass back the current skip name by overloading md->start_match_ptr and
839 returning the special MATCH_SKIP_ARG return code. This will either be
840 caught by a matching MARK, or get to the top, where it causes a rematch
841 with the md->ignore_skip_arg flag set. */
842
843 md->start_match_ptr = ecode + 2;
844 RRETURN(MATCH_SKIP_ARG);
845
846 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
847 the branch in which it occurs can be determined. Overload the start of
848 match pointer to do this. */
849
850 case OP_THEN:
851 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
852 eptrb, RM54);
853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
854 md->start_match_ptr = ecode;
855 RRETURN(MATCH_THEN);
856
857 case OP_THEN_ARG:
858 md->nomatch_mark = ecode + 2;
859 md->mark = NULL; /* In case previously set by assertion */
860 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
861 md, eptrb, RM58);
862 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
863 md->mark == NULL) md->mark = ecode + 2;
864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865 md->start_match_ptr = ecode;
866 RRETURN(MATCH_THEN);
867
868 /* Handle an atomic group that does not contain any capturing parentheses.
869 This can be handled like an assertion. Prior to 8.13, all atomic groups
870 were handled this way. In 8.13, the code was changed as below for ONCE, so
871 that backups pass through the group and thereby reset captured values.
872 However, this uses a lot more stack, so in 8.20, atomic groups that do not
873 contain any captures generate OP_ONCE_NC, which can be handled in the old,
874 less stack intensive way.
875
876 Check the alternative branches in turn - the matching won't pass the KET
877 for this kind of subpattern. If any one branch matches, we carry on as at
878 the end of a normal bracket, leaving the subject pointer, but resetting
879 the start-of-match value in case it was changed by \K. */
880
881 case OP_ONCE_NC:
882 prev = ecode;
883 saved_eptr = eptr;
884 save_mark = md->mark;
885 do
886 {
887 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
888 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
889 {
890 mstart = md->start_match_ptr;
891 break;
892 }
893 if (rrc == MATCH_THEN)
894 {
895 next = ecode + GET(ecode,1);
896 if (md->start_match_ptr < next &&
897 (*ecode == OP_ALT || *next == OP_ALT))
898 rrc = MATCH_NOMATCH;
899 }
900
901 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
902 ecode += GET(ecode,1);
903 md->mark = save_mark;
904 }
905 while (*ecode == OP_ALT);
906
907 /* If hit the end of the group (which could be repeated), fail */
908
909 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
910
911 /* Continue as from after the group, updating the offsets high water
912 mark, since extracts may have been taken. */
913
914 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
915
916 offset_top = md->end_offset_top;
917 eptr = md->end_match_ptr;
918
919 /* For a non-repeating ket, just continue at this level. This also
920 happens for a repeating ket if no characters were matched in the group.
921 This is the forcible breaking of infinite loops as implemented in Perl
922 5.005. */
923
924 if (*ecode == OP_KET || eptr == saved_eptr)
925 {
926 ecode += 1+LINK_SIZE;
927 break;
928 }
929
930 /* The repeating kets try the rest of the pattern or restart from the
931 preceding bracket, in the appropriate order. The second "call" of match()
932 uses tail recursion, to avoid using another stack frame. */
933
934 if (*ecode == OP_KETRMIN)
935 {
936 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
938 ecode = prev;
939 goto TAIL_RECURSE;
940 }
941 else /* OP_KETRMAX */
942 {
943 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
945 ecode += 1 + LINK_SIZE;
946 goto TAIL_RECURSE;
947 }
948 /* Control never gets here */
949
950 /* Handle a capturing bracket, other than those that are possessive with an
951 unlimited repeat. If there is space in the offset vector, save the current
952 subject position in the working slot at the top of the vector. We mustn't
953 change the current values of the data slot, because they may be set from a
954 previous iteration of this group, and be referred to by a reference inside
955 the group. A failure to match might occur after the group has succeeded,
956 if something later on doesn't match. For this reason, we need to restore
957 the working value and also the values of the final offsets, in case they
958 were set by a previous iteration of the same bracket.
959
960 If there isn't enough space in the offset vector, treat this as if it were
961 a non-capturing bracket. Don't worry about setting the flag for the error
962 case here; that is handled in the code for KET. */
963
964 case OP_CBRA:
965 case OP_SCBRA:
966 number = GET2(ecode, 1+LINK_SIZE);
967 offset = number << 1;
968
969 #ifdef PCRE_DEBUG
970 printf("start bracket %d\n", number);
971 printf("subject=");
972 pchars(eptr, 16, TRUE, md);
973 printf("\n");
974 #endif
975
976 if (offset < md->offset_max)
977 {
978 save_offset1 = md->offset_vector[offset];
979 save_offset2 = md->offset_vector[offset+1];
980 save_offset3 = md->offset_vector[md->offset_end - number];
981 save_capture_last = md->capture_last;
982 save_mark = md->mark;
983
984 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
985 md->offset_vector[md->offset_end - number] =
986 (int)(eptr - md->start_subject);
987
988 for (;;)
989 {
990 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
991 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
992 eptrb, RM1);
993 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
994
995 /* If we backed up to a THEN, check whether it is within the current
996 branch by comparing the address of the THEN that is passed back with
997 the end of the branch. If it is within the current branch, and the
998 branch is one of two or more alternatives (it either starts or ends
999 with OP_ALT), we have reached the limit of THEN's action, so convert
1000 the return code to NOMATCH, which will cause normal backtracking to
1001 happen from now on. Otherwise, THEN is passed back to an outer
1002 alternative. This implements Perl's treatment of parenthesized groups,
1003 where a group not containing | does not affect the current alternative,
1004 that is, (X) is NOT the same as (X|(*F)). */
1005
1006 if (rrc == MATCH_THEN)
1007 {
1008 next = ecode + GET(ecode,1);
1009 if (md->start_match_ptr < next &&
1010 (*ecode == OP_ALT || *next == OP_ALT))
1011 rrc = MATCH_NOMATCH;
1012 }
1013
1014 /* Anything other than NOMATCH is passed back. */
1015
1016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1017 md->capture_last = save_capture_last;
1018 ecode += GET(ecode, 1);
1019 md->mark = save_mark;
1020 if (*ecode != OP_ALT) break;
1021 }
1022
1023 DPRINTF(("bracket %d failed\n", number));
1024 md->offset_vector[offset] = save_offset1;
1025 md->offset_vector[offset+1] = save_offset2;
1026 md->offset_vector[md->offset_end - number] = save_offset3;
1027
1028 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1029
1030 RRETURN(rrc);
1031 }
1032
1033 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1034 as a non-capturing bracket. */
1035
1036 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1038
1039 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1040
1041 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1042 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1043
1044 /* Non-capturing or atomic group, except for possessive with unlimited
1045 repeat and ONCE group with no captures. Loop for all the alternatives.
1046
1047 When we get to the final alternative within the brackets, we used to return
1048 the result of a recursive call to match() whatever happened so it was
1049 possible to reduce stack usage by turning this into a tail recursion,
1050 except in the case of a possibly empty group. However, now that there is
1051 the possiblity of (*THEN) occurring in the final alternative, this
1052 optimization is no longer always possible.
1053
1054 We can optimize if we know there are no (*THEN)s in the pattern; at present
1055 this is the best that can be done.
1056
1057 MATCH_ONCE is returned when the end of an atomic group is successfully
1058 reached, but subsequent matching fails. It passes back up the tree (causing
1059 captured values to be reset) until the original atomic group level is
1060 reached. This is tested by comparing md->once_target with the start of the
1061 group. At this point, the return is converted into MATCH_NOMATCH so that
1062 previous backup points can be taken. */
1063
1064 case OP_ONCE:
1065 case OP_BRA:
1066 case OP_SBRA:
1067 DPRINTF(("start non-capturing bracket\n"));
1068
1069 for (;;)
1070 {
1071 if (op >= OP_SBRA || op == OP_ONCE)
1072 md->match_function_type = MATCH_CBEGROUP;
1073
1074 /* If this is not a possibly empty group, and there are no (*THEN)s in
1075 the pattern, and this is the final alternative, optimize as described
1076 above. */
1077
1078 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1079 {
1080 ecode += PRIV(OP_lengths)[*ecode];
1081 goto TAIL_RECURSE;
1082 }
1083
1084 /* In all other cases, we have to make another call to match(). */
1085
1086 save_mark = md->mark;
1087 save_capture_last = md->capture_last;
1088 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1089 RM2);
1090
1091 /* See comment in the code for capturing groups above about handling
1092 THEN. */
1093
1094 if (rrc == MATCH_THEN)
1095 {
1096 next = ecode + GET(ecode,1);
1097 if (md->start_match_ptr < next &&
1098 (*ecode == OP_ALT || *next == OP_ALT))
1099 rrc = MATCH_NOMATCH;
1100 }
1101
1102 if (rrc != MATCH_NOMATCH)
1103 {
1104 if (rrc == MATCH_ONCE)
1105 {
1106 const pcre_uchar *scode = ecode;
1107 if (*scode != OP_ONCE) /* If not at start, find it */
1108 {
1109 while (*scode == OP_ALT) scode += GET(scode, 1);
1110 scode -= GET(scode, 1);
1111 }
1112 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1113 }
1114 RRETURN(rrc);
1115 }
1116 ecode += GET(ecode, 1);
1117 md->mark = save_mark;
1118 if (*ecode != OP_ALT) break;
1119 md->capture_last = save_capture_last;
1120 }
1121
1122 RRETURN(MATCH_NOMATCH);
1123
1124 /* Handle possessive capturing brackets with an unlimited repeat. We come
1125 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1126 handled similarly to the normal case above. However, the matching is
1127 different. The end of these brackets will always be OP_KETRPOS, which
1128 returns MATCH_KETRPOS without going further in the pattern. By this means
1129 we can handle the group by iteration rather than recursion, thereby
1130 reducing the amount of stack needed. */
1131
1132 case OP_CBRAPOS:
1133 case OP_SCBRAPOS:
1134 allow_zero = FALSE;
1135
1136 POSSESSIVE_CAPTURE:
1137 number = GET2(ecode, 1+LINK_SIZE);
1138 offset = number << 1;
1139
1140 #ifdef PCRE_DEBUG
1141 printf("start possessive bracket %d\n", number);
1142 printf("subject=");
1143 pchars(eptr, 16, TRUE, md);
1144 printf("\n");
1145 #endif
1146
1147 if (offset < md->offset_max)
1148 {
1149 matched_once = FALSE;
1150 code_offset = (int)(ecode - md->start_code);
1151
1152 save_offset1 = md->offset_vector[offset];
1153 save_offset2 = md->offset_vector[offset+1];
1154 save_offset3 = md->offset_vector[md->offset_end - number];
1155 save_capture_last = md->capture_last;
1156
1157 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1158
1159 /* Each time round the loop, save the current subject position for use
1160 when the group matches. For MATCH_MATCH, the group has matched, so we
1161 restart it with a new subject starting position, remembering that we had
1162 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1163 usual. If we haven't matched any alternatives in any iteration, check to
1164 see if a previous iteration matched. If so, the group has matched;
1165 continue from afterwards. Otherwise it has failed; restore the previous
1166 capture values before returning NOMATCH. */
1167
1168 for (;;)
1169 {
1170 md->offset_vector[md->offset_end - number] =
1171 (int)(eptr - md->start_subject);
1172 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1173 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1174 eptrb, RM63);
1175 if (rrc == MATCH_KETRPOS)
1176 {
1177 offset_top = md->end_offset_top;
1178 eptr = md->end_match_ptr;
1179 ecode = md->start_code + code_offset;
1180 save_capture_last = md->capture_last;
1181 matched_once = TRUE;
1182 continue;
1183 }
1184
1185 /* See comment in the code for capturing groups above about handling
1186 THEN. */
1187
1188 if (rrc == MATCH_THEN)
1189 {
1190 next = ecode + GET(ecode,1);
1191 if (md->start_match_ptr < next &&
1192 (*ecode == OP_ALT || *next == OP_ALT))
1193 rrc = MATCH_NOMATCH;
1194 }
1195
1196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 md->capture_last = save_capture_last;
1198 ecode += GET(ecode, 1);
1199 if (*ecode != OP_ALT) break;
1200 }
1201
1202 if (!matched_once)
1203 {
1204 md->offset_vector[offset] = save_offset1;
1205 md->offset_vector[offset+1] = save_offset2;
1206 md->offset_vector[md->offset_end - number] = save_offset3;
1207 }
1208
1209 if (allow_zero || matched_once)
1210 {
1211 ecode += 1 + LINK_SIZE;
1212 break;
1213 }
1214
1215 RRETURN(MATCH_NOMATCH);
1216 }
1217
1218 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1219 as a non-capturing bracket. */
1220
1221 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1223
1224 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1225
1226 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1227 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1228
1229 /* Non-capturing possessive bracket with unlimited repeat. We come here
1230 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1231 without the capturing complication. It is written out separately for speed
1232 and cleanliness. */
1233
1234 case OP_BRAPOS:
1235 case OP_SBRAPOS:
1236 allow_zero = FALSE;
1237
1238 POSSESSIVE_NON_CAPTURE:
1239 matched_once = FALSE;
1240 code_offset = (int)(ecode - md->start_code);
1241 save_capture_last = md->capture_last;
1242
1243 for (;;)
1244 {
1245 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1246 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1247 eptrb, RM48);
1248 if (rrc == MATCH_KETRPOS)
1249 {
1250 offset_top = md->end_offset_top;
1251 eptr = md->end_match_ptr;
1252 ecode = md->start_code + code_offset;
1253 matched_once = TRUE;
1254 continue;
1255 }
1256
1257 /* See comment in the code for capturing groups above about handling
1258 THEN. */
1259
1260 if (rrc == MATCH_THEN)
1261 {
1262 next = ecode + GET(ecode,1);
1263 if (md->start_match_ptr < next &&
1264 (*ecode == OP_ALT || *next == OP_ALT))
1265 rrc = MATCH_NOMATCH;
1266 }
1267
1268 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1269 ecode += GET(ecode, 1);
1270 if (*ecode != OP_ALT) break;
1271 md->capture_last = save_capture_last;
1272 }
1273
1274 if (matched_once || allow_zero)
1275 {
1276 ecode += 1 + LINK_SIZE;
1277 break;
1278 }
1279 RRETURN(MATCH_NOMATCH);
1280
1281 /* Control never reaches here. */
1282
1283 /* Conditional group: compilation checked that there are no more than
1284 two branches. If the condition is false, skipping the first branch takes us
1285 past the end if there is only one branch, but that's OK because that is
1286 exactly what going to the ket would do. */
1287
1288 case OP_COND:
1289 case OP_SCOND:
1290 codelink = GET(ecode, 1);
1291
1292 /* Because of the way auto-callout works during compile, a callout item is
1293 inserted between OP_COND and an assertion condition. */
1294
1295 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1296 {
1297 if (PUBL(callout) != NULL)
1298 {
1299 PUBL(callout_block) cb;
1300 cb.version = 2; /* Version 1 of the callout block */
1301 cb.callout_number = ecode[LINK_SIZE+2];
1302 cb.offset_vector = md->offset_vector;
1303 #if defined COMPILE_PCRE8
1304 cb.subject = (PCRE_SPTR)md->start_subject;
1305 #elif defined COMPILE_PCRE16
1306 cb.subject = (PCRE_SPTR16)md->start_subject;
1307 #elif defined COMPILE_PCRE32
1308 cb.subject = (PCRE_SPTR32)md->start_subject;
1309 #endif
1310 cb.subject_length = (int)(md->end_subject - md->start_subject);
1311 cb.start_match = (int)(mstart - md->start_subject);
1312 cb.current_position = (int)(eptr - md->start_subject);
1313 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1314 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1315 cb.capture_top = offset_top/2;
1316 cb.capture_last = md->capture_last & CAPLMASK;
1317 /* Internal change requires this for API compatibility. */
1318 if (cb.capture_last == 0) cb.capture_last = -1;
1319 cb.callout_data = md->callout_data;
1320 cb.mark = md->nomatch_mark;
1321 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1322 if (rrc < 0) RRETURN(rrc);
1323 }
1324 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1325 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1326 }
1327
1328 condcode = ecode[LINK_SIZE+1];
1329
1330 /* Now see what the actual condition is */
1331
1332 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1333 {
1334 if (md->recursive == NULL) /* Not recursing => FALSE */
1335 {
1336 condition = FALSE;
1337 ecode += GET(ecode, 1);
1338 }
1339 else
1340 {
1341 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1342 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1343
1344 /* If the test is for recursion into a specific subpattern, and it is
1345 false, but the test was set up by name, scan the table to see if the
1346 name refers to any other numbers, and test them. The condition is true
1347 if any one is set. */
1348
1349 if (!condition && condcode == OP_NRREF)
1350 {
1351 pcre_uchar *slotA = md->name_table;
1352 for (i = 0; i < md->name_count; i++)
1353 {
1354 if (GET2(slotA, 0) == recno) break;
1355 slotA += md->name_entry_size;
1356 }
1357
1358 /* Found a name for the number - there can be only one; duplicate
1359 names for different numbers are allowed, but not vice versa. First
1360 scan down for duplicates. */
1361
1362 if (i < md->name_count)
1363 {
1364 pcre_uchar *slotB = slotA;
1365 while (slotB > md->name_table)
1366 {
1367 slotB -= md->name_entry_size;
1368 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1369 {
1370 condition = GET2(slotB, 0) == md->recursive->group_num;
1371 if (condition) break;
1372 }
1373 else break;
1374 }
1375
1376 /* Scan up for duplicates */
1377
1378 if (!condition)
1379 {
1380 slotB = slotA;
1381 for (i++; i < md->name_count; i++)
1382 {
1383 slotB += md->name_entry_size;
1384 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1385 {
1386 condition = GET2(slotB, 0) == md->recursive->group_num;
1387 if (condition) break;
1388 }
1389 else break;
1390 }
1391 }
1392 }
1393 }
1394
1395 /* Chose branch according to the condition */
1396
1397 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1398 }
1399 }
1400
1401 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1402 {
1403 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1404 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1405
1406 /* If the numbered capture is unset, but the reference was by name,
1407 scan the table to see if the name refers to any other numbers, and test
1408 them. The condition is true if any one is set. This is tediously similar
1409 to the code above, but not close enough to try to amalgamate. */
1410
1411 if (!condition && condcode == OP_NCREF)
1412 {
1413 unsigned int refno = offset >> 1;
1414 pcre_uchar *slotA = md->name_table;
1415
1416 for (i = 0; i < md->name_count; i++)
1417 {
1418 if (GET2(slotA, 0) == refno) break;
1419 slotA += md->name_entry_size;
1420 }
1421
1422 /* Found a name for the number - there can be only one; duplicate names
1423 for different numbers are allowed, but not vice versa. First scan down
1424 for duplicates. */
1425
1426 if (i < md->name_count)
1427 {
1428 pcre_uchar *slotB = slotA;
1429 while (slotB > md->name_table)
1430 {
1431 slotB -= md->name_entry_size;
1432 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1433 {
1434 offset = GET2(slotB, 0) << 1;
1435 condition = offset < offset_top &&
1436 md->offset_vector[offset] >= 0;
1437 if (condition) break;
1438 }
1439 else break;
1440 }
1441
1442 /* Scan up for duplicates */
1443
1444 if (!condition)
1445 {
1446 slotB = slotA;
1447 for (i++; i < md->name_count; i++)
1448 {
1449 slotB += md->name_entry_size;
1450 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1451 {
1452 offset = GET2(slotB, 0) << 1;
1453 condition = offset < offset_top &&
1454 md->offset_vector[offset] >= 0;
1455 if (condition) break;
1456 }
1457 else break;
1458 }
1459 }
1460 }
1461 }
1462
1463 /* Chose branch according to the condition */
1464
1465 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1466 }
1467
1468 else if (condcode == OP_DEF) /* DEFINE - always false */
1469 {
1470 condition = FALSE;
1471 ecode += GET(ecode, 1);
1472 }
1473
1474 /* The condition is an assertion. Call match() to evaluate it - setting
1475 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1476 an assertion. */
1477
1478 else
1479 {
1480 md->match_function_type = MATCH_CONDASSERT;
1481 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1482 if (rrc == MATCH_MATCH)
1483 {
1484 if (md->end_offset_top > offset_top)
1485 offset_top = md->end_offset_top; /* Captures may have happened */
1486 condition = TRUE;
1487 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1488 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1489 }
1490
1491 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1492 assertion; it is therefore treated as NOMATCH. */
1493
1494 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1495 {
1496 RRETURN(rrc); /* Need braces because of following else */
1497 }
1498 else
1499 {
1500 condition = FALSE;
1501 ecode += codelink;
1502 }
1503 }
1504
1505 /* We are now at the branch that is to be obeyed. As there is only one, can
1506 use tail recursion to avoid using another stack frame, except when there is
1507 unlimited repeat of a possibly empty group. In the latter case, a recursive
1508 call to match() is always required, unless the second alternative doesn't
1509 exist, in which case we can just plough on. Note that, for compatibility
1510 with Perl, the | in a conditional group is NOT treated as creating two
1511 alternatives. If a THEN is encountered in the branch, it propagates out to
1512 the enclosing alternative (unless nested in a deeper set of alternatives,
1513 of course). */
1514
1515 if (condition || *ecode == OP_ALT)
1516 {
1517 if (op != OP_SCOND)
1518 {
1519 ecode += 1 + LINK_SIZE;
1520 goto TAIL_RECURSE;
1521 }
1522
1523 md->match_function_type = MATCH_CBEGROUP;
1524 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1525 RRETURN(rrc);
1526 }
1527
1528 /* Condition false & no alternative; continue after the group. */
1529
1530 else
1531 {
1532 ecode += 1 + LINK_SIZE;
1533 }
1534 break;
1535
1536
1537 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1538 to close any currently open capturing brackets. */
1539
1540 case OP_CLOSE:
1541 number = GET2(ecode, 1); /* Must be less than 65536 */
1542 offset = number << 1;
1543
1544 #ifdef PCRE_DEBUG
1545 printf("end bracket %d at *ACCEPT", number);
1546 printf("\n");
1547 #endif
1548
1549 md->capture_last = (md->capture_last & OVFLMASK) | number;
1550 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1551 {
1552 md->offset_vector[offset] =
1553 md->offset_vector[md->offset_end - number];
1554 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1555 if (offset_top <= offset) offset_top = offset + 2;
1556 }
1557 ecode += 1 + IMM2_SIZE;
1558 break;
1559
1560
1561 /* End of the pattern, either real or forced. */
1562
1563 case OP_END:
1564 case OP_ACCEPT:
1565 case OP_ASSERT_ACCEPT:
1566
1567 /* If we have matched an empty string, fail if not in an assertion and not
1568 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1569 is set and we have matched at the start of the subject. In both cases,
1570 backtracking will then try other alternatives, if any. */
1571
1572 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1573 md->recursive == NULL &&
1574 (md->notempty ||
1575 (md->notempty_atstart &&
1576 mstart == md->start_subject + md->start_offset)))
1577 RRETURN(MATCH_NOMATCH);
1578
1579 /* Otherwise, we have a match. */
1580
1581 md->end_match_ptr = eptr; /* Record where we ended */
1582 md->end_offset_top = offset_top; /* and how many extracts were taken */
1583 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1584
1585 /* For some reason, the macros don't work properly if an expression is
1586 given as the argument to RRETURN when the heap is in use. */
1587
1588 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1589 RRETURN(rrc);
1590
1591 /* Assertion brackets. Check the alternative branches in turn - the
1592 matching won't pass the KET for an assertion. If any one branch matches,
1593 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1594 start of each branch to move the current point backwards, so the code at
1595 this level is identical to the lookahead case. When the assertion is part
1596 of a condition, we want to return immediately afterwards. The caller of
1597 this incarnation of the match() function will have set MATCH_CONDASSERT in
1598 md->match_function type, and one of these opcodes will be the first opcode
1599 that is processed. We use a local variable that is preserved over calls to
1600 match() to remember this case. */
1601
1602 case OP_ASSERT:
1603 case OP_ASSERTBACK:
1604 save_mark = md->mark;
1605 if (md->match_function_type == MATCH_CONDASSERT)
1606 {
1607 condassert = TRUE;
1608 md->match_function_type = 0;
1609 }
1610 else condassert = FALSE;
1611
1612 do
1613 {
1614 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1615 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1616 {
1617 mstart = md->start_match_ptr; /* In case \K reset it */
1618 break;
1619 }
1620 md->mark = save_mark;
1621
1622 /* A COMMIT failure must fail the entire assertion, without trying any
1623 subsequent branches. */
1624
1625 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1626
1627 /* PCRE does not allow THEN to escape beyond an assertion; it
1628 is treated as NOMATCH. */
1629
1630 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1631 ecode += GET(ecode, 1);
1632 }
1633 while (*ecode == OP_ALT);
1634
1635 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1636
1637 /* If checking an assertion for a condition, return MATCH_MATCH. */
1638
1639 if (condassert) RRETURN(MATCH_MATCH);
1640
1641 /* Continue from after the assertion, updating the offsets high water
1642 mark, since extracts may have been taken during the assertion. */
1643
1644 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1645 ecode += 1 + LINK_SIZE;
1646 offset_top = md->end_offset_top;
1647 continue;
1648
1649 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1650 PRUNE, or COMMIT means we must assume failure without checking subsequent
1651 branches. */
1652
1653 case OP_ASSERT_NOT:
1654 case OP_ASSERTBACK_NOT:
1655 save_mark = md->mark;
1656 if (md->match_function_type == MATCH_CONDASSERT)
1657 {
1658 condassert = TRUE;
1659 md->match_function_type = 0;
1660 }
1661 else condassert = FALSE;
1662
1663 do
1664 {
1665 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1666 md->mark = save_mark;
1667 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1668 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1669 {
1670 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1671 break;
1672 }
1673
1674 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1675 as NOMATCH. */
1676
1677 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1678 ecode += GET(ecode,1);
1679 }
1680 while (*ecode == OP_ALT);
1681
1682 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1683
1684 ecode += 1 + LINK_SIZE;
1685 continue;
1686
1687 /* Move the subject pointer back. This occurs only at the start of
1688 each branch of a lookbehind assertion. If we are too close to the start to
1689 move back, this match function fails. When working with UTF-8 we move
1690 back a number of characters, not bytes. */
1691
1692 case OP_REVERSE:
1693 #ifdef SUPPORT_UTF
1694 if (utf)
1695 {
1696 i = GET(ecode, 1);
1697 while (i-- > 0)
1698 {
1699 eptr--;
1700 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1701 BACKCHAR(eptr);
1702 }
1703 }
1704 else
1705 #endif
1706
1707 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1708
1709 {
1710 eptr -= GET(ecode, 1);
1711 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1712 }
1713
1714 /* Save the earliest consulted character, then skip to next op code */
1715
1716 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1717 ecode += 1 + LINK_SIZE;
1718 break;
1719
1720 /* The callout item calls an external function, if one is provided, passing
1721 details of the match so far. This is mainly for debugging, though the
1722 function is able to force a failure. */
1723
1724 case OP_CALLOUT:
1725 if (PUBL(callout) != NULL)
1726 {
1727 PUBL(callout_block) cb;
1728 cb.version = 2; /* Version 1 of the callout block */
1729 cb.callout_number = ecode[1];
1730 cb.offset_vector = md->offset_vector;
1731 #if defined COMPILE_PCRE8
1732 cb.subject = (PCRE_SPTR)md->start_subject;
1733 #elif defined COMPILE_PCRE16
1734 cb.subject = (PCRE_SPTR16)md->start_subject;
1735 #elif defined COMPILE_PCRE32
1736 cb.subject = (PCRE_SPTR32)md->start_subject;
1737 #endif
1738 cb.subject_length = (int)(md->end_subject - md->start_subject);
1739 cb.start_match = (int)(mstart - md->start_subject);
1740 cb.current_position = (int)(eptr - md->start_subject);
1741 cb.pattern_position = GET(ecode, 2);
1742 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1743 cb.capture_top = offset_top/2;
1744 cb.capture_last = md->capture_last & CAPLMASK;
1745 /* Internal change requires this for API compatibility. */
1746 if (cb.capture_last == 0) cb.capture_last = -1;
1747 cb.callout_data = md->callout_data;
1748 cb.mark = md->nomatch_mark;
1749 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1750 if (rrc < 0) RRETURN(rrc);
1751 }
1752 ecode += 2 + 2*LINK_SIZE;
1753 break;
1754
1755 /* Recursion either matches the current regex, or some subexpression. The
1756 offset data is the offset to the starting bracket from the start of the
1757 whole pattern. (This is so that it works from duplicated subpatterns.)
1758
1759 The state of the capturing groups is preserved over recursion, and
1760 re-instated afterwards. We don't know how many are started and not yet
1761 finished (offset_top records the completed total) so we just have to save
1762 all the potential data. There may be up to 65535 such values, which is too
1763 large to put on the stack, but using malloc for small numbers seems
1764 expensive. As a compromise, the stack is used when there are no more than
1765 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1766
1767 There are also other values that have to be saved. We use a chained
1768 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1769 for the original version of this logic. It has, however, been hacked around
1770 a lot, so he is not to blame for the current way it works. */
1771
1772 case OP_RECURSE:
1773 {
1774 recursion_info *ri;
1775 unsigned int recno;
1776
1777 callpat = md->start_code + GET(ecode, 1);
1778 recno = (callpat == md->start_code)? 0 :
1779 GET2(callpat, 1 + LINK_SIZE);
1780
1781 /* Check for repeating a recursion without advancing the subject pointer.
1782 This should catch convoluted mutual recursions. (Some simple cases are
1783 caught at compile time.) */
1784
1785 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1786 if (recno == ri->group_num && eptr == ri->subject_position)
1787 RRETURN(PCRE_ERROR_RECURSELOOP);
1788
1789 /* Add to "recursing stack" */
1790
1791 new_recursive.group_num = recno;
1792 new_recursive.saved_capture_last = md->capture_last;
1793 new_recursive.subject_position = eptr;
1794 new_recursive.prevrec = md->recursive;
1795 md->recursive = &new_recursive;
1796
1797 /* Where to continue from afterwards */
1798
1799 ecode += 1 + LINK_SIZE;
1800
1801 /* Now save the offset data */
1802
1803 new_recursive.saved_max = md->offset_end;
1804 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1805 new_recursive.offset_save = stacksave;
1806 else
1807 {
1808 new_recursive.offset_save =
1809 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1810 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1811 }
1812 memcpy(new_recursive.offset_save, md->offset_vector,
1813 new_recursive.saved_max * sizeof(int));
1814
1815 /* OK, now we can do the recursion. After processing each alternative,
1816 restore the offset data and the last captured value. If there were nested
1817 recursions, md->recursive might be changed, so reset it before looping.
1818 */
1819
1820 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1821 cbegroup = (*callpat >= OP_SBRA);
1822 do
1823 {
1824 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1825 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1826 md, eptrb, RM6);
1827 memcpy(md->offset_vector, new_recursive.offset_save,
1828 new_recursive.saved_max * sizeof(int));
1829 md->capture_last = new_recursive.saved_capture_last;
1830 md->recursive = new_recursive.prevrec;
1831 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1832 {
1833 DPRINTF(("Recursion matched\n"));
1834 if (new_recursive.offset_save != stacksave)
1835 (PUBL(free))(new_recursive.offset_save);
1836
1837 /* Set where we got to in the subject, and reset the start in case
1838 it was changed by \K. This *is* propagated back out of a recursion,
1839 for Perl compatibility. */
1840
1841 eptr = md->end_match_ptr;
1842 mstart = md->start_match_ptr;
1843 goto RECURSION_MATCHED; /* Exit loop; end processing */
1844 }
1845
1846 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1847 recursion; they are treated as NOMATCH. These codes are defined in a
1848 range that can be tested for. Any other return code is an error. */
1849
1850 else if (rrc != MATCH_NOMATCH &&
1851 (rrc < MATCH_BACKTRACK_MIN || rrc > MATCH_BACKTRACK_MAX))
1852 {
1853 DPRINTF(("Recursion gave error %d\n", rrc));
1854 if (new_recursive.offset_save != stacksave)
1855 (PUBL(free))(new_recursive.offset_save);
1856 RRETURN(rrc);
1857 }
1858
1859 md->recursive = &new_recursive;
1860 callpat += GET(callpat, 1);
1861 }
1862 while (*callpat == OP_ALT);
1863
1864 DPRINTF(("Recursion didn't match\n"));
1865 md->recursive = new_recursive.prevrec;
1866 if (new_recursive.offset_save != stacksave)
1867 (PUBL(free))(new_recursive.offset_save);
1868 RRETURN(MATCH_NOMATCH);
1869 }
1870
1871 RECURSION_MATCHED:
1872 break;
1873
1874 /* An alternation is the end of a branch; scan along to find the end of the
1875 bracketed group and go to there. */
1876
1877 case OP_ALT:
1878 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1879 break;
1880
1881 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1882 indicating that it may occur zero times. It may repeat infinitely, or not
1883 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1884 with fixed upper repeat limits are compiled as a number of copies, with the
1885 optional ones preceded by BRAZERO or BRAMINZERO. */
1886
1887 case OP_BRAZERO:
1888 next = ecode + 1;
1889 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1890 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1891 do next += GET(next, 1); while (*next == OP_ALT);
1892 ecode = next + 1 + LINK_SIZE;
1893 break;
1894
1895 case OP_BRAMINZERO:
1896 next = ecode + 1;
1897 do next += GET(next, 1); while (*next == OP_ALT);
1898 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1899 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1900 ecode++;
1901 break;
1902
1903 case OP_SKIPZERO:
1904 next = ecode+1;
1905 do next += GET(next,1); while (*next == OP_ALT);
1906 ecode = next + 1 + LINK_SIZE;
1907 break;
1908
1909 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1910 here; just jump to the group, with allow_zero set TRUE. */
1911
1912 case OP_BRAPOSZERO:
1913 op = *(++ecode);
1914 allow_zero = TRUE;
1915 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1916 goto POSSESSIVE_NON_CAPTURE;
1917
1918 /* End of a group, repeated or non-repeating. */
1919
1920 case OP_KET:
1921 case OP_KETRMIN:
1922 case OP_KETRMAX:
1923 case OP_KETRPOS:
1924 prev = ecode - GET(ecode, 1);
1925
1926 /* If this was a group that remembered the subject start, in order to break
1927 infinite repeats of empty string matches, retrieve the subject start from
1928 the chain. Otherwise, set it NULL. */
1929
1930 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1931 {
1932 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1933 eptrb = eptrb->epb_prev; /* Backup to previous group */
1934 }
1935 else saved_eptr = NULL;
1936
1937 /* If we are at the end of an assertion group or a non-capturing atomic
1938 group, stop matching and return MATCH_MATCH, but record the current high
1939 water mark for use by positive assertions. We also need to record the match
1940 start in case it was changed by \K. */
1941
1942 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1943 *prev == OP_ONCE_NC)
1944 {
1945 md->end_match_ptr = eptr; /* For ONCE_NC */
1946 md->end_offset_top = offset_top;
1947 md->start_match_ptr = mstart;
1948 RRETURN(MATCH_MATCH); /* Sets md->mark */
1949 }
1950
1951 /* For capturing groups we have to check the group number back at the start
1952 and if necessary complete handling an extraction by setting the offsets and
1953 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1954 into group 0, so it won't be picked up here. Instead, we catch it when the
1955 OP_END is reached. Other recursion is handled here. We just have to record
1956 the current subject position and start match pointer and give a MATCH
1957 return. */
1958
1959 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1960 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1961 {
1962 number = GET2(prev, 1+LINK_SIZE);
1963 offset = number << 1;
1964
1965 #ifdef PCRE_DEBUG
1966 printf("end bracket %d", number);
1967 printf("\n");
1968 #endif
1969
1970 /* Handle a recursively called group. */
1971
1972 if (md->recursive != NULL && md->recursive->group_num == number)
1973 {
1974 md->end_match_ptr = eptr;
1975 md->start_match_ptr = mstart;
1976 RRETURN(MATCH_MATCH);
1977 }
1978
1979 /* Deal with capturing */
1980
1981 md->capture_last = (md->capture_last & OVFLMASK) | number;
1982 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1983 {
1984 /* If offset is greater than offset_top, it means that we are
1985 "skipping" a capturing group, and that group's offsets must be marked
1986 unset. In earlier versions of PCRE, all the offsets were unset at the
1987 start of matching, but this doesn't work because atomic groups and
1988 assertions can cause a value to be set that should later be unset.
1989 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1990 part of the atomic group, but this is not on the final matching path,
1991 so must be unset when 2 is set. (If there is no group 2, there is no
1992 problem, because offset_top will then be 2, indicating no capture.) */
1993
1994 if (offset > offset_top)
1995 {
1996 register int *iptr = md->offset_vector + offset_top;
1997 register int *iend = md->offset_vector + offset;
1998 while (iptr < iend) *iptr++ = -1;
1999 }
2000
2001 /* Now make the extraction */
2002
2003 md->offset_vector[offset] =
2004 md->offset_vector[md->offset_end - number];
2005 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2006 if (offset_top <= offset) offset_top = offset + 2;
2007 }
2008 }
2009
2010 /* For an ordinary non-repeating ket, just continue at this level. This
2011 also happens for a repeating ket if no characters were matched in the
2012 group. This is the forcible breaking of infinite loops as implemented in
2013 Perl 5.005. For a non-repeating atomic group that includes captures,
2014 establish a backup point by processing the rest of the pattern at a lower
2015 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2016 original OP_ONCE level, thereby bypassing intermediate backup points, but
2017 resetting any captures that happened along the way. */
2018
2019 if (*ecode == OP_KET || eptr == saved_eptr)
2020 {
2021 if (*prev == OP_ONCE)
2022 {
2023 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2025 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2026 RRETURN(MATCH_ONCE);
2027 }
2028 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2029 break;
2030 }
2031
2032 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2033 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2034 at a time from the outer level, thus saving stack. */
2035
2036 if (*ecode == OP_KETRPOS)
2037 {
2038 md->end_match_ptr = eptr;
2039 md->end_offset_top = offset_top;
2040 RRETURN(MATCH_KETRPOS);
2041 }
2042
2043 /* The normal repeating kets try the rest of the pattern or restart from
2044 the preceding bracket, in the appropriate order. In the second case, we can
2045 use tail recursion to avoid using another stack frame, unless we have an
2046 an atomic group or an unlimited repeat of a group that can match an empty
2047 string. */
2048
2049 if (*ecode == OP_KETRMIN)
2050 {
2051 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2053 if (*prev == OP_ONCE)
2054 {
2055 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2057 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2058 RRETURN(MATCH_ONCE);
2059 }
2060 if (*prev >= OP_SBRA) /* Could match an empty string */
2061 {
2062 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2063 RRETURN(rrc);
2064 }
2065 ecode = prev;
2066 goto TAIL_RECURSE;
2067 }
2068 else /* OP_KETRMAX */
2069 {
2070 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2071 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2072 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2073 if (*prev == OP_ONCE)
2074 {
2075 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2076 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2077 md->once_target = prev;
2078 RRETURN(MATCH_ONCE);
2079 }
2080 ecode += 1 + LINK_SIZE;
2081 goto TAIL_RECURSE;
2082 }
2083 /* Control never gets here */
2084
2085 /* Not multiline mode: start of subject assertion, unless notbol. */
2086
2087 case OP_CIRC:
2088 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2089
2090 /* Start of subject assertion */
2091
2092 case OP_SOD:
2093 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2094 ecode++;
2095 break;
2096
2097 /* Multiline mode: start of subject unless notbol, or after any newline. */
2098
2099 case OP_CIRCM:
2100 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2101 if (eptr != md->start_subject &&
2102 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2103 RRETURN(MATCH_NOMATCH);
2104 ecode++;
2105 break;
2106
2107 /* Start of match assertion */
2108
2109 case OP_SOM:
2110 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2111 ecode++;
2112 break;
2113
2114 /* Reset the start of match point */
2115
2116 case OP_SET_SOM:
2117 mstart = eptr;
2118 ecode++;
2119 break;
2120
2121 /* Multiline mode: assert before any newline, or before end of subject
2122 unless noteol is set. */
2123
2124 case OP_DOLLM:
2125 if (eptr < md->end_subject)
2126 {
2127 if (!IS_NEWLINE(eptr))
2128 {
2129 if (md->partial != 0 &&
2130 eptr + 1 >= md->end_subject &&
2131 NLBLOCK->nltype == NLTYPE_FIXED &&
2132 NLBLOCK->nllen == 2 &&
2133 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2134 {
2135 md->hitend = TRUE;
2136 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2137 }
2138 RRETURN(MATCH_NOMATCH);
2139 }
2140 }
2141 else
2142 {
2143 if (md->noteol) RRETURN(MATCH_NOMATCH);
2144 SCHECK_PARTIAL();
2145 }
2146 ecode++;
2147 break;
2148
2149 /* Not multiline mode: assert before a terminating newline or before end of
2150 subject unless noteol is set. */
2151
2152 case OP_DOLL:
2153 if (md->noteol) RRETURN(MATCH_NOMATCH);
2154 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2155
2156 /* ... else fall through for endonly */
2157
2158 /* End of subject assertion (\z) */
2159
2160 case OP_EOD:
2161 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2162 SCHECK_PARTIAL();
2163 ecode++;
2164 break;
2165
2166 /* End of subject or ending \n assertion (\Z) */
2167
2168 case OP_EODN:
2169 ASSERT_NL_OR_EOS:
2170 if (eptr < md->end_subject &&
2171 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2172 {
2173 if (md->partial != 0 &&
2174 eptr + 1 >= md->end_subject &&
2175 NLBLOCK->nltype == NLTYPE_FIXED &&
2176 NLBLOCK->nllen == 2 &&
2177 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2178 {
2179 md->hitend = TRUE;
2180 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2181 }
2182 RRETURN(MATCH_NOMATCH);
2183 }
2184
2185 /* Either at end of string or \n before end. */
2186
2187 SCHECK_PARTIAL();
2188 ecode++;
2189 break;
2190
2191 /* Word boundary assertions */
2192
2193 case OP_NOT_WORD_BOUNDARY:
2194 case OP_WORD_BOUNDARY:
2195 {
2196
2197 /* Find out if the previous and current characters are "word" characters.
2198 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2199 be "non-word" characters. Remember the earliest consulted character for
2200 partial matching. */
2201
2202 #ifdef SUPPORT_UTF
2203 if (utf)
2204 {
2205 /* Get status of previous character */
2206
2207 if (eptr == md->start_subject) prev_is_word = FALSE; else
2208 {
2209 PCRE_PUCHAR lastptr = eptr - 1;
2210 BACKCHAR(lastptr);
2211 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2212 GETCHAR(c, lastptr);
2213 #ifdef SUPPORT_UCP
2214 if (md->use_ucp)
2215 {
2216 if (c == '_') prev_is_word = TRUE; else
2217 {
2218 int cat = UCD_CATEGORY(c);
2219 prev_is_word = (cat == ucp_L || cat == ucp_N);
2220 }
2221 }
2222 else
2223 #endif
2224 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2225 }
2226
2227 /* Get status of next character */
2228
2229 if (eptr >= md->end_subject)
2230 {
2231 SCHECK_PARTIAL();
2232 cur_is_word = FALSE;
2233 }
2234 else
2235 {
2236 GETCHAR(c, eptr);
2237 #ifdef SUPPORT_UCP
2238 if (md->use_ucp)
2239 {
2240 if (c == '_') cur_is_word = TRUE; else
2241 {
2242 int cat = UCD_CATEGORY(c);
2243 cur_is_word = (cat == ucp_L || cat == ucp_N);
2244 }
2245 }
2246 else
2247 #endif
2248 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2249 }
2250 }
2251 else
2252 #endif
2253
2254 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2255 consistency with the behaviour of \w we do use it in this case. */
2256
2257 {
2258 /* Get status of previous character */
2259
2260 if (eptr == md->start_subject) prev_is_word = FALSE; else
2261 {
2262 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2263 #ifdef SUPPORT_UCP
2264 if (md->use_ucp)
2265 {
2266 c = eptr[-1];
2267 if (c == '_') prev_is_word = TRUE; else
2268 {
2269 int cat = UCD_CATEGORY(c);
2270 prev_is_word = (cat == ucp_L || cat == ucp_N);
2271 }
2272 }
2273 else
2274 #endif
2275 prev_is_word = MAX_255(eptr[-1])
2276 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2277 }
2278
2279 /* Get status of next character */
2280
2281 if (eptr >= md->end_subject)
2282 {
2283 SCHECK_PARTIAL();
2284 cur_is_word = FALSE;
2285 }
2286 else
2287 #ifdef SUPPORT_UCP
2288 if (md->use_ucp)
2289 {
2290 c = *eptr;
2291 if (c == '_') cur_is_word = TRUE; else
2292 {
2293 int cat = UCD_CATEGORY(c);
2294 cur_is_word = (cat == ucp_L || cat == ucp_N);
2295 }
2296 }
2297 else
2298 #endif
2299 cur_is_word = MAX_255(*eptr)
2300 && ((md->ctypes[*eptr] & ctype_word) != 0);
2301 }
2302
2303 /* Now see if the situation is what we want */
2304
2305 if ((*ecode++ == OP_WORD_BOUNDARY)?
2306 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2307 RRETURN(MATCH_NOMATCH);
2308 }
2309 break;
2310
2311 /* Match any single character type except newline; have to take care with
2312 CRLF newlines and partial matching. */
2313
2314 case OP_ANY:
2315 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2316 if (md->partial != 0 &&
2317 eptr + 1 >= md->end_subject &&
2318 NLBLOCK->nltype == NLTYPE_FIXED &&
2319 NLBLOCK->nllen == 2 &&
2320 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2321 {
2322 md->hitend = TRUE;
2323 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2324 }
2325
2326 /* Fall through */
2327
2328 /* Match any single character whatsoever. */
2329
2330 case OP_ALLANY:
2331 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2332 { /* not be updated before SCHECK_PARTIAL. */
2333 SCHECK_PARTIAL();
2334 RRETURN(MATCH_NOMATCH);
2335 }
2336 eptr++;
2337 #ifdef SUPPORT_UTF
2338 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2339 #endif
2340 ecode++;
2341 break;
2342
2343 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2344 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2345
2346 case OP_ANYBYTE:
2347 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2348 { /* not be updated before SCHECK_PARTIAL. */
2349 SCHECK_PARTIAL();
2350 RRETURN(MATCH_NOMATCH);
2351 }
2352 eptr++;
2353 ecode++;
2354 break;
2355
2356 case OP_NOT_DIGIT:
2357 if (eptr >= md->end_subject)
2358 {
2359 SCHECK_PARTIAL();
2360 RRETURN(MATCH_NOMATCH);
2361 }
2362 GETCHARINCTEST(c, eptr);
2363 if (
2364 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2365 c < 256 &&
2366 #endif
2367 (md->ctypes[c] & ctype_digit) != 0
2368 )
2369 RRETURN(MATCH_NOMATCH);
2370 ecode++;
2371 break;
2372
2373 case OP_DIGIT:
2374 if (eptr >= md->end_subject)
2375 {
2376 SCHECK_PARTIAL();
2377 RRETURN(MATCH_NOMATCH);
2378 }
2379 GETCHARINCTEST(c, eptr);
2380 if (
2381 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2382 c > 255 ||
2383 #endif
2384 (md->ctypes[c] & ctype_digit) == 0
2385 )
2386 RRETURN(MATCH_NOMATCH);
2387 ecode++;
2388 break;
2389
2390 case OP_NOT_WHITESPACE:
2391 if (eptr >= md->end_subject)
2392 {
2393 SCHECK_PARTIAL();
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 GETCHARINCTEST(c, eptr);
2397 if (
2398 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2399 c < 256 &&
2400 #endif
2401 (md->ctypes[c] & ctype_space) != 0
2402 )
2403 RRETURN(MATCH_NOMATCH);
2404 ecode++;
2405 break;
2406
2407 case OP_WHITESPACE:
2408 if (eptr >= md->end_subject)
2409 {
2410 SCHECK_PARTIAL();
2411 RRETURN(MATCH_NOMATCH);
2412 }
2413 GETCHARINCTEST(c, eptr);
2414 if (
2415 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2416 c > 255 ||
2417 #endif
2418 (md->ctypes[c] & ctype_space) == 0
2419 )
2420 RRETURN(MATCH_NOMATCH);
2421 ecode++;
2422 break;
2423
2424 case OP_NOT_WORDCHAR:
2425 if (eptr >= md->end_subject)
2426 {
2427 SCHECK_PARTIAL();
2428 RRETURN(MATCH_NOMATCH);
2429 }
2430 GETCHARINCTEST(c, eptr);
2431 if (
2432 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2433 c < 256 &&
2434 #endif
2435 (md->ctypes[c] & ctype_word) != 0
2436 )
2437 RRETURN(MATCH_NOMATCH);
2438 ecode++;
2439 break;
2440
2441 case OP_WORDCHAR:
2442 if (eptr >= md->end_subject)
2443 {
2444 SCHECK_PARTIAL();
2445 RRETURN(MATCH_NOMATCH);
2446 }
2447 GETCHARINCTEST(c, eptr);
2448 if (
2449 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2450 c > 255 ||
2451 #endif
2452 (md->ctypes[c] & ctype_word) == 0
2453 )
2454 RRETURN(MATCH_NOMATCH);
2455 ecode++;
2456 break;
2457
2458 case OP_ANYNL:
2459 if (eptr >= md->end_subject)
2460 {
2461 SCHECK_PARTIAL();
2462 RRETURN(MATCH_NOMATCH);
2463 }
2464 GETCHARINCTEST(c, eptr);
2465 switch(c)
2466 {
2467 default: RRETURN(MATCH_NOMATCH);
2468
2469 case CHAR_CR:
2470 if (eptr >= md->end_subject)
2471 {
2472 SCHECK_PARTIAL();
2473 }
2474 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2475 break;
2476
2477 case CHAR_LF:
2478 break;
2479
2480 case CHAR_VT:
2481 case CHAR_FF:
2482 case CHAR_NEL:
2483 #ifndef EBCDIC
2484 case 0x2028:
2485 case 0x2029:
2486 #endif /* Not EBCDIC */
2487 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2488 break;
2489 }
2490 ecode++;
2491 break;
2492
2493 case OP_NOT_HSPACE:
2494 if (eptr >= md->end_subject)
2495 {
2496 SCHECK_PARTIAL();
2497 RRETURN(MATCH_NOMATCH);
2498 }
2499 GETCHARINCTEST(c, eptr);
2500 switch(c)
2501 {
2502 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2503 default: break;
2504 }
2505 ecode++;
2506 break;
2507
2508 case OP_HSPACE:
2509 if (eptr >= md->end_subject)
2510 {
2511 SCHECK_PARTIAL();
2512 RRETURN(MATCH_NOMATCH);
2513 }
2514 GETCHARINCTEST(c, eptr);
2515 switch(c)
2516 {
2517 HSPACE_CASES: break; /* Byte and multibyte cases */
2518 default: RRETURN(MATCH_NOMATCH);
2519 }
2520 ecode++;
2521 break;
2522
2523 case OP_NOT_VSPACE:
2524 if (eptr >= md->end_subject)
2525 {
2526 SCHECK_PARTIAL();
2527 RRETURN(MATCH_NOMATCH);
2528 }
2529 GETCHARINCTEST(c, eptr);
2530 switch(c)
2531 {
2532 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2533 default: break;
2534 }
2535 ecode++;
2536 break;
2537
2538 case OP_VSPACE:
2539 if (eptr >= md->end_subject)
2540 {
2541 SCHECK_PARTIAL();
2542 RRETURN(MATCH_NOMATCH);
2543 }
2544 GETCHARINCTEST(c, eptr);
2545 switch(c)
2546 {
2547 VSPACE_CASES: break;
2548 default: RRETURN(MATCH_NOMATCH);
2549 }
2550 ecode++;
2551 break;
2552
2553 #ifdef SUPPORT_UCP
2554 /* Check the next character by Unicode property. We will get here only
2555 if the support is in the binary; otherwise a compile-time error occurs. */
2556
2557 case OP_PROP:
2558 case OP_NOTPROP:
2559 if (eptr >= md->end_subject)
2560 {
2561 SCHECK_PARTIAL();
2562 RRETURN(MATCH_NOMATCH);
2563 }
2564 GETCHARINCTEST(c, eptr);
2565 {
2566 const pcre_uint32 *cp;
2567 const ucd_record *prop = GET_UCD(c);
2568
2569 switch(ecode[1])
2570 {
2571 case PT_ANY:
2572 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2573 break;
2574
2575 case PT_LAMP:
2576 if ((prop->chartype == ucp_Lu ||
2577 prop->chartype == ucp_Ll ||
2578 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2579 RRETURN(MATCH_NOMATCH);
2580 break;
2581
2582 case PT_GC:
2583 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2586
2587 case PT_PC:
2588 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2589 RRETURN(MATCH_NOMATCH);
2590 break;
2591
2592 case PT_SC:
2593 if ((ecode[2] != prop->script) == (op == OP_PROP))
2594 RRETURN(MATCH_NOMATCH);
2595 break;
2596
2597 /* These are specials */
2598
2599 case PT_ALNUM:
2600 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2601 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2604
2605 case PT_SPACE: /* Perl space */
2606 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2607 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2608 == (op == OP_NOTPROP))
2609 RRETURN(MATCH_NOMATCH);
2610 break;
2611
2612 case PT_PXSPACE: /* POSIX space */
2613 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2614 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2615 c == CHAR_FF || c == CHAR_CR)
2616 == (op == OP_NOTPROP))
2617 RRETURN(MATCH_NOMATCH);
2618 break;
2619
2620 case PT_WORD:
2621 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2622 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2623 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2624 RRETURN(MATCH_NOMATCH);
2625 break;
2626
2627 case PT_CLIST:
2628 cp = PRIV(ucd_caseless_sets) + ecode[2];
2629 for (;;)
2630 {
2631 if (c < *cp)
2632 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2633 if (c == *cp++)
2634 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2635 }
2636 break;
2637
2638 case PT_UCNC:
2639 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2640 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2641 c >= 0xe000) == (op == OP_NOTPROP))
2642 RRETURN(MATCH_NOMATCH);
2643 break;
2644
2645 /* This should never occur */
2646
2647 default:
2648 RRETURN(PCRE_ERROR_INTERNAL);
2649 }
2650
2651 ecode += 3;
2652 }
2653 break;
2654
2655 /* Match an extended Unicode sequence. We will get here only if the support
2656 is in the binary; otherwise a compile-time error occurs. */
2657
2658 case OP_EXTUNI:
2659 if (eptr >= md->end_subject)
2660 {
2661 SCHECK_PARTIAL();
2662 RRETURN(MATCH_NOMATCH);
2663 }
2664 else
2665 {
2666 int lgb, rgb;
2667 GETCHARINCTEST(c, eptr);
2668 lgb = UCD_GRAPHBREAK(c);
2669 while (eptr < md->end_subject)
2670 {
2671 int len = 1;
2672 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2673 rgb = UCD_GRAPHBREAK(c);
2674 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2675 lgb = rgb;
2676 eptr += len;
2677 }
2678 }
2679 CHECK_PARTIAL();
2680 ecode++;
2681 break;
2682 #endif /* SUPPORT_UCP */
2683
2684
2685 /* Match a back reference, possibly repeatedly. Look past the end of the
2686 item to see if there is repeat information following. The code is similar
2687 to that for character classes, but repeated for efficiency. Then obey
2688 similar code to character type repeats - written out again for speed.
2689 However, if the referenced string is the empty string, always treat
2690 it as matched, any number of times (otherwise there could be infinite
2691 loops). */
2692
2693 case OP_REF:
2694 case OP_REFI:
2695 caseless = op == OP_REFI;
2696 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2697 ecode += 1 + IMM2_SIZE;
2698
2699 /* If the reference is unset, there are two possibilities:
2700
2701 (a) In the default, Perl-compatible state, set the length negative;
2702 this ensures that every attempt at a match fails. We can't just fail
2703 here, because of the possibility of quantifiers with zero minima.
2704
2705 (b) If the JavaScript compatibility flag is set, set the length to zero
2706 so that the back reference matches an empty string.
2707
2708 Otherwise, set the length to the length of what was matched by the
2709 referenced subpattern. */
2710
2711 if (offset >= offset_top || md->offset_vector[offset] < 0)
2712 length = (md->jscript_compat)? 0 : -1;
2713 else
2714 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2715
2716 /* Set up for repetition, or handle the non-repeated case */
2717
2718 switch (*ecode)
2719 {
2720 case OP_CRSTAR:
2721 case OP_CRMINSTAR:
2722 case OP_CRPLUS:
2723 case OP_CRMINPLUS:
2724 case OP_CRQUERY:
2725 case OP_CRMINQUERY:
2726 c = *ecode++ - OP_CRSTAR;
2727 minimize = (c & 1) != 0;
2728 min = rep_min[c]; /* Pick up values from tables; */
2729 max = rep_max[c]; /* zero for max => infinity */
2730 if (max == 0) max = INT_MAX;
2731 break;
2732
2733 case OP_CRRANGE:
2734 case OP_CRMINRANGE:
2735 minimize = (*ecode == OP_CRMINRANGE);
2736 min = GET2(ecode, 1);
2737 max = GET2(ecode, 1 + IMM2_SIZE);
2738 if (max == 0) max = INT_MAX;
2739 ecode += 1 + 2 * IMM2_SIZE;
2740 break;
2741
2742 default: /* No repeat follows */
2743 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2744 {
2745 if (length == -2) eptr = md->end_subject; /* Partial match */
2746 CHECK_PARTIAL();
2747 RRETURN(MATCH_NOMATCH);
2748 }
2749 eptr += length;
2750 continue; /* With the main loop */
2751 }
2752
2753 /* Handle repeated back references. If the length of the reference is
2754 zero, just continue with the main loop. If the length is negative, it
2755 means the reference is unset in non-Java-compatible mode. If the minimum is
2756 zero, we can continue at the same level without recursion. For any other
2757 minimum, carrying on will result in NOMATCH. */
2758
2759 if (length == 0) continue;
2760 if (length < 0 && min == 0) continue;
2761
2762 /* First, ensure the minimum number of matches are present. We get back
2763 the length of the reference string explicitly rather than passing the
2764 address of eptr, so that eptr can be a register variable. */
2765
2766 for (i = 1; i <= min; i++)
2767 {
2768 int slength;
2769 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2770 {
2771 if (slength == -2) eptr = md->end_subject; /* Partial match */
2772 CHECK_PARTIAL();
2773 RRETURN(MATCH_NOMATCH);
2774 }
2775 eptr += slength;
2776 }
2777
2778 /* If min = max, continue at the same level without recursion.
2779 They are not both allowed to be zero. */
2780
2781 if (min == max) continue;
2782
2783 /* If minimizing, keep trying and advancing the pointer */
2784
2785 if (minimize)
2786 {
2787 for (fi = min;; fi++)
2788 {
2789 int slength;
2790 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2792 if (fi >= max) RRETURN(MATCH_NOMATCH);
2793 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2794 {
2795 if (slength == -2) eptr = md->end_subject; /* Partial match */
2796 CHECK_PARTIAL();
2797 RRETURN(MATCH_NOMATCH);
2798 }
2799 eptr += slength;
2800 }
2801 /* Control never gets here */
2802 }
2803
2804 /* If maximizing, find the longest string and work backwards */
2805
2806 else
2807 {
2808 pp = eptr;
2809 for (i = min; i < max; i++)
2810 {
2811 int slength;
2812 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2813 {
2814 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2815 the soft partial matching case. */
2816
2817 if (slength == -2 && md->partial != 0 &&
2818 md->end_subject > md->start_used_ptr)
2819 {
2820 md->hitend = TRUE;
2821 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2822 }
2823 break;
2824 }
2825 eptr += slength;
2826 }
2827
2828 while (eptr >= pp)
2829 {
2830 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2832 eptr -= length;
2833 }
2834 RRETURN(MATCH_NOMATCH);
2835 }
2836 /* Control never gets here */
2837
2838 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2839 used when all the characters in the class have values in the range 0-255,
2840 and either the matching is caseful, or the characters are in the range
2841 0-127 when UTF-8 processing is enabled. The only difference between
2842 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2843 encountered.
2844
2845 First, look past the end of the item to see if there is repeat information
2846 following. Then obey similar code to character type repeats - written out
2847 again for speed. */
2848
2849 case OP_NCLASS:
2850 case OP_CLASS:
2851 {
2852 /* The data variable is saved across frames, so the byte map needs to
2853 be stored there. */
2854 #define BYTE_MAP ((pcre_uint8 *)data)
2855 data = ecode + 1; /* Save for matching */
2856 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2857
2858 switch (*ecode)
2859 {
2860 case OP_CRSTAR:
2861 case OP_CRMINSTAR:
2862 case OP_CRPLUS:
2863 case OP_CRMINPLUS:
2864 case OP_CRQUERY:
2865 case OP_CRMINQUERY:
2866 c = *ecode++ - OP_CRSTAR;
2867 minimize = (c & 1) != 0;
2868 min = rep_min[c]; /* Pick up values from tables; */
2869 max = rep_max[c]; /* zero for max => infinity */
2870 if (max == 0) max = INT_MAX;
2871 break;
2872
2873 case OP_CRRANGE:
2874 case OP_CRMINRANGE:
2875 minimize = (*ecode == OP_CRMINRANGE);
2876 min = GET2(ecode, 1);
2877 max = GET2(ecode, 1 + IMM2_SIZE);
2878 if (max == 0) max = INT_MAX;
2879 ecode += 1 + 2 * IMM2_SIZE;
2880 break;
2881
2882 default: /* No repeat follows */
2883 min = max = 1;
2884 break;
2885 }
2886
2887 /* First, ensure the minimum number of matches are present. */
2888
2889 #ifdef SUPPORT_UTF
2890 if (utf)
2891 {
2892 for (i = 1; i <= min; i++)
2893 {
2894 if (eptr >= md->end_subject)
2895 {
2896 SCHECK_PARTIAL();
2897 RRETURN(MATCH_NOMATCH);
2898 }
2899 GETCHARINC(c, eptr);
2900 if (c > 255)
2901 {
2902 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2903 }
2904 else
2905 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2906 }
2907 }
2908 else
2909 #endif
2910 /* Not UTF mode */
2911 {
2912 for (i = 1; i <= min; i++)
2913 {
2914 if (eptr >= md->end_subject)
2915 {
2916 SCHECK_PARTIAL();
2917 RRETURN(MATCH_NOMATCH);
2918 }
2919 c = *eptr++;
2920 #ifndef COMPILE_PCRE8
2921 if (c > 255)
2922 {
2923 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2924 }
2925 else
2926 #endif
2927 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2928 }
2929 }
2930
2931 /* If max == min we can continue with the main loop without the
2932 need to recurse. */
2933
2934 if (min == max) continue;
2935
2936 /* If minimizing, keep testing the rest of the expression and advancing
2937 the pointer while it matches the class. */
2938
2939 if (minimize)
2940 {
2941 #ifdef SUPPORT_UTF
2942 if (utf)
2943 {
2944 for (fi = min;; fi++)
2945 {
2946 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2948 if (fi >= max) RRETURN(MATCH_NOMATCH);
2949 if (eptr >= md->end_subject)
2950 {
2951 SCHECK_PARTIAL();
2952 RRETURN(MATCH_NOMATCH);
2953 }
2954 GETCHARINC(c, eptr);
2955 if (c > 255)
2956 {
2957 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2958 }
2959 else
2960 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2961 }
2962 }
2963 else
2964 #endif
2965 /* Not UTF mode */
2966 {
2967 for (fi = min;; fi++)
2968 {
2969 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2971 if (fi >= max) RRETURN(MATCH_NOMATCH);
2972 if (eptr >= md->end_subject)
2973 {
2974 SCHECK_PARTIAL();
2975 RRETURN(MATCH_NOMATCH);
2976 }
2977 c = *eptr++;
2978 #ifndef COMPILE_PCRE8
2979 if (c > 255)
2980 {
2981 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2982 }
2983 else
2984 #endif
2985 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2986 }
2987 }
2988 /* Control never gets here */
2989 }
2990
2991 /* If maximizing, find the longest possible run, then work backwards. */
2992
2993 else
2994 {
2995 pp = eptr;
2996
2997 #ifdef SUPPORT_UTF
2998 if (utf)
2999 {
3000 for (i = min; i < max; i++)
3001 {
3002 int len = 1;
3003 if (eptr >= md->end_subject)
3004 {
3005 SCHECK_PARTIAL();
3006 break;
3007 }
3008 GETCHARLEN(c, eptr, len);
3009 if (c > 255)
3010 {
3011 if (op == OP_CLASS) break;
3012 }
3013 else
3014 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3015 eptr += len;
3016 }
3017 for (;;)
3018 {
3019 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3020 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3021 if (eptr-- == pp) break; /* Stop if tried at original pos */
3022 BACKCHAR(eptr);
3023 }
3024 }
3025 else
3026 #endif
3027 /* Not UTF mode */
3028 {
3029 for (i = min; i < max; i++)
3030 {
3031 if (eptr >= md->end_subject)
3032 {
3033 SCHECK_PARTIAL();
3034 break;
3035 }
3036 c = *eptr;
3037 #ifndef COMPILE_PCRE8
3038 if (c > 255)
3039 {
3040 if (op == OP_CLASS) break;
3041 }
3042 else
3043 #endif
3044 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3045 eptr++;
3046 }
3047 while (eptr >= pp)
3048 {
3049 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3051 eptr--;
3052 }
3053 }
3054
3055 RRETURN(MATCH_NOMATCH);
3056 }
3057 #undef BYTE_MAP
3058 }
3059 /* Control never gets here */
3060
3061
3062 /* Match an extended character class. This opcode is encountered only
3063 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3064 mode, because Unicode properties are supported in non-UTF-8 mode. */
3065
3066 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3067 case OP_XCLASS:
3068 {
3069 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3070 ecode += GET(ecode, 1); /* Advance past the item */
3071
3072 switch (*ecode)
3073 {
3074 case OP_CRSTAR:
3075 case OP_CRMINSTAR:
3076 case OP_CRPLUS:
3077 case OP_CRMINPLUS:
3078 case OP_CRQUERY:
3079 case OP_CRMINQUERY:
3080 c = *ecode++ - OP_CRSTAR;
3081 minimize = (c & 1) != 0;
3082 min = rep_min[c]; /* Pick up values from tables; */
3083 max = rep_max[c]; /* zero for max => infinity */
3084 if (max == 0) max = INT_MAX;
3085 break;
3086
3087 case OP_CRRANGE:
3088 case OP_CRMINRANGE:
3089 minimize = (*ecode == OP_CRMINRANGE);
3090 min = GET2(ecode, 1);
3091 max = GET2(ecode, 1 + IMM2_SIZE);
3092 if (max == 0) max = INT_MAX;
3093 ecode += 1 + 2 * IMM2_SIZE;
3094 break;
3095
3096 default: /* No repeat follows */
3097 min = max = 1;
3098 break;
3099 }
3100
3101 /* First, ensure the minimum number of matches are present. */
3102
3103 for (i = 1; i <= min; i++)
3104 {
3105 if (eptr >= md->end_subject)
3106 {
3107 SCHECK_PARTIAL();
3108 RRETURN(MATCH_NOMATCH);
3109 }
3110 GETCHARINCTEST(c, eptr);
3111 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3112 }
3113
3114 /* If max == min we can continue with the main loop without the
3115 need to recurse. */
3116
3117 if (min == max) continue;
3118
3119 /* If minimizing, keep testing the rest of the expression and advancing
3120 the pointer while it matches the class. */
3121
3122 if (minimize)
3123 {
3124 for (fi = min;; fi++)
3125 {
3126 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3127 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3128 if (fi >= max) RRETURN(MATCH_NOMATCH);
3129 if (eptr >= md->end_subject)
3130 {
3131 SCHECK_PARTIAL();
3132 RRETURN(MATCH_NOMATCH);
3133 }
3134 GETCHARINCTEST(c, eptr);
3135 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3136 }
3137 /* Control never gets here */
3138 }
3139
3140 /* If maximizing, find the longest possible run, then work backwards. */
3141
3142 else
3143 {
3144 pp = eptr;
3145 for (i = min; i < max; i++)
3146 {
3147 int len = 1;
3148 if (eptr >= md->end_subject)
3149 {
3150 SCHECK_PARTIAL();
3151 break;
3152 }
3153 #ifdef SUPPORT_UTF
3154 GETCHARLENTEST(c, eptr, len);
3155 #else
3156 c = *eptr;
3157 #endif
3158 if (!PRIV(xclass)(c, data, utf)) break;
3159 eptr += len;
3160 }
3161 for(;;)
3162 {
3163 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3164 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3165 if (eptr-- == pp) break; /* Stop if tried at original pos */
3166 #ifdef SUPPORT_UTF
3167 if (utf) BACKCHAR(eptr);
3168 #endif
3169 }
3170 RRETURN(MATCH_NOMATCH);
3171 }
3172
3173 /* Control never gets here */
3174 }
3175 #endif /* End of XCLASS */
3176
3177 /* Match a single character, casefully */
3178
3179 case OP_CHAR:
3180 #ifdef SUPPORT_UTF
3181 if (utf)
3182 {
3183 length = 1;
3184 ecode++;
3185 GETCHARLEN(fc, ecode, length);
3186 if (length > md->end_subject - eptr)
3187 {
3188 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3189 RRETURN(MATCH_NOMATCH);
3190 }
3191 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3192 }
3193 else
3194 #endif
3195 /* Not UTF mode */
3196 {
3197 if (md->end_subject - eptr < 1)
3198 {
3199 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3200 RRETURN(MATCH_NOMATCH);
3201 }
3202 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3203 ecode += 2;
3204 }
3205 break;
3206
3207 /* Match a single character, caselessly. If we are at the end of the
3208 subject, give up immediately. */
3209
3210 case OP_CHARI:
3211 if (eptr >= md->end_subject)
3212 {
3213 SCHECK_PARTIAL();
3214 RRETURN(MATCH_NOMATCH);
3215 }
3216
3217 #ifdef SUPPORT_UTF
3218 if (utf)
3219 {
3220 length = 1;
3221 ecode++;
3222 GETCHARLEN(fc, ecode, length);
3223
3224 /* If the pattern character's value is < 128, we have only one byte, and
3225 we know that its other case must also be one byte long, so we can use the
3226 fast lookup table. We know that there is at least one byte left in the
3227 subject. */
3228
3229 if (fc < 128)
3230 {
3231 pcre_uint32 cc = RAWUCHAR(eptr);
3232 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3233 ecode++;
3234 eptr++;
3235 }
3236
3237 /* Otherwise we must pick up the subject character. Note that we cannot
3238 use the value of "length" to check for sufficient bytes left, because the
3239 other case of the character may have more or fewer bytes. */
3240
3241 else
3242 {
3243 pcre_uint32 dc;
3244 GETCHARINC(dc, eptr);
3245 ecode += length;
3246
3247 /* If we have Unicode property support, we can use it to test the other
3248 case of the character, if there is one. */
3249
3250 if (fc != dc)
3251 {
3252 #ifdef SUPPORT_UCP
3253 if (dc != UCD_OTHERCASE(fc))
3254 #endif
3255 RRETURN(MATCH_NOMATCH);
3256 }
3257 }
3258 }
3259 else
3260 #endif /* SUPPORT_UTF */
3261
3262 /* Not UTF mode */
3263 {
3264 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3265 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3266 eptr++;
3267 ecode += 2;
3268 }
3269 break;
3270
3271 /* Match a single character repeatedly. */
3272
3273 case OP_EXACT:
3274 case OP_EXACTI:
3275 min = max = GET2(ecode, 1);
3276 ecode += 1 + IMM2_SIZE;
3277 goto REPEATCHAR;
3278
3279 case OP_POSUPTO:
3280 case OP_POSUPTOI:
3281 possessive = TRUE;
3282 /* Fall through */
3283
3284 case OP_UPTO:
3285 case OP_UPTOI:
3286 case OP_MINUPTO:
3287 case OP_MINUPTOI:
3288 min = 0;
3289 max = GET2(ecode, 1);
3290 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3291 ecode += 1 + IMM2_SIZE;
3292 goto REPEATCHAR;
3293
3294 case OP_POSSTAR:
3295 case OP_POSSTARI:
3296 possessive = TRUE;
3297 min = 0;
3298 max = INT_MAX;
3299 ecode++;
3300 goto REPEATCHAR;
3301
3302 case OP_POSPLUS:
3303 case OP_POSPLUSI:
3304 possessive = TRUE;
3305 min = 1;
3306 max = INT_MAX;
3307 ecode++;
3308 goto REPEATCHAR;
3309
3310 case OP_POSQUERY:
3311 case OP_POSQUERYI:
3312 possessive = TRUE;
3313 min = 0;
3314 max = 1;
3315 ecode++;
3316 goto REPEATCHAR;
3317
3318 case OP_STAR:
3319 case OP_STARI:
3320 case OP_MINSTAR:
3321 case OP_MINSTARI:
3322 case OP_PLUS:
3323 case OP_PLUSI:
3324 case OP_MINPLUS:
3325 case OP_MINPLUSI:
3326 case OP_QUERY:
3327 case OP_QUERYI:
3328 case OP_MINQUERY:
3329 case OP_MINQUERYI:
3330 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3331 minimize = (c & 1) != 0;
3332 min = rep_min[c]; /* Pick up values from tables; */
3333 max = rep_max[c]; /* zero for max => infinity */
3334 if (max == 0) max = INT_MAX;
3335
3336 /* Common code for all repeated single-character matches. */
3337
3338 REPEATCHAR:
3339 #ifdef SUPPORT_UTF
3340 if (utf)
3341 {
3342 length = 1;
3343 charptr = ecode;
3344 GETCHARLEN(fc, ecode, length);
3345 ecode += length;
3346
3347 /* Handle multibyte character matching specially here. There is
3348 support for caseless matching if UCP support is present. */
3349
3350 if (length > 1)
3351 {
3352 #ifdef SUPPORT_UCP
3353 pcre_uint32 othercase;
3354 if (op >= OP_STARI && /* Caseless */
3355 (othercase = UCD_OTHERCASE(fc)) != fc)
3356 oclength = PRIV(ord2utf)(othercase, occhars);
3357 else oclength = 0;
3358 #endif /* SUPPORT_UCP */
3359
3360 for (i = 1; i <= min; i++)
3361 {
3362 if (eptr <= md->end_subject - length &&
3363 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3364 #ifdef SUPPORT_UCP
3365 else if (oclength > 0 &&
3366 eptr <= md->end_subject - oclength &&
3367 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3368 #endif /* SUPPORT_UCP */
3369 else
3370 {
3371 CHECK_PARTIAL();
3372 RRETURN(MATCH_NOMATCH);
3373 }
3374 }
3375
3376 if (min == max) continue;
3377
3378 if (minimize)
3379 {
3380 for (fi = min;; fi++)
3381 {
3382 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3383 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3384 if (fi >= max) RRETURN(MATCH_NOMATCH);
3385 if (eptr <= md->end_subject - length &&
3386 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3387 #ifdef SUPPORT_UCP
3388 else if (oclength > 0 &&
3389 eptr <= md->end_subject - oclength &&
3390 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3391 #endif /* SUPPORT_UCP */
3392 else
3393 {
3394 CHECK_PARTIAL();
3395 RRETURN(MATCH_NOMATCH);
3396 }
3397 }
3398 /* Control never gets here */
3399 }
3400
3401 else /* Maximize */
3402 {
3403 pp = eptr;
3404 for (i = min; i < max; i++)
3405 {
3406 if (eptr <= md->end_subject - length &&
3407 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3408 #ifdef SUPPORT_UCP
3409 else if (oclength > 0 &&
3410 eptr <= md->end_subject - oclength &&
3411 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3412 #endif /* SUPPORT_UCP */
3413 else
3414 {
3415 CHECK_PARTIAL();
3416 break;
3417 }
3418 }
3419
3420 if (possessive) continue;
3421
3422 for(;;)
3423 {
3424 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3425 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3426 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3427 #ifdef SUPPORT_UCP
3428 eptr--;
3429 BACKCHAR(eptr);
3430 #else /* without SUPPORT_UCP */
3431 eptr -= length;
3432 #endif /* SUPPORT_UCP */
3433 }
3434 }
3435 /* Control never gets here */
3436 }
3437
3438 /* If the length of a UTF-8 character is 1, we fall through here, and
3439 obey the code as for non-UTF-8 characters below, though in this case the
3440 value of fc will always be < 128. */
3441 }
3442 else
3443 #endif /* SUPPORT_UTF */
3444 /* When not in UTF-8 mode, load a single-byte character. */
3445 fc = *ecode++;
3446
3447 /* The value of fc at this point is always one character, though we may
3448 or may not be in UTF mode. The code is duplicated for the caseless and
3449 caseful cases, for speed, since matching characters is likely to be quite
3450 common. First, ensure the minimum number of matches are present. If min =
3451 max, continue at the same level without recursing. Otherwise, if
3452 minimizing, keep trying the rest of the expression and advancing one
3453 matching character if failing, up to the maximum. Alternatively, if
3454 maximizing, find the maximum number of characters and work backwards. */
3455
3456 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3457 max, (char *)eptr));
3458
3459 if (op >= OP_STARI) /* Caseless */
3460 {
3461 #ifdef COMPILE_PCRE8
3462 /* fc must be < 128 if UTF is enabled. */
3463 foc = md->fcc[fc];
3464 #else
3465 #ifdef SUPPORT_UTF
3466 #ifdef SUPPORT_UCP
3467 if (utf && fc > 127)
3468 foc = UCD_OTHERCASE(fc);
3469 #else
3470 if (utf && fc > 127)
3471 foc = fc;
3472 #endif /* SUPPORT_UCP */
3473 else
3474 #endif /* SUPPORT_UTF */
3475 foc = TABLE_GET(fc, md->fcc, fc);
3476 #endif /* COMPILE_PCRE8 */
3477
3478 for (i = 1; i <= min; i++)
3479 {
3480 pcre_uint32 cc; /* Faster than pcre_uchar */
3481 if (eptr >= md->end_subject)
3482 {
3483 SCHECK_PARTIAL();
3484 RRETURN(MATCH_NOMATCH);
3485 }
3486 cc = RAWUCHARTEST(eptr);
3487 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3488 eptr++;
3489 }
3490 if (min == max) continue;
3491 if (minimize)
3492 {
3493 for (fi = min;; fi++)
3494 {
3495 pcre_uint32 cc; /* Faster than pcre_uchar */
3496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3498 if (fi >= max) RRETURN(MATCH_NOMATCH);
3499 if (eptr >= md->end_subject)
3500 {
3501 SCHECK_PARTIAL();
3502 RRETURN(MATCH_NOMATCH);
3503 }
3504 cc = RAWUCHARTEST(eptr);
3505 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3506 eptr++;
3507 }
3508 /* Control never gets here */
3509 }
3510 else /* Maximize */
3511 {
3512 pp = eptr;
3513 for (i = min; i < max; i++)
3514 {
3515 pcre_uint32 cc; /* Faster than pcre_uchar */
3516 if (eptr >= md->end_subject)
3517 {
3518 SCHECK_PARTIAL();
3519 break;
3520 }
3521 cc = RAWUCHARTEST(eptr);
3522 if (fc != cc && foc != cc) break;
3523 eptr++;
3524 }
3525
3526 if (possessive) continue;
3527
3528 while (eptr >= pp)
3529 {
3530 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3531 eptr--;
3532 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3533 }
3534 RRETURN(MATCH_NOMATCH);
3535 }
3536 /* Control never gets here */
3537 }
3538
3539 /* Caseful comparisons (includes all multi-byte characters) */
3540
3541 else
3542 {
3543 for (i = 1; i <= min; i++)
3544 {
3545 if (eptr >= md->end_subject)
3546 {
3547 SCHECK_PARTIAL();
3548 RRETURN(MATCH_NOMATCH);
3549 }
3550 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3551 }
3552
3553 if (min == max) continue;
3554
3555 if (minimize)
3556 {
3557 for (fi = min;; fi++)
3558 {
3559 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3561 if (fi >= max) RRETURN(MATCH_NOMATCH);
3562 if (eptr >= md->end_subject)
3563 {
3564 SCHECK_PARTIAL();
3565 RRETURN(MATCH_NOMATCH);
3566 }
3567 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3568 }
3569 /* Control never gets here */
3570 }
3571 else /* Maximize */
3572 {
3573 pp = eptr;
3574 for (i = min; i < max; i++)
3575 {
3576 if (eptr >= md->end_subject)
3577 {
3578 SCHECK_PARTIAL();
3579 break;
3580 }
3581 if (fc != RAWUCHARTEST(eptr)) break;
3582 eptr++;
3583 }
3584 if (possessive) continue;
3585
3586 while (eptr >= pp)
3587 {
3588 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3589 eptr--;
3590 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3591 }
3592 RRETURN(MATCH_NOMATCH);
3593 }
3594 }
3595 /* Control never gets here */
3596
3597 /* Match a negated single one-byte character. The character we are
3598 checking can be multibyte. */
3599
3600 case OP_NOT:
3601 case OP_NOTI:
3602 if (eptr >= md->end_subject)
3603 {
3604 SCHECK_PARTIAL();
3605 RRETURN(MATCH_NOMATCH);
3606 }
3607 #ifdef SUPPORT_UTF
3608 if (utf)
3609 {
3610 register pcre_uint32 ch, och;
3611
3612 ecode++;
3613 GETCHARINC(ch, ecode);
3614 GETCHARINC(c, eptr);
3615
3616 if (op == OP_NOT)
3617 {
3618 if (ch == c) RRETURN(MATCH_NOMATCH);
3619 }
3620 else
3621 {
3622 #ifdef SUPPORT_UCP
3623 if (ch > 127)
3624 och = UCD_OTHERCASE(ch);
3625 #else
3626 if (ch > 127)
3627 och = ch;
3628 #endif /* SUPPORT_UCP */
3629 else
3630 och = TABLE_GET(ch, md->fcc, ch);
3631 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3632 }
3633 }
3634 else
3635 #endif
3636 {
3637 register pcre_uint32 ch = ecode[1];
3638 c = *eptr++;
3639 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3640 RRETURN(MATCH_NOMATCH);
3641 ecode += 2;
3642 }
3643 break;
3644
3645 /* Match a negated single one-byte character repeatedly. This is almost a
3646 repeat of the code for a repeated single character, but I haven't found a
3647 nice way of commoning these up that doesn't require a test of the
3648 positive/negative option for each character match. Maybe that wouldn't add
3649 very much to the time taken, but character matching *is* what this is all
3650 about... */
3651
3652 case OP_NOTEXACT:
3653 case OP_NOTEXACTI:
3654 min = max = GET2(ecode, 1);
3655 ecode += 1 + IMM2_SIZE;
3656 goto REPEATNOTCHAR;
3657
3658 case OP_NOTUPTO:
3659 case OP_NOTUPTOI:
3660 case OP_NOTMINUPTO:
3661 case OP_NOTMINUPTOI:
3662 min = 0;
3663 max = GET2(ecode, 1);
3664 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3665 ecode += 1 + IMM2_SIZE;
3666 goto REPEATNOTCHAR;
3667
3668 case OP_NOTPOSSTAR:
3669 case OP_NOTPOSSTARI:
3670 possessive = TRUE;
3671 min = 0;
3672 max = INT_MAX;
3673 ecode++;
3674 goto REPEATNOTCHAR;
3675
3676 case OP_NOTPOSPLUS:
3677 case OP_NOTPOSPLUSI:
3678 possessive = TRUE;
3679 min = 1;
3680 max = INT_MAX;
3681 ecode++;
3682 goto REPEATNOTCHAR;
3683
3684 case OP_NOTPOSQUERY:
3685 case OP_NOTPOSQUERYI:
3686 possessive = TRUE;
3687 min = 0;
3688 max = 1;
3689 ecode++;
3690 goto REPEATNOTCHAR;
3691
3692 case OP_NOTPOSUPTO:
3693 case OP_NOTPOSUPTOI:
3694 possessive = TRUE;
3695 min = 0;
3696 max = GET2(ecode, 1);
3697 ecode += 1 + IMM2_SIZE;
3698 goto REPEATNOTCHAR;
3699
3700 case OP_NOTSTAR:
3701 case OP_NOTSTARI:
3702 case OP_NOTMINSTAR:
3703 case OP_NOTMINSTARI:
3704 case OP_NOTPLUS:
3705 case OP_NOTPLUSI:
3706 case OP_NOTMINPLUS:
3707 case OP_NOTMINPLUSI:
3708 case OP_NOTQUERY:
3709 case OP_NOTQUERYI:
3710 case OP_NOTMINQUERY:
3711 case OP_NOTMINQUERYI:
3712 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3713 minimize = (c & 1) != 0;
3714 min = rep_min[c]; /* Pick up values from tables; */
3715 max = rep_max[c]; /* zero for max => infinity */
3716 if (max == 0) max = INT_MAX;
3717
3718 /* Common code for all repeated single-byte matches. */
3719
3720 REPEATNOTCHAR:
3721 GETCHARINCTEST(fc, ecode);
3722
3723 /* The code is duplicated for the caseless and caseful cases, for speed,
3724 since matching characters is likely to be quite common. First, ensure the
3725 minimum number of matches are present. If min = max, continue at the same
3726 level without recursing. Otherwise, if minimizing, keep trying the rest of
3727 the expression and advancing one matching character if failing, up to the
3728 maximum. Alternatively, if maximizing, find the maximum number of
3729 characters and work backwards. */
3730
3731 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3732 max, (char *)eptr));
3733
3734 if (op >= OP_NOTSTARI) /* Caseless */
3735 {
3736 #ifdef SUPPORT_UTF
3737 #ifdef SUPPORT_UCP
3738 if (utf && fc > 127)
3739 foc = UCD_OTHERCASE(fc);
3740 #else
3741 if (utf && fc > 127)
3742 foc = fc;
3743 #endif /* SUPPORT_UCP */
3744 else
3745 #endif /* SUPPORT_UTF */
3746 foc = TABLE_GET(fc, md->fcc, fc);
3747
3748 #ifdef SUPPORT_UTF
3749 if (utf)
3750 {
3751 register pcre_uint32 d;
3752 for (i = 1; i <= min; i++)
3753 {
3754 if (eptr >= md->end_subject)
3755 {
3756 SCHECK_PARTIAL();
3757 RRETURN(MATCH_NOMATCH);
3758 }
3759 GETCHARINC(d, eptr);
3760 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3761 }
3762 }
3763 else
3764 #endif
3765 /* Not UTF mode */
3766 {
3767 for (i = 1; i <= min; i++)
3768 {
3769 if (eptr >= md->end_subject)
3770 {
3771 SCHECK_PARTIAL();
3772 RRETURN(MATCH_NOMATCH);
3773 }
3774 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3775 eptr++;
3776 }
3777 }
3778
3779 if (min == max) continue;
3780
3781 if (minimize)
3782 {
3783 #ifdef SUPPORT_UTF
3784 if (utf)
3785 {
3786 register pcre_uint32 d;
3787 for (fi = min;; fi++)
3788 {
3789 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3791 if (fi >= max) RRETURN(MATCH_NOMATCH);
3792 if (eptr >= md->end_subject)
3793 {
3794 SCHECK_PARTIAL();
3795 RRETURN(MATCH_NOMATCH);
3796 }
3797 GETCHARINC(d, eptr);
3798 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3799 }
3800 }
3801 else
3802 #endif
3803 /* Not UTF mode */
3804 {
3805 for (fi = min;; fi++)
3806 {
3807 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3809 if (fi >= max) RRETURN(MATCH_NOMATCH);
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 RRETURN(MATCH_NOMATCH);
3814 }
3815 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3816 eptr++;
3817 }
3818 }
3819 /* Control never gets here */
3820 }
3821
3822 /* Maximize case */
3823
3824 else
3825 {
3826 pp = eptr;
3827
3828 #ifdef SUPPORT_UTF
3829 if (utf)
3830 {
3831 register pcre_uint32 d;
3832 for (i = min; i < max; i++)
3833 {
3834 int len = 1;
3835 if (eptr >= md->end_subject)
3836 {
3837 SCHECK_PARTIAL();
3838 break;
3839 }
3840 GETCHARLEN(d, eptr, len);
3841 if (fc == d || (unsigned int)foc == d) break;
3842 eptr += len;
3843 }
3844 if (possessive) continue;
3845 for(;;)
3846 {
3847 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3849 if (eptr-- == pp) break; /* Stop if tried at original pos */
3850 BACKCHAR(eptr);
3851 }
3852 }
3853 else
3854 #endif
3855 /* Not UTF mode */
3856 {
3857 for (i = min; i < max; i++)
3858 {
3859 if (eptr >= md->end_subject)
3860 {
3861 SCHECK_PARTIAL();
3862 break;
3863 }
3864 if (fc == *eptr || foc == *eptr) break;
3865 eptr++;
3866 }
3867 if (possessive) continue;
3868 while (eptr >= pp)
3869 {
3870 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3872 eptr--;
3873 }
3874 }
3875
3876 RRETURN(MATCH_NOMATCH);
3877 }
3878 /* Control never gets here */
3879 }
3880
3881 /* Caseful comparisons */
3882
3883 else
3884 {
3885 #ifdef SUPPORT_UTF
3886 if (utf)
3887 {
3888 register pcre_uint32 d;
3889 for (i = 1; i <= min; i++)
3890 {
3891 if (eptr >= md->end_subject)
3892 {
3893 SCHECK_PARTIAL();
3894 RRETURN(MATCH_NOMATCH);
3895 }
3896 GETCHARINC(d, eptr);
3897 if (fc == d) RRETURN(MATCH_NOMATCH);
3898 }
3899 }
3900 else
3901 #endif
3902 /* Not UTF mode */
3903 {
3904 for (i = 1; i <= min; i++)
3905 {
3906 if (eptr >= md->end_subject)
3907 {
3908 SCHECK_PARTIAL();
3909 RRETURN(MATCH_NOMATCH);
3910 }
3911 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3912 }
3913 }
3914
3915 if (min == max) continue;
3916
3917 if (minimize)
3918 {
3919 #ifdef SUPPORT_UTF
3920 if (utf)
3921 {
3922 register pcre_uint32 d;
3923 for (fi = min;; fi++)
3924 {
3925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3927 if (fi >= max) RRETURN(MATCH_NOMATCH);
3928 if (eptr >= md->end_subject)
3929 {
3930 SCHECK_PARTIAL();
3931 RRETURN(MATCH_NOMATCH);
3932 }
3933 GETCHARINC(d, eptr);
3934 if (fc == d) RRETURN(MATCH_NOMATCH);
3935 }
3936 }
3937 else
3938 #endif
3939 /* Not UTF mode */
3940 {
3941 for (fi = min;; fi++)
3942 {
3943 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3945 if (fi >= max) RRETURN(MATCH_NOMATCH);
3946 if (eptr >= md->end_subject)
3947 {
3948 SCHECK_PARTIAL();
3949 RRETURN(MATCH_NOMATCH);
3950 }
3951 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3952 }
3953 }
3954 /* Control never gets here */
3955 }
3956
3957 /* Maximize case */
3958
3959 else
3960 {
3961 pp = eptr;
3962
3963 #ifdef SUPPORT_UTF
3964 if (utf)
3965 {
3966 register pcre_uint32 d;
3967 for (i = min; i < max; i++)
3968 {
3969 int len = 1;
3970 if (eptr >= md->end_subject)
3971 {
3972 SCHECK_PARTIAL();
3973 break;
3974 }
3975 GETCHARLEN(d, eptr, len);
3976 if (fc == d) break;
3977 eptr += len;
3978 }
3979 if (possessive) continue;
3980 for(;;)
3981 {
3982 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3983 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3984 if (eptr-- == pp) break; /* Stop if tried at original pos */
3985 BACKCHAR(eptr);
3986 }
3987 }
3988 else
3989 #endif
3990 /* Not UTF mode */
3991 {
3992 for (i = min; i < max; i++)
3993 {
3994 if (eptr >= md->end_subject)
3995 {
3996 SCHECK_PARTIAL();
3997 break;
3998 }
3999 if (fc == *eptr) break;
4000 eptr++;
4001 }
4002 if (possessive) continue;
4003 while (eptr >= pp)
4004 {
4005 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4007 eptr--;
4008 }
4009 }
4010
4011 RRETURN(MATCH_NOMATCH);
4012 }
4013 }
4014 /* Control never gets here */
4015
4016 /* Match a single character type repeatedly; several different opcodes
4017 share code. This is very similar to the code for single characters, but we
4018 repeat it in the interests of efficiency. */
4019
4020 case OP_TYPEEXACT:
4021 min = max = GET2(ecode, 1);
4022 minimize = TRUE;
4023 ecode += 1 + IMM2_SIZE;
4024 goto REPEATTYPE;
4025
4026 case OP_TYPEUPTO:
4027 case OP_TYPEMINUPTO:
4028 min = 0;
4029 max = GET2(ecode, 1);
4030 minimize = *ecode == OP_TYPEMINUPTO;
4031 ecode += 1 + IMM2_SIZE;
4032 goto REPEATTYPE;
4033
4034 case OP_TYPEPOSSTAR:
4035 possessive = TRUE;
4036 min = 0;
4037 max = INT_MAX;
4038 ecode++;
4039 goto REPEATTYPE;
4040
4041 case OP_TYPEPOSPLUS:
4042 possessive = TRUE;
4043 min = 1;
4044 max = INT_MAX;
4045 ecode++;
4046 goto REPEATTYPE;
4047
4048 case OP_TYPEPOSQUERY:
4049 possessive = TRUE;
4050 min = 0;
4051 max = 1;
4052 ecode++;
4053 goto REPEATTYPE;
4054
4055 case OP_TYPEPOSUPTO:
4056 possessive = TRUE;
4057 min = 0;
4058 max = GET2(ecode, 1);
4059 ecode += 1 + IMM2_SIZE;
4060 goto REPEATTYPE;
4061
4062 case OP_TYPESTAR:
4063 case OP_TYPEMINSTAR:
4064 case OP_TYPEPLUS:
4065 case OP_TYPEMINPLUS:
4066 case OP_TYPEQUERY:
4067 case OP_TYPEMINQUERY:
4068 c = *ecode++ - OP_TYPESTAR;
4069 minimize = (c & 1) != 0;
4070 min = rep_min[c]; /* Pick up values from tables; */
4071 max = rep_max[c]; /* zero for max => infinity */
4072 if (max == 0) max = INT_MAX;
4073
4074 /* Common code for all repeated single character type matches. Note that
4075 in UTF-8 mode, '.' matches a character of any length, but for the other
4076 character types, the valid characters are all one-byte long. */
4077
4078 REPEATTYPE:
4079 ctype = *ecode++; /* Code for the character type */
4080
4081 #ifdef SUPPORT_UCP
4082 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4083 {
4084 prop_fail_result = ctype == OP_NOTPROP;
4085 prop_type = *ecode++;
4086 prop_value = *ecode++;
4087 }
4088 else prop_type = -1;
4089 #endif
4090
4091 /* First, ensure the minimum number of matches are present. Use inline
4092 code for maximizing the speed, and do the type test once at the start
4093 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4094 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4095 and single-bytes. */
4096
4097 if (min > 0)
4098 {
4099 #ifdef SUPPORT_UCP
4100 if (prop_type >= 0)
4101 {
4102 switch(prop_type)
4103 {
4104 case PT_ANY:
4105 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4106 for (i = 1; i <= min; i++)
4107 {
4108 if (eptr >= md->end_subject)
4109 {
4110 SCHECK_PARTIAL();
4111 RRETURN(MATCH_NOMATCH);
4112 }
4113 GETCHARINCTEST(c, eptr);
4114 }
4115 break;
4116
4117 case PT_LAMP:
4118 for (i = 1; i <= min; i++)
4119 {
4120 int chartype;
4121 if (eptr >= md->end_subject)
4122 {
4123 SCHECK_PARTIAL();
4124 RRETURN(MATCH_NOMATCH);
4125 }
4126 GETCHARINCTEST(c, eptr);
4127 chartype = UCD_CHARTYPE(c);
4128 if ((chartype == ucp_Lu ||
4129 chartype == ucp_Ll ||
4130 chartype == ucp_Lt) == prop_fail_result)
4131 RRETURN(MATCH_NOMATCH);
4132 }
4133 break;
4134
4135 case PT_GC:
4136 for (i = 1; i <= min; i++)
4137 {
4138 if (eptr >= md->end_subject)
4139 {
4140 SCHECK_PARTIAL();
4141 RRETURN(MATCH_NOMATCH);
4142 }
4143 GETCHARINCTEST(c, eptr);
4144 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4145 RRETURN(MATCH_NOMATCH);
4146 }
4147 break;
4148
4149 case PT_PC:
4150 for (i = 1; i <= min; i++)
4151 {
4152 if (eptr >= md->end_subject)
4153 {
4154 SCHECK_PARTIAL();
4155 RRETURN(MATCH_NOMATCH);
4156 }
4157 GETCHARINCTEST(c, eptr);
4158 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4159 RRETURN(MATCH_NOMATCH);
4160 }
4161 break;
4162
4163 case PT_SC:
4164 for (i = 1; i <= min; i++)
4165 {
4166 if (eptr >= md->end_subject)
4167 {
4168 SCHECK_PARTIAL();
4169 RRETURN(MATCH_NOMATCH);
4170 }
4171 GETCHARINCTEST(c, eptr);
4172 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4173 RRETURN(MATCH_NOMATCH);
4174 }
4175 break;
4176
4177 case PT_ALNUM:
4178 for (i = 1; i <= min; i++)
4179 {
4180 int category;
4181 if (eptr >= md->end_subject)
4182 {
4183 SCHECK_PARTIAL();
4184 RRETURN(MATCH_NOMATCH);
4185 }
4186 GETCHARINCTEST(c, eptr);
4187 category = UCD_CATEGORY(c);
4188 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 break;
4192
4193 case PT_SPACE: /* Perl space */
4194 for (i = 1; i <= min; i++)
4195 {
4196 if (eptr >= md->end_subject)
4197 {
4198 SCHECK_PARTIAL();
4199 RRETURN(MATCH_NOMATCH);
4200 }
4201 GETCHARINCTEST(c, eptr);
4202 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4203 c == CHAR_FF || c == CHAR_CR)
4204 == prop_fail_result)
4205 RRETURN(MATCH_NOMATCH);
4206 }
4207 break;
4208
4209 case PT_PXSPACE: /* POSIX space */
4210 for (i = 1; i <= min; i++)
4211 {
4212 if (eptr >= md->end_subject)
4213 {
4214 SCHECK_PARTIAL();
4215 RRETURN(MATCH_NOMATCH);
4216 }
4217 GETCHARINCTEST(c, eptr);
4218 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4219 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4220 == prop_fail_result)
4221 RRETURN(MATCH_NOMATCH);
4222 }
4223 break;
4224
4225 case PT_WORD:
4226 for (i = 1; i <= min; i++)
4227 {
4228 int category;
4229 if (eptr >= md->end_subject)
4230 {
4231 SCHECK_PARTIAL();
4232 RRETURN(MATCH_NOMATCH);
4233 }
4234 GETCHARINCTEST(c, eptr);
4235 category = UCD_CATEGORY(c);
4236 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4237 == prop_fail_result)
4238 RRETURN(MATCH_NOMATCH);
4239 }
4240 break;
4241
4242 case PT_CLIST:
4243 for (i = 1; i <= min; i++)
4244 {
4245 const pcre_uint32 *cp;
4246 if (eptr >= md->end_subject)
4247 {
4248 SCHECK_PARTIAL();
4249 RRETURN(MATCH_NOMATCH);
4250 }
4251 GETCHARINCTEST(c, eptr);
4252 cp = PRIV(ucd_caseless_sets) + prop_value;
4253 for (;;)
4254 {
4255 if (c < *cp)
4256 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4257 if (c == *cp++)
4258 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4259 }
4260 }
4261 break;
4262
4263 case PT_UCNC:
4264 for (i = 1; i <= min; i++)
4265 {
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 RRETURN(MATCH_NOMATCH);
4270 }
4271 GETCHARINCTEST(c, eptr);
4272 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4273 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4274 c >= 0xe000) == prop_fail_result)
4275 RRETURN(MATCH_NOMATCH);
4276 }
4277 break;
4278
4279 /* This should not occur */
4280
4281 default:
4282 RRETURN(PCRE_ERROR_INTERNAL);
4283 }
4284 }
4285
4286 /* Match extended Unicode sequences. We will get here only if the
4287 support is in the binary; otherwise a compile-time error occurs. */
4288
4289 else if (ctype == OP_EXTUNI)
4290 {
4291 for (i = 1; i <= min; i++)
4292 {
4293 if (eptr >= md->end_subject)
4294 {
4295 SCHECK_PARTIAL();
4296 RRETURN(MATCH_NOMATCH);
4297 }
4298 else
4299 {
4300 int lgb, rgb;
4301 GETCHARINCTEST(c, eptr);
4302 lgb = UCD_GRAPHBREAK(c);
4303 while (eptr < md->end_subject)
4304 {
4305 int len = 1;
4306 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4307 rgb = UCD_GRAPHBREAK(c);
4308 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4309 lgb = rgb;
4310 eptr += len;
4311 }
4312 }
4313 CHECK_PARTIAL();
4314 }
4315 }
4316
4317 else
4318 #endif /* SUPPORT_UCP */
4319
4320 /* Handle all other cases when the coding is UTF-8 */
4321
4322 #ifdef SUPPORT_UTF
4323 if (utf) switch(ctype)
4324 {
4325 case OP_ANY:
4326 for (i = 1; i <= min; i++)
4327 {
4328 if (eptr >= md->end_subject)
4329 {
4330 SCHECK_PARTIAL();
4331 RRETURN(MATCH_NOMATCH);
4332 }
4333 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4334 if (md->partial != 0 &&
4335 eptr + 1 >= md->end_subject &&
4336 NLBLOCK->nltype == NLTYPE_FIXED &&
4337 NLBLOCK->nllen == 2 &&
4338 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4339 {
4340 md->hitend = TRUE;
4341 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4342 }
4343 eptr++;
4344 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4345 }
4346 break;
4347
4348 case OP_ALLANY:
4349 for (i = 1; i <= min; i++)
4350 {
4351 if (eptr >= md->end_subject)
4352 {
4353 SCHECK_PARTIAL();
4354 RRETURN(MATCH_NOMATCH);
4355 }
4356 eptr++;
4357 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4358 }
4359 break;
4360
4361 case OP_ANYBYTE:
4362 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4363 eptr += min;
4364 break;
4365
4366 case OP_ANYNL:
4367 for (i = 1; i <= min; i++)
4368 {
4369 if (eptr >= md->end_subject)
4370 {
4371 SCHECK_PARTIAL();
4372 RRETURN(MATCH_NOMATCH);
4373 }
4374 GETCHARINC(c, eptr);
4375 switch(c)
4376 {
4377 default: RRETURN(MATCH_NOMATCH);
4378
4379 case CHAR_CR:
4380 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4381 break;
4382
4383 case CHAR_LF:
4384 break;
4385
4386 case CHAR_VT:
4387 case CHAR_FF:
4388 case CHAR_NEL:
4389 #ifndef EBCDIC
4390 case 0x2028:
4391 case 0x2029:
4392 #endif /* Not EBCDIC */
4393 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4394 break;
4395 }
4396 }
4397 break;
4398
4399 case OP_NOT_HSPACE:
4400 for (i = 1; i <= min; i++)
4401 {
4402 if (eptr >= md->end_subject)
4403 {
4404 SCHECK_PARTIAL();
4405 RRETURN(MATCH_NOMATCH);
4406 }
4407 GETCHARINC(c, eptr);
4408 switch(c)
4409 {
4410 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4411 default: break;
4412 }
4413 }
4414 break;
4415
4416 case OP_HSPACE:
4417 for (i = 1; i <= min; i++)
4418 {
4419 if (eptr >= md->end_subject)
4420 {
4421 SCHECK_PARTIAL();
4422 RRETURN(MATCH_NOMATCH);
4423 }
4424 GETCHARINC(c, eptr);
4425 switch(c)
4426 {
4427 HSPACE_CASES: break; /* Byte and multibyte cases */
4428 default: RRETURN(MATCH_NOMATCH);
4429 }
4430 }
4431 break;
4432
4433 case OP_NOT_VSPACE:
4434 for (i = 1; i <= min; i++)
4435 {
4436 if (eptr >= md->end_subject)
4437 {
4438 SCHECK_PARTIAL();
4439 RRETURN(MATCH_NOMATCH);
4440 }
4441 GETCHARINC(c, eptr);
4442 switch(c)
4443 {
4444 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4445 default: break;
4446 }
4447 }
4448 break;
4449
4450 case OP_VSPACE:
4451 for (i = 1; i <= min; i++)
4452 {
4453 if (eptr >= md->end_subject)
4454 {
4455 SCHECK_PARTIAL();
4456 RRETURN(MATCH_NOMATCH);
4457 }
4458 GETCHARINC(c, eptr);
4459 switch(c)
4460 {
4461 VSPACE_CASES: break;
4462 default: RRETURN(MATCH_NOMATCH);
4463 }
4464 }
4465 break;
4466
4467 case OP_NOT_DIGIT:
4468 for (i = 1; i <= min; i++)
4469 {
4470 if (eptr >= md->end_subject)
4471 {
4472 SCHECK_PARTIAL();
4473 RRETURN(MATCH_NOMATCH);
4474 }
4475 GETCHARINC(c, eptr);
4476 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4477 RRETURN(MATCH_NOMATCH);
4478 }
4479 break;
4480
4481 case OP_DIGIT:
4482 for (i = 1; i <= min; i++)
4483 {
4484 pcre_uint32 cc;
4485 if (eptr >= md->end_subject)
4486 {
4487 SCHECK_PARTIAL();
4488 RRETURN(MATCH_NOMATCH);
4489 }
4490 cc = RAWUCHAR(eptr);
4491 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4492 RRETURN(MATCH_NOMATCH);
4493 eptr++;
4494 /* No need to skip more bytes - we know it's a 1-byte character */
4495 }
4496 break;
4497
4498 case OP_NOT_WHITESPACE:
4499 for (i = 1; i <= min; i++)
4500 {
4501 pcre_uint32 cc;
4502 if (eptr >= md->end_subject)
4503 {
4504 SCHECK_PARTIAL();
4505 RRETURN(MATCH_NOMATCH);
4506 }
4507 cc = RAWUCHAR(eptr);
4508 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4509 RRETURN(MATCH_NOMATCH);
4510 eptr++;
4511 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4512 }
4513 break;
4514
4515 case OP_WHITESPACE:
4516 for (i = 1; i <= min; i++)
4517 {
4518 pcre_uint32 cc;
4519 if (eptr >= md->end_subject)
4520 {
4521 SCHECK_PARTIAL();
4522 RRETURN(MATCH_NOMATCH);
4523 }
4524 cc = RAWUCHAR(eptr);
4525 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4526 RRETURN(MATCH_NOMATCH);
4527 eptr++;
4528 /* No need to skip more bytes - we know it's a 1-byte character */
4529 }
4530 break;
4531
4532 case OP_NOT_WORDCHAR:
4533 for (i = 1; i <= min; i++)
4534 {
4535 pcre_uint32 cc;
4536 if (eptr >= md->end_subject)
4537 {
4538 SCHECK_PARTIAL();
4539 RRETURN(MATCH_NOMATCH);
4540 }
4541 cc = RAWUCHAR(eptr);
4542 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4543 RRETURN(MATCH_NOMATCH);
4544 eptr++;
4545 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4546 }
4547 break;
4548
4549 case OP_WORDCHAR:
4550 for (i = 1; i <= min; i++)
4551 {
4552 pcre_uint32 cc;
4553 if (eptr >= md->end_subject)
4554 {
4555 SCHECK_PARTIAL();
4556 RRETURN(MATCH_NOMATCH);
4557 }
4558 cc = RAWUCHAR(eptr);
4559 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4560 RRETURN(MATCH_NOMATCH);
4561 eptr++;
4562 /* No need to skip more bytes - we know it's a 1-byte character */
4563 }
4564 break;
4565
4566 default:
4567 RRETURN(PCRE_ERROR_INTERNAL);
4568 } /* End switch(ctype) */
4569
4570 else
4571 #endif /* SUPPORT_UTF */
4572
4573 /* Code for the non-UTF-8 case for minimum matching of operators other
4574 than OP_PROP and OP_NOTPROP. */
4575
4576 switch(ctype)
4577 {
4578 case OP_ANY:
4579 for (i = 1; i <= min; i++)
4580 {
4581 if (eptr >= md->end_subject)
4582 {
4583 SCHECK_PARTIAL();
4584 RRETURN(MATCH_NOMATCH);
4585 }
4586 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4587 if (md->partial != 0 &&
4588 eptr + 1 >= md->end_subject &&
4589 NLBLOCK->nltype == NLTYPE_FIXED &&
4590 NLBLOCK->nllen == 2 &&
4591 *eptr == NLBLOCK->nl[0])
4592 {
4593 md->hitend = TRUE;
4594 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4595 }
4596 eptr++;
4597 }
4598 break;
4599
4600 case OP_ALLANY:
4601 if (eptr > md->end_subject - min)
4602 {
4603 SCHECK_PARTIAL();
4604 RRETURN(MATCH_NOMATCH);
4605 }
4606 eptr += min;
4607 break;
4608
4609 case OP_ANYBYTE:
4610 if (eptr > md->end_subject - min)
4611 {
4612 SCHECK_PARTIAL();
4613 RRETURN(MATCH_NOMATCH);
4614 }
4615 eptr += min;
4616 break;
4617
4618 case OP_ANYNL:
4619 for (i = 1; i <= min; i++)
4620 {
4621 if (eptr >= md->end_subject)
4622 {
4623 SCHECK_PARTIAL();
4624 RRETURN(MATCH_NOMATCH);
4625 }
4626 switch(*eptr++)
4627 {
4628 default: RRETURN(MATCH_NOMATCH);
4629
4630 case CHAR_CR:
4631 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4632 break;
4633
4634 case CHAR_LF:
4635 break;
4636
4637 case CHAR_VT:
4638 case CHAR_FF:
4639 case CHAR_NEL:
4640 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4641 case 0x2028:
4642 case 0x2029:
4643 #endif
4644 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4645 break;
4646 }
4647 }
4648 break;
4649
4650 case OP_NOT_HSPACE:
4651 for (i = 1; i <= min; i++)
4652 {
4653 if (eptr >= md->end_subject)
4654 {
4655 SCHECK_PARTIAL();
4656 RRETURN(MATCH_NOMATCH);
4657 }
4658 switch(*eptr++)
4659 {
4660 default: break;
4661 HSPACE_BYTE_CASES:
4662 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4663 HSPACE_MULTIBYTE_CASES:
4664 #endif
4665 RRETURN(MATCH_NOMATCH);
4666 }
4667 }
4668 break;
4669
4670 case OP_HSPACE:
4671 for (i = 1; i <= min; i++)
4672 {
4673 if (eptr >= md->end_subject)
4674 {
4675 SCHECK_PARTIAL();
4676 RRETURN(MATCH_NOMATCH);
4677 }
4678 switch(*eptr++)
4679 {
4680 default: RRETURN(MATCH_NOMATCH);
4681 HSPACE_BYTE_CASES:
4682 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4683 HSPACE_MULTIBYTE_CASES:
4684 #endif
4685 break;
4686 }
4687 }
4688 break;
4689
4690 case OP_NOT_VSPACE:
4691 for (i = 1; i <= min; i++)
4692 {
4693 if (eptr >= md->end_subject)
4694 {
4695 SCHECK_PARTIAL();
4696 RRETURN(MATCH_NOMATCH);
4697 }
4698 switch(*eptr++)
4699 {
4700 VSPACE_BYTE_CASES:
4701 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4702 VSPACE_MULTIBYTE_CASES:
4703 #endif
4704 RRETURN(MATCH_NOMATCH);
4705 default: break;
4706 }
4707 }
4708 break;
4709
4710 case OP_VSPACE:
4711 for (i = 1; i <= min; i++)
4712 {
4713 if (eptr >= md->end_subject)
4714 {
4715 SCHECK_PARTIAL();
4716 RRETURN(MATCH_NOMATCH);
4717 }
4718 switch(*eptr++)
4719 {
4720 default: RRETURN(MATCH_NOMATCH);
4721 VSPACE_BYTE_CASES:
4722 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4723 VSPACE_MULTIBYTE_CASES:
4724 #endif
4725 break;
4726 }
4727 }
4728 break;
4729
4730 case OP_NOT_DIGIT:
4731 for (i = 1; i <= min; i++)
4732 {
4733 if (eptr >= md->end_subject)
4734 {
4735 SCHECK_PARTIAL();
4736 RRETURN(MATCH_NOMATCH);
4737 }
4738 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4739 RRETURN(MATCH_NOMATCH);
4740 eptr++;
4741 }
4742 break;
4743
4744 case OP_DIGIT:
4745 for (i = 1; i <= min; i++)
4746 {
4747 if (eptr >= md->end_subject)
4748 {
4749 SCHECK_PARTIAL();
4750 RRETURN(MATCH_NOMATCH);
4751 }
4752 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4753 RRETURN(MATCH_NOMATCH);
4754 eptr++;
4755 }
4756 break;
4757
4758 case OP_NOT_WHITESPACE:
4759 for (i = 1; i <= min; i++)
4760 {
4761 if (eptr >= md->end_subject)
4762 {
4763 SCHECK_PARTIAL();
4764 RRETURN(MATCH_NOMATCH);
4765 }
4766 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4767 RRETURN(MATCH_NOMATCH);
4768 eptr++;
4769 }
4770 break;
4771
4772 case OP_WHITESPACE:
4773 for (i = 1; i <= min; i++)
4774 {
4775 if (eptr >= md->end_subject)
4776 {
4777 SCHECK_PARTIAL();
4778 RRETURN(MATCH_NOMATCH);
4779 }
4780 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4781 RRETURN(MATCH_NOMATCH);
4782 eptr++;
4783 }
4784 break;
4785
4786 case OP_NOT_WORDCHAR:
4787 for (i = 1; i <= min; i++)
4788 {
4789 if (eptr >= md->end_subject)
4790 {
4791 SCHECK_PARTIAL();
4792 RRETURN(MATCH_NOMATCH);
4793 }
4794 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4795 RRETURN(MATCH_NOMATCH);
4796 eptr++;
4797 }
4798 break;
4799
4800 case OP_WORDCHAR:
4801 for (i = 1; i <= min; i++)
4802 {
4803 if (eptr >= md->end_subject)
4804 {
4805 SCHECK_PARTIAL();
4806 RRETURN(MATCH_NOMATCH);
4807 }
4808 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4809 RRETURN(MATCH_NOMATCH);
4810 eptr++;
4811 }
4812 break;
4813
4814 default:
4815 RRETURN(PCRE_ERROR_INTERNAL);
4816 }
4817 }
4818
4819 /* If min = max, continue at the same level without recursing */
4820
4821 if (min == max) continue;
4822
4823 /* If minimizing, we have to test the rest of the pattern before each
4824 subsequent match. Again, separate the UTF-8 case for speed, and also
4825 separate the UCP cases. */
4826
4827 if (minimize)
4828 {
4829 #ifdef SUPPORT_UCP
4830 if (prop_type >= 0)
4831 {
4832 switch(prop_type)
4833 {
4834 case PT_ANY:
4835 for (fi = min;; fi++)
4836 {
4837 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4839 if (fi >= max) RRETURN(MATCH_NOMATCH);
4840 if (eptr >= md->end_subject)
4841 {
4842 SCHECK_PARTIAL();
4843 RRETURN(MATCH_NOMATCH);
4844 }
4845 GETCHARINCTEST(c, eptr);
4846 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4847 }
4848 /* Control never gets here */
4849
4850 case PT_LAMP:
4851 for (fi = min;; fi++)
4852 {
4853 int chartype;
4854 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4855 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4856 if (fi >= max) RRETURN(MATCH_NOMATCH);
4857 if (eptr >= md->end_subject)
4858 {
4859 SCHECK_PARTIAL();
4860 RRETURN(MATCH_NOMATCH);
4861 }
4862 GETCHARINCTEST(c, eptr);
4863 chartype = UCD_CHARTYPE(c);
4864 if ((chartype == ucp_Lu ||
4865 chartype == ucp_Ll ||
4866 chartype == ucp_Lt) == prop_fail_result)
4867 RRETURN(MATCH_NOMATCH);
4868 }
4869 /* Control never gets here */
4870
4871 case PT_GC:
4872 for (fi = min;; fi++)
4873 {
4874 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4875 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4876 if (fi >= max) RRETURN(MATCH_NOMATCH);
4877 if (eptr >= md->end_subject)
4878 {
4879 SCHECK_PARTIAL();
4880 RRETURN(MATCH_NOMATCH);
4881 }
4882 GETCHARINCTEST(c, eptr);
4883 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4884 RRETURN(MATCH_NOMATCH);
4885 }
4886 /* Control never gets here */
4887
4888 case PT_PC:
4889 for (fi = min;; fi++)
4890 {
4891 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4893 if (fi >= max) RRETURN(MATCH_NOMATCH);
4894 if (eptr >= md->end_subject)
4895 {
4896 SCHECK_PARTIAL();
4897 RRETURN(MATCH_NOMATCH);
4898 }
4899 GETCHARINCTEST(c, eptr);
4900 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4901 RRETURN(MATCH_NOMATCH);
4902 }
4903 /* Control never gets here */
4904
4905 case PT_SC:
4906 for (fi = min;; fi++)
4907 {
4908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4910 if (fi >= max) RRETURN(MATCH_NOMATCH);
4911 if (eptr >= md->end_subject)
4912 {
4913 SCHECK_PARTIAL();
4914 RRETURN(MATCH_NOMATCH);
4915 }
4916 GETCHARINCTEST(c, eptr);
4917 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4918 RRETURN(MATCH_NOMATCH);
4919 }
4920 /* Control never gets here */
4921
4922 case PT_ALNUM:
4923 for (fi = min;; fi++)
4924 {
4925 int category;
4926 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928 if (fi >= max) RRETURN(MATCH_NOMATCH);
4929 if (eptr >= md->end_subject)
4930 {
4931 SCHECK_PARTIAL();
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 GETCHARINCTEST(c, eptr);
4935 category = UCD_CATEGORY(c);
4936 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4937 RRETURN(MATCH_NOMATCH);
4938 }
4939 /* Control never gets here */
4940
4941 case PT_SPACE: /* Perl space */
4942 for (fi = min;; fi++)
4943 {
4944 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4946 if (fi >= max) RRETURN(MATCH_NOMATCH);
4947 if (eptr >= md->end_subject)
4948 {
4949 SCHECK_PARTIAL();
4950 RRETURN(MATCH_NOMATCH);
4951 }
4952 GETCHARINCTEST(c, eptr);
4953 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4954 c == CHAR_FF || c == CHAR_CR)
4955 == prop_fail_result)
4956 RRETURN(MATCH_NOMATCH);
4957 }
4958 /* Control never gets here */
4959
4960 case PT_PXSPACE: /* POSIX space */
4961 for (fi = min;; fi++)
4962 {
4963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4965 if (fi >= max) RRETURN(MATCH_NOMATCH);
4966 if (eptr >= md->end_subject)
4967 {
4968 SCHECK_PARTIAL();
4969 RRETURN(MATCH_NOMATCH);
4970 }
4971 GETCHARINCTEST(c, eptr);
4972 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4973 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4974 == prop_fail_result)
4975 RRETURN(MATCH_NOMATCH);
4976 }
4977 /* Control never gets here */
4978
4979 case PT_WORD:
4980 for (fi = min;; fi++)
4981 {
4982 int category;
4983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4985 if (fi >= max) RRETURN(MATCH_NOMATCH);
4986 if (eptr >= md->end_subject)
4987 {
4988 SCHECK_PARTIAL();
4989 RRETURN(MATCH_NOMATCH);
4990 }
4991 GETCHARINCTEST(c, eptr);
4992 category = UCD_CATEGORY(c);
4993 if ((category == ucp_L ||
4994 category == ucp_N ||
4995 c == CHAR_UNDERSCORE)
4996 == prop_fail_result)
4997 RRETURN(MATCH_NOMATCH);
4998 }
4999 /* Control never gets here */
5000
5001 case PT_CLIST:
5002 for (fi = min;; fi++)
5003 {
5004 const pcre_uint32 *cp;
5005 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5007 if (fi >= max) RRETURN(MATCH_NOMATCH);
5008 if (eptr >= md->end_subject)
5009 {
5010 SCHECK_PARTIAL();
5011 RRETURN(MATCH_NOMATCH);
5012 }
5013 GETCHARINCTEST(c, eptr);
5014 cp = PRIV(ucd_caseless_sets) + prop_value;
5015 for (;;)
5016 {
5017 if (c < *cp)
5018 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5019 if (c == *cp++)
5020 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5021 }
5022 }
5023 /* Control never gets here */
5024
5025 case PT_UCNC:
5026 for (fi = min;; fi++)
5027 {
5028 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5030 if (fi >= max) RRETURN(MATCH_NOMATCH);
5031 if (eptr >= md->end_subject)
5032 {
5033 SCHECK_PARTIAL();
5034 RRETURN(MATCH_NOMATCH);
5035 }
5036 GETCHARINCTEST(c, eptr);
5037 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5038 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5039 c >= 0xe000) == prop_fail_result)
5040 RRETURN(MATCH_NOMATCH);
5041 }
5042 /* Control never gets here */
5043
5044 /* This should never occur */
5045 default:
5046 RRETURN(PCRE_ERROR_INTERNAL);
5047 }
5048 }
5049
5050 /* Match extended Unicode sequences. We will get here only if the
5051 support is in the binary; otherwise a compile-time error occurs. */
5052
5053 else if (ctype == OP_EXTUNI)
5054 {
5055 for (fi = min;; fi++)
5056 {
5057 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5059 if (fi >= max) RRETURN(MATCH_NOMATCH);
5060 if (eptr >= md->end_subject)
5061 {
5062 SCHECK_PARTIAL();
5063 RRETURN(MATCH_NOMATCH);
5064 }
5065 else
5066 {
5067 int lgb, rgb;
5068 GETCHARINCTEST(c, eptr);
5069 lgb = UCD_GRAPHBREAK(c);
5070 while (eptr < md->end_subject)
5071 {
5072 int len = 1;
5073 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5074 rgb = UCD_GRAPHBREAK(c);
5075 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5076 lgb = rgb;
5077 eptr += len;
5078 }
5079 }
5080 CHECK_PARTIAL();
5081 }
5082 }
5083 else
5084 #endif /* SUPPORT_UCP */
5085
5086 #ifdef SUPPORT_UTF
5087 if (utf)
5088 {
5089 for (fi = min;; fi++)
5090 {
5091 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5092 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5093 if (fi >= max) RRETURN(MATCH_NOMATCH);
5094 if (eptr >= md->end_subject)
5095 {
5096 SCHECK_PARTIAL();
5097 RRETURN(MATCH_NOMATCH);
5098 }
5099 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5100 RRETURN(MATCH_NOMATCH);
5101 GETCHARINC(c, eptr);
5102 switch(ctype)
5103 {
5104 case OP_ANY: /* This is the non-NL case */
5105 if (md->partial != 0 && /* Take care with CRLF partial */
5106 eptr >= md->end_subject &&
5107 NLBLOCK->nltype == NLTYPE_FIXED &&
5108 NLBLOCK->nllen == 2 &&
5109 c == NLBLOCK->nl[0])
5110 {
5111 md->hitend = TRUE;
5112 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5113 }
5114 break;
5115
5116 case OP_ALLANY:
5117 case OP_ANYBYTE:
5118 break;
5119
5120 case OP_ANYNL:
5121 switch(c)
5122 {
5123 default: RRETURN(MATCH_NOMATCH);
5124 case CHAR_CR:
5125 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5126 break;
5127
5128 case CHAR_LF:
5129 break;
5130
5131 case CHAR_VT:
5132 case CHAR_FF:
5133 case CHAR_NEL:
5134 #ifndef EBCDIC
5135 case 0x2028:
5136 case 0x2029:
5137 #endif /* Not EBCDIC */
5138 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5139 break;
5140 }
5141 break;
5142
5143 case OP_NOT_HSPACE:
5144 switch(c)
5145 {
5146 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5147 default: break;
5148 }
5149 break;
5150
5151 case OP_HSPACE:
5152 switch(c)
5153 {
5154 HSPACE_CASES: break;
5155 default: RRETURN(MATCH_NOMATCH);
5156 }
5157 break;
5158
5159 case OP_NOT_VSPACE:
5160 switch(c)
5161 {
5162 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5163 default: break;
5164 }
5165 break;
5166
5167 case OP_VSPACE:
5168 switch(c)
5169 {
5170 VSPACE_CASES: break;
5171 default: RRETURN(MATCH_NOMATCH);
5172 }
5173 break;
5174
5175 case OP_NOT_DIGIT:
5176 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5177 RRETURN(MATCH_NOMATCH);
5178 break;
5179
5180 case OP_DIGIT:
5181 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5182 RRETURN(MATCH_NOMATCH);
5183 break;
5184
5185 case OP_NOT_WHITESPACE:
5186 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5187 RRETURN(MATCH_NOMATCH);
5188 break;
5189
5190 case OP_WHITESPACE:
5191 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5192 RRETURN(MATCH_NOMATCH);
5193 break;
5194
5195 case OP_NOT_WORDCHAR:
5196 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5197 RRETURN(MATCH_NOMATCH);
5198 break;
5199
5200 case OP_WORDCHAR:
5201 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5202 RRETURN(MATCH_NOMATCH);
5203 break;
5204
5205 default:
5206 RRETURN(PCRE_ERROR_INTERNAL);
5207 }
5208 }
5209 }
5210 else
5211 #endif
5212 /* Not UTF mode */
5213 {
5214 for (fi = min;; fi++)
5215 {
5216 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5217 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5218 if (fi >= max) RRETURN(MATCH_NOMATCH);
5219 if (eptr >= md->end_subject)
5220 {
5221 SCHECK_PARTIAL();
5222 RRETURN(MATCH_NOMATCH);
5223 }
5224 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5225 RRETURN(MATCH_NOMATCH);
5226 c = *eptr++;
5227 switch(ctype)
5228 {
5229 case OP_ANY: /* This is the non-NL case */
5230 if (md->partial != 0 && /* Take care with CRLF partial */
5231 eptr >= md->end_subject &&
5232 NLBLOCK->nltype == NLTYPE_FIXED &&
5233 NLBLOCK->nllen == 2 &&
5234 c == NLBLOCK->nl[0])
5235 {
5236 md->hitend = TRUE;
5237 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5238 }
5239 break;
5240
5241 case OP_ALLANY:
5242 case OP_ANYBYTE:
5243 break;
5244
5245 case OP_ANYNL:
5246 switch(c)
5247 {
5248 default: RRETURN(MATCH_NOMATCH);
5249 case CHAR_CR:
5250 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5251 break;
5252
5253 case CHAR_LF:
5254 break;
5255
5256 case CHAR_VT:
5257 case CHAR_FF:
5258 case CHAR_NEL:
5259 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5260 case 0x2028:
5261 case 0x2029:
5262 #endif
5263 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5264 break;
5265 }
5266 break;
5267
5268 case OP_NOT_HSPACE:
5269 switch(c)
5270 {
5271 default: break;
5272 HSPACE_BYTE_CASES:
5273 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5274 HSPACE_MULTIBYTE_CASES:
5275 #endif
5276 RRETURN(MATCH_NOMATCH);
5277 }
5278 break;
5279
5280 case OP_HSPACE:
5281 switch(c)
5282 {
5283 default: RRETURN(MATCH_NOMATCH);
5284 HSPACE_BYTE_CASES:
5285 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5286 HSPACE_MULTIBYTE_CASES:
5287 #endif
5288 break;
5289 }
5290 break;
5291
5292 case OP_NOT_VSPACE:
5293 switch(c)
5294 {
5295 default: break;
5296 VSPACE_BYTE_CASES:
5297 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5298 VSPACE_MULTIBYTE_CASES:
5299 #endif
5300 RRETURN(MATCH_NOMATCH);
5301 }
5302 break;
5303
5304 case OP_VSPACE:
5305 switch(c)
5306 {
5307 default: RRETURN(MATCH_NOMATCH);
5308 VSPACE_BYTE_CASES:
5309 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5310 VSPACE_MULTIBYTE_CASES:
5311 #endif
5312 break;
5313 }
5314 break;
5315
5316 case OP_NOT_DIGIT:
5317 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5318 break;
5319
5320 case OP_DIGIT:
5321 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5322 break;
5323
5324 case OP_NOT_WHITESPACE:
5325 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5326 break;
5327
5328 case OP_WHITESPACE:
5329 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5330 break;
5331
5332 case OP_NOT_WORDCHAR:
5333 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5334 break;
5335
5336 case OP_WORDCHAR:
5337 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5338 break;
5339
5340 default:
5341 RRETURN(PCRE_ERROR_INTERNAL);
5342 }
5343 }
5344 }
5345 /* Control never gets here */
5346 }
5347
5348 /* If maximizing, it is worth using inline code for speed, doing the type
5349 test once at the start (i.e. keep it out of the loop). Again, keep the
5350 UTF-8 and UCP stuff separate. */
5351
5352 else
5353 {
5354 pp = eptr; /* Remember where we started */
5355
5356 #ifdef SUPPORT_UCP
5357 if (prop_type >= 0)
5358 {
5359 switch(prop_type)
5360 {
5361 case PT_ANY:
5362 for (i = min; i < max; i++)
5363 {
5364 int len = 1;
5365 if (eptr >= md->end_subject)
5366 {
5367 SCHECK_PARTIAL();
5368 break;
5369 }
5370 GETCHARLENTEST(c, eptr, len);
5371 if (prop_fail_result) break;
5372 eptr+= len;
5373 }
5374 break;
5375
5376 case PT_LAMP:
5377 for (i = min; i < max; i++)
5378 {
5379 int chartype;
5380 int len = 1;
5381 if (eptr >= md->end_subject)
5382 {
5383 SCHECK_PARTIAL();
5384 break;
5385 }
5386 GETCHARLENTEST(c, eptr, len);
5387 chartype = UCD_CHARTYPE(c);
5388 if ((chartype == ucp_Lu ||
5389 chartype == ucp_Ll ||
5390 chartype == ucp_Lt) == prop_fail_result)
5391 break;
5392 eptr+= len;
5393 }
5394 break;
5395
5396 case PT_GC:
5397 for (i = min; i < max; i++)
5398 {
5399 int len = 1;
5400 if (eptr >= md->end_subject)
5401 {
5402 SCHECK_PARTIAL();
5403 break;
5404 }
5405 GETCHARLENTEST(c, eptr, len);
5406 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5407 eptr+= len;
5408 }
5409 break;
5410
5411 case PT_PC:
5412 for (i = min; i < max; i++)
5413 {
5414 int len = 1;
5415 if (eptr >= md->end_subject)
5416 {
5417 SCHECK_PARTIAL();
5418 break;
5419 }
5420 GETCHARLENTEST(c, eptr, len);
5421 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5422 eptr+= len;
5423 }
5424 break;
5425
5426 case PT_SC:
5427 for (i = min; i < max; i++)
5428 {
5429 int len = 1;
5430 if (eptr >= md->end_subject)
5431 {
5432 SCHECK_PARTIAL();
5433 break;
5434 }
5435 GETCHARLENTEST(c, eptr, len);
5436 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5437 eptr+= len;
5438 }
5439 break;
5440
5441 case PT_ALNUM:
5442 for (i = min; i < max; i++)
5443 {
5444 int category;
5445 int len = 1;
5446 if (eptr >= md->end_subject)
5447 {
5448 SCHECK_PARTIAL();
5449 break;
5450 }
5451 GETCHARLENTEST(c, eptr, len);
5452 category = UCD_CATEGORY(c);
5453 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5454 break;
5455 eptr+= len;
5456 }
5457 break;
5458
5459 case PT_SPACE: /* Perl space */
5460 for (i = min; i < max; i++)
5461 {
5462 int len = 1;
5463 if (eptr >= md->end_subject)
5464 {
5465 SCHECK_PARTIAL();
5466 break;
5467 }
5468 GETCHARLENTEST(c, eptr, len);
5469 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5470 c == CHAR_FF || c == CHAR_CR)
5471 == prop_fail_result)
5472 break;
5473 eptr+= len;
5474 }
5475 break;
5476
5477 case PT_PXSPACE: /* POSIX space */
5478 for (i = min; i < max; i++)
5479 {
5480 int len = 1;
5481 if (eptr >= md->end_subject)
5482 {
5483 SCHECK_PARTIAL();
5484 break;
5485 }
5486 GETCHARLENTEST(c, eptr, len);
5487 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5488 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5489 == prop_fail_result)
5490 break;
5491 eptr+= len;
5492 }
5493 break;
5494
5495 case PT_WORD:
5496 for (i = min; i < max; i++)
5497 {
5498 int category;
5499 int len = 1;
5500 if (eptr >= md->end_subject)
5501 {
5502 SCHECK_PARTIAL();
5503 break;
5504 }
5505 GETCHARLENTEST(c, eptr, len);
5506 category = UCD_CATEGORY(c);
5507 if ((category == ucp_L || category == ucp_N ||
5508 c == CHAR_UNDERSCORE) == prop_fail_result)
5509 break;
5510 eptr+= len;
5511 }
5512 break;
5513
5514 case PT_CLIST:
5515 for (i = min; i < max; i++)
5516 {
5517 const pcre_uint32 *cp;
5518 int len = 1;
5519 if (eptr >= md->end_subject)
5520 {
5521 SCHECK_PARTIAL();
5522 break;
5523 }
5524 GETCHARLENTEST(c, eptr, len);
5525 cp = PRIV(ucd_caseless_sets) + prop_value;
5526 for (;;)
5527 {
5528 if (c < *cp)
5529 { if (prop_fail_result) break; else goto GOT_MAX; }
5530 if (c == *cp++)
5531 { if (prop_fail_result) goto GOT_MAX; else break; }
5532 }
5533 eptr += len;
5534 }
5535 GOT_MAX:
5536 break;
5537
5538 case PT_UCNC:
5539 for (i = min; i < max; i++)
5540 {
5541 int len = 1;
5542 if (eptr >= md->end_subject)
5543 {
5544 SCHECK_PARTIAL();
5545 break;
5546 }
5547 GETCHARLENTEST(c, eptr, len);
5548 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5549 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5550 c >= 0xe000) == prop_fail_result)
5551 break;
5552 eptr += len;
5553 }
5554 break;
5555
5556 default:
5557 RRETURN(PCRE_ERROR_INTERNAL);
5558 }
5559
5560 /* eptr is now past the end of the maximum run */
5561
5562 if (possessive) continue;
5563 for(;;)
5564 {
5565 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5566 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5567 if (eptr-- == pp) break; /* Stop if tried at original pos */
5568 if (utf) BACKCHAR(eptr);
5569 }
5570 }
5571
5572 /* Match extended Unicode sequences. We will get here only if the
5573 support is in the binary; otherwise a compile-time error occurs. */
5574
5575 else if (ctype == OP_EXTUNI)
5576 {
5577 for (i = min; i < max; i++)
5578 {
5579 if (eptr >= md->end_subject)
5580 {
5581 SCHECK_PARTIAL();
5582 break;
5583 }
5584 else
5585 {
5586 int lgb, rgb;
5587 GETCHARINCTEST(c, eptr);
5588 lgb = UCD_GRAPHBREAK(c);
5589 while (eptr < md->end_subject)
5590 {
5591 int len = 1;
5592 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5593 rgb = UCD_GRAPHBREAK(c);
5594 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5595 lgb = rgb;
5596 eptr += len;
5597 }
5598 }
5599 CHECK_PARTIAL();
5600 }
5601
5602 /* eptr is now past the end of the maximum run */
5603
5604 if (possessive) continue;
5605
5606 for(;;)
5607 {
5608 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5609 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5610 if (eptr-- == pp) break; /* Stop if tried at original pos */
5611 for (;;) /* Move back over one extended */
5612 {
5613 if (!utf) c = *eptr; else
5614 {
5615 BACKCHAR(eptr);
5616 GETCHAR(c, eptr);
5617 }
5618 if (UCD_CATEGORY(c) != ucp_M) break;
5619 eptr--;
5620 }
5621 }
5622 }
5623
5624 else
5625 #endif /* SUPPORT_UCP */
5626
5627 #ifdef SUPPORT_UTF
5628 if (utf)
5629 {
5630 switch(ctype)
5631 {
5632 case OP_ANY:
5633 if (max < INT_MAX)
5634 {
5635 for (i = min; i < max; i++)
5636 {
5637 if (eptr >= md->end_subject)
5638 {
5639 SCHECK_PARTIAL();
5640 break;
5641 }
5642 if (IS_NEWLINE(eptr)) break;
5643 if (md->partial != 0 && /* Take care with CRLF partial */
5644 eptr + 1 >= md->end_subject &&
5645 NLBLOCK->nltype == NLTYPE_FIXED &&
5646 NLBLOCK->nllen == 2 &&
5647 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5648 {
5649 md->hitend = TRUE;
5650 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5651 }
5652 eptr++;
5653 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5654 }
5655 }
5656
5657 /* Handle unlimited UTF-8 repeat */
5658
5659 else
5660 {
5661 for (i = min; i < max; i++)
5662 {
5663 if (eptr >= md->end_subject)
5664 {
5665 SCHECK_PARTIAL();
5666 break;
5667 }
5668 if (IS_NEWLINE(eptr)) break;
5669 if (md->partial != 0 && /* Take care with CRLF partial */
5670 eptr + 1 >= md->end_subject &&
5671 NLBLOCK->nltype == NLTYPE_FIXED &&
5672 NLBLOCK->nllen == 2 &&
5673 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5674 {
5675 md->hitend = TRUE;
5676 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5677 }
5678 eptr++;
5679 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5680 }
5681 }
5682 break;
5683
5684 case OP_ALLANY:
5685 if (max < INT_MAX)
5686 {
5687 for (i = min; i < max; i++)
5688 {
5689 if (eptr >= md->end_subject)
5690 {
5691 SCHECK_PARTIAL();
5692 break;
5693 }
5694 eptr++;
5695 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5696 }
5697 }
5698 else
5699 {
5700 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5701 SCHECK_PARTIAL();
5702 }
5703 break;
5704
5705 /* The byte case is the same as non-UTF8 */
5706
5707 case OP_ANYBYTE:
5708 c = max - min;
5709 if (c > (unsigned int)(md->end_subject - eptr))
5710 {
5711 eptr = md->end_subject;
5712 SCHECK_PARTIAL();
5713 }
5714 else eptr += c;
5715 break;
5716
5717 case OP_ANYNL:
5718 for (i = min; i < max; i++)
5719 {
5720 int len = 1;
5721 if (eptr >= md->end_subject)
5722 {
5723 SCHECK_PARTIAL();
5724 break;
5725 }
5726 GETCHARLEN(c, eptr, len);
5727 if (c == CHAR_CR)
5728 {
5729 if (++eptr >= md->end_subject) break;
5730 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5731 }
5732 else
5733 {
5734 if (c != CHAR_LF &&
5735 (md->bsr_anycrlf ||
5736 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5737 #ifndef EBCDIC
5738 && c != 0x2028 && c != 0x2029
5739 #endif /* Not EBCDIC */
5740 )))
5741 break;
5742 eptr += len;
5743 }
5744 }
5745 break;
5746
5747 case OP_NOT_HSPACE:
5748 case OP_HSPACE:
5749 for (i = min; i < max; i++)
5750 {
5751 BOOL gotspace;
5752 int len = 1;
5753 if (eptr >= md->end_subject)
5754 {
5755 SCHECK_PARTIAL();
5756 break;
5757 }
5758 GETCHARLEN(c, eptr, len);
5759 switch(c)
5760 {
5761 HSPACE_CASES: gotspace = TRUE; break;
5762 default: gotspace = FALSE; break;
5763 }
5764 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5765 eptr += len;
5766 }
5767 break;
5768
5769 case OP_NOT_VSPACE:
5770 case OP_VSPACE:
5771 for (i = min; i < max; i++)
5772 {
5773 BOOL gotspace;
5774 int len = 1;
5775 if (eptr >= md->end_subject)
5776 {
5777 SCHECK_PARTIAL();
5778 break;
5779 }
5780 GETCHARLEN(c, eptr, len);
5781 switch(c)
5782 {
5783 VSPACE_CASES: gotspace = TRUE; break;
5784 default: gotspace = FALSE; break;
5785 }
5786 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5787 eptr += len;
5788 }
5789 break;
5790
5791 case OP_NOT_DIGIT:
5792 for (i = min; i < max; i++)
5793 {
5794 int len = 1;
5795 if (eptr >= md->end_subject)
5796 {
5797 SCHECK_PARTIAL();
5798 break;
5799 }
5800 GETCHARLEN(c, eptr, len);
5801 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5802 eptr+= len;
5803 }
5804 break;
5805
5806 case OP_DIGIT:
5807 for (i = min; i < max; i++)
5808 {
5809 int len = 1;
5810 if (eptr >= md->end_subject)
5811 {
5812 SCHECK_PARTIAL();
5813 break;
5814 }
5815 GETCHARLEN(c, eptr, len);
5816 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5817 eptr+= len;
5818 }
5819 break;
5820
5821 case OP_NOT_WHITESPACE:
5822 for (i = min; i < max; i++)
5823 {
5824 int len = 1;
5825 if (eptr >= md->end_subject)
5826 {
5827 SCHECK_PARTIAL();
5828 break;
5829 }
5830 GETCHARLEN(c, eptr, len);
5831 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5832 eptr+= len;
5833 }
5834 break;
5835
5836 case OP_WHITESPACE:
5837 for (i = min; i < max; i++)
5838 {
5839 int len = 1;
5840 if (eptr >= md->end_subject)
5841 {
5842 SCHECK_PARTIAL();
5843 break;
5844 }
5845 GETCHARLEN(c, eptr, len);
5846 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5847 eptr+= len;
5848 }
5849 break;
5850
5851 case OP_NOT_WORDCHAR:
5852 for (i = min; i < max; i++)
5853 {
5854 int len = 1;
5855 if (eptr >= md->end_subject)
5856 {
5857 SCHECK_PARTIAL();
5858 break;
5859 }
5860 GETCHARLEN(c, eptr, len);
5861 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5862 eptr+= len;
5863 }
5864 break;
5865
5866 case OP_WORDCHAR:
5867 for (i = min; i < max; i++)
5868 {
5869 int len = 1;
5870 if (eptr >= md->end_subject)
5871 {
5872 SCHECK_PARTIAL();
5873 break;
5874 }
5875 GETCHARLEN(c, eptr, len);
5876 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5877 eptr+= len;
5878 }
5879 break;
5880
5881 default:
5882 RRETURN(PCRE_ERROR_INTERNAL);
5883 }
5884
5885 /* eptr is now past the end of the maximum run. If possessive, we are
5886 done (no backing up). Otherwise, match at this position; anything other
5887 than no match is immediately returned. For nomatch, back up one
5888 character, unless we are matching \R and the last thing matched was
5889 \r\n, in which case, back up two bytes. */
5890
5891 if (possessive) continue;
5892 for(;;)
5893 {
5894 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5896 if (eptr-- == pp) break; /* Stop if tried at original pos */
5897 BACKCHAR(eptr);
5898 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5899 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5900 }
5901 }
5902 else
5903 #endif /* SUPPORT_UTF */
5904 /* Not UTF mode */
5905 {
5906 switch(ctype)
5907 {
5908 case OP_ANY:
5909 for (i = min; i < max; i++)
5910 {
5911 if (eptr >= md->end_subject)
5912 {
5913 SCHECK_PARTIAL();
5914 break;
5915 }
5916 if (IS_NEWLINE(eptr)) break;
5917 if (md->partial != 0 && /* Take care with CRLF partial */
5918 eptr + 1 >= md->end_subject &&
5919 NLBLOCK->nltype == NLTYPE_FIXED &&
5920 NLBLOCK->nllen == 2 &&
5921 *eptr == NLBLOCK->nl[0])
5922 {
5923 md->hitend = TRUE;
5924 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5925 }
5926 eptr++;
5927 }
5928 break;
5929
5930 case OP_ALLANY:
5931 case OP_ANYBYTE:
5932 c = max - min;
5933 if (c > (unsigned int)(md->end_subject - eptr))
5934 {
5935 eptr = md->end_subject;
5936 SCHECK_PARTIAL();
5937 }
5938 else eptr += c;
5939 break;
5940
5941 case OP_ANYNL:
5942 for (i = min; i < max; i++)
5943 {
5944 if (eptr >= md->end_subject)
5945 {
5946 SCHECK_PARTIAL();
5947 break;
5948 }
5949 c = *eptr;
5950 if (c == CHAR_CR)
5951 {
5952 if (++eptr >= md->end_subject) break;
5953 if (*eptr == CHAR_LF) eptr++;
5954 }
5955 else
5956 {
5957 if (c != CHAR_LF && (md->bsr_anycrlf ||
5958 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5959 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5960 && c != 0x2028 && c != 0x2029
5961 #endif
5962 ))) break;
5963 eptr++;
5964 }
5965 }
5966 break;
5967
5968 case OP_NOT_HSPACE:
5969 for (i = min; i < max; i++)
5970 {
5971 if (eptr >= md->end_subject)
5972 {
5973 SCHECK_PARTIAL();
5974 break;
5975 }
5976 switch(*eptr)
5977 {
5978 default: eptr++; break;
5979 HSPACE_BYTE_CASES:
5980 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5981 HSPACE_MULTIBYTE_CASES:
5982 #endif
5983 goto ENDLOOP00;
5984 }
5985 }
5986 ENDLOOP00:
5987 break;
5988
5989 case OP_HSPACE:
5990 for (i = min; i < max; i++)
5991 {
5992 if (eptr >= md->end_subject)
5993 {
5994 SCHECK_PARTIAL();
5995 break;
5996 }
5997 switch(*eptr)
5998 {
5999 default: goto ENDLOOP01;
6000 HSPACE_BYTE_CASES:
6001 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6002 HSPACE_MULTIBYTE_CASES:
6003 #endif
6004 eptr++; break;
6005 }
6006 }
6007 ENDLOOP01:
6008 break;
6009
6010 case OP_NOT_VSPACE:
6011 for (i = min; i < max; i++)
6012 {
6013 if (eptr >= md->end_subject)
6014 {
6015 SCHECK_PARTIAL();
6016 break;
6017 }
6018 switch(*eptr)
6019 {
6020 default: eptr++; break;
6021 VSPACE_BYTE_CASES:
6022 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6023 VSPACE_MULTIBYTE_CASES:
6024 #endif
6025 goto ENDLOOP02;
6026 }
6027 }
6028 ENDLOOP02:
6029 break;
6030
6031 case OP_VSPACE:
6032 for (i = min; i < max; i++)
6033 {
6034 if (eptr >= md->end_subject)
6035 {
6036 SCHECK_PARTIAL();
6037 break;
6038 }
6039 switch(*eptr)
6040 {
6041 default: goto ENDLOOP03;
6042 VSPACE_BYTE_CASES:
6043 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6044 VSPACE_MULTIBYTE_CASES:
6045 #endif
6046 eptr++; break;
6047 }
6048 }
6049 ENDLOOP03:
6050 break;
6051
6052 case OP_NOT_DIGIT:
6053 for (i = min; i < max; i++)
6054 {
6055 if (eptr >= md->end_subject)
6056 {
6057 SCHECK_PARTIAL();
6058 break;
6059 }
6060 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6061 eptr++;
6062 }
6063 break;
6064
6065 case OP_DIGIT:
6066 for (i = min; i < max; i++)
6067 {
6068 if (eptr >= md->end_subject)
6069 {
6070 SCHECK_PARTIAL();
6071 break;
6072 }
6073 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6074 eptr++;
6075 }
6076 break;
6077
6078 case OP_NOT_WHITESPACE:
6079 for (i = min; i < max; i++)
6080 {
6081 if (eptr >= md->end_subject)
6082 {
6083 SCHECK_PARTIAL();
6084 break;
6085 }
6086 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6087 eptr++;
6088 }
6089 break;
6090
6091 case OP_WHITESPACE:
6092 for (i = min; i < max; i++)
6093 {
6094 if (eptr >= md->end_subject)
6095 {
6096 SCHECK_PARTIAL();
6097 break;
6098 }
6099 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6100 eptr++;
6101 }
6102 break;
6103
6104 case OP_NOT_WORDCHAR:
6105 for (i = min; i < max; i++)
6106 {
6107 if (eptr >= md->end_subject)
6108 {
6109 SCHECK_PARTIAL();
6110 break;
6111 }
6112 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6113 eptr++;
6114 }
6115 break;
6116
6117 case OP_WORDCHAR:
6118 for (i = min; i < max; i++)
6119 {
6120 if (eptr >= md->end_subject)
6121 {
6122 SCHECK_PARTIAL();
6123 break;
6124 }
6125 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6126 eptr++;
6127 }
6128 break;
6129
6130 default:
6131 RRETURN(PCRE_ERROR_INTERNAL);
6132 }
6133
6134 /* eptr is now past the end of the maximum run. If possessive, we are
6135 done (no backing up). Otherwise, match at this position; anything other
6136 than no match is immediately returned. For nomatch, back up one
6137 character (byte), unless we are matching \R and the last thing matched
6138 was \r\n, in which case, back up two bytes. */
6139
6140 if (possessive) continue;
6141 while (eptr >= pp)
6142 {
6143 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6144 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6145 eptr--;
6146 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6147 eptr[-1] == CHAR_CR) eptr--;
6148 }
6149 }
6150
6151 /* Get here if we can't make it match with any permitted repetitions */
6152
6153 RRETURN(MATCH_NOMATCH);
6154 }
6155 /* Control never gets here */
6156
6157 /* There's been some horrible disaster. Arrival here can only mean there is
6158 something seriously wrong in the code above or the OP_xxx definitions. */
6159
6160 default:
6161 DPRINTF(("Unknown opcode %d\n", *ecode));
6162 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6163 }
6164
6165 /* Do not stick any code in here without much thought; it is assumed
6166 that "continue" in the code above comes out to here to repeat the main
6167 loop. */
6168
6169 } /* End of main loop */
6170 /* Control never reaches here */
6171
6172
6173 /* When compiling to use the heap rather than the stack for recursive calls to
6174 match(), the RRETURN() macro jumps here. The number that is saved in
6175 frame->Xwhere indicates which label we actually want to return to. */
6176
6177 #ifdef NO_RECURSE
6178 #define LBL(val) case val: goto L_RM##val;
6179 HEAP_RETURN:
6180 switch (frame->Xwhere)
6181 {
6182 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6183 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6184 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6185 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6186 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6187 LBL(65) LBL(66)
6188 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6189 LBL(21)
6190 #endif
6191 #ifdef SUPPORT_UTF
6192 LBL(16) LBL(18) LBL(20)
6193 LBL(22) LBL(23) LBL(28) LBL(30)
6194 LBL(32) LBL(34) LBL(42) LBL(46)
6195 #ifdef SUPPORT_UCP
6196 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6197 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6198 #endif /* SUPPORT_UCP */
6199 #endif /* SUPPORT_UTF */
6200 default:
6201 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6202 return PCRE_ERROR_INTERNAL;
6203 }
6204 #undef LBL
6205 #endif /* NO_RECURSE */
6206 }
6207
6208
6209 /***************************************************************************
6210 ****************************************************************************
6211 RECURSION IN THE match() FUNCTION
6212
6213 Undefine all the macros that were defined above to handle this. */
6214
6215 #ifdef NO_RECURSE
6216 #undef eptr
6217 #undef ecode
6218 #undef mstart
6219 #undef offset_top
6220 #undef eptrb
6221 #undef flags
6222
6223 #undef callpat
6224 #undef charptr
6225 #undef data
6226 #undef next
6227 #undef pp
6228 #undef prev
6229 #undef saved_eptr
6230
6231 #undef new_recursive
6232
6233 #undef cur_is_word
6234 #undef condition
6235 #undef prev_is_word
6236
6237 #undef ctype
6238 #undef length
6239 #undef max
6240 #undef min
6241 #undef number
6242 #undef offset
6243 #undef op
6244 #undef save_capture_last
6245 #undef save_offset1
6246 #undef save_offset2
6247 #undef save_offset3
6248 #undef stacksave
6249
6250 #undef newptrb
6251
6252 #endif
6253
6254 /* These two are defined as macros in both cases */
6255
6256 #undef fc
6257 #undef fi
6258
6259 /***************************************************************************
6260 ***************************************************************************/
6261
6262
6263 #ifdef NO_RECURSE
6264 /*************************************************
6265 * Release allocated heap frames *
6266 *************************************************/
6267
6268 /* This function releases all the allocated frames. The base frame is on the
6269 machine stack, and so must not be freed.
6270
6271 Argument: the address of the base frame
6272 Returns: nothing
6273 */
6274
6275 static void
6276 release_match_heapframes (heapframe *frame_base)
6277 {
6278 heapframe *nextframe = frame_base->Xnextframe;
6279 while (nextframe != NULL)
6280 {
6281 heapframe *oldframe = nextframe;
6282 nextframe = nextframe->Xnextframe;
6283 (PUBL(stack_free))(oldframe);
6284 }
6285 }
6286 #endif
6287
6288
6289 /*************************************************
6290 * Execute a Regular Expression *
6291 *************************************************/
6292
6293 /* This function applies a compiled re to a subject string and picks out
6294 portions of the string if it matches. Two elements in the vector are set for
6295 each substring: the offsets to the start and end of the substring.
6296
6297 Arguments:
6298 argument_re points to the compiled expression
6299 extra_data points to extra data or is NULL
6300 subject points to the subject string
6301 length length of subject string (may contain binary zeros)
6302 start_offset where to start in the subject string
6303 options option bits
6304 offsets points to a vector of ints to be filled in with offsets
6305 offsetcount the number of elements in the vector
6306
6307 Returns: > 0 => success; value is the number of elements filled in
6308 = 0 => success, but offsets is not big enough
6309 -1 => failed to match
6310 < -1 => some kind of unexpected problem
6311 */
6312
6313 #if defined COMPILE_PCRE8
6314 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6315 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6316 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6317 int offsetcount)
6318 #elif defined COMPILE_PCRE16
6319 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6320 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6321 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6322 int offsetcount)
6323 #elif defined COMPILE_PCRE32
6324 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6325 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6326 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6327 int offsetcount)
6328 #endif
6329 {
6330 int rc, ocount, arg_offset_max;
6331 int newline;
6332 BOOL using_temporary_offsets = FALSE;
6333 BOOL anchored;
6334 BOOL startline;
6335 BOOL firstline;
6336 BOOL utf;
6337 BOOL has_first_char = FALSE;
6338 BOOL has_req_char = FALSE;
6339 pcre_uchar first_char = 0;
6340 pcre_uchar first_char2 = 0;
6341 pcre_uchar req_char = 0;
6342 pcre_uchar req_char2 = 0;
6343 match_data match_block;
6344 match_data *md = &match_block;
6345 const pcre_uint8 *tables;
6346 const pcre_uint8 *start_bits = NULL;
6347 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6348 PCRE_PUCHAR end_subject;
6349 PCRE_PUCHAR start_partial = NULL;
6350 PCRE_PUCHAR match_partial;
6351 PCRE_PUCHAR req_char_ptr = start_match - 1;
6352
6353 const pcre_study_data *study;
6354 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6355
6356 #ifdef NO_RECURSE
6357 heapframe frame_zero;
6358 frame_zero.Xprevframe = NULL; /* Marks the top level */
6359 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6360 md->match_frames_base = &frame_zero;
6361 #endif
6362
6363 /* Check for the special magic call that measures the size of the stack used
6364 per recursive call of match(). Without the funny casting for sizeof, a Windows
6365 compiler gave this error: "unary minus operator applied to unsigned type,
6366 result still unsigned". Hopefully the cast fixes that. */
6367
6368 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6369 start_offset == -999)
6370 #ifdef NO_RECURSE
6371 return -((int)sizeof(heapframe));
6372 #else
6373 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6374 #endif
6375
6376 /* Plausibility checks */
6377
6378 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6379 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6380 return PCRE_ERROR_NULL;
6381 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6382 if (length < 0) return PCRE_ERROR_BADLENGTH;
6383 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6384
6385 /* Check that the first field in the block is the magic number. If it is not,
6386 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6387 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6388 means that the pattern is likely compiled with different endianness. */
6389
6390 if (re->magic_number != MAGIC_NUMBER)
6391 return re->magic_number == REVERSED_MAGIC_NUMBER?
6392 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6393 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6394
6395 /* These two settings are used in the code for checking a UTF-8 string that
6396 follows immediately afterwards. Other values in the md block are used only
6397 during "normal" pcre_exec() processing, not when the JIT support is in use,
6398 so they are set up later. */
6399
6400 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6401 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6402 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6403 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6404
6405 /* Check a UTF-8 string if required. Pass back the character offset and error
6406 code for an invalid string if a results vector is available. */
6407
6408 #ifdef SUPPORT_UTF
6409 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6410 {
6411 int erroroffset;
6412 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6413 if (errorcode != 0)
6414 {
6415 if (offsetcount >= 2)
6416 {
6417 offsets[0] = erroroffset;
6418 offsets[1] = errorcode;
6419 }
6420 #if defined COMPILE_PCRE8
6421 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6422 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6423 #elif defined COMPILE_PCRE16
6424 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6425 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6426 #elif defined COMPILE_PCRE32
6427 return PCRE_ERROR_BADUTF32;
6428 #endif
6429 }
6430 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6431 /* Check that a start_offset points to the start of a UTF character. */
6432 if (start_offset > 0 && start_offset < length &&
6433 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6434 return PCRE_ERROR_BADUTF8_OFFSET;
6435 #endif
6436 }
6437 #endif
6438
6439 /* If the pattern was successfully studied with JIT support, run the JIT
6440 executable instead of the rest of this function. Most options must be set at
6441 compile time for the JIT code to be usable. Fallback to the normal code path if
6442 an unsupported flag is set. */
6443
6444 #ifdef SUPPORT_JIT
6445 if (extra_data != NULL
6446 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6447 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6448 && extra_data->executable_jit != NULL
6449 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6450 {
6451 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6452 start_offset, options, offsets, offsetcount);
6453
6454 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6455 mode is not compiled. In this case we simply fallback to interpreter. */
6456
6457 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6458 }
6459 #endif
6460
6461 /* Carry on with non-JIT matching. This information is for finding all the
6462 numbers associated with a given name, for condition testing. */
6463
6464 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6465 md->name_count = re->name_count;
6466 md->name_entry_size = re->name_entry_size;
6467
6468 /* Fish out the optional data from the extra_data structure, first setting
6469 the default values. */
6470
6471 study = NULL;
6472 md->match_limit = MATCH_LIMIT;
6473 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6474 md->callout_data = NULL;
6475
6476 /* The table pointer is always in native byte order. */
6477
6478 tables = re->tables;
6479
6480 if (extra_data != NULL)
6481 {
6482 register unsigned int flags = extra_data->flags;
6483 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6484 study = (const pcre_study_data *)extra_data->study_data;
6485 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6486 md->match_limit = extra_data->match_limit;
6487 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6488 md->match_limit_recursion = extra_data->match_limit_recursion;
6489 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6490 md->callout_data = extra_data->callout_data;
6491 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6492 }
6493
6494 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6495 is a feature that makes it possible to save compiled regex and re-use them
6496 in other programs later. */
6497
6498 if (tables == NULL) tables = PRIV(default_tables);
6499
6500 /* Set up other data */
6501
6502 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6503 startline = (re->flags & PCRE_STARTLINE) != 0;
6504 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6505
6506 /* The code starts after the real_pcre block and the capture name table. */
6507
6508 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6509 re->name_count * re->name_entry_size;
6510
6511 md->start_subject = (PCRE_PUCHAR)subject;
6512 md->start_offset = start_offset;
6513 md->end_subject = md->start_subject + length;
6514 end_subject = md->end_subject;
6515
6516 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6517 md->use_ucp = (re->options & PCRE_UCP) != 0;
6518 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6519 md->ignore_skip_arg = FALSE;
6520
6521 /* Some options are unpacked into BOOL variables in the hope that testing
6522 them will be faster than individual option bits. */
6523
6524 md->notbol = (options & PCRE_NOTBOL) != 0;
6525 md->noteol = (options & PCRE_NOTEOL) != 0;
6526 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6527 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6528
6529 md->hitend = FALSE;
6530 md->mark = md->nomatch_mark = NULL; /* In case never set */
6531
6532 md->recursive = NULL; /* No recursion at top level */
6533 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6534
6535 md->lcc = tables + lcc_offset;
6536 md->fcc = tables + fcc_offset;
6537 md->ctypes = tables + ctypes_offset;
6538
6539 /* Handle different \R options. */
6540
6541 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6542 {
6543 case 0:
6544 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6545 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6546 else
6547 #ifdef BSR_ANYCRLF
6548 md->bsr_anycrlf = TRUE;
6549 #else
6550 md->bsr_anycrlf = FALSE;
6551 #endif
6552 break;
6553
6554 case PCRE_BSR_ANYCRLF:
6555 md->bsr_anycrlf = TRUE;
6556 break;
6557
6558 case PCRE_BSR_UNICODE:
6559 md->bsr_anycrlf = FALSE;
6560 break;
6561
6562 default: return PCRE_ERROR_BADNEWLINE;
6563 }
6564
6565 /* Handle different types of newline. The three bits give eight cases. If
6566 nothing is set at run time, whatever was used at compile time applies. */
6567
6568 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6569 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6570 {
6571 case 0: newline = NEWLINE; break; /* Compile-time default */
6572 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6573 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6574 case PCRE_NEWLINE_CR+
6575 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6576 case PCRE_NEWLINE_ANY: newline = -1; break;
6577 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6578 default: return PCRE_ERROR_BADNEWLINE;
6579 }
6580
6581 if (newline == -2)
6582 {
6583 md->nltype = NLTYPE_ANYCRLF;
6584 }
6585 else if (newline < 0)
6586 {
6587 md->nltype = NLTYPE_ANY;
6588 }
6589 else
6590 {
6591 md->nltype = NLTYPE_FIXED;
6592 if (newline > 255)
6593 {
6594 md->nllen = 2;
6595 md->nl[0] = (newline >> 8) & 255;
6596 md->nl[1] = newline & 255;
6597 }
6598 else
6599 {
6600 md->nllen = 1;
6601 md->nl[0] = newline;
6602 }
6603 }
6604
6605 /* Partial matching was originally supported only for a restricted set of
6606 regexes; from release 8.00 there are no restrictions, but the bits are still
6607 defined (though never set). So there's no harm in leaving this code. */
6608
6609 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6610 return PCRE_ERROR_BADPARTIAL;
6611
6612 /* If the expression has got more back references than the offsets supplied can
6613 hold, we get a temporary chunk of working store to use during the matching.
6614 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6615 of 3. */
6616
6617 ocount = offsetcount - (offsetcount % 3);
6618 arg_offset_max = (2*ocount)/3;
6619
6620 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6621 {
6622 ocount = re->top_backref * 3 + 3;
6623 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6624 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6625 using_temporary_offsets = TRUE;
6626 DPRINTF(("Got memory to hold back references\n"));
6627 }
6628 else md->offset_vector = offsets;
6629 md->offset_end = ocount;
6630 md->offset_max = (2*ocount)/3;
6631 md->capture_last = 0;
6632
6633 /* Reset the working variable associated with each extraction. These should
6634 never be used unless previously set, but they get saved and restored, and so we
6635 initialize them to avoid reading uninitialized locations. Also, unset the
6636 offsets for the matched string. This is really just for tidiness with callouts,
6637 in case they inspect these fields. */
6638
6639 if (md->offset_vector != NULL)
6640 {
6641 register int *iptr = md->offset_vector + ocount;
6642 register int *iend = iptr - re->top_bracket;
6643 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6644 while (--iptr >= iend) *iptr = -1;
6645 md->offset_vector[0] = md->offset_vector[1] = -1;
6646 }
6647
6648 /* Set up the first character to match, if available. The first_char value is
6649 never set for an anchored regular expression, but the anchoring may be forced
6650 at run time, so we have to test for anchoring. The first char may be unset for
6651 an unanchored pattern, of course. If there's no first char and the pattern was
6652 studied, there may be a bitmap of possible first characters. */
6653
6654 if (!anchored)
6655 {
6656 if ((re->flags & PCRE_FIRSTSET) != 0)
6657 {
6658 has_first_char = TRUE;
6659 first_char = first_char2 = (pcre_uchar)(re->first_char);
6660 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6661 {
6662 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6663 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6664 if (utf && first_char > 127)
6665 first_char2 = UCD_OTHERCASE(first_char);
6666 #endif
6667 }
6668 }
6669 else
6670 if (!startline && study != NULL &&
6671 (study->flags & PCRE_STUDY_MAPPED) != 0)
6672 start_bits = study->start_bits;
6673 }
6674
6675 /* For anchored or unanchored matches, there may be a "last known required
6676 character" set. */
6677
6678 if ((re->flags & PCRE_REQCHSET) != 0)
6679 {
6680 has_req_char = TRUE;
6681 req_char = req_char2 = (pcre_uchar)(re->req_char);
6682 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6683 {
6684 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6685 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6686 if (utf && req_char > 127)
6687 req_char2 = UCD_OTHERCASE(req_char);
6688 #endif
6689 }
6690 }
6691
6692
6693 /* ==========================================================================*/
6694
6695 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6696 the loop runs just once. */
6697
6698 for(;;)
6699 {
6700 PCRE_PUCHAR save_end_subject = end_subject;
6701 PCRE_PUCHAR new_start_match;
6702
6703 /* If firstline is TRUE, the start of the match is constrained to the first
6704 line of a multiline string. That is, the match must be before or at the first
6705 newline. Implement this by temporarily adjusting end_subject so that we stop
6706 scanning at a newline. If the match fails at the newline, later code breaks
6707 this loop. */
6708
6709 if (firstline)
6710 {
6711 PCRE_PUCHAR t = start_match;
6712 #ifdef SUPPORT_UTF
6713 if (utf)
6714 {
6715 while (t < md->end_subject && !IS_NEWLINE(t))
6716 {
6717 t++;
6718 ACROSSCHAR(t < end_subject, *t, t++);
6719 }
6720 }
6721 else
6722 #endif
6723 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6724 end_subject = t;
6725 }
6726
6727 /* There are some optimizations that avoid running the match if a known
6728 starting point is not found, or if a known later character is not present.
6729 However, there is an option that disables these, for testing and for ensuring
6730 that all callouts do actually occur. The option can be set in the regex by
6731 (*NO_START_OPT) or passed in match-time options. */
6732
6733 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6734 {
6735 /* Advance to a unique first char if there is one. */
6736
6737 if (has_first_char)
6738 {
6739 pcre_uchar smc;
6740
6741 if (first_char != first_char2)
6742 while (start_match < end_subject &&
6743 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6744 start_match++;
6745 else
6746 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6747 start_match++;
6748 }
6749
6750 /* Or to just after a linebreak for a multiline match */
6751
6752 else if (startline)
6753 {
6754 if (start_match > md->start_subject + start_offset)
6755 {
6756 #ifdef SUPPORT_UTF
6757 if (utf)
6758 {
6759