/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1296 - (show annotations)
Tue Mar 19 16:29:12 2013 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 217027 byte(s)
Code changes for simpler backtracking handling (docs to follow).
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 case OP_COMMIT:
785 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
786 eptrb, RM52);
787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
788 RRETURN(MATCH_COMMIT);
789
790 case OP_PRUNE:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
792 eptrb, RM51);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 RRETURN(MATCH_PRUNE);
795
796 case OP_PRUNE_ARG:
797 md->nomatch_mark = ecode + 2;
798 md->mark = NULL; /* In case previously set by assertion */
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM56);
801 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
802 md->mark == NULL) md->mark = ecode + 2;
803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
804 RRETURN(MATCH_PRUNE);
805
806 case OP_SKIP:
807 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
808 eptrb, RM53);
809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
810 md->start_match_ptr = eptr; /* Pass back current position */
811 RRETURN(MATCH_SKIP);
812
813 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
814 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
815 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
816 that failed and any that preceed it (either they also failed, or were not
817 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
818 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
819 set to the count of the one that failed. */
820
821 case OP_SKIP_ARG:
822 md->skip_arg_count++;
823 if (md->skip_arg_count <= md->ignore_skip_arg)
824 {
825 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 break;
827 }
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
829 eptrb, RM57);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831
832 /* Pass back the current skip name by overloading md->start_match_ptr and
833 returning the special MATCH_SKIP_ARG return code. This will either be
834 caught by a matching MARK, or get to the top, where it causes a rematch
835 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
836
837 md->start_match_ptr = ecode + 2;
838 RRETURN(MATCH_SKIP_ARG);
839
840 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
841 the branch in which it occurs can be determined. Overload the start of
842 match pointer to do this. */
843
844 case OP_THEN:
845 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
846 eptrb, RM54);
847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848 md->start_match_ptr = ecode;
849 RRETURN(MATCH_THEN);
850
851 case OP_THEN_ARG:
852 md->nomatch_mark = ecode + 2;
853 md->mark = NULL; /* In case previously set by assertion */
854 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
855 md, eptrb, RM58);
856 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
857 md->mark == NULL) md->mark = ecode + 2;
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 md->start_match_ptr = ecode;
860 RRETURN(MATCH_THEN);
861
862 /* Handle an atomic group that does not contain any capturing parentheses.
863 This can be handled like an assertion. Prior to 8.13, all atomic groups
864 were handled this way. In 8.13, the code was changed as below for ONCE, so
865 that backups pass through the group and thereby reset captured values.
866 However, this uses a lot more stack, so in 8.20, atomic groups that do not
867 contain any captures generate OP_ONCE_NC, which can be handled in the old,
868 less stack intensive way.
869
870 Check the alternative branches in turn - the matching won't pass the KET
871 for this kind of subpattern. If any one branch matches, we carry on as at
872 the end of a normal bracket, leaving the subject pointer, but resetting
873 the start-of-match value in case it was changed by \K. */
874
875 case OP_ONCE_NC:
876 prev = ecode;
877 saved_eptr = eptr;
878 save_mark = md->mark;
879 do
880 {
881 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
882 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
883 {
884 mstart = md->start_match_ptr;
885 break;
886 }
887 if (rrc == MATCH_THEN)
888 {
889 next = ecode + GET(ecode,1);
890 if (md->start_match_ptr < next &&
891 (*ecode == OP_ALT || *next == OP_ALT))
892 rrc = MATCH_NOMATCH;
893 }
894
895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896 ecode += GET(ecode,1);
897 md->mark = save_mark;
898 }
899 while (*ecode == OP_ALT);
900
901 /* If hit the end of the group (which could be repeated), fail */
902
903 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
904
905 /* Continue as from after the group, updating the offsets high water
906 mark, since extracts may have been taken. */
907
908 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
909
910 offset_top = md->end_offset_top;
911 eptr = md->end_match_ptr;
912
913 /* For a non-repeating ket, just continue at this level. This also
914 happens for a repeating ket if no characters were matched in the group.
915 This is the forcible breaking of infinite loops as implemented in Perl
916 5.005. */
917
918 if (*ecode == OP_KET || eptr == saved_eptr)
919 {
920 ecode += 1+LINK_SIZE;
921 break;
922 }
923
924 /* The repeating kets try the rest of the pattern or restart from the
925 preceding bracket, in the appropriate order. The second "call" of match()
926 uses tail recursion, to avoid using another stack frame. */
927
928 if (*ecode == OP_KETRMIN)
929 {
930 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
932 ecode = prev;
933 goto TAIL_RECURSE;
934 }
935 else /* OP_KETRMAX */
936 {
937 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939 ecode += 1 + LINK_SIZE;
940 goto TAIL_RECURSE;
941 }
942 /* Control never gets here */
943
944 /* Handle a capturing bracket, other than those that are possessive with an
945 unlimited repeat. If there is space in the offset vector, save the current
946 subject position in the working slot at the top of the vector. We mustn't
947 change the current values of the data slot, because they may be set from a
948 previous iteration of this group, and be referred to by a reference inside
949 the group. A failure to match might occur after the group has succeeded,
950 if something later on doesn't match. For this reason, we need to restore
951 the working value and also the values of the final offsets, in case they
952 were set by a previous iteration of the same bracket.
953
954 If there isn't enough space in the offset vector, treat this as if it were
955 a non-capturing bracket. Don't worry about setting the flag for the error
956 case here; that is handled in the code for KET. */
957
958 case OP_CBRA:
959 case OP_SCBRA:
960 number = GET2(ecode, 1+LINK_SIZE);
961 offset = number << 1;
962
963 #ifdef PCRE_DEBUG
964 printf("start bracket %d\n", number);
965 printf("subject=");
966 pchars(eptr, 16, TRUE, md);
967 printf("\n");
968 #endif
969
970 if (offset < md->offset_max)
971 {
972 save_offset1 = md->offset_vector[offset];
973 save_offset2 = md->offset_vector[offset+1];
974 save_offset3 = md->offset_vector[md->offset_end - number];
975 save_capture_last = md->capture_last;
976 save_mark = md->mark;
977
978 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
979 md->offset_vector[md->offset_end - number] =
980 (int)(eptr - md->start_subject);
981
982 for (;;)
983 {
984 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
986 eptrb, RM1);
987 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
988
989 /* If we backed up to a THEN, check whether it is within the current
990 branch by comparing the address of the THEN that is passed back with
991 the end of the branch. If it is within the current branch, and the
992 branch is one of two or more alternatives (it either starts or ends
993 with OP_ALT), we have reached the limit of THEN's action, so convert
994 the return code to NOMATCH, which will cause normal backtracking to
995 happen from now on. Otherwise, THEN is passed back to an outer
996 alternative. This implements Perl's treatment of parenthesized groups,
997 where a group not containing | does not affect the current alternative,
998 that is, (X) is NOT the same as (X|(*F)). */
999
1000 if (rrc == MATCH_THEN)
1001 {
1002 next = ecode + GET(ecode,1);
1003 if (md->start_match_ptr < next &&
1004 (*ecode == OP_ALT || *next == OP_ALT))
1005 rrc = MATCH_NOMATCH;
1006 }
1007
1008 /* Anything other than NOMATCH is passed back. */
1009
1010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011 md->capture_last = save_capture_last;
1012 ecode += GET(ecode, 1);
1013 md->mark = save_mark;
1014 if (*ecode != OP_ALT) break;
1015 }
1016
1017 DPRINTF(("bracket %d failed\n", number));
1018 md->offset_vector[offset] = save_offset1;
1019 md->offset_vector[offset+1] = save_offset2;
1020 md->offset_vector[md->offset_end - number] = save_offset3;
1021
1022 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1023
1024 RRETURN(rrc);
1025 }
1026
1027 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1028 as a non-capturing bracket. */
1029
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032
1033 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1034
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037
1038 /* Non-capturing or atomic group, except for possessive with unlimited
1039 repeat and ONCE group with no captures. Loop for all the alternatives.
1040
1041 When we get to the final alternative within the brackets, we used to return
1042 the result of a recursive call to match() whatever happened so it was
1043 possible to reduce stack usage by turning this into a tail recursion,
1044 except in the case of a possibly empty group. However, now that there is
1045 the possiblity of (*THEN) occurring in the final alternative, this
1046 optimization is no longer always possible.
1047
1048 We can optimize if we know there are no (*THEN)s in the pattern; at present
1049 this is the best that can be done.
1050
1051 MATCH_ONCE is returned when the end of an atomic group is successfully
1052 reached, but subsequent matching fails. It passes back up the tree (causing
1053 captured values to be reset) until the original atomic group level is
1054 reached. This is tested by comparing md->once_target with the start of the
1055 group. At this point, the return is converted into MATCH_NOMATCH so that
1056 previous backup points can be taken. */
1057
1058 case OP_ONCE:
1059 case OP_BRA:
1060 case OP_SBRA:
1061 DPRINTF(("start non-capturing bracket\n"));
1062
1063 for (;;)
1064 {
1065 if (op >= OP_SBRA || op == OP_ONCE)
1066 md->match_function_type = MATCH_CBEGROUP;
1067
1068 /* If this is not a possibly empty group, and there are no (*THEN)s in
1069 the pattern, and this is the final alternative, optimize as described
1070 above. */
1071
1072 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1073 {
1074 ecode += PRIV(OP_lengths)[*ecode];
1075 goto TAIL_RECURSE;
1076 }
1077
1078 /* In all other cases, we have to make another call to match(). */
1079
1080 save_mark = md->mark;
1081 save_capture_last = md->capture_last;
1082 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 RM2);
1084
1085 /* See comment in the code for capturing groups above about handling
1086 THEN. */
1087
1088 if (rrc == MATCH_THEN)
1089 {
1090 next = ecode + GET(ecode,1);
1091 if (md->start_match_ptr < next &&
1092 (*ecode == OP_ALT || *next == OP_ALT))
1093 rrc = MATCH_NOMATCH;
1094 }
1095
1096 if (rrc != MATCH_NOMATCH)
1097 {
1098 if (rrc == MATCH_ONCE)
1099 {
1100 const pcre_uchar *scode = ecode;
1101 if (*scode != OP_ONCE) /* If not at start, find it */
1102 {
1103 while (*scode == OP_ALT) scode += GET(scode, 1);
1104 scode -= GET(scode, 1);
1105 }
1106 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1107 }
1108 RRETURN(rrc);
1109 }
1110 ecode += GET(ecode, 1);
1111 md->mark = save_mark;
1112 if (*ecode != OP_ALT) break;
1113 md->capture_last = save_capture_last;
1114 }
1115
1116 RRETURN(MATCH_NOMATCH);
1117
1118 /* Handle possessive capturing brackets with an unlimited repeat. We come
1119 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1120 handled similarly to the normal case above. However, the matching is
1121 different. The end of these brackets will always be OP_KETRPOS, which
1122 returns MATCH_KETRPOS without going further in the pattern. By this means
1123 we can handle the group by iteration rather than recursion, thereby
1124 reducing the amount of stack needed. */
1125
1126 case OP_CBRAPOS:
1127 case OP_SCBRAPOS:
1128 allow_zero = FALSE;
1129
1130 POSSESSIVE_CAPTURE:
1131 number = GET2(ecode, 1+LINK_SIZE);
1132 offset = number << 1;
1133
1134 #ifdef PCRE_DEBUG
1135 printf("start possessive bracket %d\n", number);
1136 printf("subject=");
1137 pchars(eptr, 16, TRUE, md);
1138 printf("\n");
1139 #endif
1140
1141 if (offset < md->offset_max)
1142 {
1143 matched_once = FALSE;
1144 code_offset = (int)(ecode - md->start_code);
1145
1146 save_offset1 = md->offset_vector[offset];
1147 save_offset2 = md->offset_vector[offset+1];
1148 save_offset3 = md->offset_vector[md->offset_end - number];
1149 save_capture_last = md->capture_last;
1150
1151 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1152
1153 /* Each time round the loop, save the current subject position for use
1154 when the group matches. For MATCH_MATCH, the group has matched, so we
1155 restart it with a new subject starting position, remembering that we had
1156 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1157 usual. If we haven't matched any alternatives in any iteration, check to
1158 see if a previous iteration matched. If so, the group has matched;
1159 continue from afterwards. Otherwise it has failed; restore the previous
1160 capture values before returning NOMATCH. */
1161
1162 for (;;)
1163 {
1164 md->offset_vector[md->offset_end - number] =
1165 (int)(eptr - md->start_subject);
1166 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1167 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1168 eptrb, RM63);
1169 if (rrc == MATCH_KETRPOS)
1170 {
1171 offset_top = md->end_offset_top;
1172 eptr = md->end_match_ptr;
1173 ecode = md->start_code + code_offset;
1174 save_capture_last = md->capture_last;
1175 matched_once = TRUE;
1176 continue;
1177 }
1178
1179 /* See comment in the code for capturing groups above about handling
1180 THEN. */
1181
1182 if (rrc == MATCH_THEN)
1183 {
1184 next = ecode + GET(ecode,1);
1185 if (md->start_match_ptr < next &&
1186 (*ecode == OP_ALT || *next == OP_ALT))
1187 rrc = MATCH_NOMATCH;
1188 }
1189
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 md->capture_last = save_capture_last;
1192 ecode += GET(ecode, 1);
1193 if (*ecode != OP_ALT) break;
1194 }
1195
1196 if (!matched_once)
1197 {
1198 md->offset_vector[offset] = save_offset1;
1199 md->offset_vector[offset+1] = save_offset2;
1200 md->offset_vector[md->offset_end - number] = save_offset3;
1201 }
1202
1203 if (allow_zero || matched_once)
1204 {
1205 ecode += 1 + LINK_SIZE;
1206 break;
1207 }
1208
1209 RRETURN(MATCH_NOMATCH);
1210 }
1211
1212 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1213 as a non-capturing bracket. */
1214
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217
1218 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222
1223 /* Non-capturing possessive bracket with unlimited repeat. We come here
1224 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1225 without the capturing complication. It is written out separately for speed
1226 and cleanliness. */
1227
1228 case OP_BRAPOS:
1229 case OP_SBRAPOS:
1230 allow_zero = FALSE;
1231
1232 POSSESSIVE_NON_CAPTURE:
1233 matched_once = FALSE;
1234 code_offset = (int)(ecode - md->start_code);
1235 save_capture_last = md->capture_last;
1236
1237 for (;;)
1238 {
1239 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1240 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1241 eptrb, RM48);
1242 if (rrc == MATCH_KETRPOS)
1243 {
1244 offset_top = md->end_offset_top;
1245 eptr = md->end_match_ptr;
1246 ecode = md->start_code + code_offset;
1247 matched_once = TRUE;
1248 continue;
1249 }
1250
1251 /* See comment in the code for capturing groups above about handling
1252 THEN. */
1253
1254 if (rrc == MATCH_THEN)
1255 {
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1260 }
1261
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1266 }
1267
1268 if (matched_once || allow_zero)
1269 {
1270 ecode += 1 + LINK_SIZE;
1271 break;
1272 }
1273 RRETURN(MATCH_NOMATCH);
1274
1275 /* Control never reaches here. */
1276
1277 /* Conditional group: compilation checked that there are no more than
1278 two branches. If the condition is false, skipping the first branch takes us
1279 past the end if there is only one branch, but that's OK because that is
1280 exactly what going to the ket would do. */
1281
1282 case OP_COND:
1283 case OP_SCOND:
1284 codelink = GET(ecode, 1);
1285
1286 /* Because of the way auto-callout works during compile, a callout item is
1287 inserted between OP_COND and an assertion condition. */
1288
1289 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1290 {
1291 if (PUBL(callout) != NULL)
1292 {
1293 PUBL(callout_block) cb;
1294 cb.version = 2; /* Version 1 of the callout block */
1295 cb.callout_number = ecode[LINK_SIZE+2];
1296 cb.offset_vector = md->offset_vector;
1297 #if defined COMPILE_PCRE8
1298 cb.subject = (PCRE_SPTR)md->start_subject;
1299 #elif defined COMPILE_PCRE16
1300 cb.subject = (PCRE_SPTR16)md->start_subject;
1301 #elif defined COMPILE_PCRE32
1302 cb.subject = (PCRE_SPTR32)md->start_subject;
1303 #endif
1304 cb.subject_length = (int)(md->end_subject - md->start_subject);
1305 cb.start_match = (int)(mstart - md->start_subject);
1306 cb.current_position = (int)(eptr - md->start_subject);
1307 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1308 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1309 cb.capture_top = offset_top/2;
1310 cb.capture_last = md->capture_last & CAPLMASK;
1311 /* Internal change requires this for API compatibility. */
1312 if (cb.capture_last == 0) cb.capture_last = -1;
1313 cb.callout_data = md->callout_data;
1314 cb.mark = md->nomatch_mark;
1315 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1316 if (rrc < 0) RRETURN(rrc);
1317 }
1318 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1319 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1320 }
1321
1322 condcode = ecode[LINK_SIZE+1];
1323
1324 /* Now see what the actual condition is */
1325
1326 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1327 {
1328 if (md->recursive == NULL) /* Not recursing => FALSE */
1329 {
1330 condition = FALSE;
1331 ecode += GET(ecode, 1);
1332 }
1333 else
1334 {
1335 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1336 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1337
1338 /* If the test is for recursion into a specific subpattern, and it is
1339 false, but the test was set up by name, scan the table to see if the
1340 name refers to any other numbers, and test them. The condition is true
1341 if any one is set. */
1342
1343 if (!condition && condcode == OP_NRREF)
1344 {
1345 pcre_uchar *slotA = md->name_table;
1346 for (i = 0; i < md->name_count; i++)
1347 {
1348 if (GET2(slotA, 0) == recno) break;
1349 slotA += md->name_entry_size;
1350 }
1351
1352 /* Found a name for the number - there can be only one; duplicate
1353 names for different numbers are allowed, but not vice versa. First
1354 scan down for duplicates. */
1355
1356 if (i < md->name_count)
1357 {
1358 pcre_uchar *slotB = slotA;
1359 while (slotB > md->name_table)
1360 {
1361 slotB -= md->name_entry_size;
1362 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1363 {
1364 condition = GET2(slotB, 0) == md->recursive->group_num;
1365 if (condition) break;
1366 }
1367 else break;
1368 }
1369
1370 /* Scan up for duplicates */
1371
1372 if (!condition)
1373 {
1374 slotB = slotA;
1375 for (i++; i < md->name_count; i++)
1376 {
1377 slotB += md->name_entry_size;
1378 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1379 {
1380 condition = GET2(slotB, 0) == md->recursive->group_num;
1381 if (condition) break;
1382 }
1383 else break;
1384 }
1385 }
1386 }
1387 }
1388
1389 /* Chose branch according to the condition */
1390
1391 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1392 }
1393 }
1394
1395 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1396 {
1397 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1398 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1399
1400 /* If the numbered capture is unset, but the reference was by name,
1401 scan the table to see if the name refers to any other numbers, and test
1402 them. The condition is true if any one is set. This is tediously similar
1403 to the code above, but not close enough to try to amalgamate. */
1404
1405 if (!condition && condcode == OP_NCREF)
1406 {
1407 unsigned int refno = offset >> 1;
1408 pcre_uchar *slotA = md->name_table;
1409
1410 for (i = 0; i < md->name_count; i++)
1411 {
1412 if (GET2(slotA, 0) == refno) break;
1413 slotA += md->name_entry_size;
1414 }
1415
1416 /* Found a name for the number - there can be only one; duplicate names
1417 for different numbers are allowed, but not vice versa. First scan down
1418 for duplicates. */
1419
1420 if (i < md->name_count)
1421 {
1422 pcre_uchar *slotB = slotA;
1423 while (slotB > md->name_table)
1424 {
1425 slotB -= md->name_entry_size;
1426 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1427 {
1428 offset = GET2(slotB, 0) << 1;
1429 condition = offset < offset_top &&
1430 md->offset_vector[offset] >= 0;
1431 if (condition) break;
1432 }
1433 else break;
1434 }
1435
1436 /* Scan up for duplicates */
1437
1438 if (!condition)
1439 {
1440 slotB = slotA;
1441 for (i++; i < md->name_count; i++)
1442 {
1443 slotB += md->name_entry_size;
1444 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1445 {
1446 offset = GET2(slotB, 0) << 1;
1447 condition = offset < offset_top &&
1448 md->offset_vector[offset] >= 0;
1449 if (condition) break;
1450 }
1451 else break;
1452 }
1453 }
1454 }
1455 }
1456
1457 /* Chose branch according to the condition */
1458
1459 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1460 }
1461
1462 else if (condcode == OP_DEF) /* DEFINE - always false */
1463 {
1464 condition = FALSE;
1465 ecode += GET(ecode, 1);
1466 }
1467
1468 /* The condition is an assertion. Call match() to evaluate it - setting
1469 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1470 an assertion. */
1471
1472 else
1473 {
1474 md->match_function_type = MATCH_CONDASSERT;
1475 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1476 if (rrc == MATCH_MATCH)
1477 {
1478 if (md->end_offset_top > offset_top)
1479 offset_top = md->end_offset_top; /* Captures may have happened */
1480 condition = TRUE;
1481 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1482 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1483 }
1484
1485 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1486 assertion; it is therefore treated as NOMATCH. */
1487
1488 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1489 {
1490 RRETURN(rrc); /* Need braces because of following else */
1491 }
1492 else
1493 {
1494 condition = FALSE;
1495 ecode += codelink;
1496 }
1497 }
1498
1499 /* We are now at the branch that is to be obeyed. As there is only one, can
1500 use tail recursion to avoid using another stack frame, except when there is
1501 unlimited repeat of a possibly empty group. In the latter case, a recursive
1502 call to match() is always required, unless the second alternative doesn't
1503 exist, in which case we can just plough on. Note that, for compatibility
1504 with Perl, the | in a conditional group is NOT treated as creating two
1505 alternatives. If a THEN is encountered in the branch, it propagates out to
1506 the enclosing alternative (unless nested in a deeper set of alternatives,
1507 of course). */
1508
1509 if (condition || *ecode == OP_ALT)
1510 {
1511 if (op != OP_SCOND)
1512 {
1513 ecode += 1 + LINK_SIZE;
1514 goto TAIL_RECURSE;
1515 }
1516
1517 md->match_function_type = MATCH_CBEGROUP;
1518 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1519 RRETURN(rrc);
1520 }
1521
1522 /* Condition false & no alternative; continue after the group. */
1523
1524 else
1525 {
1526 ecode += 1 + LINK_SIZE;
1527 }
1528 break;
1529
1530
1531 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1532 to close any currently open capturing brackets. */
1533
1534 case OP_CLOSE:
1535 number = GET2(ecode, 1); /* Must be less than 65536 */
1536 offset = number << 1;
1537
1538 #ifdef PCRE_DEBUG
1539 printf("end bracket %d at *ACCEPT", number);
1540 printf("\n");
1541 #endif
1542
1543 md->capture_last = (md->capture_last & OVFLMASK) | number;
1544 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1545 {
1546 md->offset_vector[offset] =
1547 md->offset_vector[md->offset_end - number];
1548 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1549 if (offset_top <= offset) offset_top = offset + 2;
1550 }
1551 ecode += 1 + IMM2_SIZE;
1552 break;
1553
1554
1555 /* End of the pattern, either real or forced. */
1556
1557 case OP_END:
1558 case OP_ACCEPT:
1559 case OP_ASSERT_ACCEPT:
1560
1561 /* If we have matched an empty string, fail if not in an assertion and not
1562 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1563 is set and we have matched at the start of the subject. In both cases,
1564 backtracking will then try other alternatives, if any. */
1565
1566 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1567 md->recursive == NULL &&
1568 (md->notempty ||
1569 (md->notempty_atstart &&
1570 mstart == md->start_subject + md->start_offset)))
1571 RRETURN(MATCH_NOMATCH);
1572
1573 /* Otherwise, we have a match. */
1574
1575 md->end_match_ptr = eptr; /* Record where we ended */
1576 md->end_offset_top = offset_top; /* and how many extracts were taken */
1577 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1578
1579 /* For some reason, the macros don't work properly if an expression is
1580 given as the argument to RRETURN when the heap is in use. */
1581
1582 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1583 RRETURN(rrc);
1584
1585 /* Assertion brackets. Check the alternative branches in turn - the
1586 matching won't pass the KET for an assertion. If any one branch matches,
1587 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1588 start of each branch to move the current point backwards, so the code at
1589 this level is identical to the lookahead case. When the assertion is part
1590 of a condition, we want to return immediately afterwards. The caller of
1591 this incarnation of the match() function will have set MATCH_CONDASSERT in
1592 md->match_function type, and one of these opcodes will be the first opcode
1593 that is processed. We use a local variable that is preserved over calls to
1594 match() to remember this case. */
1595
1596 case OP_ASSERT:
1597 case OP_ASSERTBACK:
1598 save_mark = md->mark;
1599 if (md->match_function_type == MATCH_CONDASSERT)
1600 {
1601 condassert = TRUE;
1602 md->match_function_type = 0;
1603 }
1604 else condassert = FALSE;
1605
1606 /* Loop for each branch */
1607
1608 do
1609 {
1610 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1611 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1612 {
1613 mstart = md->start_match_ptr; /* In case \K reset it */
1614 break;
1615 }
1616 md->mark = save_mark;
1617
1618 /* See comment in the code for capturing groups above about handling
1619 THEN. */
1620
1621 if (rrc == MATCH_THEN)
1622 {
1623 next = ecode + GET(ecode,1);
1624 if (md->start_match_ptr < next &&
1625 (*ecode == OP_ALT || *next == OP_ALT))
1626 rrc = MATCH_NOMATCH;
1627 }
1628
1629 /* Anything other than NOMATCH causes the assertion to fail. This
1630 includes COMMIT, SKIP, and PRUNE. However, this consistent approach does
1631 not always have exactly the same effect as in Perl. */
1632
1633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1634 ecode += GET(ecode, 1);
1635 }
1636 while (*ecode == OP_ALT);
1637
1638 /* If we have tried all the alternative branches, the assertion has
1639 failed. */
1640
1641 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1642
1643 /* If checking an assertion for a condition, return MATCH_MATCH. */
1644
1645 if (condassert) RRETURN(MATCH_MATCH);
1646
1647 /* Continue from after a successful assertion, updating the offsets high
1648 water mark, since extracts may have been taken during the assertion. */
1649
1650 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1651 ecode += 1 + LINK_SIZE;
1652 offset_top = md->end_offset_top;
1653 continue;
1654
1655 /* Negative assertion: all branches must fail to match for the assertion to
1656 succeed. */
1657
1658 case OP_ASSERT_NOT:
1659 case OP_ASSERTBACK_NOT:
1660 save_mark = md->mark;
1661 if (md->match_function_type == MATCH_CONDASSERT)
1662 {
1663 condassert = TRUE;
1664 md->match_function_type = 0;
1665 }
1666 else condassert = FALSE;
1667
1668 /* Loop for each alternative branch. */
1669
1670 do
1671 {
1672 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1673 md->mark = save_mark;
1674
1675 /* A successful match means the assertion has failed. */
1676
1677 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1678
1679 /* See comment in the code for capturing groups above about handling
1680 THEN. */
1681
1682 if (rrc == MATCH_THEN)
1683 {
1684 next = ecode + GET(ecode,1);
1685 if (md->start_match_ptr < next &&
1686 (*ecode == OP_ALT || *next == OP_ALT))
1687 rrc = MATCH_NOMATCH;
1688 }
1689
1690 /* No match on a branch means we must carry on and try the next branch.
1691 Anything else, in particular, SKIP, PRUNE, etc. causes a failure in the
1692 enclosing branch. This is a consistent approach, but does not always have
1693 the same effect as in Perl. */
1694
1695 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1696 ecode += GET(ecode,1);
1697 }
1698 while (*ecode == OP_ALT);
1699
1700 /* All branches in the assertion failed to match. */
1701
1702 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1703 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1704 continue;
1705
1706 /* Move the subject pointer back. This occurs only at the start of
1707 each branch of a lookbehind assertion. If we are too close to the start to
1708 move back, this match function fails. When working with UTF-8 we move
1709 back a number of characters, not bytes. */
1710
1711 case OP_REVERSE:
1712 #ifdef SUPPORT_UTF
1713 if (utf)
1714 {
1715 i = GET(ecode, 1);
1716 while (i-- > 0)
1717 {
1718 eptr--;
1719 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1720 BACKCHAR(eptr);
1721 }
1722 }
1723 else
1724 #endif
1725
1726 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1727
1728 {
1729 eptr -= GET(ecode, 1);
1730 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1731 }
1732
1733 /* Save the earliest consulted character, then skip to next op code */
1734
1735 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1736 ecode += 1 + LINK_SIZE;
1737 break;
1738
1739 /* The callout item calls an external function, if one is provided, passing
1740 details of the match so far. This is mainly for debugging, though the
1741 function is able to force a failure. */
1742
1743 case OP_CALLOUT:
1744 if (PUBL(callout) != NULL)
1745 {
1746 PUBL(callout_block) cb;
1747 cb.version = 2; /* Version 1 of the callout block */
1748 cb.callout_number = ecode[1];
1749 cb.offset_vector = md->offset_vector;
1750 #if defined COMPILE_PCRE8
1751 cb.subject = (PCRE_SPTR)md->start_subject;
1752 #elif defined COMPILE_PCRE16
1753 cb.subject = (PCRE_SPTR16)md->start_subject;
1754 #elif defined COMPILE_PCRE32
1755 cb.subject = (PCRE_SPTR32)md->start_subject;
1756 #endif
1757 cb.subject_length = (int)(md->end_subject - md->start_subject);
1758 cb.start_match = (int)(mstart - md->start_subject);
1759 cb.current_position = (int)(eptr - md->start_subject);
1760 cb.pattern_position = GET(ecode, 2);
1761 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1762 cb.capture_top = offset_top/2;
1763 cb.capture_last = md->capture_last & CAPLMASK;
1764 /* Internal change requires this for API compatibility. */
1765 if (cb.capture_last == 0) cb.capture_last = -1;
1766 cb.callout_data = md->callout_data;
1767 cb.mark = md->nomatch_mark;
1768 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1769 if (rrc < 0) RRETURN(rrc);
1770 }
1771 ecode += 2 + 2*LINK_SIZE;
1772 break;
1773
1774 /* Recursion either matches the current regex, or some subexpression. The
1775 offset data is the offset to the starting bracket from the start of the
1776 whole pattern. (This is so that it works from duplicated subpatterns.)
1777
1778 The state of the capturing groups is preserved over recursion, and
1779 re-instated afterwards. We don't know how many are started and not yet
1780 finished (offset_top records the completed total) so we just have to save
1781 all the potential data. There may be up to 65535 such values, which is too
1782 large to put on the stack, but using malloc for small numbers seems
1783 expensive. As a compromise, the stack is used when there are no more than
1784 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1785
1786 There are also other values that have to be saved. We use a chained
1787 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1788 for the original version of this logic. It has, however, been hacked around
1789 a lot, so he is not to blame for the current way it works. */
1790
1791 case OP_RECURSE:
1792 {
1793 recursion_info *ri;
1794 unsigned int recno;
1795
1796 callpat = md->start_code + GET(ecode, 1);
1797 recno = (callpat == md->start_code)? 0 :
1798 GET2(callpat, 1 + LINK_SIZE);
1799
1800 /* Check for repeating a recursion without advancing the subject pointer.
1801 This should catch convoluted mutual recursions. (Some simple cases are
1802 caught at compile time.) */
1803
1804 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1805 if (recno == ri->group_num && eptr == ri->subject_position)
1806 RRETURN(PCRE_ERROR_RECURSELOOP);
1807
1808 /* Add to "recursing stack" */
1809
1810 new_recursive.group_num = recno;
1811 new_recursive.saved_capture_last = md->capture_last;
1812 new_recursive.subject_position = eptr;
1813 new_recursive.prevrec = md->recursive;
1814 md->recursive = &new_recursive;
1815
1816 /* Where to continue from afterwards */
1817
1818 ecode += 1 + LINK_SIZE;
1819
1820 /* Now save the offset data */
1821
1822 new_recursive.saved_max = md->offset_end;
1823 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1824 new_recursive.offset_save = stacksave;
1825 else
1826 {
1827 new_recursive.offset_save =
1828 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1829 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1830 }
1831 memcpy(new_recursive.offset_save, md->offset_vector,
1832 new_recursive.saved_max * sizeof(int));
1833
1834 /* OK, now we can do the recursion. After processing each alternative,
1835 restore the offset data and the last captured value. If there were nested
1836 recursions, md->recursive might be changed, so reset it before looping.
1837 */
1838
1839 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1840 cbegroup = (*callpat >= OP_SBRA);
1841 do
1842 {
1843 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1844 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1845 md, eptrb, RM6);
1846 memcpy(md->offset_vector, new_recursive.offset_save,
1847 new_recursive.saved_max * sizeof(int));
1848 md->capture_last = new_recursive.saved_capture_last;
1849 md->recursive = new_recursive.prevrec;
1850 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1851 {
1852 DPRINTF(("Recursion matched\n"));
1853 if (new_recursive.offset_save != stacksave)
1854 (PUBL(free))(new_recursive.offset_save);
1855
1856 /* Set where we got to in the subject, and reset the start in case
1857 it was changed by \K. This *is* propagated back out of a recursion,
1858 for Perl compatibility. */
1859
1860 eptr = md->end_match_ptr;
1861 mstart = md->start_match_ptr;
1862 goto RECURSION_MATCHED; /* Exit loop; end processing */
1863 }
1864
1865 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1866 recursion; they are treated as NOMATCH. These codes are defined in a
1867 range that can be tested for. Any other return code is an error. */
1868
1869 else if (rrc != MATCH_NOMATCH &&
1870 (rrc < MATCH_BACKTRACK_MIN || rrc > MATCH_BACKTRACK_MAX))
1871 {
1872 DPRINTF(("Recursion gave error %d\n", rrc));
1873 if (new_recursive.offset_save != stacksave)
1874 (PUBL(free))(new_recursive.offset_save);
1875 RRETURN(rrc);
1876 }
1877
1878 md->recursive = &new_recursive;
1879 callpat += GET(callpat, 1);
1880 }
1881 while (*callpat == OP_ALT);
1882
1883 DPRINTF(("Recursion didn't match\n"));
1884 md->recursive = new_recursive.prevrec;
1885 if (new_recursive.offset_save != stacksave)
1886 (PUBL(free))(new_recursive.offset_save);
1887 RRETURN(MATCH_NOMATCH);
1888 }
1889
1890 RECURSION_MATCHED:
1891 break;
1892
1893 /* An alternation is the end of a branch; scan along to find the end of the
1894 bracketed group and go to there. */
1895
1896 case OP_ALT:
1897 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1898 break;
1899
1900 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1901 indicating that it may occur zero times. It may repeat infinitely, or not
1902 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1903 with fixed upper repeat limits are compiled as a number of copies, with the
1904 optional ones preceded by BRAZERO or BRAMINZERO. */
1905
1906 case OP_BRAZERO:
1907 next = ecode + 1;
1908 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1910 do next += GET(next, 1); while (*next == OP_ALT);
1911 ecode = next + 1 + LINK_SIZE;
1912 break;
1913
1914 case OP_BRAMINZERO:
1915 next = ecode + 1;
1916 do next += GET(next, 1); while (*next == OP_ALT);
1917 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1919 ecode++;
1920 break;
1921
1922 case OP_SKIPZERO:
1923 next = ecode+1;
1924 do next += GET(next,1); while (*next == OP_ALT);
1925 ecode = next + 1 + LINK_SIZE;
1926 break;
1927
1928 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1929 here; just jump to the group, with allow_zero set TRUE. */
1930
1931 case OP_BRAPOSZERO:
1932 op = *(++ecode);
1933 allow_zero = TRUE;
1934 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1935 goto POSSESSIVE_NON_CAPTURE;
1936
1937 /* End of a group, repeated or non-repeating. */
1938
1939 case OP_KET:
1940 case OP_KETRMIN:
1941 case OP_KETRMAX:
1942 case OP_KETRPOS:
1943 prev = ecode - GET(ecode, 1);
1944
1945 /* If this was a group that remembered the subject start, in order to break
1946 infinite repeats of empty string matches, retrieve the subject start from
1947 the chain. Otherwise, set it NULL. */
1948
1949 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1950 {
1951 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1952 eptrb = eptrb->epb_prev; /* Backup to previous group */
1953 }
1954 else saved_eptr = NULL;
1955
1956 /* If we are at the end of an assertion group or a non-capturing atomic
1957 group, stop matching and return MATCH_MATCH, but record the current high
1958 water mark for use by positive assertions. We also need to record the match
1959 start in case it was changed by \K. */
1960
1961 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1962 *prev == OP_ONCE_NC)
1963 {
1964 md->end_match_ptr = eptr; /* For ONCE_NC */
1965 md->end_offset_top = offset_top;
1966 md->start_match_ptr = mstart;
1967 RRETURN(MATCH_MATCH); /* Sets md->mark */
1968 }
1969
1970 /* For capturing groups we have to check the group number back at the start
1971 and if necessary complete handling an extraction by setting the offsets and
1972 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1973 into group 0, so it won't be picked up here. Instead, we catch it when the
1974 OP_END is reached. Other recursion is handled here. We just have to record
1975 the current subject position and start match pointer and give a MATCH
1976 return. */
1977
1978 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1979 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1980 {
1981 number = GET2(prev, 1+LINK_SIZE);
1982 offset = number << 1;
1983
1984 #ifdef PCRE_DEBUG
1985 printf("end bracket %d", number);
1986 printf("\n");
1987 #endif
1988
1989 /* Handle a recursively called group. */
1990
1991 if (md->recursive != NULL && md->recursive->group_num == number)
1992 {
1993 md->end_match_ptr = eptr;
1994 md->start_match_ptr = mstart;
1995 RRETURN(MATCH_MATCH);
1996 }
1997
1998 /* Deal with capturing */
1999
2000 md->capture_last = (md->capture_last & OVFLMASK) | number;
2001 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
2002 {
2003 /* If offset is greater than offset_top, it means that we are
2004 "skipping" a capturing group, and that group's offsets must be marked
2005 unset. In earlier versions of PCRE, all the offsets were unset at the
2006 start of matching, but this doesn't work because atomic groups and
2007 assertions can cause a value to be set that should later be unset.
2008 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
2009 part of the atomic group, but this is not on the final matching path,
2010 so must be unset when 2 is set. (If there is no group 2, there is no
2011 problem, because offset_top will then be 2, indicating no capture.) */
2012
2013 if (offset > offset_top)
2014 {
2015 register int *iptr = md->offset_vector + offset_top;
2016 register int *iend = md->offset_vector + offset;
2017 while (iptr < iend) *iptr++ = -1;
2018 }
2019
2020 /* Now make the extraction */
2021
2022 md->offset_vector[offset] =
2023 md->offset_vector[md->offset_end - number];
2024 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2025 if (offset_top <= offset) offset_top = offset + 2;
2026 }
2027 }
2028
2029 /* For an ordinary non-repeating ket, just continue at this level. This
2030 also happens for a repeating ket if no characters were matched in the
2031 group. This is the forcible breaking of infinite loops as implemented in
2032 Perl 5.005. For a non-repeating atomic group that includes captures,
2033 establish a backup point by processing the rest of the pattern at a lower
2034 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2035 original OP_ONCE level, thereby bypassing intermediate backup points, but
2036 resetting any captures that happened along the way. */
2037
2038 if (*ecode == OP_KET || eptr == saved_eptr)
2039 {
2040 if (*prev == OP_ONCE)
2041 {
2042 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2044 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2045 RRETURN(MATCH_ONCE);
2046 }
2047 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2048 break;
2049 }
2050
2051 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2052 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2053 at a time from the outer level, thus saving stack. */
2054
2055 if (*ecode == OP_KETRPOS)
2056 {
2057 md->end_match_ptr = eptr;
2058 md->end_offset_top = offset_top;
2059 RRETURN(MATCH_KETRPOS);
2060 }
2061
2062 /* The normal repeating kets try the rest of the pattern or restart from
2063 the preceding bracket, in the appropriate order. In the second case, we can
2064 use tail recursion to avoid using another stack frame, unless we have an
2065 an atomic group or an unlimited repeat of a group that can match an empty
2066 string. */
2067
2068 if (*ecode == OP_KETRMIN)
2069 {
2070 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2072 if (*prev == OP_ONCE)
2073 {
2074 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2075 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2076 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2077 RRETURN(MATCH_ONCE);
2078 }
2079 if (*prev >= OP_SBRA) /* Could match an empty string */
2080 {
2081 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2082 RRETURN(rrc);
2083 }
2084 ecode = prev;
2085 goto TAIL_RECURSE;
2086 }
2087 else /* OP_KETRMAX */
2088 {
2089 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2090 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2091 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2092 if (*prev == OP_ONCE)
2093 {
2094 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2096 md->once_target = prev;
2097 RRETURN(MATCH_ONCE);
2098 }
2099 ecode += 1 + LINK_SIZE;
2100 goto TAIL_RECURSE;
2101 }
2102 /* Control never gets here */
2103
2104 /* Not multiline mode: start of subject assertion, unless notbol. */
2105
2106 case OP_CIRC:
2107 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2108
2109 /* Start of subject assertion */
2110
2111 case OP_SOD:
2112 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2113 ecode++;
2114 break;
2115
2116 /* Multiline mode: start of subject unless notbol, or after any newline. */
2117
2118 case OP_CIRCM:
2119 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2120 if (eptr != md->start_subject &&
2121 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2122 RRETURN(MATCH_NOMATCH);
2123 ecode++;
2124 break;
2125
2126 /* Start of match assertion */
2127
2128 case OP_SOM:
2129 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2130 ecode++;
2131 break;
2132
2133 /* Reset the start of match point */
2134
2135 case OP_SET_SOM:
2136 mstart = eptr;
2137 ecode++;
2138 break;
2139
2140 /* Multiline mode: assert before any newline, or before end of subject
2141 unless noteol is set. */
2142
2143 case OP_DOLLM:
2144 if (eptr < md->end_subject)
2145 {
2146 if (!IS_NEWLINE(eptr))
2147 {
2148 if (md->partial != 0 &&
2149 eptr + 1 >= md->end_subject &&
2150 NLBLOCK->nltype == NLTYPE_FIXED &&
2151 NLBLOCK->nllen == 2 &&
2152 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2153 {
2154 md->hitend = TRUE;
2155 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2156 }
2157 RRETURN(MATCH_NOMATCH);
2158 }
2159 }
2160 else
2161 {
2162 if (md->noteol) RRETURN(MATCH_NOMATCH);
2163 SCHECK_PARTIAL();
2164 }
2165 ecode++;
2166 break;
2167
2168 /* Not multiline mode: assert before a terminating newline or before end of
2169 subject unless noteol is set. */
2170
2171 case OP_DOLL:
2172 if (md->noteol) RRETURN(MATCH_NOMATCH);
2173 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2174
2175 /* ... else fall through for endonly */
2176
2177 /* End of subject assertion (\z) */
2178
2179 case OP_EOD:
2180 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2181 SCHECK_PARTIAL();
2182 ecode++;
2183 break;
2184
2185 /* End of subject or ending \n assertion (\Z) */
2186
2187 case OP_EODN:
2188 ASSERT_NL_OR_EOS:
2189 if (eptr < md->end_subject &&
2190 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2191 {
2192 if (md->partial != 0 &&
2193 eptr + 1 >= md->end_subject &&
2194 NLBLOCK->nltype == NLTYPE_FIXED &&
2195 NLBLOCK->nllen == 2 &&
2196 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2197 {
2198 md->hitend = TRUE;
2199 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2200 }
2201 RRETURN(MATCH_NOMATCH);
2202 }
2203
2204 /* Either at end of string or \n before end. */
2205
2206 SCHECK_PARTIAL();
2207 ecode++;
2208 break;
2209
2210 /* Word boundary assertions */
2211
2212 case OP_NOT_WORD_BOUNDARY:
2213 case OP_WORD_BOUNDARY:
2214 {
2215
2216 /* Find out if the previous and current characters are "word" characters.
2217 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2218 be "non-word" characters. Remember the earliest consulted character for
2219 partial matching. */
2220
2221 #ifdef SUPPORT_UTF
2222 if (utf)
2223 {
2224 /* Get status of previous character */
2225
2226 if (eptr == md->start_subject) prev_is_word = FALSE; else
2227 {
2228 PCRE_PUCHAR lastptr = eptr - 1;
2229 BACKCHAR(lastptr);
2230 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2231 GETCHAR(c, lastptr);
2232 #ifdef SUPPORT_UCP
2233 if (md->use_ucp)
2234 {
2235 if (c == '_') prev_is_word = TRUE; else
2236 {
2237 int cat = UCD_CATEGORY(c);
2238 prev_is_word = (cat == ucp_L || cat == ucp_N);
2239 }
2240 }
2241 else
2242 #endif
2243 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2244 }
2245
2246 /* Get status of next character */
2247
2248 if (eptr >= md->end_subject)
2249 {
2250 SCHECK_PARTIAL();
2251 cur_is_word = FALSE;
2252 }
2253 else
2254 {
2255 GETCHAR(c, eptr);
2256 #ifdef SUPPORT_UCP
2257 if (md->use_ucp)
2258 {
2259 if (c == '_') cur_is_word = TRUE; else
2260 {
2261 int cat = UCD_CATEGORY(c);
2262 cur_is_word = (cat == ucp_L || cat == ucp_N);
2263 }
2264 }
2265 else
2266 #endif
2267 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2268 }
2269 }
2270 else
2271 #endif
2272
2273 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2274 consistency with the behaviour of \w we do use it in this case. */
2275
2276 {
2277 /* Get status of previous character */
2278
2279 if (eptr == md->start_subject) prev_is_word = FALSE; else
2280 {
2281 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2282 #ifdef SUPPORT_UCP
2283 if (md->use_ucp)
2284 {
2285 c = eptr[-1];
2286 if (c == '_') prev_is_word = TRUE; else
2287 {
2288 int cat = UCD_CATEGORY(c);
2289 prev_is_word = (cat == ucp_L || cat == ucp_N);
2290 }
2291 }
2292 else
2293 #endif
2294 prev_is_word = MAX_255(eptr[-1])
2295 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2296 }
2297
2298 /* Get status of next character */
2299
2300 if (eptr >= md->end_subject)
2301 {
2302 SCHECK_PARTIAL();
2303 cur_is_word = FALSE;
2304 }
2305 else
2306 #ifdef SUPPORT_UCP
2307 if (md->use_ucp)
2308 {
2309 c = *eptr;
2310 if (c == '_') cur_is_word = TRUE; else
2311 {
2312 int cat = UCD_CATEGORY(c);
2313 cur_is_word = (cat == ucp_L || cat == ucp_N);
2314 }
2315 }
2316 else
2317 #endif
2318 cur_is_word = MAX_255(*eptr)
2319 && ((md->ctypes[*eptr] & ctype_word) != 0);
2320 }
2321
2322 /* Now see if the situation is what we want */
2323
2324 if ((*ecode++ == OP_WORD_BOUNDARY)?
2325 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 break;
2329
2330 /* Match any single character type except newline; have to take care with
2331 CRLF newlines and partial matching. */
2332
2333 case OP_ANY:
2334 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2335 if (md->partial != 0 &&
2336 eptr + 1 >= md->end_subject &&
2337 NLBLOCK->nltype == NLTYPE_FIXED &&
2338 NLBLOCK->nllen == 2 &&
2339 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2340 {
2341 md->hitend = TRUE;
2342 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2343 }
2344
2345 /* Fall through */
2346
2347 /* Match any single character whatsoever. */
2348
2349 case OP_ALLANY:
2350 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2351 { /* not be updated before SCHECK_PARTIAL. */
2352 SCHECK_PARTIAL();
2353 RRETURN(MATCH_NOMATCH);
2354 }
2355 eptr++;
2356 #ifdef SUPPORT_UTF
2357 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2358 #endif
2359 ecode++;
2360 break;
2361
2362 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2363 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2364
2365 case OP_ANYBYTE:
2366 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2367 { /* not be updated before SCHECK_PARTIAL. */
2368 SCHECK_PARTIAL();
2369 RRETURN(MATCH_NOMATCH);
2370 }
2371 eptr++;
2372 ecode++;
2373 break;
2374
2375 case OP_NOT_DIGIT:
2376 if (eptr >= md->end_subject)
2377 {
2378 SCHECK_PARTIAL();
2379 RRETURN(MATCH_NOMATCH);
2380 }
2381 GETCHARINCTEST(c, eptr);
2382 if (
2383 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2384 c < 256 &&
2385 #endif
2386 (md->ctypes[c] & ctype_digit) != 0
2387 )
2388 RRETURN(MATCH_NOMATCH);
2389 ecode++;
2390 break;
2391
2392 case OP_DIGIT:
2393 if (eptr >= md->end_subject)
2394 {
2395 SCHECK_PARTIAL();
2396 RRETURN(MATCH_NOMATCH);
2397 }
2398 GETCHARINCTEST(c, eptr);
2399 if (
2400 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2401 c > 255 ||
2402 #endif
2403 (md->ctypes[c] & ctype_digit) == 0
2404 )
2405 RRETURN(MATCH_NOMATCH);
2406 ecode++;
2407 break;
2408
2409 case OP_NOT_WHITESPACE:
2410 if (eptr >= md->end_subject)
2411 {
2412 SCHECK_PARTIAL();
2413 RRETURN(MATCH_NOMATCH);
2414 }
2415 GETCHARINCTEST(c, eptr);
2416 if (
2417 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2418 c < 256 &&
2419 #endif
2420 (md->ctypes[c] & ctype_space) != 0
2421 )
2422 RRETURN(MATCH_NOMATCH);
2423 ecode++;
2424 break;
2425
2426 case OP_WHITESPACE:
2427 if (eptr >= md->end_subject)
2428 {
2429 SCHECK_PARTIAL();
2430 RRETURN(MATCH_NOMATCH);
2431 }
2432 GETCHARINCTEST(c, eptr);
2433 if (
2434 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2435 c > 255 ||
2436 #endif
2437 (md->ctypes[c] & ctype_space) == 0
2438 )
2439 RRETURN(MATCH_NOMATCH);
2440 ecode++;
2441 break;
2442
2443 case OP_NOT_WORDCHAR:
2444 if (eptr >= md->end_subject)
2445 {
2446 SCHECK_PARTIAL();
2447 RRETURN(MATCH_NOMATCH);
2448 }
2449 GETCHARINCTEST(c, eptr);
2450 if (
2451 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2452 c < 256 &&
2453 #endif
2454 (md->ctypes[c] & ctype_word) != 0
2455 )
2456 RRETURN(MATCH_NOMATCH);
2457 ecode++;
2458 break;
2459
2460 case OP_WORDCHAR:
2461 if (eptr >= md->end_subject)
2462 {
2463 SCHECK_PARTIAL();
2464 RRETURN(MATCH_NOMATCH);
2465 }
2466 GETCHARINCTEST(c, eptr);
2467 if (
2468 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2469 c > 255 ||
2470 #endif
2471 (md->ctypes[c] & ctype_word) == 0
2472 )
2473 RRETURN(MATCH_NOMATCH);
2474 ecode++;
2475 break;
2476
2477 case OP_ANYNL:
2478 if (eptr >= md->end_subject)
2479 {
2480 SCHECK_PARTIAL();
2481 RRETURN(MATCH_NOMATCH);
2482 }
2483 GETCHARINCTEST(c, eptr);
2484 switch(c)
2485 {
2486 default: RRETURN(MATCH_NOMATCH);
2487
2488 case CHAR_CR:
2489 if (eptr >= md->end_subject)
2490 {
2491 SCHECK_PARTIAL();
2492 }
2493 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2494 break;
2495
2496 case CHAR_LF:
2497 break;
2498
2499 case CHAR_VT:
2500 case CHAR_FF:
2501 case CHAR_NEL:
2502 #ifndef EBCDIC
2503 case 0x2028:
2504 case 0x2029:
2505 #endif /* Not EBCDIC */
2506 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2507 break;
2508 }
2509 ecode++;
2510 break;
2511
2512 case OP_NOT_HSPACE:
2513 if (eptr >= md->end_subject)
2514 {
2515 SCHECK_PARTIAL();
2516 RRETURN(MATCH_NOMATCH);
2517 }
2518 GETCHARINCTEST(c, eptr);
2519 switch(c)
2520 {
2521 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2522 default: break;
2523 }
2524 ecode++;
2525 break;
2526
2527 case OP_HSPACE:
2528 if (eptr >= md->end_subject)
2529 {
2530 SCHECK_PARTIAL();
2531 RRETURN(MATCH_NOMATCH);
2532 }
2533 GETCHARINCTEST(c, eptr);
2534 switch(c)
2535 {
2536 HSPACE_CASES: break; /* Byte and multibyte cases */
2537 default: RRETURN(MATCH_NOMATCH);
2538 }
2539 ecode++;
2540 break;
2541
2542 case OP_NOT_VSPACE:
2543 if (eptr >= md->end_subject)
2544 {
2545 SCHECK_PARTIAL();
2546 RRETURN(MATCH_NOMATCH);
2547 }
2548 GETCHARINCTEST(c, eptr);
2549 switch(c)
2550 {
2551 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2552 default: break;
2553 }
2554 ecode++;
2555 break;
2556
2557 case OP_VSPACE:
2558 if (eptr >= md->end_subject)
2559 {
2560 SCHECK_PARTIAL();
2561 RRETURN(MATCH_NOMATCH);
2562 }
2563 GETCHARINCTEST(c, eptr);
2564 switch(c)
2565 {
2566 VSPACE_CASES: break;
2567 default: RRETURN(MATCH_NOMATCH);
2568 }
2569 ecode++;
2570 break;
2571
2572 #ifdef SUPPORT_UCP
2573 /* Check the next character by Unicode property. We will get here only
2574 if the support is in the binary; otherwise a compile-time error occurs. */
2575
2576 case OP_PROP:
2577 case OP_NOTPROP:
2578 if (eptr >= md->end_subject)
2579 {
2580 SCHECK_PARTIAL();
2581 RRETURN(MATCH_NOMATCH);
2582 }
2583 GETCHARINCTEST(c, eptr);
2584 {
2585 const pcre_uint32 *cp;
2586 const ucd_record *prop = GET_UCD(c);
2587
2588 switch(ecode[1])
2589 {
2590 case PT_ANY:
2591 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2592 break;
2593
2594 case PT_LAMP:
2595 if ((prop->chartype == ucp_Lu ||
2596 prop->chartype == ucp_Ll ||
2597 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2598 RRETURN(MATCH_NOMATCH);
2599 break;
2600
2601 case PT_GC:
2602 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2603 RRETURN(MATCH_NOMATCH);
2604 break;
2605
2606 case PT_PC:
2607 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2608 RRETURN(MATCH_NOMATCH);
2609 break;
2610
2611 case PT_SC:
2612 if ((ecode[2] != prop->script) == (op == OP_PROP))
2613 RRETURN(MATCH_NOMATCH);
2614 break;
2615
2616 /* These are specials */
2617
2618 case PT_ALNUM:
2619 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2620 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2621 RRETURN(MATCH_NOMATCH);
2622 break;
2623
2624 case PT_SPACE: /* Perl space */
2625 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2626 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2627 == (op == OP_NOTPROP))
2628 RRETURN(MATCH_NOMATCH);
2629 break;
2630
2631 case PT_PXSPACE: /* POSIX space */
2632 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2633 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2634 c == CHAR_FF || c == CHAR_CR)
2635 == (op == OP_NOTPROP))
2636 RRETURN(MATCH_NOMATCH);
2637 break;
2638
2639 case PT_WORD:
2640 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2641 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2642 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2643 RRETURN(MATCH_NOMATCH);
2644 break;
2645
2646 case PT_CLIST:
2647 cp = PRIV(ucd_caseless_sets) + ecode[2];
2648 for (;;)
2649 {
2650 if (c < *cp)
2651 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2652 if (c == *cp++)
2653 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2654 }
2655 break;
2656
2657 case PT_UCNC:
2658 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2659 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2660 c >= 0xe000) == (op == OP_NOTPROP))
2661 RRETURN(MATCH_NOMATCH);
2662 break;
2663
2664 /* This should never occur */
2665
2666 default:
2667 RRETURN(PCRE_ERROR_INTERNAL);
2668 }
2669
2670 ecode += 3;
2671 }
2672 break;
2673
2674 /* Match an extended Unicode sequence. We will get here only if the support
2675 is in the binary; otherwise a compile-time error occurs. */
2676
2677 case OP_EXTUNI:
2678 if (eptr >= md->end_subject)
2679 {
2680 SCHECK_PARTIAL();
2681 RRETURN(MATCH_NOMATCH);
2682 }
2683 else
2684 {
2685 int lgb, rgb;
2686 GETCHARINCTEST(c, eptr);
2687 lgb = UCD_GRAPHBREAK(c);
2688 while (eptr < md->end_subject)
2689 {
2690 int len = 1;
2691 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2692 rgb = UCD_GRAPHBREAK(c);
2693 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2694 lgb = rgb;
2695 eptr += len;
2696 }
2697 }
2698 CHECK_PARTIAL();
2699 ecode++;
2700 break;
2701 #endif /* SUPPORT_UCP */
2702
2703
2704 /* Match a back reference, possibly repeatedly. Look past the end of the
2705 item to see if there is repeat information following. The code is similar
2706 to that for character classes, but repeated for efficiency. Then obey
2707 similar code to character type repeats - written out again for speed.
2708 However, if the referenced string is the empty string, always treat
2709 it as matched, any number of times (otherwise there could be infinite
2710 loops). */
2711
2712 case OP_REF:
2713 case OP_REFI:
2714 caseless = op == OP_REFI;
2715 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2716 ecode += 1 + IMM2_SIZE;
2717
2718 /* If the reference is unset, there are two possibilities:
2719
2720 (a) In the default, Perl-compatible state, set the length negative;
2721 this ensures that every attempt at a match fails. We can't just fail
2722 here, because of the possibility of quantifiers with zero minima.
2723
2724 (b) If the JavaScript compatibility flag is set, set the length to zero
2725 so that the back reference matches an empty string.
2726
2727 Otherwise, set the length to the length of what was matched by the
2728 referenced subpattern. */
2729
2730 if (offset >= offset_top || md->offset_vector[offset] < 0)
2731 length = (md->jscript_compat)? 0 : -1;
2732 else
2733 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2734
2735 /* Set up for repetition, or handle the non-repeated case */
2736
2737 switch (*ecode)
2738 {
2739 case OP_CRSTAR:
2740 case OP_CRMINSTAR:
2741 case OP_CRPLUS:
2742 case OP_CRMINPLUS:
2743 case OP_CRQUERY:
2744 case OP_CRMINQUERY:
2745 c = *ecode++ - OP_CRSTAR;
2746 minimize = (c & 1) != 0;
2747 min = rep_min[c]; /* Pick up values from tables; */
2748 max = rep_max[c]; /* zero for max => infinity */
2749 if (max == 0) max = INT_MAX;
2750 break;
2751
2752 case OP_CRRANGE:
2753 case OP_CRMINRANGE:
2754 minimize = (*ecode == OP_CRMINRANGE);
2755 min = GET2(ecode, 1);
2756 max = GET2(ecode, 1 + IMM2_SIZE);
2757 if (max == 0) max = INT_MAX;
2758 ecode += 1 + 2 * IMM2_SIZE;
2759 break;
2760
2761 default: /* No repeat follows */
2762 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2763 {
2764 if (length == -2) eptr = md->end_subject; /* Partial match */
2765 CHECK_PARTIAL();
2766 RRETURN(MATCH_NOMATCH);
2767 }
2768 eptr += length;
2769 continue; /* With the main loop */
2770 }
2771
2772 /* Handle repeated back references. If the length of the reference is
2773 zero, just continue with the main loop. If the length is negative, it
2774 means the reference is unset in non-Java-compatible mode. If the minimum is
2775 zero, we can continue at the same level without recursion. For any other
2776 minimum, carrying on will result in NOMATCH. */
2777
2778 if (length == 0) continue;
2779 if (length < 0 && min == 0) continue;
2780
2781 /* First, ensure the minimum number of matches are present. We get back
2782 the length of the reference string explicitly rather than passing the
2783 address of eptr, so that eptr can be a register variable. */
2784
2785 for (i = 1; i <= min; i++)
2786 {
2787 int slength;
2788 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2789 {
2790 if (slength == -2) eptr = md->end_subject; /* Partial match */
2791 CHECK_PARTIAL();
2792 RRETURN(MATCH_NOMATCH);
2793 }
2794 eptr += slength;
2795 }
2796
2797 /* If min = max, continue at the same level without recursion.
2798 They are not both allowed to be zero. */
2799
2800 if (min == max) continue;
2801
2802 /* If minimizing, keep trying and advancing the pointer */
2803
2804 if (minimize)
2805 {
2806 for (fi = min;; fi++)
2807 {
2808 int slength;
2809 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2811 if (fi >= max) RRETURN(MATCH_NOMATCH);
2812 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2813 {
2814 if (slength == -2) eptr = md->end_subject; /* Partial match */
2815 CHECK_PARTIAL();
2816 RRETURN(MATCH_NOMATCH);
2817 }
2818 eptr += slength;
2819 }
2820 /* Control never gets here */
2821 }
2822
2823 /* If maximizing, find the longest string and work backwards */
2824
2825 else
2826 {
2827 pp = eptr;
2828 for (i = min; i < max; i++)
2829 {
2830 int slength;
2831 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2832 {
2833 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2834 the soft partial matching case. */
2835
2836 if (slength == -2 && md->partial != 0 &&
2837 md->end_subject > md->start_used_ptr)
2838 {
2839 md->hitend = TRUE;
2840 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2841 }
2842 break;
2843 }
2844 eptr += slength;
2845 }
2846
2847 while (eptr >= pp)
2848 {
2849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2851 eptr -= length;
2852 }
2853 RRETURN(MATCH_NOMATCH);
2854 }
2855 /* Control never gets here */
2856
2857 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2858 used when all the characters in the class have values in the range 0-255,
2859 and either the matching is caseful, or the characters are in the range
2860 0-127 when UTF-8 processing is enabled. The only difference between
2861 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2862 encountered.
2863
2864 First, look past the end of the item to see if there is repeat information
2865 following. Then obey similar code to character type repeats - written out
2866 again for speed. */
2867
2868 case OP_NCLASS:
2869 case OP_CLASS:
2870 {
2871 /* The data variable is saved across frames, so the byte map needs to
2872 be stored there. */
2873 #define BYTE_MAP ((pcre_uint8 *)data)
2874 data = ecode + 1; /* Save for matching */
2875 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2876
2877 switch (*ecode)
2878 {
2879 case OP_CRSTAR:
2880 case OP_CRMINSTAR:
2881 case OP_CRPLUS:
2882 case OP_CRMINPLUS:
2883 case OP_CRQUERY:
2884 case OP_CRMINQUERY:
2885 c = *ecode++ - OP_CRSTAR;
2886 minimize = (c & 1) != 0;
2887 min = rep_min[c]; /* Pick up values from tables; */
2888 max = rep_max[c]; /* zero for max => infinity */
2889 if (max == 0) max = INT_MAX;
2890 break;
2891
2892 case OP_CRRANGE:
2893 case OP_CRMINRANGE:
2894 minimize = (*ecode == OP_CRMINRANGE);
2895 min = GET2(ecode, 1);
2896 max = GET2(ecode, 1 + IMM2_SIZE);
2897 if (max == 0) max = INT_MAX;
2898 ecode += 1 + 2 * IMM2_SIZE;
2899 break;
2900
2901 default: /* No repeat follows */
2902 min = max = 1;
2903 break;
2904 }
2905
2906 /* First, ensure the minimum number of matches are present. */
2907
2908 #ifdef SUPPORT_UTF
2909 if (utf)
2910 {
2911 for (i = 1; i <= min; i++)
2912 {
2913 if (eptr >= md->end_subject)
2914 {
2915 SCHECK_PARTIAL();
2916 RRETURN(MATCH_NOMATCH);
2917 }
2918 GETCHARINC(c, eptr);
2919 if (c > 255)
2920 {
2921 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2922 }
2923 else
2924 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2925 }
2926 }
2927 else
2928 #endif
2929 /* Not UTF mode */
2930 {
2931 for (i = 1; i <= min; i++)
2932 {
2933 if (eptr >= md->end_subject)
2934 {
2935 SCHECK_PARTIAL();
2936 RRETURN(MATCH_NOMATCH);
2937 }
2938 c = *eptr++;
2939 #ifndef COMPILE_PCRE8
2940 if (c > 255)
2941 {
2942 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2943 }
2944 else
2945 #endif
2946 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2947 }
2948 }
2949
2950 /* If max == min we can continue with the main loop without the
2951 need to recurse. */
2952
2953 if (min == max) continue;
2954
2955 /* If minimizing, keep testing the rest of the expression and advancing
2956 the pointer while it matches the class. */
2957
2958 if (minimize)
2959 {
2960 #ifdef SUPPORT_UTF
2961 if (utf)
2962 {
2963 for (fi = min;; fi++)
2964 {
2965 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2967 if (fi >= max) RRETURN(MATCH_NOMATCH);
2968 if (eptr >= md->end_subject)
2969 {
2970 SCHECK_PARTIAL();
2971 RRETURN(MATCH_NOMATCH);
2972 }
2973 GETCHARINC(c, eptr);
2974 if (c > 255)
2975 {
2976 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2977 }
2978 else
2979 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2980 }
2981 }
2982 else
2983 #endif
2984 /* Not UTF mode */
2985 {
2986 for (fi = min;; fi++)
2987 {
2988 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2990 if (fi >= max) RRETURN(MATCH_NOMATCH);
2991 if (eptr >= md->end_subject)
2992 {
2993 SCHECK_PARTIAL();
2994 RRETURN(MATCH_NOMATCH);
2995 }
2996 c = *eptr++;
2997 #ifndef COMPILE_PCRE8
2998 if (c > 255)
2999 {
3000 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3001 }
3002 else
3003 #endif
3004 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3005 }
3006 }
3007 /* Control never gets here */
3008 }
3009
3010 /* If maximizing, find the longest possible run, then work backwards. */
3011
3012 else
3013 {
3014 pp = eptr;
3015
3016 #ifdef SUPPORT_UTF
3017 if (utf)
3018 {
3019 for (i = min; i < max; i++)
3020 {
3021 int len = 1;
3022 if (eptr >= md->end_subject)
3023 {
3024 SCHECK_PARTIAL();
3025 break;
3026 }
3027 GETCHARLEN(c, eptr, len);
3028 if (c > 255)
3029 {
3030 if (op == OP_CLASS) break;
3031 }
3032 else
3033 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3034 eptr += len;
3035 }
3036 for (;;)
3037 {
3038 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3039 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3040 if (eptr-- == pp) break; /* Stop if tried at original pos */
3041 BACKCHAR(eptr);
3042 }
3043 }
3044 else
3045 #endif
3046 /* Not UTF mode */
3047 {
3048 for (i = min; i < max; i++)
3049 {
3050 if (eptr >= md->end_subject)
3051 {
3052 SCHECK_PARTIAL();
3053 break;
3054 }
3055 c = *eptr;
3056 #ifndef COMPILE_PCRE8
3057 if (c > 255)
3058 {
3059 if (op == OP_CLASS) break;
3060 }
3061 else
3062 #endif
3063 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3064 eptr++;
3065 }
3066 while (eptr >= pp)
3067 {
3068 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3070 eptr--;
3071 }
3072 }
3073
3074 RRETURN(MATCH_NOMATCH);
3075 }
3076 #undef BYTE_MAP
3077 }
3078 /* Control never gets here */
3079
3080
3081 /* Match an extended character class. This opcode is encountered only
3082 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3083 mode, because Unicode properties are supported in non-UTF-8 mode. */
3084
3085 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3086 case OP_XCLASS:
3087 {
3088 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3089 ecode += GET(ecode, 1); /* Advance past the item */
3090
3091 switch (*ecode)
3092 {
3093 case OP_CRSTAR:
3094 case OP_CRMINSTAR:
3095 case OP_CRPLUS:
3096 case OP_CRMINPLUS:
3097 case OP_CRQUERY:
3098 case OP_CRMINQUERY:
3099 c = *ecode++ - OP_CRSTAR;
3100 minimize = (c & 1) != 0;
3101 min = rep_min[c]; /* Pick up values from tables; */
3102 max = rep_max[c]; /* zero for max => infinity */
3103 if (max == 0) max = INT_MAX;
3104 break;
3105
3106 case OP_CRRANGE:
3107 case OP_CRMINRANGE:
3108 minimize = (*ecode == OP_CRMINRANGE);
3109 min = GET2(ecode, 1);
3110 max = GET2(ecode, 1 + IMM2_SIZE);
3111 if (max == 0) max = INT_MAX;
3112 ecode += 1 + 2 * IMM2_SIZE;
3113 break;
3114
3115 default: /* No repeat follows */
3116 min = max = 1;
3117 break;
3118 }
3119
3120 /* First, ensure the minimum number of matches are present. */
3121
3122 for (i = 1; i <= min; i++)
3123 {
3124 if (eptr >= md->end_subject)
3125 {
3126 SCHECK_PARTIAL();
3127 RRETURN(MATCH_NOMATCH);
3128 }
3129 GETCHARINCTEST(c, eptr);
3130 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3131 }
3132
3133 /* If max == min we can continue with the main loop without the
3134 need to recurse. */
3135
3136 if (min == max) continue;
3137
3138 /* If minimizing, keep testing the rest of the expression and advancing
3139 the pointer while it matches the class. */
3140
3141 if (minimize)
3142 {
3143 for (fi = min;; fi++)
3144 {
3145 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3146 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3147 if (fi >= max) RRETURN(MATCH_NOMATCH);
3148 if (eptr >= md->end_subject)
3149 {
3150 SCHECK_PARTIAL();
3151 RRETURN(MATCH_NOMATCH);
3152 }
3153 GETCHARINCTEST(c, eptr);
3154 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3155 }
3156 /* Control never gets here */
3157 }
3158
3159 /* If maximizing, find the longest possible run, then work backwards. */
3160
3161 else
3162 {
3163 pp = eptr;
3164 for (i = min; i < max; i++)
3165 {
3166 int len = 1;
3167 if (eptr >= md->end_subject)
3168 {
3169 SCHECK_PARTIAL();
3170 break;
3171 }
3172 #ifdef SUPPORT_UTF
3173 GETCHARLENTEST(c, eptr, len);
3174 #else
3175 c = *eptr;
3176 #endif
3177 if (!PRIV(xclass)(c, data, utf)) break;
3178 eptr += len;
3179 }
3180 for(;;)
3181 {
3182 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3183 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3184 if (eptr-- == pp) break; /* Stop if tried at original pos */
3185 #ifdef SUPPORT_UTF
3186 if (utf) BACKCHAR(eptr);
3187 #endif
3188 }
3189 RRETURN(MATCH_NOMATCH);
3190 }
3191
3192 /* Control never gets here */
3193 }
3194 #endif /* End of XCLASS */
3195
3196 /* Match a single character, casefully */
3197
3198 case OP_CHAR:
3199 #ifdef SUPPORT_UTF
3200 if (utf)
3201 {
3202 length = 1;
3203 ecode++;
3204 GETCHARLEN(fc, ecode, length);
3205 if (length > md->end_subject - eptr)
3206 {
3207 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3208 RRETURN(MATCH_NOMATCH);
3209 }
3210 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3211 }
3212 else
3213 #endif
3214 /* Not UTF mode */
3215 {
3216 if (md->end_subject - eptr < 1)
3217 {
3218 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3219 RRETURN(MATCH_NOMATCH);
3220 }
3221 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3222 ecode += 2;
3223 }
3224 break;
3225
3226 /* Match a single character, caselessly. If we are at the end of the
3227 subject, give up immediately. */
3228
3229 case OP_CHARI:
3230 if (eptr >= md->end_subject)
3231 {
3232 SCHECK_PARTIAL();
3233 RRETURN(MATCH_NOMATCH);
3234 }
3235
3236 #ifdef SUPPORT_UTF
3237 if (utf)
3238 {
3239 length = 1;
3240 ecode++;
3241 GETCHARLEN(fc, ecode, length);
3242
3243 /* If the pattern character's value is < 128, we have only one byte, and
3244 we know that its other case must also be one byte long, so we can use the
3245 fast lookup table. We know that there is at least one byte left in the
3246 subject. */
3247
3248 if (fc < 128)
3249 {
3250 pcre_uint32 cc = RAWUCHAR(eptr);
3251 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3252 ecode++;
3253 eptr++;
3254 }
3255
3256 /* Otherwise we must pick up the subject character. Note that we cannot
3257 use the value of "length" to check for sufficient bytes left, because the
3258 other case of the character may have more or fewer bytes. */
3259
3260 else
3261 {
3262 pcre_uint32 dc;
3263 GETCHARINC(dc, eptr);
3264 ecode += length;
3265
3266 /* If we have Unicode property support, we can use it to test the other
3267 case of the character, if there is one. */
3268
3269 if (fc != dc)
3270 {
3271 #ifdef SUPPORT_UCP
3272 if (dc != UCD_OTHERCASE(fc))
3273 #endif
3274 RRETURN(MATCH_NOMATCH);
3275 }
3276 }
3277 }
3278 else
3279 #endif /* SUPPORT_UTF */
3280
3281 /* Not UTF mode */
3282 {
3283 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3284 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3285 eptr++;
3286 ecode += 2;
3287 }
3288 break;
3289
3290 /* Match a single character repeatedly. */
3291
3292 case OP_EXACT:
3293 case OP_EXACTI:
3294 min = max = GET2(ecode, 1);
3295 ecode += 1 + IMM2_SIZE;
3296 goto REPEATCHAR;
3297
3298 case OP_POSUPTO:
3299 case OP_POSUPTOI:
3300 possessive = TRUE;
3301 /* Fall through */
3302
3303 case OP_UPTO:
3304 case OP_UPTOI:
3305 case OP_MINUPTO:
3306 case OP_MINUPTOI:
3307 min = 0;
3308 max = GET2(ecode, 1);
3309 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3310 ecode += 1 + IMM2_SIZE;
3311 goto REPEATCHAR;
3312
3313 case OP_POSSTAR:
3314 case OP_POSSTARI:
3315 possessive = TRUE;
3316 min = 0;
3317 max = INT_MAX;
3318 ecode++;
3319 goto REPEATCHAR;
3320
3321 case OP_POSPLUS:
3322 case OP_POSPLUSI:
3323 possessive = TRUE;
3324 min = 1;
3325 max = INT_MAX;
3326 ecode++;
3327 goto REPEATCHAR;
3328
3329 case OP_POSQUERY:
3330 case OP_POSQUERYI:
3331 possessive = TRUE;
3332 min = 0;
3333 max = 1;
3334 ecode++;
3335 goto REPEATCHAR;
3336
3337 case OP_STAR:
3338 case OP_STARI:
3339 case OP_MINSTAR:
3340 case OP_MINSTARI:
3341 case OP_PLUS:
3342 case OP_PLUSI:
3343 case OP_MINPLUS:
3344 case OP_MINPLUSI:
3345 case OP_QUERY:
3346 case OP_QUERYI:
3347 case OP_MINQUERY:
3348 case OP_MINQUERYI:
3349 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3350 minimize = (c & 1) != 0;
3351 min = rep_min[c]; /* Pick up values from tables; */
3352 max = rep_max[c]; /* zero for max => infinity */
3353 if (max == 0) max = INT_MAX;
3354
3355 /* Common code for all repeated single-character matches. */
3356
3357 REPEATCHAR:
3358 #ifdef SUPPORT_UTF
3359 if (utf)
3360 {
3361 length = 1;
3362 charptr = ecode;
3363 GETCHARLEN(fc, ecode, length);
3364 ecode += length;
3365
3366 /* Handle multibyte character matching specially here. There is
3367 support for caseless matching if UCP support is present. */
3368
3369 if (length > 1)
3370 {
3371 #ifdef SUPPORT_UCP
3372 pcre_uint32 othercase;
3373 if (op >= OP_STARI && /* Caseless */
3374 (othercase = UCD_OTHERCASE(fc)) != fc)
3375 oclength = PRIV(ord2utf)(othercase, occhars);
3376 else oclength = 0;
3377 #endif /* SUPPORT_UCP */
3378
3379 for (i = 1; i <= min; i++)
3380 {
3381 if (eptr <= md->end_subject - length &&
3382 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3383 #ifdef SUPPORT_UCP
3384 else if (oclength > 0 &&
3385 eptr <= md->end_subject - oclength &&
3386 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3387 #endif /* SUPPORT_UCP */
3388 else
3389 {
3390 CHECK_PARTIAL();
3391 RRETURN(MATCH_NOMATCH);
3392 }
3393 }
3394
3395 if (min == max) continue;
3396
3397 if (minimize)
3398 {
3399 for (fi = min;; fi++)
3400 {
3401 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 if (fi >= max) RRETURN(MATCH_NOMATCH);
3404 if (eptr <= md->end_subject - length &&
3405 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3406 #ifdef SUPPORT_UCP
3407 else if (oclength > 0 &&
3408 eptr <= md->end_subject - oclength &&
3409 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3410 #endif /* SUPPORT_UCP */
3411 else
3412 {
3413 CHECK_PARTIAL();
3414 RRETURN(MATCH_NOMATCH);
3415 }
3416 }
3417 /* Control never gets here */
3418 }
3419
3420 else /* Maximize */
3421 {
3422 pp = eptr;
3423 for (i = min; i < max; i++)
3424 {
3425 if (eptr <= md->end_subject - length &&
3426 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3427 #ifdef SUPPORT_UCP
3428 else if (oclength > 0 &&
3429 eptr <= md->end_subject - oclength &&
3430 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3431 #endif /* SUPPORT_UCP */
3432 else
3433 {
3434 CHECK_PARTIAL();
3435 break;
3436 }
3437 }
3438
3439 if (possessive) continue;
3440
3441 for(;;)
3442 {
3443 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3446 #ifdef SUPPORT_UCP
3447 eptr--;
3448 BACKCHAR(eptr);
3449 #else /* without SUPPORT_UCP */
3450 eptr -= length;
3451 #endif /* SUPPORT_UCP */
3452 }
3453 }
3454 /* Control never gets here */
3455 }
3456
3457 /* If the length of a UTF-8 character is 1, we fall through here, and
3458 obey the code as for non-UTF-8 characters below, though in this case the
3459 value of fc will always be < 128. */
3460 }
3461 else
3462 #endif /* SUPPORT_UTF */
3463 /* When not in UTF-8 mode, load a single-byte character. */
3464 fc = *ecode++;
3465
3466 /* The value of fc at this point is always one character, though we may
3467 or may not be in UTF mode. The code is duplicated for the caseless and
3468 caseful cases, for speed, since matching characters is likely to be quite
3469 common. First, ensure the minimum number of matches are present. If min =
3470 max, continue at the same level without recursing. Otherwise, if
3471 minimizing, keep trying the rest of the expression and advancing one
3472 matching character if failing, up to the maximum. Alternatively, if
3473 maximizing, find the maximum number of characters and work backwards. */
3474
3475 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3476 max, (char *)eptr));
3477
3478 if (op >= OP_STARI) /* Caseless */
3479 {
3480 #ifdef COMPILE_PCRE8
3481 /* fc must be < 128 if UTF is enabled. */
3482 foc = md->fcc[fc];
3483 #else
3484 #ifdef SUPPORT_UTF
3485 #ifdef SUPPORT_UCP
3486 if (utf && fc > 127)
3487 foc = UCD_OTHERCASE(fc);
3488 #else
3489 if (utf && fc > 127)
3490 foc = fc;
3491 #endif /* SUPPORT_UCP */
3492 else
3493 #endif /* SUPPORT_UTF */
3494 foc = TABLE_GET(fc, md->fcc, fc);
3495 #endif /* COMPILE_PCRE8 */
3496
3497 for (i = 1; i <= min; i++)
3498 {
3499 pcre_uint32 cc; /* Faster than pcre_uchar */
3500 if (eptr >= md->end_subject)
3501 {
3502 SCHECK_PARTIAL();
3503 RRETURN(MATCH_NOMATCH);
3504 }
3505 cc = RAWUCHARTEST(eptr);
3506 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3507 eptr++;
3508 }
3509 if (min == max) continue;
3510 if (minimize)
3511 {
3512 for (fi = min;; fi++)
3513 {
3514 pcre_uint32 cc; /* Faster than pcre_uchar */
3515 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3517 if (fi >= max) RRETURN(MATCH_NOMATCH);
3518 if (eptr >= md->end_subject)
3519 {
3520 SCHECK_PARTIAL();
3521 RRETURN(MATCH_NOMATCH);
3522 }
3523 cc = RAWUCHARTEST(eptr);
3524 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3525 eptr++;
3526 }
3527 /* Control never gets here */
3528 }
3529 else /* Maximize */
3530 {
3531 pp = eptr;
3532 for (i = min; i < max; i++)
3533 {
3534 pcre_uint32 cc; /* Faster than pcre_uchar */
3535 if (eptr >= md->end_subject)
3536 {
3537 SCHECK_PARTIAL();
3538 break;
3539 }
3540 cc = RAWUCHARTEST(eptr);
3541 if (fc != cc && foc != cc) break;
3542 eptr++;
3543 }
3544
3545 if (possessive) continue;
3546
3547 while (eptr >= pp)
3548 {
3549 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3550 eptr--;
3551 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3552 }
3553 RRETURN(MATCH_NOMATCH);
3554 }
3555 /* Control never gets here */
3556 }
3557
3558 /* Caseful comparisons (includes all multi-byte characters) */
3559
3560 else
3561 {
3562 for (i = 1; i <= min; i++)
3563 {
3564 if (eptr >= md->end_subject)
3565 {
3566 SCHECK_PARTIAL();
3567 RRETURN(MATCH_NOMATCH);
3568 }
3569 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3570 }
3571
3572 if (min == max) continue;
3573
3574 if (minimize)
3575 {
3576 for (fi = min;; fi++)
3577 {
3578 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3579 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3580 if (fi >= max) RRETURN(MATCH_NOMATCH);
3581 if (eptr >= md->end_subject)
3582 {
3583 SCHECK_PARTIAL();
3584 RRETURN(MATCH_NOMATCH);
3585 }
3586 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3587 }
3588 /* Control never gets here */
3589 }
3590 else /* Maximize */
3591 {
3592 pp = eptr;
3593 for (i = min; i < max; i++)
3594 {
3595 if (eptr >= md->end_subject)
3596 {
3597 SCHECK_PARTIAL();
3598 break;
3599 }
3600 if (fc != RAWUCHARTEST(eptr)) break;
3601 eptr++;
3602 }
3603 if (possessive) continue;
3604
3605 while (eptr >= pp)
3606 {
3607 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3608 eptr--;
3609 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3610 }
3611 RRETURN(MATCH_NOMATCH);
3612 }
3613 }
3614 /* Control never gets here */
3615
3616 /* Match a negated single one-byte character. The character we are
3617 checking can be multibyte. */
3618
3619 case OP_NOT:
3620 case OP_NOTI:
3621 if (eptr >= md->end_subject)
3622 {
3623 SCHECK_PARTIAL();
3624 RRETURN(MATCH_NOMATCH);
3625 }
3626 #ifdef SUPPORT_UTF
3627 if (utf)
3628 {
3629 register pcre_uint32 ch, och;
3630
3631 ecode++;
3632 GETCHARINC(ch, ecode);
3633 GETCHARINC(c, eptr);
3634
3635 if (op == OP_NOT)
3636 {
3637 if (ch == c) RRETURN(MATCH_NOMATCH);
3638 }
3639 else
3640 {
3641 #ifdef SUPPORT_UCP
3642 if (ch > 127)
3643 och = UCD_OTHERCASE(ch);
3644 #else
3645 if (ch > 127)
3646 och = ch;
3647 #endif /* SUPPORT_UCP */
3648 else
3649 och = TABLE_GET(ch, md->fcc, ch);
3650 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3651 }
3652 }
3653 else
3654 #endif
3655 {
3656 register pcre_uint32 ch = ecode[1];
3657 c = *eptr++;
3658 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3659 RRETURN(MATCH_NOMATCH);
3660 ecode += 2;
3661 }
3662 break;
3663
3664 /* Match a negated single one-byte character repeatedly. This is almost a
3665 repeat of the code for a repeated single character, but I haven't found a
3666 nice way of commoning these up that doesn't require a test of the
3667 positive/negative option for each character match. Maybe that wouldn't add
3668 very much to the time taken, but character matching *is* what this is all
3669 about... */
3670
3671 case OP_NOTEXACT:
3672 case OP_NOTEXACTI:
3673 min = max = GET2(ecode, 1);
3674 ecode += 1 + IMM2_SIZE;
3675 goto REPEATNOTCHAR;
3676
3677 case OP_NOTUPTO:
3678 case OP_NOTUPTOI:
3679 case OP_NOTMINUPTO:
3680 case OP_NOTMINUPTOI:
3681 min = 0;
3682 max = GET2(ecode, 1);
3683 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3684 ecode += 1 + IMM2_SIZE;
3685 goto REPEATNOTCHAR;
3686
3687 case OP_NOTPOSSTAR:
3688 case OP_NOTPOSSTARI:
3689 possessive = TRUE;
3690 min = 0;
3691 max = INT_MAX;
3692 ecode++;
3693 goto REPEATNOTCHAR;
3694
3695 case OP_NOTPOSPLUS:
3696 case OP_NOTPOSPLUSI:
3697 possessive = TRUE;
3698 min = 1;
3699 max = INT_MAX;
3700 ecode++;
3701 goto REPEATNOTCHAR;
3702
3703 case OP_NOTPOSQUERY:
3704 case OP_NOTPOSQUERYI:
3705 possessive = TRUE;
3706 min = 0;
3707 max = 1;
3708 ecode++;
3709 goto REPEATNOTCHAR;
3710
3711 case OP_NOTPOSUPTO:
3712 case OP_NOTPOSUPTOI:
3713 possessive = TRUE;
3714 min = 0;
3715 max = GET2(ecode, 1);
3716 ecode += 1 + IMM2_SIZE;
3717 goto REPEATNOTCHAR;
3718
3719 case OP_NOTSTAR:
3720 case OP_NOTSTARI:
3721 case OP_NOTMINSTAR:
3722 case OP_NOTMINSTARI:
3723 case OP_NOTPLUS:
3724 case OP_NOTPLUSI:
3725 case OP_NOTMINPLUS:
3726 case OP_NOTMINPLUSI:
3727 case OP_NOTQUERY:
3728 case OP_NOTQUERYI:
3729 case OP_NOTMINQUERY:
3730 case OP_NOTMINQUERYI:
3731 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3732 minimize = (c & 1) != 0;
3733 min = rep_min[c]; /* Pick up values from tables; */
3734 max = rep_max[c]; /* zero for max => infinity */
3735 if (max == 0) max = INT_MAX;
3736
3737 /* Common code for all repeated single-byte matches. */
3738
3739 REPEATNOTCHAR:
3740 GETCHARINCTEST(fc, ecode);
3741
3742 /* The code is duplicated for the caseless and caseful cases, for speed,
3743 since matching characters is likely to be quite common. First, ensure the
3744 minimum number of matches are present. If min = max, continue at the same
3745 level without recursing. Otherwise, if minimizing, keep trying the rest of
3746 the expression and advancing one matching character if failing, up to the
3747 maximum. Alternatively, if maximizing, find the maximum number of
3748 characters and work backwards. */
3749
3750 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3751 max, (char *)eptr));
3752
3753 if (op >= OP_NOTSTARI) /* Caseless */
3754 {
3755 #ifdef SUPPORT_UTF
3756 #ifdef SUPPORT_UCP
3757 if (utf && fc > 127)
3758 foc = UCD_OTHERCASE(fc);
3759 #else
3760 if (utf && fc > 127)
3761 foc = fc;
3762 #endif /* SUPPORT_UCP */
3763 else
3764 #endif /* SUPPORT_UTF */
3765 foc = TABLE_GET(fc, md->fcc, fc);
3766
3767 #ifdef SUPPORT_UTF
3768 if (utf)
3769 {
3770 register pcre_uint32 d;
3771 for (i = 1; i <= min; i++)
3772 {
3773 if (eptr >= md->end_subject)
3774 {
3775 SCHECK_PARTIAL();
3776 RRETURN(MATCH_NOMATCH);
3777 }
3778 GETCHARINC(d, eptr);
3779 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3780 }
3781 }
3782 else
3783 #endif
3784 /* Not UTF mode */
3785 {
3786 for (i = 1; i <= min; i++)
3787 {
3788 if (eptr >= md->end_subject)
3789 {
3790 SCHECK_PARTIAL();
3791 RRETURN(MATCH_NOMATCH);
3792 }
3793 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3794 eptr++;
3795 }
3796 }
3797
3798 if (min == max) continue;
3799
3800 if (minimize)
3801 {
3802 #ifdef SUPPORT_UTF
3803 if (utf)
3804 {
3805 register pcre_uint32 d;
3806 for (fi = min;; fi++)
3807 {
3808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3810 if (fi >= max) RRETURN(MATCH_NOMATCH);
3811 if (eptr >= md->end_subject)
3812 {
3813 SCHECK_PARTIAL();
3814 RRETURN(MATCH_NOMATCH);
3815 }
3816 GETCHARINC(d, eptr);
3817 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3818 }
3819 }
3820 else
3821 #endif
3822 /* Not UTF mode */
3823 {
3824 for (fi = min;; fi++)
3825 {
3826 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3828 if (fi >= max) RRETURN(MATCH_NOMATCH);
3829 if (eptr >= md->end_subject)
3830 {
3831 SCHECK_PARTIAL();
3832 RRETURN(MATCH_NOMATCH);
3833 }
3834 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3835 eptr++;
3836 }
3837 }
3838 /* Control never gets here */
3839 }
3840
3841 /* Maximize case */
3842
3843 else
3844 {
3845 pp = eptr;
3846
3847 #ifdef SUPPORT_UTF
3848 if (utf)
3849 {
3850 register pcre_uint32 d;
3851 for (i = min; i < max; i++)
3852 {
3853 int len = 1;
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 break;
3858 }
3859 GETCHARLEN(d, eptr, len);
3860 if (fc == d || (unsigned int)foc == d) break;
3861 eptr += len;
3862 }
3863 if (possessive) continue;
3864 for(;;)
3865 {
3866 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3868 if (eptr-- == pp) break; /* Stop if tried at original pos */
3869 BACKCHAR(eptr);
3870 }
3871 }
3872 else
3873 #endif
3874 /* Not UTF mode */
3875 {
3876 for (i = min; i < max; i++)
3877 {
3878 if (eptr >= md->end_subject)
3879 {
3880 SCHECK_PARTIAL();
3881 break;
3882 }
3883 if (fc == *eptr || foc == *eptr) break;
3884 eptr++;
3885 }
3886 if (possessive) continue;
3887 while (eptr >= pp)
3888 {
3889 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3890 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3891 eptr--;
3892 }
3893 }
3894
3895 RRETURN(MATCH_NOMATCH);
3896 }
3897 /* Control never gets here */
3898 }
3899
3900 /* Caseful comparisons */
3901
3902 else
3903 {
3904 #ifdef SUPPORT_UTF
3905 if (utf)
3906 {
3907 register pcre_uint32 d;
3908 for (i = 1; i <= min; i++)
3909 {
3910 if (eptr >= md->end_subject)
3911 {
3912 SCHECK_PARTIAL();
3913 RRETURN(MATCH_NOMATCH);
3914 }
3915 GETCHARINC(d, eptr);
3916 if (fc == d) RRETURN(MATCH_NOMATCH);
3917 }
3918 }
3919 else
3920 #endif
3921 /* Not UTF mode */
3922 {
3923 for (i = 1; i <= min; i++)
3924 {
3925 if (eptr >= md->end_subject)
3926 {
3927 SCHECK_PARTIAL();
3928 RRETURN(MATCH_NOMATCH);
3929 }
3930 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3931 }
3932 }
3933
3934 if (min == max) continue;
3935
3936 if (minimize)
3937 {
3938 #ifdef SUPPORT_UTF
3939 if (utf)
3940 {
3941 register pcre_uint32 d;
3942 for (fi = min;; fi++)
3943 {
3944 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3946 if (fi >= max) RRETURN(MATCH_NOMATCH);
3947 if (eptr >= md->end_subject)
3948 {
3949 SCHECK_PARTIAL();
3950 RRETURN(MATCH_NOMATCH);
3951 }
3952 GETCHARINC(d, eptr);
3953 if (fc == d) RRETURN(MATCH_NOMATCH);
3954 }
3955 }
3956 else
3957 #endif
3958 /* Not UTF mode */
3959 {
3960 for (fi = min;; fi++)
3961 {
3962 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3964 if (fi >= max) RRETURN(MATCH_NOMATCH);
3965 if (eptr >= md->end_subject)
3966 {
3967 SCHECK_PARTIAL();
3968 RRETURN(MATCH_NOMATCH);
3969 }
3970 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3971 }
3972 }
3973 /* Control never gets here */
3974 }
3975
3976 /* Maximize case */
3977
3978 else
3979 {
3980 pp = eptr;
3981
3982 #ifdef SUPPORT_UTF
3983 if (utf)
3984 {
3985 register pcre_uint32 d;
3986 for (i = min; i < max; i++)
3987 {
3988 int len = 1;
3989 if (eptr >= md->end_subject)
3990 {
3991 SCHECK_PARTIAL();
3992 break;
3993 }
3994 GETCHARLEN(d, eptr, len);
3995 if (fc == d) break;
3996 eptr += len;
3997 }
3998 if (possessive) continue;
3999 for(;;)
4000 {
4001 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4002 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4003 if (eptr-- == pp) break; /* Stop if tried at original pos */
4004 BACKCHAR(eptr);
4005 }
4006 }
4007 else
4008 #endif
4009 /* Not UTF mode */
4010 {
4011 for (i = min; i < max; i++)
4012 {
4013 if (eptr >= md->end_subject)
4014 {
4015 SCHECK_PARTIAL();
4016 break;
4017 }
4018 if (fc == *eptr) break;
4019 eptr++;
4020 }
4021 if (possessive) continue;
4022 while (eptr >= pp)
4023 {
4024 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4026 eptr--;
4027 }
4028 }
4029
4030 RRETURN(MATCH_NOMATCH);
4031 }
4032 }
4033 /* Control never gets here */
4034
4035 /* Match a single character type repeatedly; several different opcodes
4036 share code. This is very similar to the code for single characters, but we
4037 repeat it in the interests of efficiency. */
4038
4039 case OP_TYPEEXACT:
4040 min = max = GET2(ecode, 1);
4041 minimize = TRUE;
4042 ecode += 1 + IMM2_SIZE;
4043 goto REPEATTYPE;
4044
4045 case OP_TYPEUPTO:
4046 case OP_TYPEMINUPTO:
4047 min = 0;
4048 max = GET2(ecode, 1);
4049 minimize = *ecode == OP_TYPEMINUPTO;
4050 ecode += 1 + IMM2_SIZE;
4051 goto REPEATTYPE;
4052
4053 case OP_TYPEPOSSTAR:
4054 possessive = TRUE;
4055 min = 0;
4056 max = INT_MAX;
4057 ecode++;
4058 goto REPEATTYPE;
4059
4060 case OP_TYPEPOSPLUS:
4061 possessive = TRUE;
4062 min = 1;
4063 max = INT_MAX;
4064 ecode++;
4065 goto REPEATTYPE;
4066
4067 case OP_TYPEPOSQUERY:
4068 possessive = TRUE;
4069 min = 0;
4070 max = 1;
4071 ecode++;
4072 goto REPEATTYPE;
4073
4074 case OP_TYPEPOSUPTO:
4075 possessive = TRUE;
4076 min = 0;
4077 max = GET2(ecode, 1);
4078 ecode += 1 + IMM2_SIZE;
4079 goto REPEATTYPE;
4080
4081 case OP_TYPESTAR:
4082 case OP_TYPEMINSTAR:
4083 case OP_TYPEPLUS:
4084 case OP_TYPEMINPLUS:
4085 case OP_TYPEQUERY:
4086 case OP_TYPEMINQUERY:
4087 c = *ecode++ - OP_TYPESTAR;
4088 minimize = (c & 1) != 0;
4089 min = rep_min[c]; /* Pick up values from tables; */
4090 max = rep_max[c]; /* zero for max => infinity */
4091 if (max == 0) max = INT_MAX;
4092
4093 /* Common code for all repeated single character type matches. Note that
4094 in UTF-8 mode, '.' matches a character of any length, but for the other
4095 character types, the valid characters are all one-byte long. */
4096
4097 REPEATTYPE:
4098 ctype = *ecode++; /* Code for the character type */
4099
4100 #ifdef SUPPORT_UCP
4101 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4102 {
4103 prop_fail_result = ctype == OP_NOTPROP;
4104 prop_type = *ecode++;
4105 prop_value = *ecode++;
4106 }
4107 else prop_type = -1;
4108 #endif
4109
4110 /* First, ensure the minimum number of matches are present. Use inline
4111 code for maximizing the speed, and do the type test once at the start
4112 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4113 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4114 and single-bytes. */
4115
4116 if (min > 0)
4117 {
4118 #ifdef SUPPORT_UCP
4119 if (prop_type >= 0)
4120 {
4121 switch(prop_type)
4122 {
4123 case PT_ANY:
4124 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4125 for (i = 1; i <= min; i++)
4126 {
4127 if (eptr >= md->end_subject)
4128 {
4129 SCHECK_PARTIAL();
4130 RRETURN(MATCH_NOMATCH);
4131 }
4132 GETCHARINCTEST(c, eptr);
4133 }
4134 break;
4135
4136 case PT_LAMP:
4137 for (i = 1; i <= min; i++)
4138 {
4139 int chartype;
4140 if (eptr >= md->end_subject)
4141 {
4142 SCHECK_PARTIAL();
4143 RRETURN(MATCH_NOMATCH);
4144 }
4145 GETCHARINCTEST(c, eptr);
4146 chartype = UCD_CHARTYPE(c);
4147 if ((chartype == ucp_Lu ||
4148 chartype == ucp_Ll ||
4149 chartype == ucp_Lt) == prop_fail_result)
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 break;
4153
4154 case PT_GC:
4155 for (i = 1; i <= min; i++)
4156 {
4157 if (eptr >= md->end_subject)
4158 {
4159 SCHECK_PARTIAL();
4160 RRETURN(MATCH_NOMATCH);
4161 }
4162 GETCHARINCTEST(c, eptr);
4163 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4164 RRETURN(MATCH_NOMATCH);
4165 }
4166 break;
4167
4168 case PT_PC:
4169 for (i = 1; i <= min; i++)
4170 {
4171 if (eptr >= md->end_subject)
4172 {
4173 SCHECK_PARTIAL();
4174 RRETURN(MATCH_NOMATCH);
4175 }
4176 GETCHARINCTEST(c, eptr);
4177 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 break;
4181
4182 case PT_SC:
4183 for (i = 1; i <= min; i++)
4184 {
4185 if (eptr >= md->end_subject)
4186 {
4187 SCHECK_PARTIAL();
4188 RRETURN(MATCH_NOMATCH);
4189 }
4190 GETCHARINCTEST(c, eptr);
4191 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 break;
4195
4196 case PT_ALNUM:
4197 for (i = 1; i <= min; i++)
4198 {
4199 int category;
4200 if (eptr >= md->end_subject)
4201 {
4202 SCHECK_PARTIAL();
4203 RRETURN(MATCH_NOMATCH);
4204 }
4205 GETCHARINCTEST(c, eptr);
4206 category = UCD_CATEGORY(c);
4207 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4208 RRETURN(MATCH_NOMATCH);
4209 }
4210 break;
4211
4212 case PT_SPACE: /* Perl space */
4213 for (i = 1; i <= min; i++)
4214 {
4215 if (eptr >= md->end_subject)
4216 {
4217 SCHECK_PARTIAL();
4218 RRETURN(MATCH_NOMATCH);
4219 }
4220 GETCHARINCTEST(c, eptr);
4221 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4222 c == CHAR_FF || c == CHAR_CR)
4223 == prop_fail_result)
4224 RRETURN(MATCH_NOMATCH);
4225 }
4226 break;
4227
4228 case PT_PXSPACE: /* POSIX space */
4229 for (i = 1; i <= min; i++)
4230 {
4231 if (eptr >= md->end_subject)
4232 {
4233 SCHECK_PARTIAL();
4234 RRETURN(MATCH_NOMATCH);
4235 }
4236 GETCHARINCTEST(c, eptr);
4237 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4238 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4239 == prop_fail_result)
4240 RRETURN(MATCH_NOMATCH);
4241 }
4242 break;
4243
4244 case PT_WORD:
4245 for (i = 1; i <= min; i++)
4246 {
4247 int category;
4248 if (eptr >= md->end_subject)
4249 {
4250 SCHECK_PARTIAL();
4251 RRETURN(MATCH_NOMATCH);
4252 }
4253 GETCHARINCTEST(c, eptr);
4254 category = UCD_CATEGORY(c);
4255 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4256 == prop_fail_result)
4257 RRETURN(MATCH_NOMATCH);
4258 }
4259 break;
4260
4261 case PT_CLIST:
4262 for (i = 1; i <= min; i++)
4263 {
4264 const pcre_uint32 *cp;
4265 if (eptr >= md->end_subject)
4266 {
4267 SCHECK_PARTIAL();
4268 RRETURN(MATCH_NOMATCH);
4269 }
4270 GETCHARINCTEST(c, eptr);
4271 cp = PRIV(ucd_caseless_sets) + prop_value;
4272 for (;;)
4273 {
4274 if (c < *cp)
4275 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4276 if (c == *cp++)
4277 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4278 }
4279 }
4280 break;
4281
4282 case PT_UCNC:
4283 for (i = 1; i <= min; i++)
4284 {
4285 if (eptr >= md->end_subject)
4286 {
4287 SCHECK_PARTIAL();
4288 RRETURN(MATCH_NOMATCH);
4289 }
4290 GETCHARINCTEST(c, eptr);
4291 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4292 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4293 c >= 0xe000) == prop_fail_result)
4294 RRETURN(MATCH_NOMATCH);
4295 }
4296 break;
4297
4298 /* This should not occur */
4299
4300 default:
4301 RRETURN(PCRE_ERROR_INTERNAL);
4302 }
4303 }
4304
4305 /* Match extended Unicode sequences. We will get here only if the
4306 support is in the binary; otherwise a compile-time error occurs. */
4307
4308 else if (ctype == OP_EXTUNI)
4309 {
4310 for (i = 1; i <= min; i++)
4311 {
4312 if (eptr >= md->end_subject)
4313 {
4314 SCHECK_PARTIAL();
4315 RRETURN(MATCH_NOMATCH);
4316 }
4317 else
4318 {
4319 int lgb, rgb;
4320 GETCHARINCTEST(c, eptr);
4321 lgb = UCD_GRAPHBREAK(c);
4322 while (eptr < md->end_subject)
4323 {
4324 int len = 1;
4325 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4326 rgb = UCD_GRAPHBREAK(c);
4327 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4328 lgb = rgb;
4329 eptr += len;
4330 }
4331 }
4332 CHECK_PARTIAL();
4333 }
4334 }
4335
4336 else
4337 #endif /* SUPPORT_UCP */
4338
4339 /* Handle all other cases when the coding is UTF-8 */
4340
4341 #ifdef SUPPORT_UTF
4342 if (utf) switch(ctype)
4343 {
4344 case OP_ANY:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 RRETURN(MATCH_NOMATCH);
4351 }
4352 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4353 if (md->partial != 0 &&
4354 eptr + 1 >= md->end_subject &&
4355 NLBLOCK->nltype == NLTYPE_FIXED &&
4356 NLBLOCK->nllen == 2 &&
4357 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4358 {
4359 md->hitend = TRUE;
4360 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4361 }
4362 eptr++;
4363 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4364 }
4365 break;
4366
4367 case OP_ALLANY:
4368 for (i = 1; i <= min; i++)
4369 {
4370 if (eptr >= md->end_subject)
4371 {
4372 SCHECK_PARTIAL();
4373 RRETURN(MATCH_NOMATCH);
4374 }
4375 eptr++;
4376 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4377 }
4378 break;
4379
4380 case OP_ANYBYTE:
4381 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4382 eptr += min;
4383 break;
4384
4385 case OP_ANYNL:
4386 for (i = 1; i <= min; i++)
4387 {
4388 if (eptr >= md->end_subject)
4389 {
4390 SCHECK_PARTIAL();
4391 RRETURN(MATCH_NOMATCH);
4392 }
4393 GETCHARINC(c, eptr);
4394 switch(c)
4395 {
4396 default: RRETURN(MATCH_NOMATCH);
4397
4398 case CHAR_CR:
4399 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4400 break;
4401
4402 case CHAR_LF:
4403 break;
4404
4405 case CHAR_VT:
4406 case CHAR_FF:
4407 case CHAR_NEL:
4408 #ifndef EBCDIC
4409 case 0x2028:
4410 case 0x2029:
4411 #endif /* Not EBCDIC */
4412 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4413 break;
4414 }
4415 }
4416 break;
4417
4418 case OP_NOT_HSPACE:
4419 for (i = 1; i <= min; i++)
4420 {
4421 if (eptr >= md->end_subject)
4422 {
4423 SCHECK_PARTIAL();
4424 RRETURN(MATCH_NOMATCH);
4425 }
4426 GETCHARINC(c, eptr);
4427 switch(c)
4428 {
4429 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4430 default: break;
4431 }
4432 }
4433 break;
4434
4435 case OP_HSPACE:
4436 for (i = 1; i <= min; i++)
4437 {
4438 if (eptr >= md->end_subject)
4439 {
4440 SCHECK_PARTIAL();
4441 RRETURN(MATCH_NOMATCH);
4442 }
4443 GETCHARINC(c, eptr);
4444 switch(c)
4445 {
4446 HSPACE_CASES: break; /* Byte and multibyte cases */
4447 default: RRETURN(MATCH_NOMATCH);
4448 }
4449 }
4450 break;
4451
4452 case OP_NOT_VSPACE:
4453 for (i = 1; i <= min; i++)
4454 {
4455 if (eptr >= md->end_subject)
4456 {
4457 SCHECK_PARTIAL();
4458 RRETURN(MATCH_NOMATCH);
4459 }
4460 GETCHARINC(c, eptr);
4461 switch(c)
4462 {
4463 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4464 default: break;
4465 }
4466 }
4467 break;
4468
4469 case OP_VSPACE:
4470 for (i = 1; i <= min; i++)
4471 {
4472 if (eptr >= md->end_subject)
4473 {
4474 SCHECK_PARTIAL();
4475 RRETURN(MATCH_NOMATCH);
4476 }
4477 GETCHARINC(c, eptr);
4478 switch(c)
4479 {
4480 VSPACE_CASES: break;
4481 default: RRETURN(MATCH_NOMATCH);
4482 }
4483 }
4484 break;
4485
4486 case OP_NOT_DIGIT:
4487 for (i = 1; i <= min; i++)
4488 {
4489 if (eptr >= md->end_subject)
4490 {
4491 SCHECK_PARTIAL();
4492 RRETURN(MATCH_NOMATCH);
4493 }
4494 GETCHARINC(c, eptr);
4495 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4496 RRETURN(MATCH_NOMATCH);
4497 }
4498 break;
4499
4500 case OP_DIGIT:
4501 for (i = 1; i <= min; i++)
4502 {
4503 pcre_uint32 cc;
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 RRETURN(MATCH_NOMATCH);
4508 }
4509 cc = RAWUCHAR(eptr);
4510 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4511 RRETURN(MATCH_NOMATCH);
4512 eptr++;
4513 /* No need to skip more bytes - we know it's a 1-byte character */
4514 }
4515 break;
4516
4517 case OP_NOT_WHITESPACE:
4518 for (i = 1; i <= min; i++)
4519 {
4520 pcre_uint32 cc;
4521 if (eptr >= md->end_subject)
4522 {
4523 SCHECK_PARTIAL();
4524 RRETURN(MATCH_NOMATCH);
4525 }
4526 cc = RAWUCHAR(eptr);
4527 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4528 RRETURN(MATCH_NOMATCH);
4529 eptr++;
4530 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4531 }
4532 break;
4533
4534 case OP_WHITESPACE:
4535 for (i = 1; i <= min; i++)
4536 {
4537 pcre_uint32 cc;
4538 if (eptr >= md->end_subject)
4539 {
4540 SCHECK_PARTIAL();
4541 RRETURN(MATCH_NOMATCH);
4542 }
4543 cc = RAWUCHAR(eptr);
4544 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4545 RRETURN(MATCH_NOMATCH);
4546 eptr++;
4547 /* No need to skip more bytes - we know it's a 1-byte character */
4548 }
4549 break;
4550
4551 case OP_NOT_WORDCHAR:
4552 for (i = 1; i <= min; i++)
4553 {
4554 pcre_uint32 cc;
4555 if (eptr >= md->end_subject)
4556 {
4557 SCHECK_PARTIAL();
4558 RRETURN(MATCH_NOMATCH);
4559 }
4560 cc = RAWUCHAR(eptr);
4561 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4562 RRETURN(MATCH_NOMATCH);
4563 eptr++;
4564 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4565 }
4566 break;
4567
4568 case OP_WORDCHAR:
4569 for (i = 1; i <= min; i++)
4570 {
4571 pcre_uint32 cc;
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 RRETURN(MATCH_NOMATCH);
4576 }
4577 cc = RAWUCHAR(eptr);
4578 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4579 RRETURN(MATCH_NOMATCH);
4580 eptr++;
4581 /* No need to skip more bytes - we know it's a 1-byte character */
4582 }
4583 break;
4584
4585 default:
4586 RRETURN(PCRE_ERROR_INTERNAL);
4587 } /* End switch(ctype) */
4588
4589 else
4590 #endif /* SUPPORT_UTF */
4591
4592 /* Code for the non-UTF-8 case for minimum matching of operators other
4593 than OP_PROP and OP_NOTPROP. */
4594
4595 switch(ctype)
4596 {
4597 case OP_ANY:
4598 for (i = 1; i <= min; i++)
4599 {
4600 if (eptr >= md->end_subject)
4601 {
4602 SCHECK_PARTIAL();
4603 RRETURN(MATCH_NOMATCH);
4604 }
4605 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4606 if (md->partial != 0 &&
4607 eptr + 1 >= md->end_subject &&
4608 NLBLOCK->nltype == NLTYPE_FIXED &&
4609 NLBLOCK->nllen == 2 &&
4610 *eptr == NLBLOCK->nl[0])
4611 {
4612 md->hitend = TRUE;
4613 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4614 }
4615 eptr++;
4616 }
4617 break;
4618
4619 case OP_ALLANY:
4620 if (eptr > md->end_subject - min)
4621 {
4622 SCHECK_PARTIAL();
4623 RRETURN(MATCH_NOMATCH);
4624 }
4625 eptr += min;
4626 break;
4627
4628 case OP_ANYBYTE:
4629 if (eptr > md->end_subject - min)
4630 {
4631 SCHECK_PARTIAL();
4632 RRETURN(MATCH_NOMATCH);
4633 }
4634 eptr += min;
4635 break;
4636
4637 case OP_ANYNL:
4638 for (i = 1; i <= min; i++)
4639 {
4640 if (eptr >= md->end_subject)
4641 {
4642 SCHECK_PARTIAL();
4643 RRETURN(MATCH_NOMATCH);
4644 }
4645 switch(*eptr++)
4646 {
4647 default: RRETURN(MATCH_NOMATCH);
4648
4649 case CHAR_CR:
4650 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4651 break;
4652
4653 case CHAR_LF:
4654 break;
4655
4656 case CHAR_VT:
4657 case CHAR_FF:
4658 case CHAR_NEL:
4659 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4660 case 0x2028:
4661 case 0x2029:
4662 #endif
4663 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4664 break;
4665 }
4666 }
4667 break;
4668
4669 case OP_NOT_HSPACE:
4670 for (i = 1; i <= min; i++)
4671 {
4672 if (eptr >= md->end_subject)
4673 {
4674 SCHECK_PARTIAL();
4675 RRETURN(MATCH_NOMATCH);
4676 }
4677 switch(*eptr++)
4678 {
4679 default: break;
4680 HSPACE_BYTE_CASES:
4681 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4682 HSPACE_MULTIBYTE_CASES:
4683 #endif
4684 RRETURN(MATCH_NOMATCH);
4685 }
4686 }
4687 break;
4688
4689 case OP_HSPACE:
4690 for (i = 1; i <= min; i++)
4691 {
4692 if (eptr >= md->end_subject)
4693 {
4694 SCHECK_PARTIAL();
4695 RRETURN(MATCH_NOMATCH);
4696 }
4697 switch(*eptr++)
4698 {
4699 default: RRETURN(MATCH_NOMATCH);
4700 HSPACE_BYTE_CASES:
4701 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4702 HSPACE_MULTIBYTE_CASES:
4703 #endif
4704 break;
4705 }
4706 }
4707 break;
4708
4709 case OP_NOT_VSPACE:
4710 for (i = 1; i <= min; i++)
4711 {
4712 if (eptr >= md->end_subject)
4713 {
4714 SCHECK_PARTIAL();
4715 RRETURN(MATCH_NOMATCH);
4716 }
4717 switch(*eptr++)
4718 {
4719 VSPACE_BYTE_CASES:
4720 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4721 VSPACE_MULTIBYTE_CASES:
4722 #endif
4723 RRETURN(MATCH_NOMATCH);
4724 default: break;
4725 }
4726 }
4727 break;
4728
4729 case OP_VSPACE:
4730 for (i = 1; i <= min; i++)
4731 {
4732 if (eptr >= md->end_subject)
4733 {
4734 SCHECK_PARTIAL();
4735 RRETURN(MATCH_NOMATCH);
4736 }
4737 switch(*eptr++)
4738 {
4739 default: RRETURN(MATCH_NOMATCH);
4740 VSPACE_BYTE_CASES:
4741 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4742 VSPACE_MULTIBYTE_CASES:
4743 #endif
4744 break;
4745 }
4746 }
4747 break;
4748
4749 case OP_NOT_DIGIT:
4750 for (i = 1; i <= min; i++)
4751 {
4752 if (eptr >= md->end_subject)
4753 {
4754 SCHECK_PARTIAL();
4755 RRETURN(MATCH_NOMATCH);
4756 }
4757 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4758 RRETURN(MATCH_NOMATCH);
4759 eptr++;
4760 }
4761 break;
4762
4763 case OP_DIGIT:
4764 for (i = 1; i <= min; i++)
4765 {
4766 if (eptr >= md->end_subject)
4767 {
4768 SCHECK_PARTIAL();
4769 RRETURN(MATCH_NOMATCH);
4770 }
4771 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4772 RRETURN(MATCH_NOMATCH);
4773 eptr++;
4774 }
4775 break;
4776
4777 case OP_NOT_WHITESPACE:
4778 for (i = 1; i <= min; i++)
4779 {
4780 if (eptr >= md->end_subject)
4781 {
4782 SCHECK_PARTIAL();
4783 RRETURN(MATCH_NOMATCH);
4784 }
4785 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4786 RRETURN(MATCH_NOMATCH);
4787 eptr++;
4788 }
4789 break;
4790
4791 case OP_WHITESPACE:
4792 for (i = 1; i <= min; i++)
4793 {
4794 if (eptr >= md->end_subject)
4795 {
4796 SCHECK_PARTIAL();
4797 RRETURN(MATCH_NOMATCH);
4798 }
4799 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4800 RRETURN(MATCH_NOMATCH);
4801 eptr++;
4802 }
4803 break;
4804
4805 case OP_NOT_WORDCHAR:
4806 for (i = 1; i <= min; i++)
4807 {
4808 if (eptr >= md->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 RRETURN(MATCH_NOMATCH);
4812 }
4813 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4814 RRETURN(MATCH_NOMATCH);
4815 eptr++;
4816 }
4817 break;
4818
4819 case OP_WORDCHAR:
4820 for (i = 1; i <= min; i++)
4821 {
4822 if (eptr >= md->end_subject)
4823 {
4824 SCHECK_PARTIAL();
4825 RRETURN(MATCH_NOMATCH);
4826 }
4827 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4828 RRETURN(MATCH_NOMATCH);
4829 eptr++;
4830 }
4831 break;
4832
4833 default:
4834 RRETURN(PCRE_ERROR_INTERNAL);
4835 }
4836 }
4837
4838 /* If min = max, continue at the same level without recursing */
4839
4840 if (min == max) continue;
4841
4842 /* If minimizing, we have to test the rest of the pattern before each
4843 subsequent match. Again, separate the UTF-8 case for speed, and also
4844 separate the UCP cases. */
4845
4846 if (minimize)
4847 {
4848 #ifdef SUPPORT_UCP
4849 if (prop_type >= 0)
4850 {
4851 switch(prop_type)
4852 {
4853 case PT_ANY:
4854 for (fi = min;; fi++)
4855 {
4856 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4858 if (fi >= max) RRETURN(MATCH_NOMATCH);
4859 if (eptr >= md->end_subject)
4860 {
4861 SCHECK_PARTIAL();
4862 RRETURN(MATCH_NOMATCH);
4863 }
4864 GETCHARINCTEST(c, eptr);
4865 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4866 }
4867 /* Control never gets here */
4868
4869 case PT_LAMP:
4870 for (fi = min;; fi++)
4871 {
4872 int chartype;
4873 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4875 if (fi >= max) RRETURN(MATCH_NOMATCH);
4876 if (eptr >= md->end_subject)
4877 {
4878 SCHECK_PARTIAL();
4879 RRETURN(MATCH_NOMATCH);
4880 }
4881 GETCHARINCTEST(c, eptr);
4882 chartype = UCD_CHARTYPE(c);
4883 if ((chartype == ucp_Lu ||
4884 chartype == ucp_Ll ||
4885 chartype == ucp_Lt) == prop_fail_result)
4886 RRETURN(MATCH_NOMATCH);
4887 }
4888 /* Control never gets here */
4889
4890 case PT_GC:
4891 for (fi = min;; fi++)
4892 {
4893 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4895 if (fi >= max) RRETURN(MATCH_NOMATCH);
4896 if (eptr >= md->end_subject)
4897 {
4898 SCHECK_PARTIAL();
4899 RRETURN(MATCH_NOMATCH);
4900 }
4901 GETCHARINCTEST(c, eptr);
4902 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4903 RRETURN(MATCH_NOMATCH);
4904 }
4905 /* Control never gets here */
4906
4907 case PT_PC:
4908 for (fi = min;; fi++)
4909 {
4910 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4911 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4912 if (fi >= max) RRETURN(MATCH_NOMATCH);
4913 if (eptr >= md->end_subject)
4914 {
4915 SCHECK_PARTIAL();
4916 RRETURN(MATCH_NOMATCH);
4917 }
4918 GETCHARINCTEST(c, eptr);
4919 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4920 RRETURN(MATCH_NOMATCH);
4921 }
4922 /* Control never gets here */
4923
4924 case PT_SC:
4925 for (fi = min;; fi++)
4926 {
4927 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4928 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4929 if (fi >= max) RRETURN(MATCH_NOMATCH);
4930 if (eptr >= md->end_subject)
4931 {
4932 SCHECK_PARTIAL();
4933 RRETURN(MATCH_NOMATCH);
4934 }
4935 GETCHARINCTEST(c, eptr);
4936 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4937 RRETURN(MATCH_NOMATCH);
4938 }
4939 /* Control never gets here */
4940
4941 case PT_ALNUM:
4942 for (fi = min;; fi++)
4943 {
4944 int category;
4945 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4947 if (fi >= max) RRETURN(MATCH_NOMATCH);
4948 if (eptr >= md->end_subject)
4949 {
4950 SCHECK_PARTIAL();
4951 RRETURN(MATCH_NOMATCH);
4952 }
4953 GETCHARINCTEST(c, eptr);
4954 category = UCD_CATEGORY(c);
4955 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4956 RRETURN(MATCH_NOMATCH);
4957 }
4958 /* Control never gets here */
4959
4960 case PT_SPACE: /* Perl space */
4961 for (fi = min;; fi++)
4962 {
4963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4965 if (fi >= max) RRETURN(MATCH_NOMATCH);
4966 if (eptr >= md->end_subject)
4967 {
4968 SCHECK_PARTIAL();
4969 RRETURN(MATCH_NOMATCH);
4970 }
4971 GETCHARINCTEST(c, eptr);
4972 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4973 c == CHAR_FF || c == CHAR_CR)
4974 == prop_fail_result)
4975 RRETURN(MATCH_NOMATCH);
4976 }
4977 /* Control never gets here */
4978
4979 case PT_PXSPACE: /* POSIX space */
4980 for (fi = min;; fi++)
4981 {
4982 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4983 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4984 if (fi >= max) RRETURN(MATCH_NOMATCH);
4985 if (eptr >= md->end_subject)
4986 {
4987 SCHECK_PARTIAL();
4988 RRETURN(MATCH_NOMATCH);
4989 }
4990 GETCHARINCTEST(c, eptr);
4991 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4992 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4993 == prop_fail_result)
4994 RRETURN(MATCH_NOMATCH);
4995 }
4996 /* Control never gets here */
4997
4998 case PT_WORD:
4999 for (fi = min;; fi++)
5000 {
5001 int category;
5002 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5004 if (fi >= max) RRETURN(MATCH_NOMATCH);
5005 if (eptr >= md->end_subject)
5006 {
5007 SCHECK_PARTIAL();
5008 RRETURN(MATCH_NOMATCH);
5009 }
5010 GETCHARINCTEST(c, eptr);
5011 category = UCD_CATEGORY(c);
5012 if ((category == ucp_L ||
5013 category == ucp_N ||
5014 c == CHAR_UNDERSCORE)
5015 == prop_fail_result)
5016 RRETURN(MATCH_NOMATCH);
5017 }
5018 /* Control never gets here */
5019
5020 case PT_CLIST:
5021 for (fi = min;; fi++)
5022 {
5023 const pcre_uint32 *cp;
5024 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5026 if (fi >= max) RRETURN(MATCH_NOMATCH);
5027 if (eptr >= md->end_subject)
5028 {
5029 SCHECK_PARTIAL();
5030 RRETURN(MATCH_NOMATCH);
5031 }
5032 GETCHARINCTEST(c, eptr);
5033 cp = PRIV(ucd_caseless_sets) + prop_value;
5034 for (;;)
5035 {
5036 if (c < *cp)
5037 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5038 if (c == *cp++)
5039 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5040 }
5041 }
5042 /* Control never gets here */
5043
5044 case PT_UCNC:
5045 for (fi = min;; fi++)
5046 {
5047 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5049 if (fi >= max) RRETURN(MATCH_NOMATCH);
5050 if (eptr >= md->end_subject)
5051 {
5052 SCHECK_PARTIAL();
5053 RRETURN(MATCH_NOMATCH);
5054 }
5055 GETCHARINCTEST(c, eptr);
5056 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5057 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5058 c >= 0xe000) == prop_fail_result)
5059 RRETURN(MATCH_NOMATCH);
5060 }
5061 /* Control never gets here */
5062
5063 /* This should never occur */
5064 default:
5065 RRETURN(PCRE_ERROR_INTERNAL);
5066 }
5067 }
5068
5069 /* Match extended Unicode sequences. We will get here only if the
5070 support is in the binary; otherwise a compile-time error occurs. */
5071
5072 else if (ctype == OP_EXTUNI)
5073 {
5074 for (fi = min;; fi++)
5075 {
5076 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5077 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5078 if (fi >= max) RRETURN(MATCH_NOMATCH);
5079 if (eptr >= md->end_subject)
5080 {
5081 SCHECK_PARTIAL();
5082 RRETURN(MATCH_NOMATCH);
5083 }
5084 else
5085 {
5086 int lgb, rgb;
5087 GETCHARINCTEST(c, eptr);
5088 lgb = UCD_GRAPHBREAK(c);
5089 while (eptr < md->end_subject)
5090 {
5091 int len = 1;
5092 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5093 rgb = UCD_GRAPHBREAK(c);
5094 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5095 lgb = rgb;
5096 eptr += len;
5097 }
5098 }
5099 CHECK_PARTIAL();
5100 }
5101 }
5102 else
5103 #endif /* SUPPORT_UCP */
5104
5105 #ifdef SUPPORT_UTF
5106 if (utf)
5107 {
5108 for (fi = min;; fi++)
5109 {
5110 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5111 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5112 if (fi >= max) RRETURN(MATCH_NOMATCH);
5113 if (eptr >= md->end_subject)
5114 {
5115 SCHECK_PARTIAL();
5116 RRETURN(MATCH_NOMATCH);
5117 }
5118 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5119 RRETURN(MATCH_NOMATCH);
5120 GETCHARINC(c, eptr);
5121 switch(ctype)
5122 {
5123 case OP_ANY: /* This is the non-NL case */
5124 if (md->partial != 0 && /* Take care with CRLF partial */
5125 eptr >= md->end_subject &&
5126 NLBLOCK->nltype == NLTYPE_FIXED &&
5127 NLBLOCK->nllen == 2 &&
5128 c == NLBLOCK->nl[0])
5129 {
5130 md->hitend = TRUE;
5131 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5132 }
5133 break;
5134
5135 case OP_ALLANY:
5136 case OP_ANYBYTE:
5137 break;
5138
5139 case OP_ANYNL:
5140 switch(c)
5141 {
5142 default: RRETURN(MATCH_NOMATCH);
5143 case CHAR_CR:
5144 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5145 break;
5146
5147 case CHAR_LF:
5148 break;
5149
5150 case CHAR_VT:
5151 case CHAR_FF:
5152 case CHAR_NEL:
5153 #ifndef EBCDIC
5154 case 0x2028:
5155 case 0x2029:
5156 #endif /* Not EBCDIC */
5157 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5158 break;
5159 }
5160 break;
5161
5162 case OP_NOT_HSPACE:
5163 switch(c)
5164 {
5165 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5166 default: break;
5167 }
5168 break;
5169
5170 case OP_HSPACE:
5171 switch(c)
5172 {
5173 HSPACE_CASES: break;
5174 default: RRETURN(MATCH_NOMATCH);
5175 }
5176 break;
5177
5178 case OP_NOT_VSPACE:
5179 switch(c)
5180 {
5181 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5182 default: break;
5183 }
5184 break;
5185
5186 case OP_VSPACE:
5187 switch(c)
5188 {
5189 VSPACE_CASES: break;
5190 default: RRETURN(MATCH_NOMATCH);
5191 }
5192 break;
5193
5194 case OP_NOT_DIGIT:
5195 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5196 RRETURN(MATCH_NOMATCH);
5197 break;
5198
5199 case OP_DIGIT:
5200 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5201 RRETURN(MATCH_NOMATCH);
5202 break;
5203
5204 case OP_NOT_WHITESPACE:
5205 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5206 RRETURN(MATCH_NOMATCH);
5207 break;
5208
5209 case OP_WHITESPACE:
5210 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5211 RRETURN(MATCH_NOMATCH);
5212 break;
5213
5214 case OP_NOT_WORDCHAR:
5215 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5216 RRETURN(MATCH_NOMATCH);
5217 break;
5218
5219 case OP_WORDCHAR:
5220 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5221 RRETURN(MATCH_NOMATCH);
5222 break;
5223
5224 default:
5225 RRETURN(PCRE_ERROR_INTERNAL);
5226 }
5227 }
5228 }
5229 else
5230 #endif
5231 /* Not UTF mode */
5232 {
5233 for (fi = min;; fi++)
5234 {
5235 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5236 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5237 if (fi >= max) RRETURN(MATCH_NOMATCH);
5238 if (eptr >= md->end_subject)
5239 {
5240 SCHECK_PARTIAL();
5241 RRETURN(MATCH_NOMATCH);
5242 }
5243 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5244 RRETURN(MATCH_NOMATCH);
5245 c = *eptr++;
5246 switch(ctype)
5247 {
5248 case OP_ANY: /* This is the non-NL case */
5249 if (md->partial != 0 && /* Take care with CRLF partial */
5250 eptr >= md->end_subject &&
5251 NLBLOCK->nltype == NLTYPE_FIXED &&
5252 NLBLOCK->nllen == 2 &&
5253 c == NLBLOCK->nl[0])
5254 {
5255 md->hitend = TRUE;
5256 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5257 }
5258 break;
5259
5260 case OP_ALLANY:
5261 case OP_ANYBYTE:
5262 break;
5263
5264 case OP_ANYNL:
5265 switch(c)
5266 {
5267 default: RRETURN(MATCH_NOMATCH);
5268 case CHAR_CR:
5269 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5270 break;
5271
5272 case CHAR_LF:
5273 break;
5274
5275 case CHAR_VT:
5276 case CHAR_FF:
5277 case CHAR_NEL:
5278 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5279 case 0x2028:
5280 case 0x2029:
5281 #endif
5282 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5283 break;
5284 }
5285 break;
5286
5287 case OP_NOT_HSPACE:
5288 switch(c)
5289 {
5290 default: break;
5291 HSPACE_BYTE_CASES:
5292 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5293 HSPACE_MULTIBYTE_CASES:
5294 #endif
5295 RRETURN(MATCH_NOMATCH);
5296 }
5297 break;
5298
5299 case OP_HSPACE:
5300 switch(c)
5301 {
5302 default: RRETURN(MATCH_NOMATCH);
5303 HSPACE_BYTE_CASES:
5304 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5305 HSPACE_MULTIBYTE_CASES:
5306 #endif
5307 break;
5308 }
5309 break;
5310
5311 case OP_NOT_VSPACE:
5312 switch(c)
5313 {
5314 default: break;
5315 VSPACE_BYTE_CASES:
5316 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5317 VSPACE_MULTIBYTE_CASES:
5318 #endif
5319 RRETURN(MATCH_NOMATCH);
5320 }
5321 break;
5322
5323 case OP_VSPACE:
5324 switch(c)
5325 {
5326 default: RRETURN(MATCH_NOMATCH);
5327 VSPACE_BYTE_CASES:
5328 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5329 VSPACE_MULTIBYTE_CASES:
5330 #endif
5331 break;
5332 }
5333 break;
5334
5335 case OP_NOT_DIGIT:
5336 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5337 break;
5338
5339 case OP_DIGIT:
5340 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5341 break;
5342
5343 case OP_NOT_WHITESPACE:
5344 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5345 break;
5346
5347 case OP_WHITESPACE:
5348 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5349 break;
5350
5351 case OP_NOT_WORDCHAR:
5352 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5353 break;
5354
5355 case OP_WORDCHAR:
5356 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5357 break;
5358
5359 default:
5360 RRETURN(PCRE_ERROR_INTERNAL);
5361 }
5362 }
5363 }
5364 /* Control never gets here */
5365 }
5366
5367 /* If maximizing, it is worth using inline code for speed, doing the type
5368 test once at the start (i.e. keep it out of the loop). Again, keep the
5369 UTF-8 and UCP stuff separate. */
5370
5371 else
5372 {
5373 pp = eptr; /* Remember where we started */
5374
5375 #ifdef SUPPORT_UCP
5376 if (prop_type >= 0)
5377 {
5378 switch(prop_type)
5379 {
5380 case PT_ANY:
5381 for (i = min; i < max; i++)
5382 {
5383 int len = 1;
5384 if (eptr >= md->end_subject)
5385 {
5386 SCHECK_PARTIAL();
5387 break;
5388 }
5389 GETCHARLENTEST(c, eptr, len);
5390 if (prop_fail_result) break;
5391 eptr+= len;
5392 }
5393 break;
5394
5395 case PT_LAMP:
5396 for (i = min; i < max; i++)
5397 {
5398 int chartype;
5399 int len = 1;
5400 if (eptr >= md->end_subject)
5401 {
5402 SCHECK_PARTIAL();
5403 break;
5404 }
5405 GETCHARLENTEST(c, eptr, len);
5406 chartype = UCD_CHARTYPE(c);
5407 if ((chartype == ucp_Lu ||
5408 chartype == ucp_Ll ||
5409 chartype == ucp_Lt) == prop_fail_result)
5410 break;
5411 eptr+= len;
5412 }
5413 break;
5414
5415 case PT_GC:
5416 for (i = min; i < max; i++)
5417 {
5418 int len = 1;
5419 if (eptr >= md->end_subject)
5420 {
5421 SCHECK_PARTIAL();
5422 break;
5423 }
5424 GETCHARLENTEST(c, eptr, len);
5425 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5426 eptr+= len;
5427 }
5428 break;
5429
5430 case PT_PC:
5431 for (i = min; i < max; i++)
5432 {
5433 int len = 1;
5434 if (eptr >= md->end_subject)
5435 {
5436 SCHECK_PARTIAL();
5437 break;
5438 }
5439 GETCHARLENTEST(c, eptr, len);
5440 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5441 eptr+= len;
5442 }
5443 break;
5444
5445 case PT_SC:
5446 for (i = min; i < max; i++)
5447 {
5448 int len = 1;
5449 if (eptr >= md->end_subject)
5450 {
5451 SCHECK_PARTIAL();
5452 break;
5453 }
5454 GETCHARLENTEST(c, eptr, len);
5455 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5456 eptr+= len;
5457 }
5458 break;
5459
5460 case PT_ALNUM:
5461 for (i = min; i < max; i++)
5462 {
5463 int category;
5464 int len = 1;
5465 if (eptr >= md->end_subject)
5466 {
5467 SCHECK_PARTIAL();
5468 break;
5469 }
5470 GETCHARLENTEST(c, eptr, len);
5471 category = UCD_CATEGORY(c);
5472 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5473 break;
5474 eptr+= len;
5475 }
5476 break;
5477
5478 case PT_SPACE: /* Perl space */
5479 for (i = min; i < max; i++)
5480 {
5481 int len = 1;
5482 if (eptr >= md->end_subject)
5483 {
5484 SCHECK_PARTIAL();
5485 break;
5486 }
5487 GETCHARLENTEST(c, eptr, len);
5488 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5489 c == CHAR_FF || c == CHAR_CR)
5490 == prop_fail_result)
5491 break;
5492 eptr+= len;
5493 }
5494 break;
5495
5496 case PT_PXSPACE: /* POSIX space */
5497 for (i = min; i < max; i++)
5498 {
5499 int len = 1;
5500 if (eptr >= md->end_subject)
5501 {
5502 SCHECK_PARTIAL();
5503 break;
5504 }
5505 GETCHARLENTEST(c, eptr, len);
5506 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5507 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5508 == prop_fail_result)
5509 break;
5510 eptr+= len;
5511 }
5512 break;
5513
5514 case PT_WORD:
5515 for (i = min; i < max; i++)
5516 {
5517 int category;
5518 int len = 1;
5519 if (eptr >= md->end_subject)
5520 {
5521 SCHECK_PARTIAL();
5522 break;
5523 }
5524 GETCHARLENTEST(c, eptr, len);
5525 category = UCD_CATEGORY(c);
5526 if ((category == ucp_L || category == ucp_N ||
5527 c == CHAR_UNDERSCORE) == prop_fail_result)
5528 break;
5529 eptr+= len;
5530 }
5531 break;
5532
5533 case PT_CLIST:
5534 for (i = min; i < max; i++)
5535 {
5536 const pcre_uint32 *cp;
5537 int len = 1;
5538 if (eptr >= md->end_subject)
5539 {
5540 SCHECK_PARTIAL();
5541 break;
5542 }
5543 GETCHARLENTEST(c, eptr, len);
5544 cp = PRIV(ucd_caseless_sets) + prop_value;
5545 for (;;)
5546 {
5547 if (c < *cp)
5548 { if (prop_fail_result) break; else goto GOT_MAX; }
5549 if (c == *cp++)
5550 { if (prop_fail_result) goto GOT_MAX; else break; }
5551 }
5552 eptr += len;
5553 }
5554 GOT_MAX:
5555 break;
5556
5557 case PT_UCNC:
5558 for (i = min; i < max; i++)
5559 {
5560 int len = 1;
5561 if (eptr >= md->end_subject)
5562 {
5563 SCHECK_PARTIAL();
5564 break;
5565 }
5566 GETCHARLENTEST(c, eptr, len);
5567 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5568 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5569 c >= 0xe000) == prop_fail_result)
5570 break;
5571 eptr += len;
5572 }
5573 break;
5574
5575 default:
5576 RRETURN(PCRE_ERROR_INTERNAL);
5577 }
5578
5579 /* eptr is now past the end of the maximum run */
5580
5581 if (possessive) continue;
5582 for(;;)
5583 {
5584 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5585 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5586 if (eptr-- == pp) break; /* Stop if tried at original pos */
5587 if (utf) BACKCHAR(eptr);
5588 }
5589 }
5590
5591 /* Match extended Unicode sequences. We will get here only if the
5592 support is in the binary; otherwise a compile-time error occurs. */
5593
5594 else if (ctype == OP_EXTUNI)
5595 {
5596 for (i = min; i < max; i++)
5597 {
5598 if (eptr >= md->end_subject)
5599 {
5600 SCHECK_PARTIAL();
5601 break;
5602 }
5603 else
5604 {
5605 int lgb, rgb;
5606 GETCHARINCTEST(c, eptr);
5607 lgb = UCD_GRAPHBREAK(c);
5608 while (eptr < md->end_subject)
5609 {
5610 int len = 1;
5611 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5612 rgb = UCD_GRAPHBREAK(c);
5613 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5614 lgb = rgb;
5615 eptr += len;
5616 }
5617 }
5618 CHECK_PARTIAL();
5619 }
5620
5621 /* eptr is now past the end of the maximum run */
5622
5623 if (possessive) continue;
5624
5625 for(;;)
5626 {
5627 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5629 if (eptr-- == pp) break; /* Stop if tried at original pos */
5630 for (;;) /* Move back over one extended */
5631 {
5632 if (!utf) c = *eptr; else
5633 {
5634 BACKCHAR(eptr);
5635 GETCHAR(c, eptr);
5636 }
5637 if (UCD_CATEGORY(c) != ucp_M) break;
5638 eptr--;
5639 }
5640 }
5641 }
5642
5643 else
5644 #endif /* SUPPORT_UCP */
5645
5646 #ifdef SUPPORT_UTF
5647 if (utf)
5648 {
5649 switch(ctype)
5650 {
5651 case OP_ANY:
5652 if (max < INT_MAX)
5653 {
5654 for (i = min; i < max; i++)
5655 {
5656 if (eptr >= md->end_subject)
5657 {
5658 SCHECK_PARTIAL();
5659 break;
5660 }
5661 if (IS_NEWLINE(eptr)) break;
5662 if (md->partial != 0 && /* Take care with CRLF partial */
5663 eptr + 1 >= md->end_subject &&
5664 NLBLOCK->nltype == NLTYPE_FIXED &&
5665 NLBLOCK->nllen == 2 &&
5666 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5667 {
5668 md->hitend = TRUE;
5669 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5670 }
5671 eptr++;
5672 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5673 }
5674 }
5675
5676 /* Handle unlimited UTF-8 repeat */
5677
5678 else
5679 {
5680 for (i = min; i < max; i++)
5681 {
5682 if (eptr >= md->end_subject)
5683 {
5684 SCHECK_PARTIAL();
5685 break;
5686 }
5687 if (IS_NEWLINE(eptr)) break;
5688 if (md->partial != 0 && /* Take care with CRLF partial */
5689 eptr + 1 >= md->end_subject &&
5690 NLBLOCK->nltype == NLTYPE_FIXED &&
5691 NLBLOCK->nllen == 2 &&
5692 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5693 {
5694 md->hitend = TRUE;
5695 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5696 }
5697 eptr++;
5698 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5699 }
5700 }
5701 break;
5702
5703 case OP_ALLANY:
5704 if (max < INT_MAX)
5705 {
5706 for (i = min; i < max; i++)
5707 {
5708 if (eptr >= md->end_subject)
5709 {
5710 SCHECK_PARTIAL();
5711 break;
5712 }
5713 eptr++;
5714 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5715 }
5716 }
5717 else
5718 {
5719 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5720 SCHECK_PARTIAL();
5721 }
5722 break;
5723
5724 /* The byte case is the same as non-UTF8 */
5725
5726 case OP_ANYBYTE:
5727 c = max - min;
5728 if (c > (unsigned int)(md->end_subject - eptr))
5729 {
5730 eptr = md->end_subject;
5731 SCHECK_PARTIAL();
5732 }
5733 else eptr += c;
5734 break;
5735
5736 case OP_ANYNL:
5737 for (i = min; i < max; i++)
5738 {
5739 int len = 1;
5740 if (eptr >= md->end_subject)
5741 {
5742 SCHECK_PARTIAL();
5743 break;
5744 }
5745 GETCHARLEN(c, eptr, len);
5746 if (c == CHAR_CR)
5747 {
5748 if (++eptr >= md->end_subject) break;
5749 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5750 }
5751 else
5752 {
5753 if (c != CHAR_LF &&
5754 (md->bsr_anycrlf ||
5755 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5756 #ifndef EBCDIC
5757 && c != 0x2028 && c != 0x2029
5758 #endif /* Not EBCDIC */
5759 )))
5760 break;
5761 eptr += len;
5762 }
5763 }
5764 break;
5765
5766 case OP_NOT_HSPACE:
5767 case OP_HSPACE:
5768 for (i = min; i < max; i++)
5769 {
5770 BOOL gotspace;
5771 int len = 1;
5772 if (eptr >= md->end_subject)
5773 {
5774 SCHECK_PARTIAL();
5775 break;
5776 }
5777 GETCHARLEN(c, eptr, len);
5778 switch(c)
5779 {
5780 HSPACE_CASES: gotspace = TRUE; break;
5781 default: gotspace = FALSE; break;
5782 }
5783 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5784 eptr += len;
5785 }
5786 break;
5787
5788 case OP_NOT_VSPACE:
5789 case OP_VSPACE:
5790 for (i = min; i < max; i++)
5791 {
5792 BOOL gotspace;
5793 int len = 1;
5794 if (eptr >= md->end_subject)
5795 {
5796 SCHECK_PARTIAL();
5797 break;
5798 }
5799 GETCHARLEN(c, eptr, len);
5800 switch(c)
5801 {
5802 VSPACE_CASES: gotspace = TRUE; break;
5803 default: gotspace = FALSE; break;
5804 }
5805 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5806 eptr += len;
5807 }
5808 break;
5809
5810 case OP_NOT_DIGIT:
5811 for (i = min; i < max; i++)
5812 {
5813 int len = 1;
5814 if (eptr >= md->end_subject)
5815 {
5816 SCHECK_PARTIAL();
5817 break;
5818 }
5819 GETCHARLEN(c, eptr, len);
5820 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5821 eptr+= len;
5822 }
5823 break;
5824
5825 case OP_DIGIT:
5826 for (i = min; i < max; i++)
5827 {
5828 int len = 1;
5829 if (eptr >= md->end_subject)
5830 {
5831 SCHECK_PARTIAL();
5832 break;
5833 }
5834 GETCHARLEN(c, eptr, len);
5835 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5836 eptr+= len;
5837 }
5838 break;
5839
5840 case OP_NOT_WHITESPACE:
5841 for (i = min; i < max; i++)
5842 {
5843 int len = 1;
5844 if (eptr >= md->end_subject)
5845 {
5846 SCHECK_PARTIAL();
5847 break;
5848 }
5849 GETCHARLEN(c, eptr, len);
5850 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5851 eptr+= len;
5852 }
5853 break;
5854
5855 case OP_WHITESPACE:
5856 for (i = min; i < max; i++)
5857 {
5858 int len = 1;
5859 if (eptr >= md->end_subject)
5860 {
5861 SCHECK_PARTIAL();
5862 break;
5863 }
5864 GETCHARLEN(c, eptr, len);
5865 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5866 eptr+= len;
5867 }
5868 break;
5869
5870 case OP_NOT_WORDCHAR:
5871 for (i = min; i < max; i++)
5872 {
5873 int len = 1;
5874 if (eptr >= md->end_subject)
5875 {
5876 SCHECK_PARTIAL();
5877 break;
5878 }
5879 GETCHARLEN(c, eptr, len);
5880 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5881 eptr+= len;
5882 }
5883 break;
5884
5885 case OP_WORDCHAR:
5886 for (i = min; i < max; i++)
5887 {
5888 int len = 1;
5889 if (eptr >= md->end_subject)
5890 {
5891 SCHECK_PARTIAL();
5892 break;
5893 }
5894 GETCHARLEN(c, eptr, len);
5895 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5896 eptr+= len;
5897 }
5898 break;
5899
5900 default:
5901 RRETURN(PCRE_ERROR_INTERNAL);
5902 }
5903
5904 /* eptr is now past the end of the maximum run. If possessive, we are
5905 done (no backing up). Otherwise, match at this position; anything other
5906 than no match is immediately returned. For nomatch, back up one
5907 character, unless we are matching \R and the last thing matched was
5908 \r\n, in which case, back up two bytes. */
5909
5910 if (possessive) continue;
5911 for(;;)
5912 {
5913 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5915 if (eptr-- == pp) break; /* Stop if tried at original pos */
5916 BACKCHAR(eptr);
5917 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5918 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5919 }
5920 }
5921 else
5922 #endif /* SUPPORT_UTF */
5923 /* Not UTF mode */
5924 {
5925 switch(ctype)
5926 {
5927 case OP_ANY:
5928 for (i = min; i < max; i++)
5929 {
5930 if (eptr >= md->end_subject)
5931 {
5932 SCHECK_PARTIAL();
5933 break;
5934 }
5935 if (IS_NEWLINE(eptr)) break;
5936 if (md->partial != 0 && /* Take care with CRLF partial */
5937 eptr + 1 >= md->end_subject &&
5938 NLBLOCK->nltype == NLTYPE_FIXED &&
5939 NLBLOCK->nllen == 2 &&
5940 *eptr == NLBLOCK->nl[0])
5941 {
5942 md->hitend = TRUE;
5943 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5944 }
5945 eptr++;
5946 }
5947 break;
5948
5949 case OP_ALLANY:
5950 case OP_ANYBYTE:
5951 c = max - min;
5952 if (c > (unsigned int)(md->end_subject - eptr))
5953 {
5954 eptr = md->end_subject;
5955 SCHECK_PARTIAL();
5956 }
5957 else eptr += c;
5958 break;
5959
5960 case OP_ANYNL:
5961 for (i = min; i < max; i++)
5962 {
5963 if (eptr >= md->end_subject)
5964 {
5965 SCHECK_PARTIAL();
5966 break;
5967 }
5968 c = *eptr;
5969 if (c == CHAR_CR)
5970 {
5971 if (++eptr >= md->end_subject) break;
5972 if (*eptr == CHAR_LF) eptr++;
5973 }
5974 else
5975 {
5976 if (c != CHAR_LF && (md->bsr_anycrlf ||
5977 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5978 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5979 && c != 0x2028 && c != 0x2029
5980 #endif
5981 ))) break;
5982 eptr++;
5983 }
5984 }
5985 break;
5986
5987 case OP_NOT_HSPACE:
5988 for (i = min; i < max; i++)
5989 {
5990 if (eptr >= md->end_subject)
5991 {
5992 SCHECK_PARTIAL();
5993 break;
5994 }
5995 switch(*eptr)
5996 {
5997 default: eptr++; break;
5998 HSPACE_BYTE_CASES:
5999 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6000 HSPACE_MULTIBYTE_CASES:
6001 #endif
6002 goto ENDLOOP00;
6003 }
6004 }
6005 ENDLOOP00:
6006 break;
6007
6008 case OP_HSPACE:
6009 for (i = min; i < max; i++)
6010 {
6011 if (eptr >= md->end_subject)
6012 {
6013 SCHECK_PARTIAL();
6014 break;
6015 }
6016 switch(*eptr)
6017 {
6018 default: goto ENDLOOP01;
6019 HSPACE_BYTE_CASES:
6020 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6021 HSPACE_MULTIBYTE_CASES:
6022 #endif
6023 eptr++; break;
6024 }
6025 }
6026 ENDLOOP01:
6027 break;
6028
6029 case OP_NOT_VSPACE:
6030 for (i = min; i < max; i++)
6031 {
6032 if (eptr >= md->end_subject)
6033 {
6034 SCHECK_PARTIAL();
6035 break;
6036 }
6037 switch(*eptr)
6038 {
6039 default: eptr++; break;
6040 VSPACE_BYTE_CASES:
6041 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6042 VSPACE_MULTIBYTE_CASES:
6043 #endif
6044 goto ENDLOOP02;
6045 }
6046 }
6047 ENDLOOP02:
6048 break;
6049
6050 case OP_VSPACE:
6051 for (i = min; i < max; i++)
6052 {
6053 if (eptr >= md->end_subject)
6054 {
6055 SCHECK_PARTIAL();
6056 break;
6057 }
6058 switch(*eptr)
6059 {
6060 default: goto ENDLOOP03;
6061 VSPACE_BYTE_CASES:
6062 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6063 VSPACE_MULTIBYTE_CASES:
6064 #endif
6065 eptr++; break;
6066 }
6067 }
6068 ENDLOOP03:
6069 break;
6070
6071 case OP_NOT_DIGIT:
6072 for (i = min; i < max; i++)
6073 {
6074 if (eptr >= md->end_subject)
6075 {
6076 SCHECK_PARTIAL();
6077 break;
6078 }
6079 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6080 eptr++;
6081 }
6082 break;
6083
6084 case OP_DIGIT:
6085 for (i = min; i < max; i++)
6086 {
6087 if (eptr >= md->end_subject)
6088 {
6089 SCHECK_PARTIAL();
6090 break;
6091 }
6092 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6093 eptr++;
6094 }
6095 break;
6096
6097 case OP_NOT_WHITESPACE:
6098 for (i = min; i < max; i++)
6099 {
6100 if (eptr >= md->end_subject)
6101 {
6102 SCHECK_PARTIAL();
6103 break;
6104 }
6105 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6106 eptr++;
6107 }
6108 break;
6109
6110 case OP_WHITESPACE:
6111 for (i = min; i < max; i++)
6112 {
6113 if (eptr >= md->end_subject)
6114 {
6115 SCHECK_PARTIAL();
6116 break;
6117 }
6118 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6119 eptr++;
6120 }
6121 break;
6122
6123 case OP_NOT_WORDCHAR:
6124 for (i = min; i < max; i++)
6125 {
6126 if (eptr >= md->end_subject)
6127 {
6128 SCHECK_PARTIAL();
6129 break;
6130 }
6131 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6132 eptr++;
6133 }
6134 break;
6135
6136 case OP_WORDCHAR:
6137 for (i = min; i < max; i++)
6138 {
6139 if (eptr >= md->end_subject)
6140 {
6141 SCHECK_PARTIAL();
6142 break;
6143 }
6144 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6145 eptr++;
6146 }
6147 break;
6148
6149 default:
6150 RRETURN(PCRE_ERROR_INTERNAL);
6151 }
6152
6153 /* eptr is now past the end of the maximum run. If possessive, we are
6154 done (no backing up). Otherwise, match at this position; anything other
6155 than no match is immediately returned. For nomatch, back up one
6156 character (byte), unless we are matching \R and the last thing matched
6157 was \r\n, in which case, back up two bytes. */
6158
6159 if (possessive) continue;
6160 while (eptr >= pp)
6161 {
6162 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6163 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6164 eptr--;
6165 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6166 eptr[-1] == CHAR_CR) eptr--;
6167 }
6168 }
6169
6170 /* Get here if we can't make it match with any permitted repetitions */
6171
6172 RRETURN(MATCH_NOMATCH);
6173 }
6174 /* Control never gets here */
6175
6176 /* There's been some horrible disaster. Arrival here can only mean there is
6177 something seriously wrong in the code above or the OP_xxx definitions. */
6178
6179 default:
6180 DPRINTF(("Unknown opcode %d\n", *ecode));
6181 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6182 }
6183
6184 /* Do not stick any code in here without much thought; it is assumed
6185 that "continue" in the code above comes out to here to repeat the main
6186 loop. */
6187
6188 } /* End of main loop */
6189 /* Control never reaches here */
6190
6191
6192 /* When compiling to use the heap rather than the stack for recursive calls to
6193 match(), the RRETURN() macro jumps here. The number that is saved in
6194 frame->Xwhere indicates which label we actually want to return to. */
6195
6196 #ifdef NO_RECURSE
6197 #define LBL(val) case val: goto L_RM##val;
6198 HEAP_RETURN:
6199 switch (frame->Xwhere)
6200 {
6201 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6202 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6203 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6204 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6205 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6206 LBL(65) LBL(66)
6207 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6208 LBL(21)
6209 #endif
6210 #ifdef SUPPORT_UTF
6211 LBL(16) LBL(18) LBL(20)
6212 LBL(22) LBL(23) LBL(28) LBL(30)
6213 LBL(32) LBL(34) LBL(42) LBL(46)
6214 #ifdef SUPPORT_UCP
6215 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6216 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6217 #endif /* SUPPORT_UCP */
6218 #endif /* SUPPORT_UTF */
6219 default:
6220 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6221 return PCRE_ERROR_INTERNAL;
6222 }
6223 #undef LBL
6224 #endif /* NO_RECURSE */
6225 }
6226
6227
6228 /***************************************************************************
6229 ****************************************************************************
6230 RECURSION IN THE match() FUNCTION
6231
6232 Undefine all the macros that were defined above to handle this. */
6233
6234 #ifdef NO_RECURSE
6235 #undef eptr
6236 #undef ecode
6237 #undef mstart
6238 #undef offset_top
6239 #undef eptrb
6240 #undef flags
6241
6242 #undef callpat
6243 #undef charptr
6244 #undef data
6245 #undef next
6246 #undef pp
6247 #undef prev
6248 #undef saved_eptr
6249
6250 #undef new_recursive
6251
6252 #undef cur_is_word
6253 #undef condition
6254 #undef prev_is_word
6255
6256 #undef ctype
6257 #undef length
6258 #undef max
6259 #undef min
6260 #undef number
6261 #undef offset
6262 #undef op
6263 #undef save_capture_last
6264 #undef save_offset1
6265 #undef save_offset2
6266 #undef save_offset3
6267 #undef stacksave
6268
6269 #undef newptrb
6270
6271 #endif
6272
6273 /* These two are defined as macros in both cases */
6274
6275 #undef fc
6276 #undef fi
6277
6278 /***************************************************************************
6279 ***************************************************************************/
6280
6281
6282 #ifdef NO_RECURSE
6283 /*************************************************
6284 * Release allocated heap frames *
6285 *************************************************/
6286
6287 /* This function releases all the allocated frames. The base frame is on the
6288 machine stack, and so must not be freed.
6289
6290 Argument: the address of the base frame
6291 Returns: nothing
6292 */
6293
6294 static void
6295 release_match_heapframes (heapframe *frame_base)
6296 {
6297 heapframe *nextframe = frame_base->Xnextframe;
6298 while (nextframe != NULL)
6299 {
6300 heapframe *oldframe = nextframe;
6301 nextframe = nextframe->Xnextframe;
6302 (PUBL(stack_free))(oldframe);
6303 }
6304 }
6305 #endif
6306
6307
6308 /*************************************************
6309 * Execute a Regular Expression *
6310 *************************************************/
6311
6312 /* This function applies a compiled re to a subject string and picks out
6313 portions of the string if it matches. Two elements in the vector are set for
6314 each substring: the offsets to the start and end of the substring.
6315
6316 Arguments:
6317 argument_re points to the compiled expression
6318 extra_data points to extra data or is NULL
6319 subject points to the subject string
6320 length length of subject string (may contain binary zeros)
6321 start_offset where to start in the subject string
6322 options option bits
6323 offsets points to a vector of ints to be filled in with offsets
6324 offsetcount the number of elements in the vector
6325
6326 Returns: > 0 => success; value is the number of elements filled in
6327 = 0 => success, but offsets is not big enough
6328 -1 => failed to match
6329 < -1 => some kind of unexpected problem
6330 */
6331
6332 #if defined COMPILE_PCRE8
6333 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6334 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6335 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6336 int offsetcount)
6337 #elif defined COMPILE_PCRE16
6338 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6339 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6340 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6341 int offsetcount)
6342 #elif defined COMPILE_PCRE32
6343 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6344 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6345 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6346 int offsetcount)
6347 #endif
6348 {
6349 int rc, ocount, arg_offset_max;
6350 int newline;
6351 BOOL using_temporary_offsets = FALSE;
6352 BOOL anchored;
6353 BOOL startline;
6354 BOOL firstline;
6355 BOOL utf;
6356 BOOL has_first_char = FALSE;
6357 BOOL has_req_char = FALSE;
6358 pcre_uchar first_char = 0;
6359 pcre_uchar first_char2 = 0;
6360 pcre_uchar req_char = 0;
6361 pcre_uchar req_char2 = 0;
6362 match_data match_block;
6363 match_data *md = &match_block;
6364 const pcre_uint8 *tables;
6365 const pcre_uint8 *start_bits = NULL;
6366 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6367 PCRE_PUCHAR end_subject;
6368 PCRE_PUCHAR start_partial = NULL;
6369 PCRE_PUCHAR match_partial;
6370 PCRE_PUCHAR req_char_ptr = start_match - 1;
6371
6372 const pcre_study_data *study;
6373 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6374
6375 #ifdef NO_RECURSE
6376 heapframe frame_zero;
6377 frame_zero.Xprevframe = NULL; /* Marks the top level */
6378 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6379 md->match_frames_base = &frame_zero;
6380 #endif
6381
6382 /* Check for the special magic call that measures the size of the stack used
6383 per recursive call of match(). Without the funny casting for sizeof, a Windows
6384 compiler gave this error: "unary minus operator applied to unsigned type,
6385 result still unsigned". Hopefully the cast fixes that. */
6386
6387 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6388 start_offset == -999)
6389 #ifdef NO_RECURSE
6390 return -((int)sizeof(heapframe));
6391 #else
6392 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6393 #endif
6394
6395 /* Plausibility checks */
6396
6397 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6398 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6399 return PCRE_ERROR_NULL;
6400 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6401 if (length < 0) return PCRE_ERROR_BADLENGTH;
6402 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6403
6404 /* Check that the first field in the block is the magic number. If it is not,
6405 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6406 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6407 means that the pattern is likely compiled with different endianness. */
6408
6409 if (re->magic_number != MAGIC_NUMBER)
6410 return re->magic_number == REVERSED_MAGIC_NUMBER?
6411 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6412 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6413
6414 /* These two settings are used in the code for checking a UTF-8 string that
6415 follows immediately afterwards. Other values in the md block are used only
6416 during "normal" pcre_exec() processing, not when the JIT support is in use,
6417 so they are set up later. */
6418
6419 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6420 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6421 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6422 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6423
6424 /* Check a UTF-8 string if required. Pass back the character offset and error
6425 code for an invalid string if a results vector is available. */
6426
6427 #ifdef SUPPORT_UTF
6428 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6429 {
6430 int erroroffset;
6431 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6432 if (errorcode != 0)
6433 {
6434 if (offsetcount >= 2)
6435 {
6436 offsets[0] = erroroffset;
6437 offsets[1] = errorcode;
6438 }
6439 #if defined COMPILE_PCRE8
6440 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6441 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6442 #elif defined COMPILE_PCRE16
6443 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6444 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6445 #elif defined COMPILE_PCRE32
6446 return PCRE_ERROR_BADUTF32;
6447 #endif
6448 }
6449 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6450 /* Check that a start_offset points to the start of a UTF character. */
6451 if (start_offset > 0 && start_offset < length &&
6452 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6453 return PCRE_ERROR_BADUTF8_OFFSET;
6454 #endif
6455 }
6456 #endif
6457
6458 /* If the pattern was successfully studied with JIT support, run the JIT
6459 executable instead of the rest of this function. Most options must be set at
6460 compile time for the JIT code to be usable. Fallback to the normal code path if
6461 an unsupported flag is set. */
6462
6463 #ifdef SUPPORT_JIT
6464 if (extra_data != NULL
6465 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6466 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6467 && extra_data->executable_jit != NULL
6468 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6469 {
6470 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6471 start_offset, options, offsets, offsetcount);
6472
6473 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6474 mode is not compiled. In this case we simply fallback to interpreter. */
6475
6476 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6477 }
6478 #endif
6479
6480 /* Carry on with non-JIT matching. This information is for finding all the
6481 numbers associated with a given name, for condition testing. */
6482
6483 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6484 md->name_count = re->name_count;
6485 md->name_entry_size = re->name_entry_size;
6486
6487 /* Fish out the optional data from the extra_data structure, first setting
6488 the default values. */
6489
6490 study = NULL;
6491 md->match_limit = MATCH_LIMIT;
6492 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6493 md->callout_data = NULL;
6494
6495 /* The table pointer is always in native byte order. */
6496
6497 tables = re->tables;
6498
6499 if (extra_data != NULL)
6500 {
6501 register unsigned int flags = extra_data->flags;
6502 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6503 study = (const pcre_study_data *)extra_data->study_data;
6504 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6505 md->match_limit = extra_data->match_limit;
6506 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6507 md->match_limit_recursion = extra_data->match_limit_recursion;
6508 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6509 md->callout_data = extra_data->callout_data;
6510 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6511 }
6512
6513 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6514 is a feature that makes it possible to save compiled regex and re-use them
6515 in other programs later. */
6516
6517 if (tables == NULL) tables = PRIV(default_tables);
6518
6519 /* Set up other data */
6520
6521 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6522 startline = (re->flags & PCRE_STARTLINE) != 0;
6523 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6524
6525 /* The code starts after the real_pcre block and the capture name table. */
6526
6527 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6528 re->name_count * re->name_entry_size;
6529
6530 md->start_subject = (PCRE_PUCHAR)subject;
6531 md->start_offset = start_offset;
6532 md->end_subject = md->start_subject + length;
6533 end_subject = md->end_subject;
6534
6535 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6536 md->use_ucp = (re->options & PCRE_UCP) != 0;
6537 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6538 md->ignore_skip_arg = 0;
6539
6540 /* Some options are unpacked into BOOL variables in the hope that testing
6541 them will be faster than individual option bits. */
6542
6543 md->notbol = (options & PCRE_NOTBOL) != 0;
6544 md->noteol = (options & PCRE_NOTEOL) != 0;
6545 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6546 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6547
6548 md->hitend = FALSE;
6549 md->mark = md->nomatch_mark = NULL; /* In case never set */
6550
6551 md->recursive = NULL; /* No recursion at top level */
6552 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6553
6554 md->lcc = tables + lcc_offset;
6555 md->fcc = tables + fcc_offset;
6556 md->ctypes = tables + ctypes_offset;
6557
6558 /* Handle different \R options. */
6559
6560 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6561 {
6562 case 0:
6563 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6564 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6565 else
6566 #ifdef BSR_ANYCRLF
6567 md->bsr_anycrlf = TRUE;
6568 #else
6569 md->bsr_anycrlf = FALSE;
6570 #endif
6571 break;
6572
6573 case PCRE_BSR_ANYCRLF:
6574 md->bsr_anycrlf = TRUE;
6575 break;
6576
6577 case PCRE_BSR_UNICODE:
6578 md->bsr_anycrlf = FALSE;
6579 break;
6580
6581 default: return PCRE_ERROR_BADNEWLINE;
6582 }
6583
6584 /* Handle different types of newline. The three bits give eight cases. If
6585 nothing is set at run time, whatever was used at compile time applies. */
6586
6587 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6588 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6589 {
6590 case 0: newline = NEWLINE; break; /* Compile-time default */
6591 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6592 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6593 case PCRE_NEWLINE_CR+
6594 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6595 case PCRE_NEWLINE_ANY: newline = -1; break;
6596 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6597 default: return PCRE_ERROR_BADNEWLINE;
6598 }
6599
6600 if (newline == -2)
6601 {
6602 md->nltype = NLTYPE_ANYCRLF;
6603 }
6604 else if (newline < 0)
6605 {
6606 md->nltype = NLTYPE_ANY;
6607 }
6608 else
6609 {
6610 md->nltype = NLTYPE_FIXED;
6611 if (newline > 255)
6612 {
6613 md->nllen = 2;
6614 md->nl[0] = (newline >> 8) & 255;
6615 md->nl[1] = newline & 255;
6616 }
6617 else
6618 {
6619 md->nllen = 1;
6620 md->nl[0] = newline;
6621 }
6622 }
6623
6624 /* Partial matching was originally supported only for a restricted set of
6625 regexes; from release 8.00 there are no restrictions, but the bits are still
6626 defined (though never set). So there's no harm in leaving this code. */
6627
6628 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6629 return PCRE_ERROR_BADPARTIAL;
6630
6631 /* If the expression has got more back references than the offsets supplied can
6632 hold, we get a temporary chunk of working store to use during the matching.
6633 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6634 of 3. */
6635
6636 ocount = offsetcount - (offsetcount % 3);
6637 arg_offset_max = (2*ocount)/3;
6638
6639 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6640 {
6641 ocount = re->top_backref * 3 + 3;
6642 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6643 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6644 using_temporary_offsets = TRUE;
6645 DPRINTF(("Got memory to hold back references\n"));
6646 }
6647 else md->offset_vector = offsets;
6648 md->offset_end = ocount;
6649 md->offset_max = (2*ocount)/3;
6650 md->capture_last = 0;
6651
6652 /* Reset the working variable associated with each extraction. These should
6653 never be used unless previously set, but they get saved and restored, and so we
6654 initialize them to avoid reading uninitialized locations. Also, unset the
6655 offsets for the matched string. This is really just for tidiness with callouts,
6656 in case they inspect these fields. */
6657
6658 if (md->offset_vector != NULL)
6659 {
6660 register int *iptr = md->offset_vector + ocount;
6661 register int *iend = iptr - re->top_bracket;
6662 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6663 while (--iptr >= iend) *iptr = -1;
6664 md->offset_vector[0] = md->offset_vector[1] = -1;
6665 }
6666
6667 /* Set up the first character to match, if available. The first_char value is
6668 never set for an anchored regular expression, but the anchoring may be forced
6669 at run time, so we have to test for anchoring. The first char may be unset for
6670 an unanchored pattern, of course. If there's no first char and the pattern was
6671 studied, there may be a bitmap of possible first characters. */
6672
6673 if (!anchored)
6674 {
6675 if ((re->flags & PCRE_FIRSTSET) != 0)
6676 {
6677 has_first_char = TRUE;
6678 first_char = first_char2 = (pcre_uchar)(re->first_char);
6679 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6680 {
6681 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6682 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6683 if (utf && first_char > 127)
6684 first_char2 = UCD_OTHERCASE(first_char);
6685 #endif
6686 }
6687 }
6688 else
6689 if (!startline && study != NULL &&
6690 (study->flags & PCRE_STUDY_MAPPED) != 0)
6691 start_bits = study->start_bits;
6692 }
6693
6694 /* For anchored or unanchored matches, there may be a "last known required
6695 character" set. */
6696
6697 if ((re->flags & PCRE_REQCHSET) != 0)
6698 {
6699 has_req_char = TRUE;
6700 req_char = req_char2 = (pcre_uchar)(re->req_char);
6701 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6702 {
6703 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6704 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6705 if (utf && req_char > 127)
6706 req_char2 = UCD_OTHERCASE(req_char);
6707 #endif
6708 }
6709 }
6710
6711
6712 /* ==========================================================================*/
6713
6714 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6715 the loop runs just once. */
6716
6717 for(;;)
6718 {
6719 PCRE_PUCHAR save_end_subject = end_subject;
6720 PCRE_PUCHAR new_start_match;
6721
6722 /* If firstline is TRUE, the start of the match is constrained to the first
6723 line of a multiline string. That is, the match must be before or at the first
6724 newline. Implement this by temporarily adjusting end_subject so that we stop
6725 scanning at a newline. If the match fails at the newline, later code breaks
6726 this loop. */
6727
6728 if (firstline)
6729 {
6730 PCRE_PUCHAR t = start_match;
6731 #ifdef SUPPORT_UTF
6732 if (utf)
6733 {
6734 while (t < md->end_subject && !IS_NEWLINE(t))
6735 {
6736 t++;
6737 ACROSSCHAR(t < end_subject, *t, t++);
6738 }
6739 }
6740 else
6741 #endif
6742 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6743 end_subject = t;
6744 }
6745
6746 /* There are some optimizations that avoid running the match if a known
6747 starting point is not found, or if a known later character is not present.
6748 However, there is an option that disables these, for testing and for ensuring
6749 that all callouts do actually occur. The option can be set in the regex by
6750 (*NO_START_OPT) or passed in match-time options. */
6751
6752 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6753 {
6754 /* Advance to a unique first char if there is one. */
6755
6756 if (has_first_char)
6757 {