/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1274 - (show annotations)
Fri Mar 8 11:35:41 2013 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 216564 byte(s)
Error occurred while calculating annotation data.
Fix the case where there are two or more SKIPs that may have to be ignored.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 /* COMMIT overrides PRUNE, SKIP, and THEN */
785
786 case OP_COMMIT:
787 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
788 eptrb, RM52);
789 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
790 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
791 rrc != MATCH_THEN)
792 RRETURN(rrc);
793 RRETURN(MATCH_COMMIT);
794
795 /* PRUNE overrides THEN */
796
797 case OP_PRUNE:
798 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
799 eptrb, RM51);
800 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
801 RRETURN(MATCH_PRUNE);
802
803 case OP_PRUNE_ARG:
804 md->nomatch_mark = ecode + 2;
805 md->mark = NULL; /* In case previously set by assertion */
806 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
807 eptrb, RM56);
808 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
809 md->mark == NULL) md->mark = ecode + 2;
810 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
811 RRETURN(MATCH_PRUNE);
812
813 /* SKIP overrides PRUNE and THEN */
814
815 case OP_SKIP:
816 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
817 eptrb, RM53);
818 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
819 RRETURN(rrc);
820 md->start_match_ptr = eptr; /* Pass back current position */
821 RRETURN(MATCH_SKIP);
822
823 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
824 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
825 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
826 that failed and any that preceed it (either they also failed, or were not
827 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
828 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
829 set to the count of the one that failed. */
830
831 case OP_SKIP_ARG:
832 md->skip_arg_count++;
833 if (md->skip_arg_count <= md->ignore_skip_arg)
834 {
835 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
836 break;
837 }
838 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
839 eptrb, RM57);
840 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
841 RRETURN(rrc);
842
843 /* Pass back the current skip name by overloading md->start_match_ptr and
844 returning the special MATCH_SKIP_ARG return code. This will either be
845 caught by a matching MARK, or get to the top, where it causes a rematch
846 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
847
848 md->start_match_ptr = ecode + 2;
849 RRETURN(MATCH_SKIP_ARG);
850
851 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
852 the branch in which it occurs can be determined. Overload the start of
853 match pointer to do this. */
854
855 case OP_THEN:
856 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
857 eptrb, RM54);
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 md->start_match_ptr = ecode;
860 RRETURN(MATCH_THEN);
861
862 case OP_THEN_ARG:
863 md->nomatch_mark = ecode + 2;
864 md->mark = NULL; /* In case previously set by assertion */
865 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
866 md, eptrb, RM58);
867 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
868 md->mark == NULL) md->mark = ecode + 2;
869 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
870 md->start_match_ptr = ecode;
871 RRETURN(MATCH_THEN);
872
873 /* Handle an atomic group that does not contain any capturing parentheses.
874 This can be handled like an assertion. Prior to 8.13, all atomic groups
875 were handled this way. In 8.13, the code was changed as below for ONCE, so
876 that backups pass through the group and thereby reset captured values.
877 However, this uses a lot more stack, so in 8.20, atomic groups that do not
878 contain any captures generate OP_ONCE_NC, which can be handled in the old,
879 less stack intensive way.
880
881 Check the alternative branches in turn - the matching won't pass the KET
882 for this kind of subpattern. If any one branch matches, we carry on as at
883 the end of a normal bracket, leaving the subject pointer, but resetting
884 the start-of-match value in case it was changed by \K. */
885
886 case OP_ONCE_NC:
887 prev = ecode;
888 saved_eptr = eptr;
889 save_mark = md->mark;
890 do
891 {
892 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
893 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
894 {
895 mstart = md->start_match_ptr;
896 break;
897 }
898 if (rrc == MATCH_THEN)
899 {
900 next = ecode + GET(ecode,1);
901 if (md->start_match_ptr < next &&
902 (*ecode == OP_ALT || *next == OP_ALT))
903 rrc = MATCH_NOMATCH;
904 }
905
906 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
907 ecode += GET(ecode,1);
908 md->mark = save_mark;
909 }
910 while (*ecode == OP_ALT);
911
912 /* If hit the end of the group (which could be repeated), fail */
913
914 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
915
916 /* Continue as from after the group, updating the offsets high water
917 mark, since extracts may have been taken. */
918
919 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
920
921 offset_top = md->end_offset_top;
922 eptr = md->end_match_ptr;
923
924 /* For a non-repeating ket, just continue at this level. This also
925 happens for a repeating ket if no characters were matched in the group.
926 This is the forcible breaking of infinite loops as implemented in Perl
927 5.005. */
928
929 if (*ecode == OP_KET || eptr == saved_eptr)
930 {
931 ecode += 1+LINK_SIZE;
932 break;
933 }
934
935 /* The repeating kets try the rest of the pattern or restart from the
936 preceding bracket, in the appropriate order. The second "call" of match()
937 uses tail recursion, to avoid using another stack frame. */
938
939 if (*ecode == OP_KETRMIN)
940 {
941 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
943 ecode = prev;
944 goto TAIL_RECURSE;
945 }
946 else /* OP_KETRMAX */
947 {
948 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
950 ecode += 1 + LINK_SIZE;
951 goto TAIL_RECURSE;
952 }
953 /* Control never gets here */
954
955 /* Handle a capturing bracket, other than those that are possessive with an
956 unlimited repeat. If there is space in the offset vector, save the current
957 subject position in the working slot at the top of the vector. We mustn't
958 change the current values of the data slot, because they may be set from a
959 previous iteration of this group, and be referred to by a reference inside
960 the group. A failure to match might occur after the group has succeeded,
961 if something later on doesn't match. For this reason, we need to restore
962 the working value and also the values of the final offsets, in case they
963 were set by a previous iteration of the same bracket.
964
965 If there isn't enough space in the offset vector, treat this as if it were
966 a non-capturing bracket. Don't worry about setting the flag for the error
967 case here; that is handled in the code for KET. */
968
969 case OP_CBRA:
970 case OP_SCBRA:
971 number = GET2(ecode, 1+LINK_SIZE);
972 offset = number << 1;
973
974 #ifdef PCRE_DEBUG
975 printf("start bracket %d\n", number);
976 printf("subject=");
977 pchars(eptr, 16, TRUE, md);
978 printf("\n");
979 #endif
980
981 if (offset < md->offset_max)
982 {
983 save_offset1 = md->offset_vector[offset];
984 save_offset2 = md->offset_vector[offset+1];
985 save_offset3 = md->offset_vector[md->offset_end - number];
986 save_capture_last = md->capture_last;
987 save_mark = md->mark;
988
989 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
990 md->offset_vector[md->offset_end - number] =
991 (int)(eptr - md->start_subject);
992
993 for (;;)
994 {
995 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
996 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
997 eptrb, RM1);
998 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
999
1000 /* If we backed up to a THEN, check whether it is within the current
1001 branch by comparing the address of the THEN that is passed back with
1002 the end of the branch. If it is within the current branch, and the
1003 branch is one of two or more alternatives (it either starts or ends
1004 with OP_ALT), we have reached the limit of THEN's action, so convert
1005 the return code to NOMATCH, which will cause normal backtracking to
1006 happen from now on. Otherwise, THEN is passed back to an outer
1007 alternative. This implements Perl's treatment of parenthesized groups,
1008 where a group not containing | does not affect the current alternative,
1009 that is, (X) is NOT the same as (X|(*F)). */
1010
1011 if (rrc == MATCH_THEN)
1012 {
1013 next = ecode + GET(ecode,1);
1014 if (md->start_match_ptr < next &&
1015 (*ecode == OP_ALT || *next == OP_ALT))
1016 rrc = MATCH_NOMATCH;
1017 }
1018
1019 /* Anything other than NOMATCH is passed back. */
1020
1021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1022 md->capture_last = save_capture_last;
1023 ecode += GET(ecode, 1);
1024 md->mark = save_mark;
1025 if (*ecode != OP_ALT) break;
1026 }
1027
1028 DPRINTF(("bracket %d failed\n", number));
1029 md->offset_vector[offset] = save_offset1;
1030 md->offset_vector[offset+1] = save_offset2;
1031 md->offset_vector[md->offset_end - number] = save_offset3;
1032
1033 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1034
1035 RRETURN(rrc);
1036 }
1037
1038 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1039 as a non-capturing bracket. */
1040
1041 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1042 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1043
1044 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1045
1046 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1047 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1048
1049 /* Non-capturing or atomic group, except for possessive with unlimited
1050 repeat and ONCE group with no captures. Loop for all the alternatives.
1051
1052 When we get to the final alternative within the brackets, we used to return
1053 the result of a recursive call to match() whatever happened so it was
1054 possible to reduce stack usage by turning this into a tail recursion,
1055 except in the case of a possibly empty group. However, now that there is
1056 the possiblity of (*THEN) occurring in the final alternative, this
1057 optimization is no longer always possible.
1058
1059 We can optimize if we know there are no (*THEN)s in the pattern; at present
1060 this is the best that can be done.
1061
1062 MATCH_ONCE is returned when the end of an atomic group is successfully
1063 reached, but subsequent matching fails. It passes back up the tree (causing
1064 captured values to be reset) until the original atomic group level is
1065 reached. This is tested by comparing md->once_target with the start of the
1066 group. At this point, the return is converted into MATCH_NOMATCH so that
1067 previous backup points can be taken. */
1068
1069 case OP_ONCE:
1070 case OP_BRA:
1071 case OP_SBRA:
1072 DPRINTF(("start non-capturing bracket\n"));
1073
1074 for (;;)
1075 {
1076 if (op >= OP_SBRA || op == OP_ONCE)
1077 md->match_function_type = MATCH_CBEGROUP;
1078
1079 /* If this is not a possibly empty group, and there are no (*THEN)s in
1080 the pattern, and this is the final alternative, optimize as described
1081 above. */
1082
1083 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1084 {
1085 ecode += PRIV(OP_lengths)[*ecode];
1086 goto TAIL_RECURSE;
1087 }
1088
1089 /* In all other cases, we have to make another call to match(). */
1090
1091 save_mark = md->mark;
1092 save_capture_last = md->capture_last;
1093 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1094 RM2);
1095
1096 /* See comment in the code for capturing groups above about handling
1097 THEN. */
1098
1099 if (rrc == MATCH_THEN)
1100 {
1101 next = ecode + GET(ecode,1);
1102 if (md->start_match_ptr < next &&
1103 (*ecode == OP_ALT || *next == OP_ALT))
1104 rrc = MATCH_NOMATCH;
1105 }
1106
1107 if (rrc != MATCH_NOMATCH)
1108 {
1109 if (rrc == MATCH_ONCE)
1110 {
1111 const pcre_uchar *scode = ecode;
1112 if (*scode != OP_ONCE) /* If not at start, find it */
1113 {
1114 while (*scode == OP_ALT) scode += GET(scode, 1);
1115 scode -= GET(scode, 1);
1116 }
1117 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1118 }
1119 RRETURN(rrc);
1120 }
1121 ecode += GET(ecode, 1);
1122 md->mark = save_mark;
1123 if (*ecode != OP_ALT) break;
1124 md->capture_last = save_capture_last;
1125 }
1126
1127 RRETURN(MATCH_NOMATCH);
1128
1129 /* Handle possessive capturing brackets with an unlimited repeat. We come
1130 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1131 handled similarly to the normal case above. However, the matching is
1132 different. The end of these brackets will always be OP_KETRPOS, which
1133 returns MATCH_KETRPOS without going further in the pattern. By this means
1134 we can handle the group by iteration rather than recursion, thereby
1135 reducing the amount of stack needed. */
1136
1137 case OP_CBRAPOS:
1138 case OP_SCBRAPOS:
1139 allow_zero = FALSE;
1140
1141 POSSESSIVE_CAPTURE:
1142 number = GET2(ecode, 1+LINK_SIZE);
1143 offset = number << 1;
1144
1145 #ifdef PCRE_DEBUG
1146 printf("start possessive bracket %d\n", number);
1147 printf("subject=");
1148 pchars(eptr, 16, TRUE, md);
1149 printf("\n");
1150 #endif
1151
1152 if (offset < md->offset_max)
1153 {
1154 matched_once = FALSE;
1155 code_offset = (int)(ecode - md->start_code);
1156
1157 save_offset1 = md->offset_vector[offset];
1158 save_offset2 = md->offset_vector[offset+1];
1159 save_offset3 = md->offset_vector[md->offset_end - number];
1160 save_capture_last = md->capture_last;
1161
1162 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1163
1164 /* Each time round the loop, save the current subject position for use
1165 when the group matches. For MATCH_MATCH, the group has matched, so we
1166 restart it with a new subject starting position, remembering that we had
1167 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1168 usual. If we haven't matched any alternatives in any iteration, check to
1169 see if a previous iteration matched. If so, the group has matched;
1170 continue from afterwards. Otherwise it has failed; restore the previous
1171 capture values before returning NOMATCH. */
1172
1173 for (;;)
1174 {
1175 md->offset_vector[md->offset_end - number] =
1176 (int)(eptr - md->start_subject);
1177 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1178 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1179 eptrb, RM63);
1180 if (rrc == MATCH_KETRPOS)
1181 {
1182 offset_top = md->end_offset_top;
1183 eptr = md->end_match_ptr;
1184 ecode = md->start_code + code_offset;
1185 save_capture_last = md->capture_last;
1186 matched_once = TRUE;
1187 continue;
1188 }
1189
1190 /* See comment in the code for capturing groups above about handling
1191 THEN. */
1192
1193 if (rrc == MATCH_THEN)
1194 {
1195 next = ecode + GET(ecode,1);
1196 if (md->start_match_ptr < next &&
1197 (*ecode == OP_ALT || *next == OP_ALT))
1198 rrc = MATCH_NOMATCH;
1199 }
1200
1201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1202 md->capture_last = save_capture_last;
1203 ecode += GET(ecode, 1);
1204 if (*ecode != OP_ALT) break;
1205 }
1206
1207 if (!matched_once)
1208 {
1209 md->offset_vector[offset] = save_offset1;
1210 md->offset_vector[offset+1] = save_offset2;
1211 md->offset_vector[md->offset_end - number] = save_offset3;
1212 }
1213
1214 if (allow_zero || matched_once)
1215 {
1216 ecode += 1 + LINK_SIZE;
1217 break;
1218 }
1219
1220 RRETURN(MATCH_NOMATCH);
1221 }
1222
1223 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1224 as a non-capturing bracket. */
1225
1226 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1227 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1228
1229 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1230
1231 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1232 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1233
1234 /* Non-capturing possessive bracket with unlimited repeat. We come here
1235 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1236 without the capturing complication. It is written out separately for speed
1237 and cleanliness. */
1238
1239 case OP_BRAPOS:
1240 case OP_SBRAPOS:
1241 allow_zero = FALSE;
1242
1243 POSSESSIVE_NON_CAPTURE:
1244 matched_once = FALSE;
1245 code_offset = (int)(ecode - md->start_code);
1246 save_capture_last = md->capture_last;
1247
1248 for (;;)
1249 {
1250 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1251 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1252 eptrb, RM48);
1253 if (rrc == MATCH_KETRPOS)
1254 {
1255 offset_top = md->end_offset_top;
1256 eptr = md->end_match_ptr;
1257 ecode = md->start_code + code_offset;
1258 matched_once = TRUE;
1259 continue;
1260 }
1261
1262 /* See comment in the code for capturing groups above about handling
1263 THEN. */
1264
1265 if (rrc == MATCH_THEN)
1266 {
1267 next = ecode + GET(ecode,1);
1268 if (md->start_match_ptr < next &&
1269 (*ecode == OP_ALT || *next == OP_ALT))
1270 rrc = MATCH_NOMATCH;
1271 }
1272
1273 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1274 ecode += GET(ecode, 1);
1275 if (*ecode != OP_ALT) break;
1276 md->capture_last = save_capture_last;
1277 }
1278
1279 if (matched_once || allow_zero)
1280 {
1281 ecode += 1 + LINK_SIZE;
1282 break;
1283 }
1284 RRETURN(MATCH_NOMATCH);
1285
1286 /* Control never reaches here. */
1287
1288 /* Conditional group: compilation checked that there are no more than
1289 two branches. If the condition is false, skipping the first branch takes us
1290 past the end if there is only one branch, but that's OK because that is
1291 exactly what going to the ket would do. */
1292
1293 case OP_COND:
1294 case OP_SCOND:
1295 codelink = GET(ecode, 1);
1296
1297 /* Because of the way auto-callout works during compile, a callout item is
1298 inserted between OP_COND and an assertion condition. */
1299
1300 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1301 {
1302 if (PUBL(callout) != NULL)
1303 {
1304 PUBL(callout_block) cb;
1305 cb.version = 2; /* Version 1 of the callout block */
1306 cb.callout_number = ecode[LINK_SIZE+2];
1307 cb.offset_vector = md->offset_vector;
1308 #if defined COMPILE_PCRE8
1309 cb.subject = (PCRE_SPTR)md->start_subject;
1310 #elif defined COMPILE_PCRE16
1311 cb.subject = (PCRE_SPTR16)md->start_subject;
1312 #elif defined COMPILE_PCRE32
1313 cb.subject = (PCRE_SPTR32)md->start_subject;
1314 #endif
1315 cb.subject_length = (int)(md->end_subject - md->start_subject);
1316 cb.start_match = (int)(mstart - md->start_subject);
1317 cb.current_position = (int)(eptr - md->start_subject);
1318 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1319 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1320 cb.capture_top = offset_top/2;
1321 cb.capture_last = md->capture_last & CAPLMASK;
1322 /* Internal change requires this for API compatibility. */
1323 if (cb.capture_last == 0) cb.capture_last = -1;
1324 cb.callout_data = md->callout_data;
1325 cb.mark = md->nomatch_mark;
1326 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1327 if (rrc < 0) RRETURN(rrc);
1328 }
1329 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1330 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1331 }
1332
1333 condcode = ecode[LINK_SIZE+1];
1334
1335 /* Now see what the actual condition is */
1336
1337 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1338 {
1339 if (md->recursive == NULL) /* Not recursing => FALSE */
1340 {
1341 condition = FALSE;
1342 ecode += GET(ecode, 1);
1343 }
1344 else
1345 {
1346 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1347 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1348
1349 /* If the test is for recursion into a specific subpattern, and it is
1350 false, but the test was set up by name, scan the table to see if the
1351 name refers to any other numbers, and test them. The condition is true
1352 if any one is set. */
1353
1354 if (!condition && condcode == OP_NRREF)
1355 {
1356 pcre_uchar *slotA = md->name_table;
1357 for (i = 0; i < md->name_count; i++)
1358 {
1359 if (GET2(slotA, 0) == recno) break;
1360 slotA += md->name_entry_size;
1361 }
1362
1363 /* Found a name for the number - there can be only one; duplicate
1364 names for different numbers are allowed, but not vice versa. First
1365 scan down for duplicates. */
1366
1367 if (i < md->name_count)
1368 {
1369 pcre_uchar *slotB = slotA;
1370 while (slotB > md->name_table)
1371 {
1372 slotB -= md->name_entry_size;
1373 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1374 {
1375 condition = GET2(slotB, 0) == md->recursive->group_num;
1376 if (condition) break;
1377 }
1378 else break;
1379 }
1380
1381 /* Scan up for duplicates */
1382
1383 if (!condition)
1384 {
1385 slotB = slotA;
1386 for (i++; i < md->name_count; i++)
1387 {
1388 slotB += md->name_entry_size;
1389 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1390 {
1391 condition = GET2(slotB, 0) == md->recursive->group_num;
1392 if (condition) break;
1393 }
1394 else break;
1395 }
1396 }
1397 }
1398 }
1399
1400 /* Chose branch according to the condition */
1401
1402 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1403 }
1404 }
1405
1406 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1407 {
1408 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1409 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1410
1411 /* If the numbered capture is unset, but the reference was by name,
1412 scan the table to see if the name refers to any other numbers, and test
1413 them. The condition is true if any one is set. This is tediously similar
1414 to the code above, but not close enough to try to amalgamate. */
1415
1416 if (!condition && condcode == OP_NCREF)
1417 {
1418 unsigned int refno = offset >> 1;
1419 pcre_uchar *slotA = md->name_table;
1420
1421 for (i = 0; i < md->name_count; i++)
1422 {
1423 if (GET2(slotA, 0) == refno) break;
1424 slotA += md->name_entry_size;
1425 }
1426
1427 /* Found a name for the number - there can be only one; duplicate names
1428 for different numbers are allowed, but not vice versa. First scan down
1429 for duplicates. */
1430
1431 if (i < md->name_count)
1432 {
1433 pcre_uchar *slotB = slotA;
1434 while (slotB > md->name_table)
1435 {
1436 slotB -= md->name_entry_size;
1437 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1438 {
1439 offset = GET2(slotB, 0) << 1;
1440 condition = offset < offset_top &&
1441 md->offset_vector[offset] >= 0;
1442 if (condition) break;
1443 }
1444 else break;
1445 }
1446
1447 /* Scan up for duplicates */
1448
1449 if (!condition)
1450 {
1451 slotB = slotA;
1452 for (i++; i < md->name_count; i++)
1453 {
1454 slotB += md->name_entry_size;
1455 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1456 {
1457 offset = GET2(slotB, 0) << 1;
1458 condition = offset < offset_top &&
1459 md->offset_vector[offset] >= 0;
1460 if (condition) break;
1461 }
1462 else break;
1463 }
1464 }
1465 }
1466 }
1467
1468 /* Chose branch according to the condition */
1469
1470 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1471 }
1472
1473 else if (condcode == OP_DEF) /* DEFINE - always false */
1474 {
1475 condition = FALSE;
1476 ecode += GET(ecode, 1);
1477 }
1478
1479 /* The condition is an assertion. Call match() to evaluate it - setting
1480 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1481 an assertion. */
1482
1483 else
1484 {
1485 md->match_function_type = MATCH_CONDASSERT;
1486 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1487 if (rrc == MATCH_MATCH)
1488 {
1489 if (md->end_offset_top > offset_top)
1490 offset_top = md->end_offset_top; /* Captures may have happened */
1491 condition = TRUE;
1492 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1493 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1494 }
1495
1496 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1497 assertion; it is therefore treated as NOMATCH. */
1498
1499 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1500 {
1501 RRETURN(rrc); /* Need braces because of following else */
1502 }
1503 else
1504 {
1505 condition = FALSE;
1506 ecode += codelink;
1507 }
1508 }
1509
1510 /* We are now at the branch that is to be obeyed. As there is only one, can
1511 use tail recursion to avoid using another stack frame, except when there is
1512 unlimited repeat of a possibly empty group. In the latter case, a recursive
1513 call to match() is always required, unless the second alternative doesn't
1514 exist, in which case we can just plough on. Note that, for compatibility
1515 with Perl, the | in a conditional group is NOT treated as creating two
1516 alternatives. If a THEN is encountered in the branch, it propagates out to
1517 the enclosing alternative (unless nested in a deeper set of alternatives,
1518 of course). */
1519
1520 if (condition || *ecode == OP_ALT)
1521 {
1522 if (op != OP_SCOND)
1523 {
1524 ecode += 1 + LINK_SIZE;
1525 goto TAIL_RECURSE;
1526 }
1527
1528 md->match_function_type = MATCH_CBEGROUP;
1529 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1530 RRETURN(rrc);
1531 }
1532
1533 /* Condition false & no alternative; continue after the group. */
1534
1535 else
1536 {
1537 ecode += 1 + LINK_SIZE;
1538 }
1539 break;
1540
1541
1542 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1543 to close any currently open capturing brackets. */
1544
1545 case OP_CLOSE:
1546 number = GET2(ecode, 1); /* Must be less than 65536 */
1547 offset = number << 1;
1548
1549 #ifdef PCRE_DEBUG
1550 printf("end bracket %d at *ACCEPT", number);
1551 printf("\n");
1552 #endif
1553
1554 md->capture_last = (md->capture_last & OVFLMASK) | number;
1555 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1556 {
1557 md->offset_vector[offset] =
1558 md->offset_vector[md->offset_end - number];
1559 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1560 if (offset_top <= offset) offset_top = offset + 2;
1561 }
1562 ecode += 1 + IMM2_SIZE;
1563 break;
1564
1565
1566 /* End of the pattern, either real or forced. */
1567
1568 case OP_END:
1569 case OP_ACCEPT:
1570 case OP_ASSERT_ACCEPT:
1571
1572 /* If we have matched an empty string, fail if not in an assertion and not
1573 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1574 is set and we have matched at the start of the subject. In both cases,
1575 backtracking will then try other alternatives, if any. */
1576
1577 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1578 md->recursive == NULL &&
1579 (md->notempty ||
1580 (md->notempty_atstart &&
1581 mstart == md->start_subject + md->start_offset)))
1582 RRETURN(MATCH_NOMATCH);
1583
1584 /* Otherwise, we have a match. */
1585
1586 md->end_match_ptr = eptr; /* Record where we ended */
1587 md->end_offset_top = offset_top; /* and how many extracts were taken */
1588 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1589
1590 /* For some reason, the macros don't work properly if an expression is
1591 given as the argument to RRETURN when the heap is in use. */
1592
1593 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1594 RRETURN(rrc);
1595
1596 /* Assertion brackets. Check the alternative branches in turn - the
1597 matching won't pass the KET for an assertion. If any one branch matches,
1598 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1599 start of each branch to move the current point backwards, so the code at
1600 this level is identical to the lookahead case. When the assertion is part
1601 of a condition, we want to return immediately afterwards. The caller of
1602 this incarnation of the match() function will have set MATCH_CONDASSERT in
1603 md->match_function type, and one of these opcodes will be the first opcode
1604 that is processed. We use a local variable that is preserved over calls to
1605 match() to remember this case. */
1606
1607 case OP_ASSERT:
1608 case OP_ASSERTBACK:
1609 save_mark = md->mark;
1610 if (md->match_function_type == MATCH_CONDASSERT)
1611 {
1612 condassert = TRUE;
1613 md->match_function_type = 0;
1614 }
1615 else condassert = FALSE;
1616
1617 do
1618 {
1619 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1620 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1621 {
1622 mstart = md->start_match_ptr; /* In case \K reset it */
1623 break;
1624 }
1625 md->mark = save_mark;
1626
1627 /* A COMMIT failure must fail the entire assertion, without trying any
1628 subsequent branches. */
1629
1630 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1631
1632 /* PCRE does not allow THEN to escape beyond an assertion; it
1633 is treated as NOMATCH. */
1634
1635 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1636 ecode += GET(ecode, 1);
1637 }
1638 while (*ecode == OP_ALT);
1639
1640 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1641
1642 /* If checking an assertion for a condition, return MATCH_MATCH. */
1643
1644 if (condassert) RRETURN(MATCH_MATCH);
1645
1646 /* Continue from after the assertion, updating the offsets high water
1647 mark, since extracts may have been taken during the assertion. */
1648
1649 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1650 ecode += 1 + LINK_SIZE;
1651 offset_top = md->end_offset_top;
1652 continue;
1653
1654 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1655 PRUNE, or COMMIT means we must assume failure without checking subsequent
1656 branches. */
1657
1658 case OP_ASSERT_NOT:
1659 case OP_ASSERTBACK_NOT:
1660 save_mark = md->mark;
1661 if (md->match_function_type == MATCH_CONDASSERT)
1662 {
1663 condassert = TRUE;
1664 md->match_function_type = 0;
1665 }
1666 else condassert = FALSE;
1667
1668 do
1669 {
1670 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1671 md->mark = save_mark;
1672 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1673 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1674 {
1675 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1676 break;
1677 }
1678
1679 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1680 as NOMATCH. */
1681
1682 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1683 ecode += GET(ecode,1);
1684 }
1685 while (*ecode == OP_ALT);
1686
1687 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1688
1689 ecode += 1 + LINK_SIZE;
1690 continue;
1691
1692 /* Move the subject pointer back. This occurs only at the start of
1693 each branch of a lookbehind assertion. If we are too close to the start to
1694 move back, this match function fails. When working with UTF-8 we move
1695 back a number of characters, not bytes. */
1696
1697 case OP_REVERSE:
1698 #ifdef SUPPORT_UTF
1699 if (utf)
1700 {
1701 i = GET(ecode, 1);
1702 while (i-- > 0)
1703 {
1704 eptr--;
1705 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1706 BACKCHAR(eptr);
1707 }
1708 }
1709 else
1710 #endif
1711
1712 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1713
1714 {
1715 eptr -= GET(ecode, 1);
1716 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1717 }
1718
1719 /* Save the earliest consulted character, then skip to next op code */
1720
1721 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1722 ecode += 1 + LINK_SIZE;
1723 break;
1724
1725 /* The callout item calls an external function, if one is provided, passing
1726 details of the match so far. This is mainly for debugging, though the
1727 function is able to force a failure. */
1728
1729 case OP_CALLOUT:
1730 if (PUBL(callout) != NULL)
1731 {
1732 PUBL(callout_block) cb;
1733 cb.version = 2; /* Version 1 of the callout block */
1734 cb.callout_number = ecode[1];
1735 cb.offset_vector = md->offset_vector;
1736 #if defined COMPILE_PCRE8
1737 cb.subject = (PCRE_SPTR)md->start_subject;
1738 #elif defined COMPILE_PCRE16
1739 cb.subject = (PCRE_SPTR16)md->start_subject;
1740 #elif defined COMPILE_PCRE32
1741 cb.subject = (PCRE_SPTR32)md->start_subject;
1742 #endif
1743 cb.subject_length = (int)(md->end_subject - md->start_subject);
1744 cb.start_match = (int)(mstart - md->start_subject);
1745 cb.current_position = (int)(eptr - md->start_subject);
1746 cb.pattern_position = GET(ecode, 2);
1747 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1748 cb.capture_top = offset_top/2;
1749 cb.capture_last = md->capture_last & CAPLMASK;
1750 /* Internal change requires this for API compatibility. */
1751 if (cb.capture_last == 0) cb.capture_last = -1;
1752 cb.callout_data = md->callout_data;
1753 cb.mark = md->nomatch_mark;
1754 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1755 if (rrc < 0) RRETURN(rrc);
1756 }
1757 ecode += 2 + 2*LINK_SIZE;
1758 break;
1759
1760 /* Recursion either matches the current regex, or some subexpression. The
1761 offset data is the offset to the starting bracket from the start of the
1762 whole pattern. (This is so that it works from duplicated subpatterns.)
1763
1764 The state of the capturing groups is preserved over recursion, and
1765 re-instated afterwards. We don't know how many are started and not yet
1766 finished (offset_top records the completed total) so we just have to save
1767 all the potential data. There may be up to 65535 such values, which is too
1768 large to put on the stack, but using malloc for small numbers seems
1769 expensive. As a compromise, the stack is used when there are no more than
1770 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1771
1772 There are also other values that have to be saved. We use a chained
1773 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1774 for the original version of this logic. It has, however, been hacked around
1775 a lot, so he is not to blame for the current way it works. */
1776
1777 case OP_RECURSE:
1778 {
1779 recursion_info *ri;
1780 unsigned int recno;
1781
1782 callpat = md->start_code + GET(ecode, 1);
1783 recno = (callpat == md->start_code)? 0 :
1784 GET2(callpat, 1 + LINK_SIZE);
1785
1786 /* Check for repeating a recursion without advancing the subject pointer.
1787 This should catch convoluted mutual recursions. (Some simple cases are
1788 caught at compile time.) */
1789
1790 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1791 if (recno == ri->group_num && eptr == ri->subject_position)
1792 RRETURN(PCRE_ERROR_RECURSELOOP);
1793
1794 /* Add to "recursing stack" */
1795
1796 new_recursive.group_num = recno;
1797 new_recursive.saved_capture_last = md->capture_last;
1798 new_recursive.subject_position = eptr;
1799 new_recursive.prevrec = md->recursive;
1800 md->recursive = &new_recursive;
1801
1802 /* Where to continue from afterwards */
1803
1804 ecode += 1 + LINK_SIZE;
1805
1806 /* Now save the offset data */
1807
1808 new_recursive.saved_max = md->offset_end;
1809 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1810 new_recursive.offset_save = stacksave;
1811 else
1812 {
1813 new_recursive.offset_save =
1814 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1815 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1816 }
1817 memcpy(new_recursive.offset_save, md->offset_vector,
1818 new_recursive.saved_max * sizeof(int));
1819
1820 /* OK, now we can do the recursion. After processing each alternative,
1821 restore the offset data and the last captured value. If there were nested
1822 recursions, md->recursive might be changed, so reset it before looping.
1823 */
1824
1825 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1826 cbegroup = (*callpat >= OP_SBRA);
1827 do
1828 {
1829 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1830 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1831 md, eptrb, RM6);
1832 memcpy(md->offset_vector, new_recursive.offset_save,
1833 new_recursive.saved_max * sizeof(int));
1834 md->capture_last = new_recursive.saved_capture_last;
1835 md->recursive = new_recursive.prevrec;
1836 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1837 {
1838 DPRINTF(("Recursion matched\n"));
1839 if (new_recursive.offset_save != stacksave)
1840 (PUBL(free))(new_recursive.offset_save);
1841
1842 /* Set where we got to in the subject, and reset the start in case
1843 it was changed by \K. This *is* propagated back out of a recursion,
1844 for Perl compatibility. */
1845
1846 eptr = md->end_match_ptr;
1847 mstart = md->start_match_ptr;
1848 goto RECURSION_MATCHED; /* Exit loop; end processing */
1849 }
1850
1851 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1852 recursion; they are treated as NOMATCH. These codes are defined in a
1853 range that can be tested for. Any other return code is an error. */
1854
1855 else if (rrc != MATCH_NOMATCH &&
1856 (rrc < MATCH_BACKTRACK_MIN || rrc > MATCH_BACKTRACK_MAX))
1857 {
1858 DPRINTF(("Recursion gave error %d\n", rrc));
1859 if (new_recursive.offset_save != stacksave)
1860 (PUBL(free))(new_recursive.offset_save);
1861 RRETURN(rrc);
1862 }
1863
1864 md->recursive = &new_recursive;
1865 callpat += GET(callpat, 1);
1866 }
1867 while (*callpat == OP_ALT);
1868
1869 DPRINTF(("Recursion didn't match\n"));
1870 md->recursive = new_recursive.prevrec;
1871 if (new_recursive.offset_save != stacksave)
1872 (PUBL(free))(new_recursive.offset_save);
1873 RRETURN(MATCH_NOMATCH);
1874 }
1875
1876 RECURSION_MATCHED:
1877 break;
1878
1879 /* An alternation is the end of a branch; scan along to find the end of the
1880 bracketed group and go to there. */
1881
1882 case OP_ALT:
1883 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1884 break;
1885
1886 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1887 indicating that it may occur zero times. It may repeat infinitely, or not
1888 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1889 with fixed upper repeat limits are compiled as a number of copies, with the
1890 optional ones preceded by BRAZERO or BRAMINZERO. */
1891
1892 case OP_BRAZERO:
1893 next = ecode + 1;
1894 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1896 do next += GET(next, 1); while (*next == OP_ALT);
1897 ecode = next + 1 + LINK_SIZE;
1898 break;
1899
1900 case OP_BRAMINZERO:
1901 next = ecode + 1;
1902 do next += GET(next, 1); while (*next == OP_ALT);
1903 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1905 ecode++;
1906 break;
1907
1908 case OP_SKIPZERO:
1909 next = ecode+1;
1910 do next += GET(next,1); while (*next == OP_ALT);
1911 ecode = next + 1 + LINK_SIZE;
1912 break;
1913
1914 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1915 here; just jump to the group, with allow_zero set TRUE. */
1916
1917 case OP_BRAPOSZERO:
1918 op = *(++ecode);
1919 allow_zero = TRUE;
1920 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1921 goto POSSESSIVE_NON_CAPTURE;
1922
1923 /* End of a group, repeated or non-repeating. */
1924
1925 case OP_KET:
1926 case OP_KETRMIN:
1927 case OP_KETRMAX:
1928 case OP_KETRPOS:
1929 prev = ecode - GET(ecode, 1);
1930
1931 /* If this was a group that remembered the subject start, in order to break
1932 infinite repeats of empty string matches, retrieve the subject start from
1933 the chain. Otherwise, set it NULL. */
1934
1935 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1936 {
1937 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1938 eptrb = eptrb->epb_prev; /* Backup to previous group */
1939 }
1940 else saved_eptr = NULL;
1941
1942 /* If we are at the end of an assertion group or a non-capturing atomic
1943 group, stop matching and return MATCH_MATCH, but record the current high
1944 water mark for use by positive assertions. We also need to record the match
1945 start in case it was changed by \K. */
1946
1947 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1948 *prev == OP_ONCE_NC)
1949 {
1950 md->end_match_ptr = eptr; /* For ONCE_NC */
1951 md->end_offset_top = offset_top;
1952 md->start_match_ptr = mstart;
1953 RRETURN(MATCH_MATCH); /* Sets md->mark */
1954 }
1955
1956 /* For capturing groups we have to check the group number back at the start
1957 and if necessary complete handling an extraction by setting the offsets and
1958 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1959 into group 0, so it won't be picked up here. Instead, we catch it when the
1960 OP_END is reached. Other recursion is handled here. We just have to record
1961 the current subject position and start match pointer and give a MATCH
1962 return. */
1963
1964 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1965 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1966 {
1967 number = GET2(prev, 1+LINK_SIZE);
1968 offset = number << 1;
1969
1970 #ifdef PCRE_DEBUG
1971 printf("end bracket %d", number);
1972 printf("\n");
1973 #endif
1974
1975 /* Handle a recursively called group. */
1976
1977 if (md->recursive != NULL && md->recursive->group_num == number)
1978 {
1979 md->end_match_ptr = eptr;
1980 md->start_match_ptr = mstart;
1981 RRETURN(MATCH_MATCH);
1982 }
1983
1984 /* Deal with capturing */
1985
1986 md->capture_last = (md->capture_last & OVFLMASK) | number;
1987 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1988 {
1989 /* If offset is greater than offset_top, it means that we are
1990 "skipping" a capturing group, and that group's offsets must be marked
1991 unset. In earlier versions of PCRE, all the offsets were unset at the
1992 start of matching, but this doesn't work because atomic groups and
1993 assertions can cause a value to be set that should later be unset.
1994 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1995 part of the atomic group, but this is not on the final matching path,
1996 so must be unset when 2 is set. (If there is no group 2, there is no
1997 problem, because offset_top will then be 2, indicating no capture.) */
1998
1999 if (offset > offset_top)
2000 {
2001 register int *iptr = md->offset_vector + offset_top;
2002 register int *iend = md->offset_vector + offset;
2003 while (iptr < iend) *iptr++ = -1;
2004 }
2005
2006 /* Now make the extraction */
2007
2008 md->offset_vector[offset] =
2009 md->offset_vector[md->offset_end - number];
2010 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2011 if (offset_top <= offset) offset_top = offset + 2;
2012 }
2013 }
2014
2015 /* For an ordinary non-repeating ket, just continue at this level. This
2016 also happens for a repeating ket if no characters were matched in the
2017 group. This is the forcible breaking of infinite loops as implemented in
2018 Perl 5.005. For a non-repeating atomic group that includes captures,
2019 establish a backup point by processing the rest of the pattern at a lower
2020 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2021 original OP_ONCE level, thereby bypassing intermediate backup points, but
2022 resetting any captures that happened along the way. */
2023
2024 if (*ecode == OP_KET || eptr == saved_eptr)
2025 {
2026 if (*prev == OP_ONCE)
2027 {
2028 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2030 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2031 RRETURN(MATCH_ONCE);
2032 }
2033 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2034 break;
2035 }
2036
2037 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2038 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2039 at a time from the outer level, thus saving stack. */
2040
2041 if (*ecode == OP_KETRPOS)
2042 {
2043 md->end_match_ptr = eptr;
2044 md->end_offset_top = offset_top;
2045 RRETURN(MATCH_KETRPOS);
2046 }
2047
2048 /* The normal repeating kets try the rest of the pattern or restart from
2049 the preceding bracket, in the appropriate order. In the second case, we can
2050 use tail recursion to avoid using another stack frame, unless we have an
2051 an atomic group or an unlimited repeat of a group that can match an empty
2052 string. */
2053
2054 if (*ecode == OP_KETRMIN)
2055 {
2056 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2058 if (*prev == OP_ONCE)
2059 {
2060 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2062 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2063 RRETURN(MATCH_ONCE);
2064 }
2065 if (*prev >= OP_SBRA) /* Could match an empty string */
2066 {
2067 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2068 RRETURN(rrc);
2069 }
2070 ecode = prev;
2071 goto TAIL_RECURSE;
2072 }
2073 else /* OP_KETRMAX */
2074 {
2075 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2076 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2077 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2078 if (*prev == OP_ONCE)
2079 {
2080 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2081 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2082 md->once_target = prev;
2083 RRETURN(MATCH_ONCE);
2084 }
2085 ecode += 1 + LINK_SIZE;
2086 goto TAIL_RECURSE;
2087 }
2088 /* Control never gets here */
2089
2090 /* Not multiline mode: start of subject assertion, unless notbol. */
2091
2092 case OP_CIRC:
2093 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2094
2095 /* Start of subject assertion */
2096
2097 case OP_SOD:
2098 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2099 ecode++;
2100 break;
2101
2102 /* Multiline mode: start of subject unless notbol, or after any newline. */
2103
2104 case OP_CIRCM:
2105 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2106 if (eptr != md->start_subject &&
2107 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2108 RRETURN(MATCH_NOMATCH);
2109 ecode++;
2110 break;
2111
2112 /* Start of match assertion */
2113
2114 case OP_SOM:
2115 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2116 ecode++;
2117 break;
2118
2119 /* Reset the start of match point */
2120
2121 case OP_SET_SOM:
2122 mstart = eptr;
2123 ecode++;
2124 break;
2125
2126 /* Multiline mode: assert before any newline, or before end of subject
2127 unless noteol is set. */
2128
2129 case OP_DOLLM:
2130 if (eptr < md->end_subject)
2131 {
2132 if (!IS_NEWLINE(eptr))
2133 {
2134 if (md->partial != 0 &&
2135 eptr + 1 >= md->end_subject &&
2136 NLBLOCK->nltype == NLTYPE_FIXED &&
2137 NLBLOCK->nllen == 2 &&
2138 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2139 {
2140 md->hitend = TRUE;
2141 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2142 }
2143 RRETURN(MATCH_NOMATCH);
2144 }
2145 }
2146 else
2147 {
2148 if (md->noteol) RRETURN(MATCH_NOMATCH);
2149 SCHECK_PARTIAL();
2150 }
2151 ecode++;
2152 break;
2153
2154 /* Not multiline mode: assert before a terminating newline or before end of
2155 subject unless noteol is set. */
2156
2157 case OP_DOLL:
2158 if (md->noteol) RRETURN(MATCH_NOMATCH);
2159 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2160
2161 /* ... else fall through for endonly */
2162
2163 /* End of subject assertion (\z) */
2164
2165 case OP_EOD:
2166 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2167 SCHECK_PARTIAL();
2168 ecode++;
2169 break;
2170
2171 /* End of subject or ending \n assertion (\Z) */
2172
2173 case OP_EODN:
2174 ASSERT_NL_OR_EOS:
2175 if (eptr < md->end_subject &&
2176 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2177 {
2178 if (md->partial != 0 &&
2179 eptr + 1 >= md->end_subject &&
2180 NLBLOCK->nltype == NLTYPE_FIXED &&
2181 NLBLOCK->nllen == 2 &&
2182 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2183 {
2184 md->hitend = TRUE;
2185 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2186 }
2187 RRETURN(MATCH_NOMATCH);
2188 }
2189
2190 /* Either at end of string or \n before end. */
2191
2192 SCHECK_PARTIAL();
2193 ecode++;
2194 break;
2195
2196 /* Word boundary assertions */
2197
2198 case OP_NOT_WORD_BOUNDARY:
2199 case OP_WORD_BOUNDARY:
2200 {
2201
2202 /* Find out if the previous and current characters are "word" characters.
2203 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2204 be "non-word" characters. Remember the earliest consulted character for
2205 partial matching. */
2206
2207 #ifdef SUPPORT_UTF
2208 if (utf)
2209 {
2210 /* Get status of previous character */
2211
2212 if (eptr == md->start_subject) prev_is_word = FALSE; else
2213 {
2214 PCRE_PUCHAR lastptr = eptr - 1;
2215 BACKCHAR(lastptr);
2216 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2217 GETCHAR(c, lastptr);
2218 #ifdef SUPPORT_UCP
2219 if (md->use_ucp)
2220 {
2221 if (c == '_') prev_is_word = TRUE; else
2222 {
2223 int cat = UCD_CATEGORY(c);
2224 prev_is_word = (cat == ucp_L || cat == ucp_N);
2225 }
2226 }
2227 else
2228 #endif
2229 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2230 }
2231
2232 /* Get status of next character */
2233
2234 if (eptr >= md->end_subject)
2235 {
2236 SCHECK_PARTIAL();
2237 cur_is_word = FALSE;
2238 }
2239 else
2240 {
2241 GETCHAR(c, eptr);
2242 #ifdef SUPPORT_UCP
2243 if (md->use_ucp)
2244 {
2245 if (c == '_') cur_is_word = TRUE; else
2246 {
2247 int cat = UCD_CATEGORY(c);
2248 cur_is_word = (cat == ucp_L || cat == ucp_N);
2249 }
2250 }
2251 else
2252 #endif
2253 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2254 }
2255 }
2256 else
2257 #endif
2258
2259 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2260 consistency with the behaviour of \w we do use it in this case. */
2261
2262 {
2263 /* Get status of previous character */
2264
2265 if (eptr == md->start_subject) prev_is_word = FALSE; else
2266 {
2267 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2268 #ifdef SUPPORT_UCP
2269 if (md->use_ucp)
2270 {
2271 c = eptr[-1];
2272 if (c == '_') prev_is_word = TRUE; else
2273 {
2274 int cat = UCD_CATEGORY(c);
2275 prev_is_word = (cat == ucp_L || cat == ucp_N);
2276 }
2277 }
2278 else
2279 #endif
2280 prev_is_word = MAX_255(eptr[-1])
2281 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2282 }
2283
2284 /* Get status of next character */
2285
2286 if (eptr >= md->end_subject)
2287 {
2288 SCHECK_PARTIAL();
2289 cur_is_word = FALSE;
2290 }
2291 else
2292 #ifdef SUPPORT_UCP
2293 if (md->use_ucp)
2294 {
2295 c = *eptr;
2296 if (c == '_') cur_is_word = TRUE; else
2297 {
2298 int cat = UCD_CATEGORY(c);
2299 cur_is_word = (cat == ucp_L || cat == ucp_N);
2300 }
2301 }
2302 else
2303 #endif
2304 cur_is_word = MAX_255(*eptr)
2305 && ((md->ctypes[*eptr] & ctype_word) != 0);
2306 }
2307
2308 /* Now see if the situation is what we want */
2309
2310 if ((*ecode++ == OP_WORD_BOUNDARY)?
2311 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2312 RRETURN(MATCH_NOMATCH);
2313 }
2314 break;
2315
2316 /* Match any single character type except newline; have to take care with
2317 CRLF newlines and partial matching. */
2318
2319 case OP_ANY:
2320 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2321 if (md->partial != 0 &&
2322 eptr + 1 >= md->end_subject &&
2323 NLBLOCK->nltype == NLTYPE_FIXED &&
2324 NLBLOCK->nllen == 2 &&
2325 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2326 {
2327 md->hitend = TRUE;
2328 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2329 }
2330
2331 /* Fall through */
2332
2333 /* Match any single character whatsoever. */
2334
2335 case OP_ALLANY:
2336 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2337 { /* not be updated before SCHECK_PARTIAL. */
2338 SCHECK_PARTIAL();
2339 RRETURN(MATCH_NOMATCH);
2340 }
2341 eptr++;
2342 #ifdef SUPPORT_UTF
2343 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2344 #endif
2345 ecode++;
2346 break;
2347
2348 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2349 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2350
2351 case OP_ANYBYTE:
2352 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2353 { /* not be updated before SCHECK_PARTIAL. */
2354 SCHECK_PARTIAL();
2355 RRETURN(MATCH_NOMATCH);
2356 }
2357 eptr++;
2358 ecode++;
2359 break;
2360
2361 case OP_NOT_DIGIT:
2362 if (eptr >= md->end_subject)
2363 {
2364 SCHECK_PARTIAL();
2365 RRETURN(MATCH_NOMATCH);
2366 }
2367 GETCHARINCTEST(c, eptr);
2368 if (
2369 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2370 c < 256 &&
2371 #endif
2372 (md->ctypes[c] & ctype_digit) != 0
2373 )
2374 RRETURN(MATCH_NOMATCH);
2375 ecode++;
2376 break;
2377
2378 case OP_DIGIT:
2379 if (eptr >= md->end_subject)
2380 {
2381 SCHECK_PARTIAL();
2382 RRETURN(MATCH_NOMATCH);
2383 }
2384 GETCHARINCTEST(c, eptr);
2385 if (
2386 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2387 c > 255 ||
2388 #endif
2389 (md->ctypes[c] & ctype_digit) == 0
2390 )
2391 RRETURN(MATCH_NOMATCH);
2392 ecode++;
2393 break;
2394
2395 case OP_NOT_WHITESPACE:
2396 if (eptr >= md->end_subject)
2397 {
2398 SCHECK_PARTIAL();
2399 RRETURN(MATCH_NOMATCH);
2400 }
2401 GETCHARINCTEST(c, eptr);
2402 if (
2403 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2404 c < 256 &&
2405 #endif
2406 (md->ctypes[c] & ctype_space) != 0
2407 )
2408 RRETURN(MATCH_NOMATCH);
2409 ecode++;
2410 break;
2411
2412 case OP_WHITESPACE:
2413 if (eptr >= md->end_subject)
2414 {
2415 SCHECK_PARTIAL();
2416 RRETURN(MATCH_NOMATCH);
2417 }
2418 GETCHARINCTEST(c, eptr);
2419 if (
2420 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2421 c > 255 ||
2422 #endif
2423 (md->ctypes[c] & ctype_space) == 0
2424 )
2425 RRETURN(MATCH_NOMATCH);
2426 ecode++;
2427 break;
2428
2429 case OP_NOT_WORDCHAR:
2430 if (eptr >= md->end_subject)
2431 {
2432 SCHECK_PARTIAL();
2433 RRETURN(MATCH_NOMATCH);
2434 }
2435 GETCHARINCTEST(c, eptr);
2436 if (
2437 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2438 c < 256 &&
2439 #endif
2440 (md->ctypes[c] & ctype_word) != 0
2441 )
2442 RRETURN(MATCH_NOMATCH);
2443 ecode++;
2444 break;
2445
2446 case OP_WORDCHAR:
2447 if (eptr >= md->end_subject)
2448 {
2449 SCHECK_PARTIAL();
2450 RRETURN(MATCH_NOMATCH);
2451 }
2452 GETCHARINCTEST(c, eptr);
2453 if (
2454 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2455 c > 255 ||
2456 #endif
2457 (md->ctypes[c] & ctype_word) == 0
2458 )
2459 RRETURN(MATCH_NOMATCH);
2460 ecode++;
2461 break;
2462
2463 case OP_ANYNL:
2464 if (eptr >= md->end_subject)
2465 {
2466 SCHECK_PARTIAL();
2467 RRETURN(MATCH_NOMATCH);
2468 }
2469 GETCHARINCTEST(c, eptr);
2470 switch(c)
2471 {
2472 default: RRETURN(MATCH_NOMATCH);
2473
2474 case CHAR_CR:
2475 if (eptr >= md->end_subject)
2476 {
2477 SCHECK_PARTIAL();
2478 }
2479 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2480 break;
2481
2482 case CHAR_LF:
2483 break;
2484
2485 case CHAR_VT:
2486 case CHAR_FF:
2487 case CHAR_NEL:
2488 #ifndef EBCDIC
2489 case 0x2028:
2490 case 0x2029:
2491 #endif /* Not EBCDIC */
2492 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2493 break;
2494 }
2495 ecode++;
2496 break;
2497
2498 case OP_NOT_HSPACE:
2499 if (eptr >= md->end_subject)
2500 {
2501 SCHECK_PARTIAL();
2502 RRETURN(MATCH_NOMATCH);
2503 }
2504 GETCHARINCTEST(c, eptr);
2505 switch(c)
2506 {
2507 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2508 default: break;
2509 }
2510 ecode++;
2511 break;
2512
2513 case OP_HSPACE:
2514 if (eptr >= md->end_subject)
2515 {
2516 SCHECK_PARTIAL();
2517 RRETURN(MATCH_NOMATCH);
2518 }
2519 GETCHARINCTEST(c, eptr);
2520 switch(c)
2521 {
2522 HSPACE_CASES: break; /* Byte and multibyte cases */
2523 default: RRETURN(MATCH_NOMATCH);
2524 }
2525 ecode++;
2526 break;
2527
2528 case OP_NOT_VSPACE:
2529 if (eptr >= md->end_subject)
2530 {
2531 SCHECK_PARTIAL();
2532 RRETURN(MATCH_NOMATCH);
2533 }
2534 GETCHARINCTEST(c, eptr);
2535 switch(c)
2536 {
2537 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2538 default: break;
2539 }
2540 ecode++;
2541 break;
2542
2543 case OP_VSPACE:
2544 if (eptr >= md->end_subject)
2545 {
2546 SCHECK_PARTIAL();
2547 RRETURN(MATCH_NOMATCH);
2548 }
2549 GETCHARINCTEST(c, eptr);
2550 switch(c)
2551 {
2552 VSPACE_CASES: break;
2553 default: RRETURN(MATCH_NOMATCH);
2554 }
2555 ecode++;
2556 break;
2557
2558 #ifdef SUPPORT_UCP
2559 /* Check the next character by Unicode property. We will get here only
2560 if the support is in the binary; otherwise a compile-time error occurs. */
2561
2562 case OP_PROP:
2563 case OP_NOTPROP:
2564 if (eptr >= md->end_subject)
2565 {
2566 SCHECK_PARTIAL();
2567 RRETURN(MATCH_NOMATCH);
2568 }
2569 GETCHARINCTEST(c, eptr);
2570 {
2571 const pcre_uint32 *cp;
2572 const ucd_record *prop = GET_UCD(c);
2573
2574 switch(ecode[1])
2575 {
2576 case PT_ANY:
2577 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2578 break;
2579
2580 case PT_LAMP:
2581 if ((prop->chartype == ucp_Lu ||
2582 prop->chartype == ucp_Ll ||
2583 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2586
2587 case PT_GC:
2588 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2589 RRETURN(MATCH_NOMATCH);
2590 break;
2591
2592 case PT_PC:
2593 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2594 RRETURN(MATCH_NOMATCH);
2595 break;
2596
2597 case PT_SC:
2598 if ((ecode[2] != prop->script) == (op == OP_PROP))
2599 RRETURN(MATCH_NOMATCH);
2600 break;
2601
2602 /* These are specials */
2603
2604 case PT_ALNUM:
2605 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2606 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2607 RRETURN(MATCH_NOMATCH);
2608 break;
2609
2610 case PT_SPACE: /* Perl space */
2611 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2612 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2613 == (op == OP_NOTPROP))
2614 RRETURN(MATCH_NOMATCH);
2615 break;
2616
2617 case PT_PXSPACE: /* POSIX space */
2618 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2619 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2620 c == CHAR_FF || c == CHAR_CR)
2621 == (op == OP_NOTPROP))
2622 RRETURN(MATCH_NOMATCH);
2623 break;
2624
2625 case PT_WORD:
2626 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2627 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2628 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2629 RRETURN(MATCH_NOMATCH);
2630 break;
2631
2632 case PT_CLIST:
2633 cp = PRIV(ucd_caseless_sets) + ecode[2];
2634 for (;;)
2635 {
2636 if (c < *cp)
2637 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2638 if (c == *cp++)
2639 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2640 }
2641 break;
2642
2643 case PT_UCNC:
2644 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2645 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2646 c >= 0xe000) == (op == OP_NOTPROP))
2647 RRETURN(MATCH_NOMATCH);
2648 break;
2649
2650 /* This should never occur */
2651
2652 default:
2653 RRETURN(PCRE_ERROR_INTERNAL);
2654 }
2655
2656 ecode += 3;
2657 }
2658 break;
2659
2660 /* Match an extended Unicode sequence. We will get here only if the support
2661 is in the binary; otherwise a compile-time error occurs. */
2662
2663 case OP_EXTUNI:
2664 if (eptr >= md->end_subject)
2665 {
2666 SCHECK_PARTIAL();
2667 RRETURN(MATCH_NOMATCH);
2668 }
2669 else
2670 {
2671 int lgb, rgb;
2672 GETCHARINCTEST(c, eptr);
2673 lgb = UCD_GRAPHBREAK(c);
2674 while (eptr < md->end_subject)
2675 {
2676 int len = 1;
2677 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2678 rgb = UCD_GRAPHBREAK(c);
2679 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2680 lgb = rgb;
2681 eptr += len;
2682 }
2683 }
2684 CHECK_PARTIAL();
2685 ecode++;
2686 break;
2687 #endif /* SUPPORT_UCP */
2688
2689
2690 /* Match a back reference, possibly repeatedly. Look past the end of the
2691 item to see if there is repeat information following. The code is similar
2692 to that for character classes, but repeated for efficiency. Then obey
2693 similar code to character type repeats - written out again for speed.
2694 However, if the referenced string is the empty string, always treat
2695 it as matched, any number of times (otherwise there could be infinite
2696 loops). */
2697
2698 case OP_REF:
2699 case OP_REFI:
2700 caseless = op == OP_REFI;
2701 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2702 ecode += 1 + IMM2_SIZE;
2703
2704 /* If the reference is unset, there are two possibilities:
2705
2706 (a) In the default, Perl-compatible state, set the length negative;
2707 this ensures that every attempt at a match fails. We can't just fail
2708 here, because of the possibility of quantifiers with zero minima.
2709
2710 (b) If the JavaScript compatibility flag is set, set the length to zero
2711 so that the back reference matches an empty string.
2712
2713 Otherwise, set the length to the length of what was matched by the
2714 referenced subpattern. */
2715
2716 if (offset >= offset_top || md->offset_vector[offset] < 0)
2717 length = (md->jscript_compat)? 0 : -1;
2718 else
2719 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2720
2721 /* Set up for repetition, or handle the non-repeated case */
2722
2723 switch (*ecode)
2724 {
2725 case OP_CRSTAR:
2726 case OP_CRMINSTAR:
2727 case OP_CRPLUS:
2728 case OP_CRMINPLUS:
2729 case OP_CRQUERY:
2730 case OP_CRMINQUERY:
2731 c = *ecode++ - OP_CRSTAR;
2732 minimize = (c & 1) != 0;
2733 min = rep_min[c]; /* Pick up values from tables; */
2734 max = rep_max[c]; /* zero for max => infinity */
2735 if (max == 0) max = INT_MAX;
2736 break;
2737
2738 case OP_CRRANGE:
2739 case OP_CRMINRANGE:
2740 minimize = (*ecode == OP_CRMINRANGE);
2741 min = GET2(ecode, 1);
2742 max = GET2(ecode, 1 + IMM2_SIZE);
2743 if (max == 0) max = INT_MAX;
2744 ecode += 1 + 2 * IMM2_SIZE;
2745 break;
2746
2747 default: /* No repeat follows */
2748 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2749 {
2750 if (length == -2) eptr = md->end_subject; /* Partial match */
2751 CHECK_PARTIAL();
2752 RRETURN(MATCH_NOMATCH);
2753 }
2754 eptr += length;
2755 continue; /* With the main loop */
2756 }
2757
2758 /* Handle repeated back references. If the length of the reference is
2759 zero, just continue with the main loop. If the length is negative, it
2760 means the reference is unset in non-Java-compatible mode. If the minimum is
2761 zero, we can continue at the same level without recursion. For any other
2762 minimum, carrying on will result in NOMATCH. */
2763
2764 if (length == 0) continue;
2765 if (length < 0 && min == 0) continue;
2766
2767 /* First, ensure the minimum number of matches are present. We get back
2768 the length of the reference string explicitly rather than passing the
2769 address of eptr, so that eptr can be a register variable. */
2770
2771 for (i = 1; i <= min; i++)
2772 {
2773 int slength;
2774 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2775 {
2776 if (slength == -2) eptr = md->end_subject; /* Partial match */
2777 CHECK_PARTIAL();
2778 RRETURN(MATCH_NOMATCH);
2779 }
2780 eptr += slength;
2781 }
2782
2783 /* If min = max, continue at the same level without recursion.
2784 They are not both allowed to be zero. */
2785
2786 if (min == max) continue;
2787
2788 /* If minimizing, keep trying and advancing the pointer */
2789
2790 if (minimize)
2791 {
2792 for (fi = min;; fi++)
2793 {
2794 int slength;
2795 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2797 if (fi >= max) RRETURN(MATCH_NOMATCH);
2798 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2799 {
2800 if (slength == -2) eptr = md->end_subject; /* Partial match */
2801 CHECK_PARTIAL();
2802 RRETURN(MATCH_NOMATCH);
2803 }
2804 eptr += slength;
2805 }
2806 /* Control never gets here */
2807 }
2808
2809 /* If maximizing, find the longest string and work backwards */
2810
2811 else
2812 {
2813 pp = eptr;
2814 for (i = min; i < max; i++)
2815 {
2816 int slength;
2817 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2818 {
2819 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2820 the soft partial matching case. */
2821
2822 if (slength == -2 && md->partial != 0 &&
2823 md->end_subject > md->start_used_ptr)
2824 {
2825 md->hitend = TRUE;
2826 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2827 }
2828 break;
2829 }
2830 eptr += slength;
2831 }
2832
2833 while (eptr >= pp)
2834 {
2835 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2837 eptr -= length;
2838 }
2839 RRETURN(MATCH_NOMATCH);
2840 }
2841 /* Control never gets here */
2842
2843 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2844 used when all the characters in the class have values in the range 0-255,
2845 and either the matching is caseful, or the characters are in the range
2846 0-127 when UTF-8 processing is enabled. The only difference between
2847 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2848 encountered.
2849
2850 First, look past the end of the item to see if there is repeat information
2851 following. Then obey similar code to character type repeats - written out
2852 again for speed. */
2853
2854 case OP_NCLASS:
2855 case OP_CLASS:
2856 {
2857 /* The data variable is saved across frames, so the byte map needs to
2858 be stored there. */
2859 #define BYTE_MAP ((pcre_uint8 *)data)
2860 data = ecode + 1; /* Save for matching */
2861 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2862
2863 switch (*ecode)
2864 {
2865 case OP_CRSTAR:
2866 case OP_CRMINSTAR:
2867 case OP_CRPLUS:
2868 case OP_CRMINPLUS:
2869 case OP_CRQUERY:
2870 case OP_CRMINQUERY:
2871 c = *ecode++ - OP_CRSTAR;
2872 minimize = (c & 1) != 0;
2873 min = rep_min[c]; /* Pick up values from tables; */
2874 max = rep_max[c]; /* zero for max => infinity */
2875 if (max == 0) max = INT_MAX;
2876 break;
2877
2878 case OP_CRRANGE:
2879 case OP_CRMINRANGE:
2880 minimize = (*ecode == OP_CRMINRANGE);
2881 min = GET2(ecode, 1);
2882 max = GET2(ecode, 1 + IMM2_SIZE);
2883 if (max == 0) max = INT_MAX;
2884 ecode += 1 + 2 * IMM2_SIZE;
2885 break;
2886
2887 default: /* No repeat follows */
2888 min = max = 1;
2889 break;
2890 }
2891
2892 /* First, ensure the minimum number of matches are present. */
2893
2894 #ifdef SUPPORT_UTF
2895 if (utf)
2896 {
2897 for (i = 1; i <= min; i++)
2898 {
2899 if (eptr >= md->end_subject)
2900 {
2901 SCHECK_PARTIAL();
2902 RRETURN(MATCH_NOMATCH);
2903 }
2904 GETCHARINC(c, eptr);
2905 if (c > 255)
2906 {
2907 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2908 }
2909 else
2910 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2911 }
2912 }
2913 else
2914 #endif
2915 /* Not UTF mode */
2916 {
2917 for (i = 1; i <= min; i++)
2918 {
2919 if (eptr >= md->end_subject)
2920 {
2921 SCHECK_PARTIAL();
2922 RRETURN(MATCH_NOMATCH);
2923 }
2924 c = *eptr++;
2925 #ifndef COMPILE_PCRE8
2926 if (c > 255)
2927 {
2928 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2929 }
2930 else
2931 #endif
2932 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2933 }
2934 }
2935
2936 /* If max == min we can continue with the main loop without the
2937 need to recurse. */
2938
2939 if (min == max) continue;
2940
2941 /* If minimizing, keep testing the rest of the expression and advancing
2942 the pointer while it matches the class. */
2943
2944 if (minimize)
2945 {
2946 #ifdef SUPPORT_UTF
2947 if (utf)
2948 {
2949 for (fi = min;; fi++)
2950 {
2951 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2953 if (fi >= max) RRETURN(MATCH_NOMATCH);
2954 if (eptr >= md->end_subject)
2955 {
2956 SCHECK_PARTIAL();
2957 RRETURN(MATCH_NOMATCH);
2958 }
2959 GETCHARINC(c, eptr);
2960 if (c > 255)
2961 {
2962 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2963 }
2964 else
2965 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2966 }
2967 }
2968 else
2969 #endif
2970 /* Not UTF mode */
2971 {
2972 for (fi = min;; fi++)
2973 {
2974 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2975 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2976 if (fi >= max) RRETURN(MATCH_NOMATCH);
2977 if (eptr >= md->end_subject)
2978 {
2979 SCHECK_PARTIAL();
2980 RRETURN(MATCH_NOMATCH);
2981 }
2982 c = *eptr++;
2983 #ifndef COMPILE_PCRE8
2984 if (c > 255)
2985 {
2986 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2987 }
2988 else
2989 #endif
2990 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2991 }
2992 }
2993 /* Control never gets here */
2994 }
2995
2996 /* If maximizing, find the longest possible run, then work backwards. */
2997
2998 else
2999 {
3000 pp = eptr;
3001
3002 #ifdef SUPPORT_UTF
3003 if (utf)
3004 {
3005 for (i = min; i < max; i++)
3006 {
3007 int len = 1;
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 break;
3012 }
3013 GETCHARLEN(c, eptr, len);
3014 if (c > 255)
3015 {
3016 if (op == OP_CLASS) break;
3017 }
3018 else
3019 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3020 eptr += len;
3021 }
3022 for (;;)
3023 {
3024 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3026 if (eptr-- == pp) break; /* Stop if tried at original pos */
3027 BACKCHAR(eptr);
3028 }
3029 }
3030 else
3031 #endif
3032 /* Not UTF mode */
3033 {
3034 for (i = min; i < max; i++)
3035 {
3036 if (eptr >= md->end_subject)
3037 {
3038 SCHECK_PARTIAL();
3039 break;
3040 }
3041 c = *eptr;
3042 #ifndef COMPILE_PCRE8
3043 if (c > 255)
3044 {
3045 if (op == OP_CLASS) break;
3046 }
3047 else
3048 #endif
3049 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3050 eptr++;
3051 }
3052 while (eptr >= pp)
3053 {
3054 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3055 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3056 eptr--;
3057 }
3058 }
3059
3060 RRETURN(MATCH_NOMATCH);
3061 }
3062 #undef BYTE_MAP
3063 }
3064 /* Control never gets here */
3065
3066
3067 /* Match an extended character class. This opcode is encountered only
3068 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3069 mode, because Unicode properties are supported in non-UTF-8 mode. */
3070
3071 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3072 case OP_XCLASS:
3073 {
3074 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3075 ecode += GET(ecode, 1); /* Advance past the item */
3076
3077 switch (*ecode)
3078 {
3079 case OP_CRSTAR:
3080 case OP_CRMINSTAR:
3081 case OP_CRPLUS:
3082 case OP_CRMINPLUS:
3083 case OP_CRQUERY:
3084 case OP_CRMINQUERY:
3085 c = *ecode++ - OP_CRSTAR;
3086 minimize = (c & 1) != 0;
3087 min = rep_min[c]; /* Pick up values from tables; */
3088 max = rep_max[c]; /* zero for max => infinity */
3089 if (max == 0) max = INT_MAX;
3090 break;
3091
3092 case OP_CRRANGE:
3093 case OP_CRMINRANGE:
3094 minimize = (*ecode == OP_CRMINRANGE);
3095 min = GET2(ecode, 1);
3096 max = GET2(ecode, 1 + IMM2_SIZE);
3097 if (max == 0) max = INT_MAX;
3098 ecode += 1 + 2 * IMM2_SIZE;
3099 break;
3100
3101 default: /* No repeat follows */
3102 min = max = 1;
3103 break;
3104 }
3105
3106 /* First, ensure the minimum number of matches are present. */
3107
3108 for (i = 1; i <= min; i++)
3109 {
3110 if (eptr >= md->end_subject)
3111 {
3112 SCHECK_PARTIAL();
3113 RRETURN(MATCH_NOMATCH);
3114 }
3115 GETCHARINCTEST(c, eptr);
3116 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3117 }
3118
3119 /* If max == min we can continue with the main loop without the
3120 need to recurse. */
3121
3122 if (min == max) continue;
3123
3124 /* If minimizing, keep testing the rest of the expression and advancing
3125 the pointer while it matches the class. */
3126
3127 if (minimize)
3128 {
3129 for (fi = min;; fi++)
3130 {
3131 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3132 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3133 if (fi >= max) RRETURN(MATCH_NOMATCH);
3134 if (eptr >= md->end_subject)
3135 {
3136 SCHECK_PARTIAL();
3137 RRETURN(MATCH_NOMATCH);
3138 }
3139 GETCHARINCTEST(c, eptr);
3140 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3141 }
3142 /* Control never gets here */
3143 }
3144
3145 /* If maximizing, find the longest possible run, then work backwards. */
3146
3147 else
3148 {
3149 pp = eptr;
3150 for (i = min; i < max; i++)
3151 {
3152 int len = 1;
3153 if (eptr >= md->end_subject)
3154 {
3155 SCHECK_PARTIAL();
3156 break;
3157 }
3158 #ifdef SUPPORT_UTF
3159 GETCHARLENTEST(c, eptr, len);
3160 #else
3161 c = *eptr;
3162 #endif
3163 if (!PRIV(xclass)(c, data, utf)) break;
3164 eptr += len;
3165 }
3166 for(;;)
3167 {
3168 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3169 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3170 if (eptr-- == pp) break; /* Stop if tried at original pos */
3171 #ifdef SUPPORT_UTF
3172 if (utf) BACKCHAR(eptr);
3173 #endif
3174 }
3175 RRETURN(MATCH_NOMATCH);
3176 }
3177
3178 /* Control never gets here */
3179 }
3180 #endif /* End of XCLASS */
3181
3182 /* Match a single character, casefully */
3183
3184 case OP_CHAR:
3185 #ifdef SUPPORT_UTF
3186 if (utf)
3187 {
3188 length = 1;
3189 ecode++;
3190 GETCHARLEN(fc, ecode, length);
3191 if (length > md->end_subject - eptr)
3192 {
3193 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3194 RRETURN(MATCH_NOMATCH);
3195 }
3196 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3197 }
3198 else
3199 #endif
3200 /* Not UTF mode */
3201 {
3202 if (md->end_subject - eptr < 1)
3203 {
3204 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3205 RRETURN(MATCH_NOMATCH);
3206 }
3207 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3208 ecode += 2;
3209 }
3210 break;
3211
3212 /* Match a single character, caselessly. If we are at the end of the
3213 subject, give up immediately. */
3214
3215 case OP_CHARI:
3216 if (eptr >= md->end_subject)
3217 {
3218 SCHECK_PARTIAL();
3219 RRETURN(MATCH_NOMATCH);
3220 }
3221
3222 #ifdef SUPPORT_UTF
3223 if (utf)
3224 {
3225 length = 1;
3226 ecode++;
3227 GETCHARLEN(fc, ecode, length);
3228
3229 /* If the pattern character's value is < 128, we have only one byte, and
3230 we know that its other case must also be one byte long, so we can use the
3231 fast lookup table. We know that there is at least one byte left in the
3232 subject. */
3233
3234 if (fc < 128)
3235 {
3236 pcre_uint32 cc = RAWUCHAR(eptr);
3237 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3238 ecode++;
3239 eptr++;
3240 }
3241
3242 /* Otherwise we must pick up the subject character. Note that we cannot
3243 use the value of "length" to check for sufficient bytes left, because the
3244 other case of the character may have more or fewer bytes. */
3245
3246 else
3247 {
3248 pcre_uint32 dc;
3249 GETCHARINC(dc, eptr);
3250 ecode += length;
3251
3252 /* If we have Unicode property support, we can use it to test the other
3253 case of the character, if there is one. */
3254
3255 if (fc != dc)
3256 {
3257 #ifdef SUPPORT_UCP
3258 if (dc != UCD_OTHERCASE(fc))
3259 #endif
3260 RRETURN(MATCH_NOMATCH);
3261 }
3262 }
3263 }
3264 else
3265 #endif /* SUPPORT_UTF */
3266
3267 /* Not UTF mode */
3268 {
3269 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3270 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3271 eptr++;
3272 ecode += 2;
3273 }
3274 break;
3275
3276 /* Match a single character repeatedly. */
3277
3278 case OP_EXACT:
3279 case OP_EXACTI:
3280 min = max = GET2(ecode, 1);
3281 ecode += 1 + IMM2_SIZE;
3282 goto REPEATCHAR;
3283
3284 case OP_POSUPTO:
3285 case OP_POSUPTOI:
3286 possessive = TRUE;
3287 /* Fall through */
3288
3289 case OP_UPTO:
3290 case OP_UPTOI:
3291 case OP_MINUPTO:
3292 case OP_MINUPTOI:
3293 min = 0;
3294 max = GET2(ecode, 1);
3295 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3296 ecode += 1 + IMM2_SIZE;
3297 goto REPEATCHAR;
3298
3299 case OP_POSSTAR:
3300 case OP_POSSTARI:
3301 possessive = TRUE;
3302 min = 0;
3303 max = INT_MAX;
3304 ecode++;
3305 goto REPEATCHAR;
3306
3307 case OP_POSPLUS:
3308 case OP_POSPLUSI:
3309 possessive = TRUE;
3310 min = 1;
3311 max = INT_MAX;
3312 ecode++;
3313 goto REPEATCHAR;
3314
3315 case OP_POSQUERY:
3316 case OP_POSQUERYI:
3317 possessive = TRUE;
3318 min = 0;
3319 max = 1;
3320 ecode++;
3321 goto REPEATCHAR;
3322
3323 case OP_STAR:
3324 case OP_STARI:
3325 case OP_MINSTAR:
3326 case OP_MINSTARI:
3327 case OP_PLUS:
3328 case OP_PLUSI:
3329 case OP_MINPLUS:
3330 case OP_MINPLUSI:
3331 case OP_QUERY:
3332 case OP_QUERYI:
3333 case OP_MINQUERY:
3334 case OP_MINQUERYI:
3335 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3336 minimize = (c & 1) != 0;
3337 min = rep_min[c]; /* Pick up values from tables; */
3338 max = rep_max[c]; /* zero for max => infinity */
3339 if (max == 0) max = INT_MAX;
3340
3341 /* Common code for all repeated single-character matches. */
3342
3343 REPEATCHAR:
3344 #ifdef SUPPORT_UTF
3345 if (utf)
3346 {
3347 length = 1;
3348 charptr = ecode;
3349 GETCHARLEN(fc, ecode, length);
3350 ecode += length;
3351
3352 /* Handle multibyte character matching specially here. There is
3353 support for caseless matching if UCP support is present. */
3354
3355 if (length > 1)
3356 {
3357 #ifdef SUPPORT_UCP
3358 pcre_uint32 othercase;
3359 if (op >= OP_STARI && /* Caseless */
3360 (othercase = UCD_OTHERCASE(fc)) != fc)
3361 oclength = PRIV(ord2utf)(othercase, occhars);
3362 else oclength = 0;
3363 #endif /* SUPPORT_UCP */
3364
3365 for (i = 1; i <= min; i++)
3366 {
3367 if (eptr <= md->end_subject - length &&
3368 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3369 #ifdef SUPPORT_UCP
3370 else if (oclength > 0 &&
3371 eptr <= md->end_subject - oclength &&
3372 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3373 #endif /* SUPPORT_UCP */
3374 else
3375 {
3376 CHECK_PARTIAL();
3377 RRETURN(MATCH_NOMATCH);
3378 }
3379 }
3380
3381 if (min == max) continue;
3382
3383 if (minimize)
3384 {
3385 for (fi = min;; fi++)
3386 {
3387 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3389 if (fi >= max) RRETURN(MATCH_NOMATCH);
3390 if (eptr <= md->end_subject - length &&
3391 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3392 #ifdef SUPPORT_UCP
3393 else if (oclength > 0 &&
3394 eptr <= md->end_subject - oclength &&
3395 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3396 #endif /* SUPPORT_UCP */
3397 else
3398 {
3399 CHECK_PARTIAL();
3400 RRETURN(MATCH_NOMATCH);
3401 }
3402 }
3403 /* Control never gets here */
3404 }
3405
3406 else /* Maximize */
3407 {
3408 pp = eptr;
3409 for (i = min; i < max; i++)
3410 {
3411 if (eptr <= md->end_subject - length &&
3412 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3413 #ifdef SUPPORT_UCP
3414 else if (oclength > 0 &&
3415 eptr <= md->end_subject - oclength &&
3416 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3417 #endif /* SUPPORT_UCP */
3418 else
3419 {
3420 CHECK_PARTIAL();
3421 break;
3422 }
3423 }
3424
3425 if (possessive) continue;
3426
3427 for(;;)
3428 {
3429 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3432 #ifdef SUPPORT_UCP
3433 eptr--;
3434 BACKCHAR(eptr);
3435 #else /* without SUPPORT_UCP */
3436 eptr -= length;
3437 #endif /* SUPPORT_UCP */
3438 }
3439 }
3440 /* Control never gets here */
3441 }
3442
3443 /* If the length of a UTF-8 character is 1, we fall through here, and
3444 obey the code as for non-UTF-8 characters below, though in this case the
3445 value of fc will always be < 128. */
3446 }
3447 else
3448 #endif /* SUPPORT_UTF */
3449 /* When not in UTF-8 mode, load a single-byte character. */
3450 fc = *ecode++;
3451
3452 /* The value of fc at this point is always one character, though we may
3453 or may not be in UTF mode. The code is duplicated for the caseless and
3454 caseful cases, for speed, since matching characters is likely to be quite
3455 common. First, ensure the minimum number of matches are present. If min =
3456 max, continue at the same level without recursing. Otherwise, if
3457 minimizing, keep trying the rest of the expression and advancing one
3458 matching character if failing, up to the maximum. Alternatively, if
3459 maximizing, find the maximum number of characters and work backwards. */
3460
3461 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3462 max, (char *)eptr));
3463
3464 if (op >= OP_STARI) /* Caseless */
3465 {
3466 #ifdef COMPILE_PCRE8
3467 /* fc must be < 128 if UTF is enabled. */
3468 foc = md->fcc[fc];
3469 #else
3470 #ifdef SUPPORT_UTF
3471 #ifdef SUPPORT_UCP
3472 if (utf && fc > 127)
3473 foc = UCD_OTHERCASE(fc);
3474 #else
3475 if (utf && fc > 127)
3476 foc = fc;
3477 #endif /* SUPPORT_UCP */
3478 else
3479 #endif /* SUPPORT_UTF */
3480 foc = TABLE_GET(fc, md->fcc, fc);
3481 #endif /* COMPILE_PCRE8 */
3482
3483 for (i = 1; i <= min; i++)
3484 {
3485 pcre_uint32 cc; /* Faster than pcre_uchar */
3486 if (eptr >= md->end_subject)
3487 {
3488 SCHECK_PARTIAL();
3489 RRETURN(MATCH_NOMATCH);
3490 }
3491 cc = RAWUCHARTEST(eptr);
3492 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3493 eptr++;
3494 }
3495 if (min == max) continue;
3496 if (minimize)
3497 {
3498 for (fi = min;; fi++)
3499 {
3500 pcre_uint32 cc; /* Faster than pcre_uchar */
3501 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3502 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3503 if (fi >= max) RRETURN(MATCH_NOMATCH);
3504 if (eptr >= md->end_subject)
3505 {
3506 SCHECK_PARTIAL();
3507 RRETURN(MATCH_NOMATCH);
3508 }
3509 cc = RAWUCHARTEST(eptr);
3510 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3511 eptr++;
3512 }
3513 /* Control never gets here */
3514 }
3515 else /* Maximize */
3516 {
3517 pp = eptr;
3518 for (i = min; i < max; i++)
3519 {
3520 pcre_uint32 cc; /* Faster than pcre_uchar */
3521 if (eptr >= md->end_subject)
3522 {
3523 SCHECK_PARTIAL();
3524 break;
3525 }
3526 cc = RAWUCHARTEST(eptr);
3527 if (fc != cc && foc != cc) break;
3528 eptr++;
3529 }
3530
3531 if (possessive) continue;
3532
3533 while (eptr >= pp)
3534 {
3535 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3536 eptr--;
3537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3538 }
3539 RRETURN(MATCH_NOMATCH);
3540 }
3541 /* Control never gets here */
3542 }
3543
3544 /* Caseful comparisons (includes all multi-byte characters) */
3545
3546 else
3547 {
3548 for (i = 1; i <= min; i++)
3549 {
3550 if (eptr >= md->end_subject)
3551 {
3552 SCHECK_PARTIAL();
3553 RRETURN(MATCH_NOMATCH);
3554 }
3555 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3556 }
3557
3558 if (min == max) continue;
3559
3560 if (minimize)
3561 {
3562 for (fi = min;; fi++)
3563 {
3564 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3565 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3566 if (fi >= max) RRETURN(MATCH_NOMATCH);
3567 if (eptr >= md->end_subject)
3568 {
3569 SCHECK_PARTIAL();
3570 RRETURN(MATCH_NOMATCH);
3571 }
3572 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3573 }
3574 /* Control never gets here */
3575 }
3576 else /* Maximize */
3577 {
3578 pp = eptr;
3579 for (i = min; i < max; i++)
3580 {
3581 if (eptr >= md->end_subject)
3582 {
3583 SCHECK_PARTIAL();
3584 break;
3585 }
3586 if (fc != RAWUCHARTEST(eptr)) break;
3587 eptr++;
3588 }
3589 if (possessive) continue;
3590
3591 while (eptr >= pp)
3592 {
3593 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3594 eptr--;
3595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3596 }
3597 RRETURN(MATCH_NOMATCH);
3598 }
3599 }
3600 /* Control never gets here */
3601
3602 /* Match a negated single one-byte character. The character we are
3603 checking can be multibyte. */
3604
3605 case OP_NOT:
3606 case OP_NOTI:
3607 if (eptr >= md->end_subject)
3608 {
3609 SCHECK_PARTIAL();
3610 RRETURN(MATCH_NOMATCH);
3611 }
3612 #ifdef SUPPORT_UTF
3613 if (utf)
3614 {
3615 register pcre_uint32 ch, och;
3616
3617 ecode++;
3618 GETCHARINC(ch, ecode);
3619 GETCHARINC(c, eptr);
3620
3621 if (op == OP_NOT)
3622 {
3623 if (ch == c) RRETURN(MATCH_NOMATCH);
3624 }
3625 else
3626 {
3627 #ifdef SUPPORT_UCP
3628 if (ch > 127)
3629 och = UCD_OTHERCASE(ch);
3630 #else
3631 if (ch > 127)
3632 och = ch;
3633 #endif /* SUPPORT_UCP */
3634 else
3635 och = TABLE_GET(ch, md->fcc, ch);
3636 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3637 }
3638 }
3639 else
3640 #endif
3641 {
3642 register pcre_uint32 ch = ecode[1];
3643 c = *eptr++;
3644 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3645 RRETURN(MATCH_NOMATCH);
3646 ecode += 2;
3647 }
3648 break;
3649
3650 /* Match a negated single one-byte character repeatedly. This is almost a
3651 repeat of the code for a repeated single character, but I haven't found a
3652 nice way of commoning these up that doesn't require a test of the
3653 positive/negative option for each character match. Maybe that wouldn't add
3654 very much to the time taken, but character matching *is* what this is all
3655 about... */
3656
3657 case OP_NOTEXACT:
3658 case OP_NOTEXACTI:
3659 min = max = GET2(ecode, 1);
3660 ecode += 1 + IMM2_SIZE;
3661 goto REPEATNOTCHAR;
3662
3663 case OP_NOTUPTO:
3664 case OP_NOTUPTOI:
3665 case OP_NOTMINUPTO:
3666 case OP_NOTMINUPTOI:
3667 min = 0;
3668 max = GET2(ecode, 1);
3669 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3670 ecode += 1 + IMM2_SIZE;
3671 goto REPEATNOTCHAR;
3672
3673 case OP_NOTPOSSTAR:
3674 case OP_NOTPOSSTARI:
3675 possessive = TRUE;
3676 min = 0;
3677 max = INT_MAX;
3678 ecode++;
3679 goto REPEATNOTCHAR;
3680
3681 case OP_NOTPOSPLUS:
3682 case OP_NOTPOSPLUSI:
3683 possessive = TRUE;
3684 min = 1;
3685 max = INT_MAX;
3686 ecode++;
3687 goto REPEATNOTCHAR;
3688
3689 case OP_NOTPOSQUERY:
3690 case OP_NOTPOSQUERYI:
3691 possessive = TRUE;
3692 min = 0;
3693 max = 1;
3694 ecode++;
3695 goto REPEATNOTCHAR;
3696
3697 case OP_NOTPOSUPTO:
3698 case OP_NOTPOSUPTOI:
3699 possessive = TRUE;
3700 min = 0;
3701 max = GET2(ecode, 1);
3702 ecode += 1 + IMM2_SIZE;
3703 goto REPEATNOTCHAR;
3704
3705 case OP_NOTSTAR:
3706 case OP_NOTSTARI:
3707 case OP_NOTMINSTAR:
3708 case OP_NOTMINSTARI:
3709 case OP_NOTPLUS:
3710 case OP_NOTPLUSI:
3711 case OP_NOTMINPLUS:
3712 case OP_NOTMINPLUSI:
3713 case OP_NOTQUERY:
3714 case OP_NOTQUERYI:
3715 case OP_NOTMINQUERY:
3716 case OP_NOTMINQUERYI:
3717 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3718 minimize = (c & 1) != 0;
3719 min = rep_min[c]; /* Pick up values from tables; */
3720 max = rep_max[c]; /* zero for max => infinity */
3721 if (max == 0) max = INT_MAX;
3722
3723 /* Common code for all repeated single-byte matches. */
3724
3725 REPEATNOTCHAR:
3726 GETCHARINCTEST(fc, ecode);
3727
3728 /* The code is duplicated for the caseless and caseful cases, for speed,
3729 since matching characters is likely to be quite common. First, ensure the
3730 minimum number of matches are present. If min = max, continue at the same
3731 level without recursing. Otherwise, if minimizing, keep trying the rest of
3732 the expression and advancing one matching character if failing, up to the
3733 maximum. Alternatively, if maximizing, find the maximum number of
3734 characters and work backwards. */
3735
3736 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3737 max, (char *)eptr));
3738
3739 if (op >= OP_NOTSTARI) /* Caseless */
3740 {
3741 #ifdef SUPPORT_UTF
3742 #ifdef SUPPORT_UCP
3743 if (utf && fc > 127)
3744 foc = UCD_OTHERCASE(fc);
3745 #else
3746 if (utf && fc > 127)
3747 foc = fc;
3748 #endif /* SUPPORT_UCP */
3749 else
3750 #endif /* SUPPORT_UTF */
3751 foc = TABLE_GET(fc, md->fcc, fc);
3752
3753 #ifdef SUPPORT_UTF
3754 if (utf)
3755 {
3756 register pcre_uint32 d;
3757 for (i = 1; i <= min; i++)
3758 {
3759 if (eptr >= md->end_subject)
3760 {
3761 SCHECK_PARTIAL();
3762 RRETURN(MATCH_NOMATCH);
3763 }
3764 GETCHARINC(d, eptr);
3765 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3766 }
3767 }
3768 else
3769 #endif
3770 /* Not UTF mode */
3771 {
3772 for (i = 1; i <= min; i++)
3773 {
3774 if (eptr >= md->end_subject)
3775 {
3776 SCHECK_PARTIAL();
3777 RRETURN(MATCH_NOMATCH);
3778 }
3779 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3780 eptr++;
3781 }
3782 }
3783
3784 if (min == max) continue;
3785
3786 if (minimize)
3787 {
3788 #ifdef SUPPORT_UTF
3789 if (utf)
3790 {
3791 register pcre_uint32 d;
3792 for (fi = min;; fi++)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (fi >= max) RRETURN(MATCH_NOMATCH);
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 RRETURN(MATCH_NOMATCH);
3801 }
3802 GETCHARINC(d, eptr);
3803 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3804 }
3805 }
3806 else
3807 #endif
3808 /* Not UTF mode */
3809 {
3810 for (fi = min;; fi++)
3811 {
3812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3814 if (fi >= max) RRETURN(MATCH_NOMATCH);
3815 if (eptr >= md->end_subject)
3816 {
3817 SCHECK_PARTIAL();
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3821 eptr++;
3822 }
3823 }
3824 /* Control never gets here */
3825 }
3826
3827 /* Maximize case */
3828
3829 else
3830 {
3831 pp = eptr;
3832
3833 #ifdef SUPPORT_UTF
3834 if (utf)
3835 {
3836 register pcre_uint32 d;
3837 for (i = min; i < max; i++)
3838 {
3839 int len = 1;
3840 if (eptr >= md->end_subject)
3841 {
3842 SCHECK_PARTIAL();
3843 break;
3844 }
3845 GETCHARLEN(d, eptr, len);
3846 if (fc == d || (unsigned int)foc == d) break;
3847 eptr += len;
3848 }
3849 if (possessive) continue;
3850 for(;;)
3851 {
3852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3854 if (eptr-- == pp) break; /* Stop if tried at original pos */
3855 BACKCHAR(eptr);
3856 }
3857 }
3858 else
3859 #endif
3860 /* Not UTF mode */
3861 {
3862 for (i = min; i < max; i++)
3863 {
3864 if (eptr >= md->end_subject)
3865 {
3866 SCHECK_PARTIAL();
3867 break;
3868 }
3869 if (fc == *eptr || foc == *eptr) break;
3870 eptr++;
3871 }
3872 if (possessive) continue;
3873 while (eptr >= pp)
3874 {
3875 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3877 eptr--;
3878 }
3879 }
3880
3881 RRETURN(MATCH_NOMATCH);
3882 }
3883 /* Control never gets here */
3884 }
3885
3886 /* Caseful comparisons */
3887
3888 else
3889 {
3890 #ifdef SUPPORT_UTF
3891 if (utf)
3892 {
3893 register pcre_uint32 d;
3894 for (i = 1; i <= min; i++)
3895 {
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 RRETURN(MATCH_NOMATCH);
3900 }
3901 GETCHARINC(d, eptr);
3902 if (fc == d) RRETURN(MATCH_NOMATCH);
3903 }
3904 }
3905 else
3906 #endif
3907 /* Not UTF mode */
3908 {
3909 for (i = 1; i <= min; i++)
3910 {
3911 if (eptr >= md->end_subject)
3912 {
3913 SCHECK_PARTIAL();
3914 RRETURN(MATCH_NOMATCH);
3915 }
3916 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3917 }
3918 }
3919
3920 if (min == max) continue;
3921
3922 if (minimize)
3923 {
3924 #ifdef SUPPORT_UTF
3925 if (utf)
3926 {
3927 register pcre_uint32 d;
3928 for (fi = min;; fi++)
3929 {
3930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3932 if (fi >= max) RRETURN(MATCH_NOMATCH);
3933 if (eptr >= md->end_subject)
3934 {
3935 SCHECK_PARTIAL();
3936 RRETURN(MATCH_NOMATCH);
3937 }
3938 GETCHARINC(d, eptr);
3939 if (fc == d) RRETURN(MATCH_NOMATCH);
3940 }
3941 }
3942 else
3943 #endif
3944 /* Not UTF mode */
3945 {
3946 for (fi = min;; fi++)
3947 {
3948 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3950 if (fi >= max) RRETURN(MATCH_NOMATCH);
3951 if (eptr >= md->end_subject)
3952 {
3953 SCHECK_PARTIAL();
3954 RRETURN(MATCH_NOMATCH);
3955 }
3956 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3957 }
3958 }
3959 /* Control never gets here */
3960 }
3961
3962 /* Maximize case */
3963
3964 else
3965 {
3966 pp = eptr;
3967
3968 #ifdef SUPPORT_UTF
3969 if (utf)
3970 {
3971 register pcre_uint32 d;
3972 for (i = min; i < max; i++)
3973 {
3974 int len = 1;
3975 if (eptr >= md->end_subject)
3976 {
3977 SCHECK_PARTIAL();
3978 break;
3979 }
3980 GETCHARLEN(d, eptr, len);
3981 if (fc == d) break;
3982 eptr += len;
3983 }
3984 if (possessive) continue;
3985 for(;;)
3986 {
3987 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3989 if (eptr-- == pp) break; /* Stop if tried at original pos */
3990 BACKCHAR(eptr);
3991 }
3992 }
3993 else
3994 #endif
3995 /* Not UTF mode */
3996 {
3997 for (i = min; i < max; i++)
3998 {
3999 if (eptr >= md->end_subject)
4000 {
4001 SCHECK_PARTIAL();
4002 break;
4003 }
4004 if (fc == *eptr) break;
4005 eptr++;
4006 }
4007 if (possessive) continue;
4008 while (eptr >= pp)
4009 {
4010 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4011 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4012 eptr--;
4013 }
4014 }
4015
4016 RRETURN(MATCH_NOMATCH);
4017 }
4018 }
4019 /* Control never gets here */
4020
4021 /* Match a single character type repeatedly; several different opcodes
4022 share code. This is very similar to the code for single characters, but we
4023 repeat it in the interests of efficiency. */
4024
4025 case OP_TYPEEXACT:
4026 min = max = GET2(ecode, 1);
4027 minimize = TRUE;
4028 ecode += 1 + IMM2_SIZE;
4029 goto REPEATTYPE;
4030
4031 case OP_TYPEUPTO:
4032 case OP_TYPEMINUPTO:
4033 min = 0;
4034 max = GET2(ecode, 1);
4035 minimize = *ecode == OP_TYPEMINUPTO;
4036 ecode += 1 + IMM2_SIZE;
4037 goto REPEATTYPE;
4038
4039 case OP_TYPEPOSSTAR:
4040 possessive = TRUE;
4041 min = 0;
4042 max = INT_MAX;
4043 ecode++;
4044 goto REPEATTYPE;
4045
4046 case OP_TYPEPOSPLUS:
4047 possessive = TRUE;
4048 min = 1;
4049 max = INT_MAX;
4050 ecode++;
4051 goto REPEATTYPE;
4052
4053 case OP_TYPEPOSQUERY:
4054 possessive = TRUE;
4055 min = 0;
4056 max = 1;
4057 ecode++;
4058 goto REPEATTYPE;
4059
4060 case OP_TYPEPOSUPTO:
4061 possessive = TRUE;
4062 min = 0;
4063 max = GET2(ecode, 1);
4064 ecode += 1 + IMM2_SIZE;
4065 goto REPEATTYPE;
4066
4067 case OP_TYPESTAR:
4068 case OP_TYPEMINSTAR:
4069 case OP_TYPEPLUS:
4070 case OP_TYPEMINPLUS:
4071 case OP_TYPEQUERY:
4072 case OP_TYPEMINQUERY:
4073 c = *ecode++ - OP_TYPESTAR;
4074 minimize = (c & 1) != 0;
4075 min = rep_min[c]; /* Pick up values from tables; */
4076 max = rep_max[c]; /* zero for max => infinity */
4077 if (max == 0) max = INT_MAX;
4078
4079 /* Common code for all repeated single character type matches. Note that
4080 in UTF-8 mode, '.' matches a character of any length, but for the other
4081 character types, the valid characters are all one-byte long. */
4082
4083 REPEATTYPE:
4084 ctype = *ecode++; /* Code for the character type */
4085
4086 #ifdef SUPPORT_UCP
4087 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4088 {
4089 prop_fail_result = ctype == OP_NOTPROP;
4090 prop_type = *ecode++;
4091 prop_value = *ecode++;
4092 }
4093 else prop_type = -1;
4094 #endif
4095
4096 /* First, ensure the minimum number of matches are present. Use inline
4097 code for maximizing the speed, and do the type test once at the start
4098 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4099 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4100 and single-bytes. */
4101
4102 if (min > 0)
4103 {
4104 #ifdef SUPPORT_UCP
4105 if (prop_type >= 0)
4106 {
4107 switch(prop_type)
4108 {
4109 case PT_ANY:
4110 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4111 for (i = 1; i <= min; i++)
4112 {
4113 if (eptr >= md->end_subject)
4114 {
4115 SCHECK_PARTIAL();
4116 RRETURN(MATCH_NOMATCH);
4117 }
4118 GETCHARINCTEST(c, eptr);
4119 }
4120 break;
4121
4122 case PT_LAMP:
4123 for (i = 1; i <= min; i++)
4124 {
4125 int chartype;
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 RRETURN(MATCH_NOMATCH);
4130 }
4131 GETCHARINCTEST(c, eptr);
4132 chartype = UCD_CHARTYPE(c);
4133 if ((chartype == ucp_Lu ||
4134 chartype == ucp_Ll ||
4135 chartype == ucp_Lt) == prop_fail_result)
4136 RRETURN(MATCH_NOMATCH);
4137 }
4138 break;
4139
4140 case PT_GC:
4141 for (i = 1; i <= min; i++)
4142 {
4143 if (eptr >= md->end_subject)
4144 {
4145 SCHECK_PARTIAL();
4146 RRETURN(MATCH_NOMATCH);
4147 }
4148 GETCHARINCTEST(c, eptr);
4149 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 break;
4153
4154 case PT_PC:
4155 for (i = 1; i <= min; i++)
4156 {
4157 if (eptr >= md->end_subject)
4158 {
4159 SCHECK_PARTIAL();
4160 RRETURN(MATCH_NOMATCH);
4161 }
4162 GETCHARINCTEST(c, eptr);
4163 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4164 RRETURN(MATCH_NOMATCH);
4165 }
4166 break;
4167
4168 case PT_SC:
4169 for (i = 1; i <= min; i++)
4170 {
4171 if (eptr >= md->end_subject)
4172 {
4173 SCHECK_PARTIAL();
4174 RRETURN(MATCH_NOMATCH);
4175 }
4176 GETCHARINCTEST(c, eptr);
4177 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 break;
4181
4182 case PT_ALNUM:
4183 for (i = 1; i <= min; i++)
4184 {
4185 int category;
4186 if (eptr >= md->end_subject)
4187 {
4188 SCHECK_PARTIAL();
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 GETCHARINCTEST(c, eptr);
4192 category = UCD_CATEGORY(c);
4193 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4194 RRETURN(MATCH_NOMATCH);
4195 }
4196 break;
4197
4198 case PT_SPACE: /* Perl space */
4199 for (i = 1; i <= min; i++)
4200 {
4201 if (eptr >= md->end_subject)
4202 {
4203 SCHECK_PARTIAL();
4204 RRETURN(MATCH_NOMATCH);
4205 }
4206 GETCHARINCTEST(c, eptr);
4207 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4208 c == CHAR_FF || c == CHAR_CR)
4209 == prop_fail_result)
4210 RRETURN(MATCH_NOMATCH);
4211 }
4212 break;
4213
4214 case PT_PXSPACE: /* POSIX space */
4215 for (i = 1; i <= min; i++)
4216 {
4217 if (eptr >= md->end_subject)
4218 {
4219 SCHECK_PARTIAL();
4220 RRETURN(MATCH_NOMATCH);
4221 }
4222 GETCHARINCTEST(c, eptr);
4223 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4224 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4225 == prop_fail_result)
4226 RRETURN(MATCH_NOMATCH);
4227 }
4228 break;
4229
4230 case PT_WORD:
4231 for (i = 1; i <= min; i++)
4232 {
4233 int category;
4234 if (eptr >= md->end_subject)
4235 {
4236 SCHECK_PARTIAL();
4237 RRETURN(MATCH_NOMATCH);
4238 }
4239 GETCHARINCTEST(c, eptr);
4240 category = UCD_CATEGORY(c);
4241 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4242 == prop_fail_result)
4243 RRETURN(MATCH_NOMATCH);
4244 }
4245 break;
4246
4247 case PT_CLIST:
4248 for (i = 1; i <= min; i++)
4249 {
4250 const pcre_uint32 *cp;
4251 if (eptr >= md->end_subject)
4252 {
4253 SCHECK_PARTIAL();
4254 RRETURN(MATCH_NOMATCH);
4255 }
4256 GETCHARINCTEST(c, eptr);
4257 cp = PRIV(ucd_caseless_sets) + prop_value;
4258 for (;;)
4259 {
4260 if (c < *cp)
4261 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4262 if (c == *cp++)
4263 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4264 }
4265 }
4266 break;
4267
4268 case PT_UCNC:
4269 for (i = 1; i <= min; i++)
4270 {
4271 if (eptr >= md->end_subject)
4272 {
4273 SCHECK_PARTIAL();
4274 RRETURN(MATCH_NOMATCH);
4275 }
4276 GETCHARINCTEST(c, eptr);
4277 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4278 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4279 c >= 0xe000) == prop_fail_result)
4280 RRETURN(MATCH_NOMATCH);
4281 }
4282 break;
4283
4284 /* This should not occur */
4285
4286 default:
4287 RRETURN(PCRE_ERROR_INTERNAL);
4288 }
4289 }
4290
4291 /* Match extended Unicode sequences. We will get here only if the
4292 support is in the binary; otherwise a compile-time error occurs. */
4293
4294 else if (ctype == OP_EXTUNI)
4295 {
4296 for (i = 1; i <= min; i++)
4297 {
4298 if (eptr >= md->end_subject)
4299 {
4300 SCHECK_PARTIAL();
4301 RRETURN(MATCH_NOMATCH);
4302 }
4303 else
4304 {
4305 int lgb, rgb;
4306 GETCHARINCTEST(c, eptr);
4307 lgb = UCD_GRAPHBREAK(c);
4308 while (eptr < md->end_subject)
4309 {
4310 int len = 1;
4311 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4312 rgb = UCD_GRAPHBREAK(c);
4313 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4314 lgb = rgb;
4315 eptr += len;
4316 }
4317 }
4318 CHECK_PARTIAL();
4319 }
4320 }
4321
4322 else
4323 #endif /* SUPPORT_UCP */
4324
4325 /* Handle all other cases when the coding is UTF-8 */
4326
4327 #ifdef SUPPORT_UTF
4328 if (utf) switch(ctype)
4329 {
4330 case OP_ANY:
4331 for (i = 1; i <= min; i++)
4332 {
4333 if (eptr >= md->end_subject)
4334 {
4335 SCHECK_PARTIAL();
4336 RRETURN(MATCH_NOMATCH);
4337 }
4338 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4339 if (md->partial != 0 &&
4340 eptr + 1 >= md->end_subject &&
4341 NLBLOCK->nltype == NLTYPE_FIXED &&
4342 NLBLOCK->nllen == 2 &&
4343 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4344 {
4345 md->hitend = TRUE;
4346 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4347 }
4348 eptr++;
4349 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4350 }
4351 break;
4352
4353 case OP_ALLANY:
4354 for (i = 1; i <= min; i++)
4355 {
4356 if (eptr >= md->end_subject)
4357 {
4358 SCHECK_PARTIAL();
4359 RRETURN(MATCH_NOMATCH);
4360 }
4361 eptr++;
4362 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4363 }
4364 break;
4365
4366 case OP_ANYBYTE:
4367 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4368 eptr += min;
4369 break;
4370
4371 case OP_ANYNL:
4372 for (i = 1; i <= min; i++)
4373 {
4374 if (eptr >= md->end_subject)
4375 {
4376 SCHECK_PARTIAL();
4377 RRETURN(MATCH_NOMATCH);
4378 }
4379 GETCHARINC(c, eptr);
4380 switch(c)
4381 {
4382 default: RRETURN(MATCH_NOMATCH);
4383
4384 case CHAR_CR:
4385 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4386 break;
4387
4388 case CHAR_LF:
4389 break;
4390
4391 case CHAR_VT:
4392 case CHAR_FF:
4393 case CHAR_NEL:
4394 #ifndef EBCDIC
4395 case 0x2028:
4396 case 0x2029:
4397 #endif /* Not EBCDIC */
4398 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4399 break;
4400 }
4401 }
4402 break;
4403
4404 case OP_NOT_HSPACE:
4405 for (i = 1; i <= min; i++)
4406 {
4407 if (eptr >= md->end_subject)
4408 {
4409 SCHECK_PARTIAL();
4410 RRETURN(MATCH_NOMATCH);
4411 }
4412 GETCHARINC(c, eptr);
4413 switch(c)
4414 {
4415 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4416 default: break;
4417 }
4418 }
4419 break;
4420
4421 case OP_HSPACE:
4422 for (i = 1; i <= min; i++)
4423 {
4424 if (eptr >= md->end_subject)
4425 {
4426 SCHECK_PARTIAL();
4427 RRETURN(MATCH_NOMATCH);
4428 }
4429 GETCHARINC(c, eptr);
4430 switch(c)
4431 {
4432 HSPACE_CASES: break; /* Byte and multibyte cases */
4433 default: RRETURN(MATCH_NOMATCH);
4434 }
4435 }
4436 break;
4437
4438 case OP_NOT_VSPACE:
4439 for (i = 1; i <= min; i++)
4440 {
4441 if (eptr >= md->end_subject)
4442 {
4443 SCHECK_PARTIAL();
4444 RRETURN(MATCH_NOMATCH);
4445 }
4446 GETCHARINC(c, eptr);
4447 switch(c)
4448 {
4449 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4450 default: break;
4451 }
4452 }
4453 break;
4454
4455 case OP_VSPACE:
4456 for (i = 1; i <= min; i++)
4457 {
4458 if (eptr >= md->end_subject)
4459 {
4460 SCHECK_PARTIAL();
4461 RRETURN(MATCH_NOMATCH);
4462 }
4463 GETCHARINC(c, eptr);
4464 switch(c)
4465 {
4466 VSPACE_CASES: break;
4467 default: RRETURN(MATCH_NOMATCH);
4468 }
4469 }
4470 break;
4471
4472 case OP_NOT_DIGIT:
4473 for (i = 1; i <= min; i++)
4474 {
4475 if (eptr >= md->end_subject)
4476 {
4477 SCHECK_PARTIAL();
4478 RRETURN(MATCH_NOMATCH);
4479 }
4480 GETCHARINC(c, eptr);
4481 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4482 RRETURN(MATCH_NOMATCH);
4483 }
4484 break;
4485
4486 case OP_DIGIT:
4487 for (i = 1; i <= min; i++)
4488 {
4489 pcre_uint32 cc;
4490 if (eptr >= md->end_subject)
4491 {
4492 SCHECK_PARTIAL();
4493 RRETURN(MATCH_NOMATCH);
4494 }
4495 cc = RAWUCHAR(eptr);
4496 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4497 RRETURN(MATCH_NOMATCH);
4498 eptr++;
4499 /* No need to skip more bytes - we know it's a 1-byte character */
4500 }
4501 break;
4502
4503 case OP_NOT_WHITESPACE:
4504 for (i = 1; i <= min; i++)
4505 {
4506 pcre_uint32 cc;
4507 if (eptr >= md->end_subject)
4508 {
4509 SCHECK_PARTIAL();
4510 RRETURN(MATCH_NOMATCH);
4511 }
4512 cc = RAWUCHAR(eptr);
4513 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4514 RRETURN(MATCH_NOMATCH);
4515 eptr++;
4516 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4517 }
4518 break;
4519
4520 case OP_WHITESPACE:
4521 for (i = 1; i <= min; i++)
4522 {
4523 pcre_uint32 cc;
4524 if (eptr >= md->end_subject)
4525 {
4526 SCHECK_PARTIAL();
4527 RRETURN(MATCH_NOMATCH);
4528 }
4529 cc = RAWUCHAR(eptr);
4530 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4531 RRETURN(MATCH_NOMATCH);
4532 eptr++;
4533 /* No need to skip more bytes - we know it's a 1-byte character */
4534 }
4535 break;
4536
4537 case OP_NOT_WORDCHAR:
4538 for (i = 1; i <= min; i++)
4539 {
4540 pcre_uint32 cc;
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 RRETURN(MATCH_NOMATCH);
4545 }
4546 cc = RAWUCHAR(eptr);
4547 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4548 RRETURN(MATCH_NOMATCH);
4549 eptr++;
4550 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4551 }
4552 break;
4553
4554 case OP_WORDCHAR:
4555 for (i = 1; i <= min; i++)
4556 {
4557 pcre_uint32 cc;
4558 if (eptr >= md->end_subject)
4559 {
4560 SCHECK_PARTIAL();
4561 RRETURN(MATCH_NOMATCH);
4562 }
4563 cc = RAWUCHAR(eptr);
4564 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4565 RRETURN(MATCH_NOMATCH);
4566 eptr++;
4567 /* No need to skip more bytes - we know it's a 1-byte character */
4568 }
4569 break;
4570
4571 default:
4572 RRETURN(PCRE_ERROR_INTERNAL);
4573 } /* End switch(ctype) */
4574
4575 else
4576 #endif /* SUPPORT_UTF */
4577
4578 /* Code for the non-UTF-8 case for minimum matching of operators other
4579 than OP_PROP and OP_NOTPROP. */
4580
4581 switch(ctype)
4582 {
4583 case OP_ANY:
4584 for (i = 1; i <= min; i++)
4585 {
4586 if (eptr >= md->end_subject)
4587 {
4588 SCHECK_PARTIAL();
4589 RRETURN(MATCH_NOMATCH);
4590 }
4591 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4592 if (md->partial != 0 &&
4593 eptr + 1 >= md->end_subject &&
4594 NLBLOCK->nltype == NLTYPE_FIXED &&
4595 NLBLOCK->nllen == 2 &&
4596 *eptr == NLBLOCK->nl[0])
4597 {
4598 md->hitend = TRUE;
4599 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4600 }
4601 eptr++;
4602 }
4603 break;
4604
4605 case OP_ALLANY:
4606 if (eptr > md->end_subject - min)
4607 {
4608 SCHECK_PARTIAL();
4609 RRETURN(MATCH_NOMATCH);
4610 }
4611 eptr += min;
4612 break;
4613
4614 case OP_ANYBYTE:
4615 if (eptr > md->end_subject - min)
4616 {
4617 SCHECK_PARTIAL();
4618 RRETURN(MATCH_NOMATCH);
4619 }
4620 eptr += min;
4621 break;
4622
4623 case OP_ANYNL:
4624 for (i = 1; i <= min; i++)
4625 {
4626 if (eptr >= md->end_subject)
4627 {
4628 SCHECK_PARTIAL();
4629 RRETURN(MATCH_NOMATCH);
4630 }
4631 switch(*eptr++)
4632 {
4633 default: RRETURN(MATCH_NOMATCH);
4634
4635 case CHAR_CR:
4636 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4637 break;
4638
4639 case CHAR_LF:
4640 break;
4641
4642 case CHAR_VT:
4643 case CHAR_FF:
4644 case CHAR_NEL:
4645 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4646 case 0x2028:
4647 case 0x2029:
4648 #endif
4649 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4650 break;
4651 }
4652 }
4653 break;
4654
4655 case OP_NOT_HSPACE:
4656 for (i = 1; i <= min; i++)
4657 {
4658 if (eptr >= md->end_subject)
4659 {
4660 SCHECK_PARTIAL();
4661 RRETURN(MATCH_NOMATCH);
4662 }
4663 switch(*eptr++)
4664 {
4665 default: break;
4666 HSPACE_BYTE_CASES:
4667 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4668 HSPACE_MULTIBYTE_CASES:
4669 #endif
4670 RRETURN(MATCH_NOMATCH);
4671 }
4672 }
4673 break;
4674
4675 case OP_HSPACE:
4676 for (i = 1; i <= min; i++)
4677 {
4678 if (eptr >= md->end_subject)
4679 {
4680 SCHECK_PARTIAL();
4681 RRETURN(MATCH_NOMATCH);
4682 }
4683 switch(*eptr++)
4684 {
4685 default: RRETURN(MATCH_NOMATCH);
4686 HSPACE_BYTE_CASES:
4687 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4688 HSPACE_MULTIBYTE_CASES:
4689 #endif
4690 break;
4691 }
4692 }
4693 break;
4694
4695 case OP_NOT_VSPACE:
4696 for (i = 1; i <= min; i++)
4697 {
4698 if (eptr >= md->end_subject)
4699 {
4700 SCHECK_PARTIAL();
4701 RRETURN(MATCH_NOMATCH);
4702 }
4703 switch(*eptr++)
4704 {
4705 VSPACE_BYTE_CASES:
4706 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4707 VSPACE_MULTIBYTE_CASES:
4708 #endif
4709 RRETURN(MATCH_NOMATCH);
4710 default: break;
4711 }
4712 }
4713 break;
4714
4715 case OP_VSPACE:
4716 for (i = 1; i <= min; i++)
4717 {
4718 if (eptr >= md->end_subject)
4719 {
4720 SCHECK_PARTIAL();
4721 RRETURN(MATCH_NOMATCH);
4722 }
4723 switch(*eptr++)
4724 {
4725 default: RRETURN(MATCH_NOMATCH);
4726 VSPACE_BYTE_CASES:
4727 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4728 VSPACE_MULTIBYTE_CASES:
4729 #endif
4730 break;
4731 }
4732 }
4733 break;
4734
4735 case OP_NOT_DIGIT:
4736 for (i = 1; i <= min; i++)
4737 {
4738 if (eptr >= md->end_subject)
4739 {
4740 SCHECK_PARTIAL();
4741 RRETURN(MATCH_NOMATCH);
4742 }
4743 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4744 RRETURN(MATCH_NOMATCH);
4745 eptr++;
4746 }
4747 break;
4748
4749 case OP_DIGIT:
4750 for (i = 1; i <= min; i++)
4751 {
4752 if (eptr >= md->end_subject)
4753 {
4754 SCHECK_PARTIAL();
4755 RRETURN(MATCH_NOMATCH);
4756 }
4757 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4758 RRETURN(MATCH_NOMATCH);
4759 eptr++;
4760 }
4761 break;
4762
4763 case OP_NOT_WHITESPACE:
4764 for (i = 1; i <= min; i++)
4765 {
4766 if (eptr >= md->end_subject)
4767 {
4768 SCHECK_PARTIAL();
4769 RRETURN(MATCH_NOMATCH);
4770 }
4771 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4772 RRETURN(MATCH_NOMATCH);
4773 eptr++;
4774 }
4775 break;
4776
4777 case OP_WHITESPACE:
4778 for (i = 1; i <= min; i++)
4779 {
4780 if (eptr >= md->end_subject)
4781 {
4782 SCHECK_PARTIAL();
4783 RRETURN(MATCH_NOMATCH);
4784 }
4785 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4786 RRETURN(MATCH_NOMATCH);
4787 eptr++;
4788 }
4789 break;
4790
4791 case OP_NOT_WORDCHAR:
4792 for (i = 1; i <= min; i++)
4793 {
4794 if (eptr >= md->end_subject)
4795 {
4796 SCHECK_PARTIAL();
4797 RRETURN(MATCH_NOMATCH);
4798 }
4799 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4800 RRETURN(MATCH_NOMATCH);
4801 eptr++;
4802 }
4803 break;
4804
4805 case OP_WORDCHAR:
4806 for (i = 1; i <= min; i++)
4807 {
4808 if (eptr >= md->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 RRETURN(MATCH_NOMATCH);
4812 }
4813 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4814 RRETURN(MATCH_NOMATCH);
4815 eptr++;
4816 }
4817 break;
4818
4819 default:
4820 RRETURN(PCRE_ERROR_INTERNAL);
4821 }
4822 }
4823
4824 /* If min = max, continue at the same level without recursing */
4825
4826 if (min == max) continue;
4827
4828 /* If minimizing, we have to test the rest of the pattern before each
4829 subsequent match. Again, separate the UTF-8 case for speed, and also
4830 separate the UCP cases. */
4831
4832 if (minimize)
4833 {
4834 #ifdef SUPPORT_UCP
4835 if (prop_type >= 0)
4836 {
4837 switch(prop_type)
4838 {
4839 case PT_ANY:
4840 for (fi = min;; fi++)
4841 {
4842 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4844 if (fi >= max) RRETURN(MATCH_NOMATCH);
4845 if (eptr >= md->end_subject)
4846 {
4847 SCHECK_PARTIAL();
4848 RRETURN(MATCH_NOMATCH);
4849 }
4850 GETCHARINCTEST(c, eptr);
4851 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4852 }
4853 /* Control never gets here */
4854
4855 case PT_LAMP:
4856 for (fi = min;; fi++)
4857 {
4858 int chartype;
4859 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4861 if (fi >= max) RRETURN(MATCH_NOMATCH);
4862 if (eptr >= md->end_subject)
4863 {
4864 SCHECK_PARTIAL();
4865 RRETURN(MATCH_NOMATCH);
4866 }
4867 GETCHARINCTEST(c, eptr);
4868 chartype = UCD_CHARTYPE(c);
4869 if ((chartype == ucp_Lu ||
4870 chartype == ucp_Ll ||
4871 chartype == ucp_Lt) == prop_fail_result)
4872 RRETURN(MATCH_NOMATCH);
4873 }
4874 /* Control never gets here */
4875
4876 case PT_GC:
4877 for (fi = min;; fi++)
4878 {
4879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4881 if (fi >= max) RRETURN(MATCH_NOMATCH);
4882 if (eptr >= md->end_subject)
4883 {
4884 SCHECK_PARTIAL();
4885 RRETURN(MATCH_NOMATCH);
4886 }
4887 GETCHARINCTEST(c, eptr);
4888 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4889 RRETURN(MATCH_NOMATCH);
4890 }
4891 /* Control never gets here */
4892
4893 case PT_PC:
4894 for (fi = min;; fi++)
4895 {
4896 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4898 if (fi >= max) RRETURN(MATCH_NOMATCH);
4899 if (eptr >= md->end_subject)
4900 {
4901 SCHECK_PARTIAL();
4902 RRETURN(MATCH_NOMATCH);
4903 }
4904 GETCHARINCTEST(c, eptr);
4905 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4906 RRETURN(MATCH_NOMATCH);
4907 }
4908 /* Control never gets here */
4909
4910 case PT_SC:
4911 for (fi = min;; fi++)
4912 {
4913 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4915 if (fi >= max) RRETURN(MATCH_NOMATCH);
4916 if (eptr >= md->end_subject)
4917 {
4918 SCHECK_PARTIAL();
4919 RRETURN(MATCH_NOMATCH);
4920 }
4921 GETCHARINCTEST(c, eptr);
4922 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4923 RRETURN(MATCH_NOMATCH);
4924 }
4925 /* Control never gets here */
4926
4927 case PT_ALNUM:
4928 for (fi = min;; fi++)
4929 {
4930 int category;
4931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4933 if (fi >= max) RRETURN(MATCH_NOMATCH);
4934 if (eptr >= md->end_subject)
4935 {
4936 SCHECK_PARTIAL();
4937 RRETURN(MATCH_NOMATCH);
4938 }
4939 GETCHARINCTEST(c, eptr);
4940 category = UCD_CATEGORY(c);
4941 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4942 RRETURN(MATCH_NOMATCH);
4943 }
4944 /* Control never gets here */
4945
4946 case PT_SPACE: /* Perl space */
4947 for (fi = min;; fi++)
4948 {
4949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4951 if (fi >= max) RRETURN(MATCH_NOMATCH);
4952 if (eptr >= md->end_subject)
4953 {
4954 SCHECK_PARTIAL();
4955 RRETURN(MATCH_NOMATCH);
4956 }
4957 GETCHARINCTEST(c, eptr);
4958 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4959 c == CHAR_FF || c == CHAR_CR)
4960 == prop_fail_result)
4961 RRETURN(MATCH_NOMATCH);
4962 }
4963 /* Control never gets here */
4964
4965 case PT_PXSPACE: /* POSIX space */
4966 for (fi = min;; fi++)
4967 {
4968 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4970 if (fi >= max) RRETURN(MATCH_NOMATCH);
4971 if (eptr >= md->end_subject)
4972 {
4973 SCHECK_PARTIAL();
4974 RRETURN(MATCH_NOMATCH);
4975 }
4976 GETCHARINCTEST(c, eptr);
4977 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4978 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4979 == prop_fail_result)
4980 RRETURN(MATCH_NOMATCH);
4981 }
4982 /* Control never gets here */
4983
4984 case PT_WORD:
4985 for (fi = min;; fi++)
4986 {
4987 int category;
4988 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4990 if (fi >= max) RRETURN(MATCH_NOMATCH);
4991 if (eptr >= md->end_subject)
4992 {
4993 SCHECK_PARTIAL();
4994 RRETURN(MATCH_NOMATCH);
4995 }
4996 GETCHARINCTEST(c, eptr);
4997 category = UCD_CATEGORY(c);
4998 if ((category == ucp_L ||
4999 category == ucp_N ||
5000 c == CHAR_UNDERSCORE)
5001 == prop_fail_result)
5002 RRETURN(MATCH_NOMATCH);
5003 }
5004 /* Control never gets here */
5005
5006 case PT_CLIST:
5007 for (fi = min;; fi++)
5008 {
5009 const pcre_uint32 *cp;
5010 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5011 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5012 if (fi >= max) RRETURN(MATCH_NOMATCH);
5013 if (eptr >= md->end_subject)
5014 {
5015 SCHECK_PARTIAL();
5016 RRETURN(MATCH_NOMATCH);
5017 }
5018 GETCHARINCTEST(c, eptr);
5019 cp = PRIV(ucd_caseless_sets) + prop_value;
5020 for (;;)
5021 {
5022 if (c < *cp)
5023 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5024 if (c == *cp++)
5025 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5026 }
5027 }
5028 /* Control never gets here */
5029
5030 case PT_UCNC:
5031 for (fi = min;; fi++)
5032 {
5033 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5035 if (fi >= max) RRETURN(MATCH_NOMATCH);
5036 if (eptr >= md->end_subject)
5037 {
5038 SCHECK_PARTIAL();
5039 RRETURN(MATCH_NOMATCH);
5040 }
5041 GETCHARINCTEST(c, eptr);
5042 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5043 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5044 c >= 0xe000) == prop_fail_result)
5045 RRETURN(MATCH_NOMATCH);
5046 }
5047 /* Control never gets here */
5048
5049 /* This should never occur */
5050 default:
5051 RRETURN(PCRE_ERROR_INTERNAL);
5052 }
5053 }
5054
5055 /* Match extended Unicode sequences. We will get here only if the
5056 support is in the binary; otherwise a compile-time error occurs. */
5057
5058 else if (ctype == OP_EXTUNI)
5059 {
5060 for (fi = min;; fi++)
5061 {
5062 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5063 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5064 if (fi >= max) RRETURN(MATCH_NOMATCH);
5065 if (eptr >= md->end_subject)
5066 {
5067 SCHECK_PARTIAL();
5068 RRETURN(MATCH_NOMATCH);
5069 }
5070 else
5071 {
5072 int lgb, rgb;
5073 GETCHARINCTEST(c, eptr);
5074 lgb = UCD_GRAPHBREAK(c);
5075 while (eptr < md->end_subject)
5076 {
5077 int len = 1;
5078 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5079 rgb = UCD_GRAPHBREAK(c);
5080 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5081 lgb = rgb;
5082 eptr += len;
5083 }
5084 }
5085 CHECK_PARTIAL();
5086 }
5087 }
5088 else
5089 #endif /* SUPPORT_UCP */
5090
5091 #ifdef SUPPORT_UTF
5092 if (utf)
5093 {
5094 for (fi = min;; fi++)
5095 {
5096 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5097 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5098 if (fi >= max) RRETURN(MATCH_NOMATCH);
5099 if (eptr >= md->end_subject)
5100 {
5101 SCHECK_PARTIAL();
5102 RRETURN(MATCH_NOMATCH);
5103 }
5104 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5105 RRETURN(MATCH_NOMATCH);
5106 GETCHARINC(c, eptr);
5107 switch(ctype)
5108 {
5109 case OP_ANY: /* This is the non-NL case */
5110 if (md->partial != 0 && /* Take care with CRLF partial */
5111 eptr >= md->end_subject &&
5112 NLBLOCK->nltype == NLTYPE_FIXED &&
5113 NLBLOCK->nllen == 2 &&
5114 c == NLBLOCK->nl[0])
5115 {
5116 md->hitend = TRUE;
5117 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5118 }
5119 break;
5120
5121 case OP_ALLANY:
5122 case OP_ANYBYTE:
5123 break;
5124
5125 case OP_ANYNL:
5126 switch(c)
5127 {
5128 default: RRETURN(MATCH_NOMATCH);
5129 case CHAR_CR:
5130 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5131 break;
5132
5133 case CHAR_LF:
5134 break;
5135
5136 case CHAR_VT:
5137 case CHAR_FF:
5138 case CHAR_NEL:
5139 #ifndef EBCDIC
5140 case 0x2028:
5141 case 0x2029:
5142 #endif /* Not EBCDIC */
5143 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5144 break;
5145 }
5146 break;
5147
5148 case OP_NOT_HSPACE:
5149 switch(c)
5150 {
5151 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5152 default: break;
5153 }
5154 break;
5155
5156 case OP_HSPACE:
5157 switch(c)
5158 {
5159 HSPACE_CASES: break;
5160 default: RRETURN(MATCH_NOMATCH);
5161 }
5162 break;
5163
5164 case OP_NOT_VSPACE:
5165 switch(c)
5166 {
5167 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5168 default: break;
5169 }
5170 break;
5171
5172 case OP_VSPACE:
5173 switch(c)
5174 {
5175 VSPACE_CASES: break;
5176 default: RRETURN(MATCH_NOMATCH);
5177 }
5178 break;
5179
5180 case OP_NOT_DIGIT:
5181 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5182 RRETURN(MATCH_NOMATCH);
5183 break;
5184
5185 case OP_DIGIT:
5186 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5187 RRETURN(MATCH_NOMATCH);
5188 break;
5189
5190 case OP_NOT_WHITESPACE:
5191 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5192 RRETURN(MATCH_NOMATCH);
5193 break;
5194
5195 case OP_WHITESPACE:
5196 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5197 RRETURN(MATCH_NOMATCH);
5198 break;
5199
5200 case OP_NOT_WORDCHAR:
5201 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5202 RRETURN(MATCH_NOMATCH);
5203 break;
5204
5205 case OP_WORDCHAR:
5206 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5207 RRETURN(MATCH_NOMATCH);
5208 break;
5209
5210 default:
5211 RRETURN(PCRE_ERROR_INTERNAL);
5212 }
5213 }
5214 }
5215 else
5216 #endif
5217 /* Not UTF mode */
5218 {
5219 for (fi = min;; fi++)
5220 {
5221 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5222 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5223 if (fi >= max) RRETURN(MATCH_NOMATCH);
5224 if (eptr >= md->end_subject)
5225 {
5226 SCHECK_PARTIAL();
5227 RRETURN(MATCH_NOMATCH);
5228 }
5229 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5230 RRETURN(MATCH_NOMATCH);
5231 c = *eptr++;
5232 switch(ctype)
5233 {
5234 case OP_ANY: /* This is the non-NL case */
5235 if (md->partial != 0 && /* Take care with CRLF partial */
5236 eptr >= md->end_subject &&
5237 NLBLOCK->nltype == NLTYPE_FIXED &&
5238 NLBLOCK->nllen == 2 &&
5239 c == NLBLOCK->nl[0])
5240 {
5241 md->hitend = TRUE;
5242 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5243 }
5244 break;
5245
5246 case OP_ALLANY:
5247 case OP_ANYBYTE:
5248 break;
5249
5250 case OP_ANYNL:
5251 switch(c)
5252 {
5253 default: RRETURN(MATCH_NOMATCH);
5254 case CHAR_CR:
5255 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5256 break;
5257
5258 case CHAR_LF:
5259 break;
5260
5261 case CHAR_VT:
5262 case CHAR_FF:
5263 case CHAR_NEL:
5264 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5265 case 0x2028:
5266 case 0x2029:
5267 #endif
5268 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5269 break;
5270 }
5271 break;
5272
5273 case OP_NOT_HSPACE:
5274 switch(c)
5275 {
5276 default: break;
5277 HSPACE_BYTE_CASES:
5278 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5279 HSPACE_MULTIBYTE_CASES:
5280 #endif
5281 RRETURN(MATCH_NOMATCH);
5282 }
5283 break;
5284
5285 case OP_HSPACE:
5286 switch(c)
5287 {
5288 default: RRETURN(MATCH_NOMATCH);
5289 HSPACE_BYTE_CASES:
5290 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5291 HSPACE_MULTIBYTE_CASES:
5292 #endif
5293 break;
5294 }
5295 break;
5296
5297 case OP_NOT_VSPACE:
5298 switch(c)
5299 {
5300 default: break;
5301 VSPACE_BYTE_CASES:
5302 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5303 VSPACE_MULTIBYTE_CASES:
5304 #endif
5305 RRETURN(MATCH_NOMATCH);
5306 }
5307 break;
5308
5309 case OP_VSPACE:
5310 switch(c)
5311 {
5312 default: RRETURN(MATCH_NOMATCH);
5313 VSPACE_BYTE_CASES:
5314 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5315 VSPACE_MULTIBYTE_CASES:
5316 #endif
5317 break;
5318 }
5319 break;
5320
5321 case OP_NOT_DIGIT:
5322 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5323 break;
5324
5325 case OP_DIGIT:
5326 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5327 break;
5328
5329 case OP_NOT_WHITESPACE:
5330 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5331 break;
5332
5333 case OP_WHITESPACE:
5334 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5335 break;
5336
5337 case OP_NOT_WORDCHAR:
5338 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5339 break;
5340
5341 case OP_WORDCHAR:
5342 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5343 break;
5344
5345 default:
5346 RRETURN(PCRE_ERROR_INTERNAL);
5347 }
5348 }
5349 }
5350 /* Control never gets here */
5351 }
5352
5353 /* If maximizing, it is worth using inline code for speed, doing the type
5354 test once at the start (i.e. keep it out of the loop). Again, keep the
5355 UTF-8 and UCP stuff separate. */
5356
5357 else
5358 {
5359 pp = eptr; /* Remember where we started */
5360
5361 #ifdef SUPPORT_UCP
5362 if (prop_type >= 0)
5363 {
5364 switch(prop_type)
5365 {
5366 case PT_ANY:
5367 for (i = min; i < max; i++)
5368 {
5369 int len = 1;
5370 if (eptr >= md->end_subject)
5371 {
5372 SCHECK_PARTIAL();
5373 break;
5374 }
5375 GETCHARLENTEST(c, eptr, len);
5376 if (prop_fail_result) break;
5377 eptr+= len;
5378 }
5379 break;
5380
5381 case PT_LAMP:
5382 for (i = min; i < max; i++)
5383 {
5384 int chartype;
5385 int len = 1;
5386 if (eptr >= md->end_subject)
5387 {
5388 SCHECK_PARTIAL();
5389 break;
5390 }
5391 GETCHARLENTEST(c, eptr, len);
5392 chartype = UCD_CHARTYPE(c);
5393 if ((chartype == ucp_Lu ||
5394 chartype == ucp_Ll ||
5395 chartype == ucp_Lt) == prop_fail_result)
5396 break;
5397 eptr+= len;
5398 }
5399 break;
5400
5401 case PT_GC:
5402 for (i = min; i < max; i++)
5403 {
5404 int len = 1;
5405 if (eptr >= md->end_subject)
5406 {
5407 SCHECK_PARTIAL();
5408 break;
5409 }
5410 GETCHARLENTEST(c, eptr, len);
5411 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5412 eptr+= len;
5413 }
5414 break;
5415
5416 case PT_PC:
5417 for (i = min; i < max; i++)
5418 {
5419 int len = 1;
5420 if (eptr >= md->end_subject)
5421 {
5422 SCHECK_PARTIAL();
5423 break;
5424 }
5425 GETCHARLENTEST(c, eptr, len);
5426 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5427 eptr+= len;
5428 }
5429 break;
5430
5431 case PT_SC:
5432 for (i = min; i < max; i++)
5433 {
5434 int len = 1;
5435 if (eptr >= md->end_subject)
5436 {
5437 SCHECK_PARTIAL();
5438 break;
5439 }
5440 GETCHARLENTEST(c, eptr, len);
5441 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5442 eptr+= len;
5443 }
5444 break;
5445
5446 case PT_ALNUM:
5447 for (i = min; i < max; i++)
5448 {
5449 int category;
5450 int len = 1;
5451 if (eptr >= md->end_subject)
5452 {
5453 SCHECK_PARTIAL();
5454 break;
5455 }
5456 GETCHARLENTEST(c, eptr, len);
5457 category = UCD_CATEGORY(c);
5458 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5459 break;
5460 eptr+= len;
5461 }
5462 break;
5463
5464 case PT_SPACE: /* Perl space */
5465 for (i = min; i < max; i++)
5466 {
5467 int len = 1;
5468 if (eptr >= md->end_subject)
5469 {
5470 SCHECK_PARTIAL();
5471 break;
5472 }
5473 GETCHARLENTEST(c, eptr, len);
5474 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5475 c == CHAR_FF || c == CHAR_CR)
5476 == prop_fail_result)
5477 break;
5478 eptr+= len;
5479 }
5480 break;
5481
5482 case PT_PXSPACE: /* POSIX space */
5483 for (i = min; i < max; i++)
5484 {
5485 int len = 1;
5486 if (eptr >= md->end_subject)
5487 {
5488 SCHECK_PARTIAL();
5489 break;
5490 }
5491 GETCHARLENTEST(c, eptr, len);
5492 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5493 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5494 == prop_fail_result)
5495 break;
5496 eptr+= len;
5497 }
5498 break;
5499
5500 case PT_WORD:
5501 for (i = min; i < max; i++)
5502 {
5503 int category;
5504 int len = 1;
5505 if (eptr >= md->end_subject)
5506 {
5507 SCHECK_PARTIAL();
5508 break;
5509 }
5510 GETCHARLENTEST(c, eptr, len);
5511 category = UCD_CATEGORY(c);
5512 if ((category == ucp_L || category == ucp_N ||
5513 c == CHAR_UNDERSCORE) == prop_fail_result)
5514 break;
5515 eptr+= len;
5516 }
5517 break;
5518
5519 case PT_CLIST:
5520 for (i = min; i < max; i++)
5521 {
5522 const pcre_uint32 *cp;
5523 int len = 1;
5524 if (eptr >= md->end_subject)
5525 {
5526 SCHECK_PARTIAL();
5527 break;
5528 }
5529 GETCHARLENTEST(c, eptr, len);
5530 cp = PRIV(ucd_caseless_sets) + prop_value;
5531 for (;;)
5532 {
5533 if (c < *cp)
5534 { if (prop_fail_result) break; else goto GOT_MAX; }
5535 if (c == *cp++)
5536 { if (prop_fail_result) goto GOT_MAX; else break; }
5537 }
5538 eptr += len;
5539 }
5540 GOT_MAX:
5541 break;
5542
5543 case PT_UCNC:
5544 for (i = min; i < max; i++)
5545 {
5546 int len = 1;
5547 if (eptr >= md->end_subject)
5548 {
5549 SCHECK_PARTIAL();
5550 break;
5551 }
5552 GETCHARLENTEST(c, eptr, len);
5553 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5554 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5555 c >= 0xe000) == prop_fail_result)
5556 break;
5557 eptr += len;
5558 }
5559 break;
5560
5561 default:
5562 RRETURN(PCRE_ERROR_INTERNAL);
5563 }
5564
5565 /* eptr is now past the end of the maximum run */
5566
5567 if (possessive) continue;
5568 for(;;)
5569 {
5570 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5571 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5572 if (eptr-- == pp) break; /* Stop if tried at original pos */
5573 if (utf) BACKCHAR(eptr);
5574 }
5575 }
5576
5577 /* Match extended Unicode sequences. We will get here only if the
5578 support is in the binary; otherwise a compile-time error occurs. */
5579
5580 else if (ctype == OP_EXTUNI)
5581 {
5582 for (i = min; i < max; i++)
5583 {
5584 if (eptr >= md->end_subject)
5585 {
5586 SCHECK_PARTIAL();
5587 break;
5588 }
5589 else
5590 {
5591 int lgb, rgb;
5592 GETCHARINCTEST(c, eptr);
5593 lgb = UCD_GRAPHBREAK(c);
5594 while (eptr < md->end_subject)
5595 {
5596 int len = 1;
5597 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5598 rgb = UCD_GRAPHBREAK(c);
5599 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5600 lgb = rgb;
5601 eptr += len;
5602 }
5603 }
5604 CHECK_PARTIAL();
5605 }
5606
5607 /* eptr is now past the end of the maximum run */
5608
5609 if (possessive) continue;
5610
5611 for(;;)
5612 {
5613 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5615 if (eptr-- == pp) break; /* Stop if tried at original pos */
5616 for (;;) /* Move back over one extended */
5617 {
5618 if (!utf) c = *eptr; else
5619 {
5620 BACKCHAR(eptr);
5621 GETCHAR(c, eptr);
5622 }
5623 if (UCD_CATEGORY(c) != ucp_M) break;
5624 eptr--;
5625 }
5626 }
5627 }
5628
5629 else
5630 #endif /* SUPPORT_UCP */
5631
5632 #ifdef SUPPORT_UTF
5633 if (utf)
5634 {
5635 switch(ctype)
5636 {
5637 case OP_ANY:
5638 if (max < INT_MAX)
5639 {
5640 for (i = min; i < max; i++)
5641 {
5642 if (eptr >= md->end_subject)
5643 {
5644 SCHECK_PARTIAL();
5645 break;
5646 }
5647 if (IS_NEWLINE(eptr)) break;
5648 if (md->partial != 0 && /* Take care with CRLF partial */
5649 eptr + 1 >= md->end_subject &&
5650 NLBLOCK->nltype == NLTYPE_FIXED &&
5651 NLBLOCK->nllen == 2 &&
5652 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5653 {
5654 md->hitend = TRUE;
5655 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5656 }
5657 eptr++;
5658 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5659 }
5660 }
5661
5662 /* Handle unlimited UTF-8 repeat */
5663
5664 else
5665 {
5666 for (i = min; i < max; i++)
5667 {
5668 if (eptr >= md->end_subject)
5669 {
5670 SCHECK_PARTIAL();
5671 break;
5672 }
5673 if (IS_NEWLINE(eptr)) break;
5674 if (md->partial != 0 && /* Take care with CRLF partial */
5675 eptr + 1 >= md->end_subject &&
5676 NLBLOCK->nltype == NLTYPE_FIXED &&
5677 NLBLOCK->nllen == 2 &&
5678 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5679 {
5680 md->hitend = TRUE;
5681 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5682 }
5683 eptr++;
5684 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5685 }
5686 }
5687 break;
5688
5689 case OP_ALLANY:
5690 if (max < INT_MAX)
5691 {
5692 for (i = min; i < max; i++)
5693 {
5694 if (eptr >= md->end_subject)
5695 {
5696 SCHECK_PARTIAL();
5697 break;
5698 }
5699 eptr++;
5700 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5701 }
5702 }
5703 else
5704 {
5705 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5706 SCHECK_PARTIAL();
5707 }
5708 break;
5709
5710 /* The byte case is the same as non-UTF8 */
5711
5712 case OP_ANYBYTE:
5713 c = max - min;
5714 if (c > (unsigned int)(md->end_subject - eptr))
5715 {
5716 eptr = md->end_subject;
5717 SCHECK_PARTIAL();
5718 }
5719 else eptr += c;
5720 break;
5721
5722 case OP_ANYNL:
5723 for (i = min; i < max; i++)
5724 {
5725 int len = 1;
5726 if (eptr >= md->end_subject)
5727 {
5728 SCHECK_PARTIAL();
5729 break;
5730 }
5731 GETCHARLEN(c, eptr, len);
5732 if (c == CHAR_CR)
5733 {
5734 if (++eptr >= md->end_subject) break;
5735 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5736 }
5737 else
5738 {
5739 if (c != CHAR_LF &&
5740 (md->bsr_anycrlf ||
5741 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5742 #ifndef EBCDIC
5743 && c != 0x2028 && c != 0x2029
5744 #endif /* Not EBCDIC */
5745 )))
5746 break;
5747 eptr += len;
5748 }
5749 }
5750 break;
5751
5752 case OP_NOT_HSPACE:
5753 case OP_HSPACE:
5754 for (i = min; i < max; i++)
5755 {
5756 BOOL gotspace;
5757 int len = 1;
5758 if (eptr >= md->end_subject)
5759 {
5760 SCHECK_PARTIAL();
5761 break;
5762 }
5763 GETCHARLEN(c, eptr, len);
5764 switch(c)
5765 {
5766 HSPACE_CASES: gotspace = TRUE; break;
5767 default: gotspace = FALSE; break;
5768 }
5769 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5770 eptr += len;
5771 }
5772 break;
5773
5774 case OP_NOT_VSPACE:
5775 case OP_VSPACE:
5776 for (i = min; i < max; i++)
5777 {
5778 BOOL gotspace;
5779 int len = 1;
5780 if (eptr >= md->end_subject)
5781 {
5782 SCHECK_PARTIAL();
5783 break;
5784 }
5785 GETCHARLEN(c, eptr, len);
5786 switch(c)
5787 {
5788 VSPACE_CASES: gotspace = TRUE; break;
5789 default: gotspace = FALSE; break;
5790 }
5791 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5792 eptr += len;
5793 }
5794 break;
5795
5796 case OP_NOT_DIGIT:
5797 for (i = min; i < max; i++)
5798 {
5799 int len = 1;
5800 if (eptr >= md->end_subject)
5801 {
5802 SCHECK_PARTIAL();
5803 break;
5804 }
5805 GETCHARLEN(c, eptr, len);
5806 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5807 eptr+= len;
5808 }
5809 break;
5810
5811 case OP_DIGIT:
5812 for (i = min; i < max; i++)
5813 {
5814 int len = 1;
5815 if (eptr >= md->end_subject)
5816 {
5817 SCHECK_PARTIAL();
5818 break;
5819 }
5820 GETCHARLEN(c, eptr, len);
5821 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5822 eptr+= len;
5823 }
5824 break;
5825
5826 case OP_NOT_WHITESPACE:
5827 for (i = min; i < max; i++)
5828 {
5829 int len = 1;
5830 if (eptr >= md->end_subject)
5831 {
5832 SCHECK_PARTIAL();
5833 break;
5834 }
5835 GETCHARLEN(c, eptr, len);
5836 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5837 eptr+= len;
5838 }
5839 break;
5840
5841 case OP_WHITESPACE:
5842 for (i = min; i < max; i++)
5843 {
5844 int len = 1;
5845 if (eptr >= md->end_subject)
5846 {
5847 SCHECK_PARTIAL();
5848 break;
5849 }
5850 GETCHARLEN(c, eptr, len);
5851 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5852 eptr+= len;
5853 }
5854 break;
5855
5856 case OP_NOT_WORDCHAR:
5857 for (i = min; i < max; i++)
5858 {
5859 int len = 1;
5860 if (eptr >= md->end_subject)
5861 {
5862 SCHECK_PARTIAL();
5863 break;
5864 }
5865 GETCHARLEN(c, eptr, len);
5866 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5867 eptr+= len;
5868 }
5869 break;
5870
5871 case OP_WORDCHAR:
5872 for (i = min; i < max; i++)
5873 {
5874 int len = 1;
5875 if (eptr >= md->end_subject)
5876 {
5877 SCHECK_PARTIAL();
5878 break;
5879 }
5880 GETCHARLEN(c, eptr, len);
5881 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5882 eptr+= len;
5883 }
5884 break;
5885
5886 default:
5887 RRETURN(PCRE_ERROR_INTERNAL);
5888 }
5889
5890 /* eptr is now past the end of the maximum run. If possessive, we are
5891 done (no backing up). Otherwise, match at this position; anything other
5892 than no match is immediately returned. For nomatch, back up one
5893 character, unless we are matching \R and the last thing matched was
5894 \r\n, in which case, back up two bytes. */
5895
5896 if (possessive) continue;
5897 for(;;)
5898 {
5899 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5901 if (eptr-- == pp) break; /* Stop if tried at original pos */
5902 BACKCHAR(eptr);
5903 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5904 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5905 }
5906 }
5907 else
5908 #endif /* SUPPORT_UTF */
5909 /* Not UTF mode */
5910 {
5911 switch(ctype)
5912 {
5913 case OP_ANY:
5914 for (i = min; i < max; i++)
5915 {
5916 if (eptr >= md->end_subject)
5917 {
5918 SCHECK_PARTIAL();
5919 break;
5920 }
5921 if (IS_NEWLINE(eptr)) break;
5922 if (md->partial != 0 && /* Take care with CRLF partial */
5923 eptr + 1 >= md->end_subject &&
5924 NLBLOCK->nltype == NLTYPE_FIXED &&
5925 NLBLOCK->nllen == 2 &&
5926 *eptr == NLBLOCK->nl[0])
5927 {
5928 md->hitend = TRUE;
5929 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5930 }
5931 eptr++;
5932 }
5933 break;
5934
5935 case OP_ALLANY:
5936 case OP_ANYBYTE:
5937 c = max - min;
5938 if (c > (unsigned int)(md->end_subject - eptr))
5939 {
5940 eptr = md->end_subject;
5941 SCHECK_PARTIAL();
5942 }
5943 else eptr += c;
5944 break;
5945
5946 case OP_ANYNL:
5947 for (i = min; i < max; i++)
5948 {
5949 if (eptr >= md->end_subject)
5950 {
5951 SCHECK_PARTIAL();
5952 break;
5953 }
5954 c = *eptr;
5955 if (c == CHAR_CR)
5956 {
5957 if (++eptr >= md->end_subject) break;
5958 if (*eptr == CHAR_LF) eptr++;
5959 }
5960 else
5961 {
5962 if (c != CHAR_LF && (md->bsr_anycrlf ||
5963 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5964 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5965 && c != 0x2028 && c != 0x2029
5966 #endif
5967 ))) break;
5968 eptr++;
5969 }
5970 }
5971 break;
5972
5973 case OP_NOT_HSPACE:
5974 for (i = min; i < max; i++)
5975 {
5976 if (eptr >= md->end_subject)
5977 {
5978 SCHECK_PARTIAL();
5979 break;
5980 }
5981 switch(*eptr)
5982 {
5983 default: eptr++; break;
5984 HSPACE_BYTE_CASES:
5985 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5986 HSPACE_MULTIBYTE_CASES:
5987 #endif
5988 goto ENDLOOP00;
5989 }
5990 }
5991 ENDLOOP00:
5992 break;
5993
5994 case OP_HSPACE:
5995 for (i = min; i < max; i++)
5996 {
5997 if (eptr >= md->end_subject)
5998 {
5999 SCHECK_PARTIAL();
6000 break;
6001 }
6002 switch(*eptr)
6003 {
6004 default: goto ENDLOOP01;
6005 HSPACE_BYTE_CASES:
6006 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6007 HSPACE_MULTIBYTE_CASES:
6008 #endif
6009 eptr++; break;
6010 }
6011 }
6012 ENDLOOP01:
6013 break;
6014
6015 case OP_NOT_VSPACE:
6016 for (i = min; i < max; i++)
6017 {
6018 if (eptr >= md->end_subject)
6019 {
6020 SCHECK_PARTIAL();
6021 break;
6022 }
6023 switch(*eptr)
6024 {
6025 default: eptr++; break;
6026 VSPACE_BYTE_CASES:
6027 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6028 VSPACE_MULTIBYTE_CASES:
6029 #endif
6030 goto ENDLOOP02;
6031 }
6032 }
6033 ENDLOOP02:
6034 break;
6035
6036 case OP_VSPACE:
6037 for (i = min; i < max; i++)
6038 {
6039 if (eptr >= md->end_subject)
6040 {
6041 SCHECK_PARTIAL();
6042 break;
6043 }
6044 switch(*eptr)
6045 {
6046 default: goto ENDLOOP03;
6047 VSPACE_BYTE_CASES:
6048 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6049 VSPACE_MULTIBYTE_CASES:
6050 #endif
6051 eptr++; break;
6052 }
6053 }
6054 ENDLOOP03:
6055 break;
6056
6057 case OP_NOT_DIGIT:
6058 for (i = min; i < max; i++)
6059 {
6060 if (eptr >= md->end_subject)
6061 {
6062 SCHECK_PARTIAL();
6063 break;
6064 }
6065 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6066 eptr++;
6067 }
6068 break;
6069
6070 case OP_DIGIT:
6071 for (i = min; i < max; i++)
6072 {
6073 if (eptr >= md->end_subject)
6074 {
6075 SCHECK_PARTIAL();
6076 break;
6077 }
6078 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6079 eptr++;
6080 }
6081 break;
6082
6083 case OP_NOT_WHITESPACE:
6084 for (i = min; i < max; i++)
6085 {
6086 if (eptr >= md->end_subject)
6087 {
6088 SCHECK_PARTIAL();
6089 break;
6090 }
6091 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6092 eptr++;
6093 }
6094 break;
6095
6096 case OP_WHITESPACE:
6097 for (i = min; i < max; i++)
6098 {
6099 if (eptr >= md->end_subject)
6100 {
6101 SCHECK_PARTIAL();
6102 break;
6103 }
6104 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6105 eptr++;
6106 }
6107 break;
6108
6109 case OP_NOT_WORDCHAR:
6110 for (i = min; i < max; i++)
6111 {
6112 if (eptr >= md->end_subject)
6113 {
6114 SCHECK_PARTIAL();
6115 break;
6116 }
6117 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6118 eptr++;
6119 }
6120 break;
6121
6122 case OP_WORDCHAR:
6123 for (i = min; i < max; i++)
6124 {
6125 if (eptr >= md->end_subject)
6126 {
6127 SCHECK_PARTIAL();
6128 break;
6129 }
6130 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6131 eptr++;
6132 }
6133 break;
6134
6135 default:
6136 RRETURN(PCRE_ERROR_INTERNAL);
6137 }
6138
6139 /* eptr is now past the end of the maximum run. If possessive, we are
6140 done (no backing up). Otherwise, match at this position; anything other
6141 than no match is immediately returned. For nomatch, back up one
6142 character (byte), unless we are matching \R and the last thing matched
6143 was \r\n, in which case, back up two bytes. */
6144
6145 if (possessive) continue;
6146 while (eptr >= pp)
6147 {
6148 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6149 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6150 eptr--;
6151 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6152 eptr[-1] == CHAR_CR) eptr--;
6153 }
6154 }
6155
6156 /* Get here if we can't make it match with any permitted repetitions */
6157
6158 RRETURN(MATCH_NOMATCH);
6159 }
6160 /* Control never gets here */
6161
6162 /* There's been some horrible disaster. Arrival here can only mean there is
6163 something seriously wrong in the code above or the OP_xxx definitions. */
6164
6165 default:
6166 DPRINTF(("Unknown opcode %d\n", *ecode));
6167 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6168 }
6169
6170 /* Do not stick any code in here without much thought; it is assumed
6171 that "continue" in the code above comes out to here to repeat the main
6172 loop. */
6173
6174 } /* End of main loop */
6175 /* Control never reaches here */
6176
6177
6178 /* When compiling to use the heap rather than the stack for recursive calls to
6179 match(), the RRETURN() macro jumps here. The number that is saved in
6180 frame->Xwhere indicates which label we actually want to return to. */
6181
6182 #ifdef NO_RECURSE
6183 #define LBL(val) case val: goto L_RM##val;
6184 HEAP_RETURN:
6185 switch (frame->Xwhere)
6186 {
6187 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6188 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6189 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6190 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6191 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6192 LBL(65) LBL(66)
6193 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6194 LBL(21)
6195 #endif
6196 #ifdef SUPPORT_UTF
6197 LBL(16) LBL(18) LBL(20)
6198 LBL(22) LBL(23) LBL(28) LBL(30)
6199 LBL(32) LBL(34) LBL(42) LBL(46)
6200 #ifdef SUPPORT_UCP
6201 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6202 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6203 #endif /* SUPPORT_UCP */
6204 #endif /* SUPPORT_UTF */
6205 default:
6206 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6207 return PCRE_ERROR_INTERNAL;
6208 }
6209 #undef LBL
6210 #endif /* NO_RECURSE */
6211 }
6212
6213
6214 /***************************************************************************
6215 ****************************************************************************
6216 RECURSION IN THE match() FUNCTION
6217
6218 Undefine all the macros that were defined above to handle this. */
6219
6220 #ifdef NO_RECURSE
6221 #undef eptr
6222 #undef ecode
6223 #undef mstart
6224 #undef offset_top
6225 #undef eptrb
6226 #undef flags
6227
6228 #undef callpat
6229 #undef charptr
6230 #undef data
6231 #undef next
6232 #undef pp
6233 #undef prev
6234 #undef saved_eptr
6235
6236 #undef new_recursive
6237
6238 #undef cur_is_word
6239 #undef condition
6240 #undef prev_is_word
6241
6242 #undef ctype
6243 #undef length
6244 #undef max
6245 #undef min
6246 #undef number
6247 #undef offset
6248 #undef op
6249 #undef save_capture_last
6250 #undef save_offset1
6251 #undef save_offset2
6252 #undef save_offset3
6253 #undef stacksave
6254
6255 #undef newptrb
6256
6257 #endif
6258
6259 /* These two are defined as macros in both cases */
6260
6261 #undef fc
6262 #undef fi
6263
6264 /***************************************************************************
6265 ***************************************************************************/
6266
6267
6268 #ifdef NO_RECURSE
6269 /*************************************************
6270 * Release allocated heap frames *
6271 *************************************************/
6272
6273 /* This function releases all the allocated frames. The base frame is on the
6274 machine stack, and so must not be freed.
6275
6276 Argument: the address of the base frame
6277 Returns: nothing
6278 */
6279
6280 static void
6281 release_match_heapframes (heapframe *frame_base)
6282 {
6283 heapframe *nextframe = frame_base->Xnextframe;
6284 while (nextframe != NULL)
6285 {
6286 heapframe *oldframe = nextframe;
6287 nextframe = nextframe->Xnextframe;
6288 (PUBL(stack_free))(oldframe);
6289 }
6290 }
6291 #endif
6292
6293
6294 /*************************************************
6295 * Execute a Regular Expression *
6296 *************************************************/
6297
6298 /* This function applies a compiled re to a subject string and picks out
6299 portions of the string if it matches. Two elements in the vector are set for
6300 each substring: the offsets to the start and end of the substring.
6301
6302 Arguments:
6303 argument_re points to the compiled expression
6304 extra_data points to extra data or is NULL
6305 subject points to the subject string
6306 length length of subject string (may contain binary zeros)
6307 start_offset where to start in the subject string
6308 options option bits
6309 offsets points to a vector of ints to be filled in with offsets
6310 offsetcount the number of elements in the vector
6311
6312 Returns: > 0 => success; value is the number of elements filled in
6313 = 0 => success, but offsets is not big enough
6314 -1 => failed to match
6315 < -1 => some kind of unexpected problem
6316 */
6317
6318 #if defined COMPILE_PCRE8
6319 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6320 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6321 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6322 int offsetcount)
6323 #elif defined COMPILE_PCRE16
6324 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6325 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6326 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6327 int offsetcount)
6328 #elif defined COMPILE_PCRE32
6329 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6330 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6331 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6332 int offsetcount)
6333 #endif
6334 {
6335 int rc, ocount, arg_offset_max;
6336 int newline;
6337 BOOL using_temporary_offsets = FALSE;
6338 BOOL anchored;
6339 BOOL startline;
6340 BOOL firstline;
6341 BOOL utf;
6342 BOOL has_first_char = FALSE;
6343 BOOL has_req_char = FALSE;
6344 pcre_uchar first_char = 0;
6345 pcre_uchar first_char2 = 0;
6346 pcre_uchar req_char = 0;
6347 pcre_uchar req_char2 = 0;
6348 match_data match_block;
6349 match_data *md = &match_block;
6350 const pcre_uint8 *tables;
6351 const pcre_uint8 *start_bits = NULL;
6352 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6353 PCRE_PUCHAR end_subject;
6354 PCRE_PUCHAR start_partial = NULL;
6355 PCRE_PUCHAR match_partial;
6356 PCRE_PUCHAR req_char_ptr = start_match - 1;
6357
6358 const pcre_study_data *study;
6359 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6360
6361 #ifdef NO_RECURSE
6362 heapframe frame_zero;
6363 frame_zero.Xprevframe = NULL; /* Marks the top level */
6364 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6365 md->match_frames_base = &frame_zero;
6366 #endif
6367
6368 /* Check for the special magic call that measures the size of the stack used
6369 per recursive call of match(). Without the funny casting for sizeof, a Windows
6370 compiler gave this error: "unary minus operator applied to unsigned type,
6371 result still unsigned". Hopefully the cast fixes that. */
6372
6373 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6374 start_offset == -999)
6375 #ifdef NO_RECURSE
6376 return -((int)sizeof(heapframe));
6377 #else
6378 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6379 #endif
6380
6381 /* Plausibility checks */
6382
6383 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6384 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6385 return PCRE_ERROR_NULL;
6386 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6387 if (length < 0) return PCRE_ERROR_BADLENGTH;
6388 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6389
6390 /* Check that the first field in the block is the magic number. If it is not,
6391 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6392 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6393 means that the pattern is likely compiled with different endianness. */
6394
6395 if (re->magic_number != MAGIC_NUMBER)
6396 return re->magic_number == REVERSED_MAGIC_NUMBER?
6397 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6398 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6399
6400 /* These two settings are used in the code for checking a UTF-8 string that
6401 follows immediately afterwards. Other values in the md block are used only
6402 during "normal" pcre_exec() processing, not when the JIT support is in use,
6403 so they are set up later. */
6404
6405 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6406 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6407 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6408 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6409
6410 /* Check a UTF-8 string if required. Pass back the character offset and error
6411 code for an invalid string if a results vector is available. */
6412
6413 #ifdef SUPPORT_UTF
6414 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6415 {
6416 int erroroffset;
6417 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6418 if (errorcode != 0)
6419 {
6420 if (offsetcount >= 2)
6421 {
6422 offsets[0] = erroroffset;
6423 offsets[1] = errorcode;
6424 }
6425 #if defined COMPILE_PCRE8
6426 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6427 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6428 #elif defined COMPILE_PCRE16
6429 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6430 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6431 #elif defined COMPILE_PCRE32
6432 return PCRE_ERROR_BADUTF32;
6433 #endif
6434 }
6435 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6436 /* Check that a start_offset points to the start of a UTF character. */
6437 if (start_offset > 0 && start_offset < length &&
6438 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6439 return PCRE_ERROR_BADUTF8_OFFSET;
6440 #endif
6441 }
6442 #endif
6443
6444 /* If the pattern was successfully studied with JIT support, run the JIT
6445 executable instead of the rest of this function. Most options must be set at
6446 compile time for the JIT code to be usable. Fallback to the normal code path if
6447 an unsupported flag is set. */
6448
6449 #ifdef SUPPORT_JIT
6450 if (extra_data != NULL
6451 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6452 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6453 && extra_data->executable_jit != NULL
6454 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6455 {
6456 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6457 start_offset, options, offsets, offsetcount);
6458
6459 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6460 mode is not compiled. In this case we simply fallback to interpreter. */
6461
6462 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6463 }
6464 #endif
6465
6466 /* Carry on with non-JIT matching. This information is for finding all the
6467 numbers associated with a given name, for condition testing. */
6468
6469 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6470 md->name_count = re->name_count;
6471 md->name_entry_size = re->name_entry_size;
6472
6473 /* Fish out the optional data from the extra_data structure, first setting
6474 the default values. */
6475
6476 study = NULL;
6477 md->match_limit = MATCH_LIMIT;
6478 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6479 md->callout_data = NULL;
6480
6481 /* The table pointer is always in native byte order. */
6482
6483 tables = re->tables;
6484
6485 if (extra_data != NULL)
6486 {
6487 register unsigned int flags = extra_data->flags;
6488 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6489 study = (const pcre_study_data *)extra_data->study_data;
6490 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6491 md->match_limit = extra_data->match_limit;
6492 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6493 md->match_limit_recursion = extra_data->match_limit_recursion;
6494 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6495 md->callout_data = extra_data->callout_data;
6496 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6497 }
6498
6499 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6500 is a feature that makes it possible to save compiled regex and re-use them
6501 in other programs later. */
6502
6503 if (tables == NULL) tables = PRIV(default_tables);
6504
6505 /* Set up other data */
6506
6507 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6508 startline = (re->flags & PCRE_STARTLINE) != 0;
6509 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6510
6511 /* The code starts after the real_pcre block and the capture name table. */
6512
6513 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6514 re->name_count * re->name_entry_size;
6515
6516 md->start_subject = (PCRE_PUCHAR)subject;
6517 md->start_offset = start_offset;
6518 md->end_subject = md->start_subject + length;
6519 end_subject = md->end_subject;
6520
6521 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6522 md->use_ucp = (re->options & PCRE_UCP) != 0;
6523 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6524 md->ignore_skip_arg = 0;
6525
6526 /* Some options are unpacked into BOOL variables in the hope that testing
6527 them will be faster than individual option bits. */
6528
6529 md->notbol = (options & PCRE_NOTBOL) != 0;
6530 md->noteol = (options & PCRE_NOTEOL) != 0;
6531 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6532 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6533
6534 md->hitend = FALSE;
6535 md->mark = md->nomatch_mark = NULL; /* In case never set */
6536
6537 md->recursive = NULL; /* No recursion at top level */
6538 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6539
6540 md->lcc = tables + lcc_offset;
6541 md->fcc = tables + fcc_offset;
6542 md->ctypes = tables + ctypes_offset;
6543
6544 /* Handle different \R options. */
6545
6546 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6547 {
6548 case 0:
6549 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6550 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6551 else
6552 #ifdef BSR_ANYCRLF
6553 md->bsr_anycrlf = TRUE;
6554 #else
6555 md->bsr_anycrlf = FALSE;
6556 #endif
6557 break;
6558
6559 case PCRE_BSR_ANYCRLF:
6560 md->bsr_anycrlf = TRUE;
6561 break;
6562
6563 case PCRE_BSR_UNICODE:
6564 md->bsr_anycrlf = FALSE;
6565 break;
6566
6567 default: return PCRE_ERROR_BADNEWLINE;
6568 }
6569
6570 /* Handle different types of newline. The three bits give eight cases. If
6571 nothing is set at run time, whatever was used at compile time applies. */
6572
6573 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6574 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6575 {
6576 case 0: newline = NEWLINE; break; /* Compile-time default */
6577 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6578 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6579 case PCRE_NEWLINE_CR+
6580 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6581 case PCRE_NEWLINE_ANY: newline = -1; break;
6582 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6583 default: return PCRE_ERROR_BADNEWLINE;
6584 }
6585
6586 if (newline == -2)
6587 {
6588 md->nltype = NLTYPE_ANYCRLF;
6589 }
6590 else if (newline < 0)
6591 {
6592 md->nltype = NLTYPE_ANY;
6593 }
6594 else
6595 {
6596 md->nltype = NLTYPE_FIXED;
6597 if (newline > 255)
6598 {
6599 md->nllen = 2;
6600 md->nl[0] = (newline >> 8) & 255;
6601 md->nl[1] = newline & 255;
6602 }
6603 else
6604 {
6605 md->nllen = 1;
6606 md->nl[0] = newline;
6607 }
6608 }
6609
6610 /* Partial matching was originally supported only for a restricted set of
6611 regexes; from release 8.00 there are no restrictions, but the bits are still
6612 defined (though never set). So there's no harm in leaving this code. */
6613
6614 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6615 return PCRE_ERROR_BADPARTIAL;
6616
6617 /* If the expression has got more back references than the offsets supplied can
6618 hold, we get a temporary chunk of working store to use during the matching.
6619 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6620 of 3. */
6621
6622 ocount = offsetcount - (offsetcount % 3);
6623 arg_offset_max = (2*ocount)/3;
6624
6625 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6626 {
6627 ocount = re->top_backref * 3 + 3;
6628 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6629 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6630 using_temporary_offsets = TRUE;
6631 DPRINTF(("Got memory to hold back references\n"));
6632 }
6633 else md->offset_vector = offsets;
6634 md->offset_end = ocount;
6635 md->offset_max = (2*ocount)/3;
6636 md->capture_last = 0;
6637
6638 /* Reset the working variable associated with each extraction. These should
6639 never be used unless previously set, but they get saved and restored, and so we
6640 initialize them to avoid reading uninitialized locations. Also, unset the
6641 offsets for the matched string. This is really just for tidiness with callouts,
6642 in case they inspect these fields. */
6643
6644 if (md->offset_vector != NULL)
6645 {
6646 register int *iptr = md->offset_vector + ocount;
6647 register int *iend = iptr - re->top_bracket;
6648 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6649 while (--iptr >= iend) *iptr = -1;
6650 md->offset_vector[0] = md->offset_vector[1] = -1;
6651 }
6652
6653 /* Set up the first character to match, if available. The first_char value is
6654 never set for an anchored regular expression, but the anchoring may be forced
6655 at run time, so we have to test for anchoring. The first char may be unset for
6656 an unanchored pattern, of course. If there's no first char and the pattern was
6657 studied, there may be a bitmap of possible first characters. */
6658
6659 if (!anchored)
6660 {
6661 if ((re->flags & PCRE_FIRSTSET) != 0)
6662 {
6663 has_first_char = TRUE;
6664 first_char = first_char2 = (pcre_uchar)(re->first_char);
6665 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6666 {
6667 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6668 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6669 if (utf && first_char > 127)
6670 first_char2 = UCD_OTHERCASE(first_char);
6671 #endif
6672 }
6673 }
6674 else
6675 if (!startline && study != NULL &&
6676 (study->flags & PCRE_STUDY_MAPPED) != 0)
6677 start_bits = study->start_bits;
6678 }
6679
6680 /* For anchored or unanchored matches, there may be a "last known required
6681 character" set. */
6682
6683 if ((re->flags & PCRE_REQCHSET) != 0)
6684 {
6685 has_req_char = TRUE;
6686 req_char = req_char2 = (pcre_uchar)(re->req_char);
6687 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6688 {
6689 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6690 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6691 if (utf && req_char > 127)
6692 req_char2 = UCD_OTHERCASE(req_char);
6693 #endif
6694 }
6695 }
6696
6697
6698 /* ==========================================================================*/
6699
6700 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6701 the loop runs just once. */
6702
6703 for(;;)
6704 {
6705 PCRE_PUCHAR save_end_subject = end_subject;
6706 PCRE_PUCHAR new_start_match;
6707
6708 /* If firstline is TRUE, the start of the match is constrained to the first
6709 line of a multiline string. That is, the match must be before or at the first
6710 newline. Implement this by temporarily adjusting end_subject so that we stop
6711 scanning at a newline. If the match fails at the newline, later code breaks
6712 this loop. */
6713
6714 if (firstline)
6715 {
6716 PCRE_PUCHAR t = start_match;
6717 #ifdef SUPPORT_UTF
6718 if (utf)
6719 {
6720 while (t < md->end_subject && !IS_NEWLINE(t))
6721 {
6722 t++;
6723 ACROSSCHAR(t < end_subject, *t, t++);
6724 }
6725 }
6726 else
6727 #endif
6728 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6729 end_subject = t;
6730 }
6731
6732 /* There are some optimizations that avoid running the match if a known
6733 starting point is not found, or if a known later character is not present.
6734 However, there is an option that disables these, for testing and for ensuring
6735 that all callouts do actually occur. The option can be set in the regex by
6736 (*NO_START_OPT) or passed in match-time options. */
6737
6738 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6739 {
6740 /* Advance to a unique first char if there is one. */
6741
6742 if (has_first_char)
6743 {
6744 pcre_uchar smc;
6745
6746 if (first_char != first_char2)
6747 while (start_match < end_subject &&
6748 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6749 start_match++;
6750 else
6751 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6752 start_match++;
6753 }
6754
6755 /* Or to just after a linebreak for a multiline match */
6756