/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1404 - (show annotations)
Tue Nov 19 15:36:57 2013 UTC (5 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 218216 byte(s)
Source tidies for 8.34-RC1.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 case OP_COMMIT:
785 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
786 eptrb, RM52);
787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
788 RRETURN(MATCH_COMMIT);
789
790 case OP_PRUNE:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
792 eptrb, RM51);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 RRETURN(MATCH_PRUNE);
795
796 case OP_PRUNE_ARG:
797 md->nomatch_mark = ecode + 2;
798 md->mark = NULL; /* In case previously set by assertion */
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM56);
801 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
802 md->mark == NULL) md->mark = ecode + 2;
803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
804 RRETURN(MATCH_PRUNE);
805
806 case OP_SKIP:
807 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
808 eptrb, RM53);
809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
810 md->start_match_ptr = eptr; /* Pass back current position */
811 RRETURN(MATCH_SKIP);
812
813 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
814 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
815 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
816 that failed and any that precede it (either they also failed, or were not
817 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
818 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
819 set to the count of the one that failed. */
820
821 case OP_SKIP_ARG:
822 md->skip_arg_count++;
823 if (md->skip_arg_count <= md->ignore_skip_arg)
824 {
825 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 break;
827 }
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
829 eptrb, RM57);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831
832 /* Pass back the current skip name by overloading md->start_match_ptr and
833 returning the special MATCH_SKIP_ARG return code. This will either be
834 caught by a matching MARK, or get to the top, where it causes a rematch
835 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
836
837 md->start_match_ptr = ecode + 2;
838 RRETURN(MATCH_SKIP_ARG);
839
840 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
841 the branch in which it occurs can be determined. Overload the start of
842 match pointer to do this. */
843
844 case OP_THEN:
845 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
846 eptrb, RM54);
847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848 md->start_match_ptr = ecode;
849 RRETURN(MATCH_THEN);
850
851 case OP_THEN_ARG:
852 md->nomatch_mark = ecode + 2;
853 md->mark = NULL; /* In case previously set by assertion */
854 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
855 md, eptrb, RM58);
856 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
857 md->mark == NULL) md->mark = ecode + 2;
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 md->start_match_ptr = ecode;
860 RRETURN(MATCH_THEN);
861
862 /* Handle an atomic group that does not contain any capturing parentheses.
863 This can be handled like an assertion. Prior to 8.13, all atomic groups
864 were handled this way. In 8.13, the code was changed as below for ONCE, so
865 that backups pass through the group and thereby reset captured values.
866 However, this uses a lot more stack, so in 8.20, atomic groups that do not
867 contain any captures generate OP_ONCE_NC, which can be handled in the old,
868 less stack intensive way.
869
870 Check the alternative branches in turn - the matching won't pass the KET
871 for this kind of subpattern. If any one branch matches, we carry on as at
872 the end of a normal bracket, leaving the subject pointer, but resetting
873 the start-of-match value in case it was changed by \K. */
874
875 case OP_ONCE_NC:
876 prev = ecode;
877 saved_eptr = eptr;
878 save_mark = md->mark;
879 do
880 {
881 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
882 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
883 {
884 mstart = md->start_match_ptr;
885 break;
886 }
887 if (rrc == MATCH_THEN)
888 {
889 next = ecode + GET(ecode,1);
890 if (md->start_match_ptr < next &&
891 (*ecode == OP_ALT || *next == OP_ALT))
892 rrc = MATCH_NOMATCH;
893 }
894
895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896 ecode += GET(ecode,1);
897 md->mark = save_mark;
898 }
899 while (*ecode == OP_ALT);
900
901 /* If hit the end of the group (which could be repeated), fail */
902
903 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
904
905 /* Continue as from after the group, updating the offsets high water
906 mark, since extracts may have been taken. */
907
908 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
909
910 offset_top = md->end_offset_top;
911 eptr = md->end_match_ptr;
912
913 /* For a non-repeating ket, just continue at this level. This also
914 happens for a repeating ket if no characters were matched in the group.
915 This is the forcible breaking of infinite loops as implemented in Perl
916 5.005. */
917
918 if (*ecode == OP_KET || eptr == saved_eptr)
919 {
920 ecode += 1+LINK_SIZE;
921 break;
922 }
923
924 /* The repeating kets try the rest of the pattern or restart from the
925 preceding bracket, in the appropriate order. The second "call" of match()
926 uses tail recursion, to avoid using another stack frame. */
927
928 if (*ecode == OP_KETRMIN)
929 {
930 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
932 ecode = prev;
933 goto TAIL_RECURSE;
934 }
935 else /* OP_KETRMAX */
936 {
937 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939 ecode += 1 + LINK_SIZE;
940 goto TAIL_RECURSE;
941 }
942 /* Control never gets here */
943
944 /* Handle a capturing bracket, other than those that are possessive with an
945 unlimited repeat. If there is space in the offset vector, save the current
946 subject position in the working slot at the top of the vector. We mustn't
947 change the current values of the data slot, because they may be set from a
948 previous iteration of this group, and be referred to by a reference inside
949 the group. A failure to match might occur after the group has succeeded,
950 if something later on doesn't match. For this reason, we need to restore
951 the working value and also the values of the final offsets, in case they
952 were set by a previous iteration of the same bracket.
953
954 If there isn't enough space in the offset vector, treat this as if it were
955 a non-capturing bracket. Don't worry about setting the flag for the error
956 case here; that is handled in the code for KET. */
957
958 case OP_CBRA:
959 case OP_SCBRA:
960 number = GET2(ecode, 1+LINK_SIZE);
961 offset = number << 1;
962
963 #ifdef PCRE_DEBUG
964 printf("start bracket %d\n", number);
965 printf("subject=");
966 pchars(eptr, 16, TRUE, md);
967 printf("\n");
968 #endif
969
970 if (offset < md->offset_max)
971 {
972 save_offset1 = md->offset_vector[offset];
973 save_offset2 = md->offset_vector[offset+1];
974 save_offset3 = md->offset_vector[md->offset_end - number];
975 save_capture_last = md->capture_last;
976 save_mark = md->mark;
977
978 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
979 md->offset_vector[md->offset_end - number] =
980 (int)(eptr - md->start_subject);
981
982 for (;;)
983 {
984 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
986 eptrb, RM1);
987 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
988
989 /* If we backed up to a THEN, check whether it is within the current
990 branch by comparing the address of the THEN that is passed back with
991 the end of the branch. If it is within the current branch, and the
992 branch is one of two or more alternatives (it either starts or ends
993 with OP_ALT), we have reached the limit of THEN's action, so convert
994 the return code to NOMATCH, which will cause normal backtracking to
995 happen from now on. Otherwise, THEN is passed back to an outer
996 alternative. This implements Perl's treatment of parenthesized groups,
997 where a group not containing | does not affect the current alternative,
998 that is, (X) is NOT the same as (X|(*F)). */
999
1000 if (rrc == MATCH_THEN)
1001 {
1002 next = ecode + GET(ecode,1);
1003 if (md->start_match_ptr < next &&
1004 (*ecode == OP_ALT || *next == OP_ALT))
1005 rrc = MATCH_NOMATCH;
1006 }
1007
1008 /* Anything other than NOMATCH is passed back. */
1009
1010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011 md->capture_last = save_capture_last;
1012 ecode += GET(ecode, 1);
1013 md->mark = save_mark;
1014 if (*ecode != OP_ALT) break;
1015 }
1016
1017 DPRINTF(("bracket %d failed\n", number));
1018 md->offset_vector[offset] = save_offset1;
1019 md->offset_vector[offset+1] = save_offset2;
1020 md->offset_vector[md->offset_end - number] = save_offset3;
1021
1022 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1023
1024 RRETURN(rrc);
1025 }
1026
1027 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1028 as a non-capturing bracket. */
1029
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032
1033 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1034
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037
1038 /* Non-capturing or atomic group, except for possessive with unlimited
1039 repeat and ONCE group with no captures. Loop for all the alternatives.
1040
1041 When we get to the final alternative within the brackets, we used to return
1042 the result of a recursive call to match() whatever happened so it was
1043 possible to reduce stack usage by turning this into a tail recursion,
1044 except in the case of a possibly empty group. However, now that there is
1045 the possiblity of (*THEN) occurring in the final alternative, this
1046 optimization is no longer always possible.
1047
1048 We can optimize if we know there are no (*THEN)s in the pattern; at present
1049 this is the best that can be done.
1050
1051 MATCH_ONCE is returned when the end of an atomic group is successfully
1052 reached, but subsequent matching fails. It passes back up the tree (causing
1053 captured values to be reset) until the original atomic group level is
1054 reached. This is tested by comparing md->once_target with the start of the
1055 group. At this point, the return is converted into MATCH_NOMATCH so that
1056 previous backup points can be taken. */
1057
1058 case OP_ONCE:
1059 case OP_BRA:
1060 case OP_SBRA:
1061 DPRINTF(("start non-capturing bracket\n"));
1062
1063 for (;;)
1064 {
1065 if (op >= OP_SBRA || op == OP_ONCE)
1066 md->match_function_type = MATCH_CBEGROUP;
1067
1068 /* If this is not a possibly empty group, and there are no (*THEN)s in
1069 the pattern, and this is the final alternative, optimize as described
1070 above. */
1071
1072 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1073 {
1074 ecode += PRIV(OP_lengths)[*ecode];
1075 goto TAIL_RECURSE;
1076 }
1077
1078 /* In all other cases, we have to make another call to match(). */
1079
1080 save_mark = md->mark;
1081 save_capture_last = md->capture_last;
1082 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 RM2);
1084
1085 /* See comment in the code for capturing groups above about handling
1086 THEN. */
1087
1088 if (rrc == MATCH_THEN)
1089 {
1090 next = ecode + GET(ecode,1);
1091 if (md->start_match_ptr < next &&
1092 (*ecode == OP_ALT || *next == OP_ALT))
1093 rrc = MATCH_NOMATCH;
1094 }
1095
1096 if (rrc != MATCH_NOMATCH)
1097 {
1098 if (rrc == MATCH_ONCE)
1099 {
1100 const pcre_uchar *scode = ecode;
1101 if (*scode != OP_ONCE) /* If not at start, find it */
1102 {
1103 while (*scode == OP_ALT) scode += GET(scode, 1);
1104 scode -= GET(scode, 1);
1105 }
1106 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1107 }
1108 RRETURN(rrc);
1109 }
1110 ecode += GET(ecode, 1);
1111 md->mark = save_mark;
1112 if (*ecode != OP_ALT) break;
1113 md->capture_last = save_capture_last;
1114 }
1115
1116 RRETURN(MATCH_NOMATCH);
1117
1118 /* Handle possessive capturing brackets with an unlimited repeat. We come
1119 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1120 handled similarly to the normal case above. However, the matching is
1121 different. The end of these brackets will always be OP_KETRPOS, which
1122 returns MATCH_KETRPOS without going further in the pattern. By this means
1123 we can handle the group by iteration rather than recursion, thereby
1124 reducing the amount of stack needed. */
1125
1126 case OP_CBRAPOS:
1127 case OP_SCBRAPOS:
1128 allow_zero = FALSE;
1129
1130 POSSESSIVE_CAPTURE:
1131 number = GET2(ecode, 1+LINK_SIZE);
1132 offset = number << 1;
1133
1134 #ifdef PCRE_DEBUG
1135 printf("start possessive bracket %d\n", number);
1136 printf("subject=");
1137 pchars(eptr, 16, TRUE, md);
1138 printf("\n");
1139 #endif
1140
1141 if (offset < md->offset_max)
1142 {
1143 matched_once = FALSE;
1144 code_offset = (int)(ecode - md->start_code);
1145
1146 save_offset1 = md->offset_vector[offset];
1147 save_offset2 = md->offset_vector[offset+1];
1148 save_offset3 = md->offset_vector[md->offset_end - number];
1149 save_capture_last = md->capture_last;
1150
1151 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1152
1153 /* Each time round the loop, save the current subject position for use
1154 when the group matches. For MATCH_MATCH, the group has matched, so we
1155 restart it with a new subject starting position, remembering that we had
1156 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1157 usual. If we haven't matched any alternatives in any iteration, check to
1158 see if a previous iteration matched. If so, the group has matched;
1159 continue from afterwards. Otherwise it has failed; restore the previous
1160 capture values before returning NOMATCH. */
1161
1162 for (;;)
1163 {
1164 md->offset_vector[md->offset_end - number] =
1165 (int)(eptr - md->start_subject);
1166 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1167 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1168 eptrb, RM63);
1169 if (rrc == MATCH_KETRPOS)
1170 {
1171 offset_top = md->end_offset_top;
1172 eptr = md->end_match_ptr;
1173 ecode = md->start_code + code_offset;
1174 save_capture_last = md->capture_last;
1175 matched_once = TRUE;
1176 mstart = md->start_match_ptr; /* In case \K changed it */
1177 continue;
1178 }
1179
1180 /* See comment in the code for capturing groups above about handling
1181 THEN. */
1182
1183 if (rrc == MATCH_THEN)
1184 {
1185 next = ecode + GET(ecode,1);
1186 if (md->start_match_ptr < next &&
1187 (*ecode == OP_ALT || *next == OP_ALT))
1188 rrc = MATCH_NOMATCH;
1189 }
1190
1191 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 md->capture_last = save_capture_last;
1193 ecode += GET(ecode, 1);
1194 if (*ecode != OP_ALT) break;
1195 }
1196
1197 if (!matched_once)
1198 {
1199 md->offset_vector[offset] = save_offset1;
1200 md->offset_vector[offset+1] = save_offset2;
1201 md->offset_vector[md->offset_end - number] = save_offset3;
1202 }
1203
1204 if (allow_zero || matched_once)
1205 {
1206 ecode += 1 + LINK_SIZE;
1207 break;
1208 }
1209
1210 RRETURN(MATCH_NOMATCH);
1211 }
1212
1213 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1214 as a non-capturing bracket. */
1215
1216 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1218
1219 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1220
1221 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1223
1224 /* Non-capturing possessive bracket with unlimited repeat. We come here
1225 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1226 without the capturing complication. It is written out separately for speed
1227 and cleanliness. */
1228
1229 case OP_BRAPOS:
1230 case OP_SBRAPOS:
1231 allow_zero = FALSE;
1232
1233 POSSESSIVE_NON_CAPTURE:
1234 matched_once = FALSE;
1235 code_offset = (int)(ecode - md->start_code);
1236 save_capture_last = md->capture_last;
1237
1238 for (;;)
1239 {
1240 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1241 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1242 eptrb, RM48);
1243 if (rrc == MATCH_KETRPOS)
1244 {
1245 offset_top = md->end_offset_top;
1246 eptr = md->end_match_ptr;
1247 ecode = md->start_code + code_offset;
1248 matched_once = TRUE;
1249 mstart = md->start_match_ptr; /* In case \K reset it */
1250 continue;
1251 }
1252
1253 /* See comment in the code for capturing groups above about handling
1254 THEN. */
1255
1256 if (rrc == MATCH_THEN)
1257 {
1258 next = ecode + GET(ecode,1);
1259 if (md->start_match_ptr < next &&
1260 (*ecode == OP_ALT || *next == OP_ALT))
1261 rrc = MATCH_NOMATCH;
1262 }
1263
1264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1265 ecode += GET(ecode, 1);
1266 if (*ecode != OP_ALT) break;
1267 md->capture_last = save_capture_last;
1268 }
1269
1270 if (matched_once || allow_zero)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275 RRETURN(MATCH_NOMATCH);
1276
1277 /* Control never reaches here. */
1278
1279 /* Conditional group: compilation checked that there are no more than two
1280 branches. If the condition is false, skipping the first branch takes us
1281 past the end of the item if there is only one branch, but that's exactly
1282 what we want. */
1283
1284 case OP_COND:
1285 case OP_SCOND:
1286
1287 /* The variable codelink will be added to ecode when the condition is
1288 false, to get to the second branch. Setting it to the offset to the ALT
1289 or KET, then incrementing ecode achieves this effect. We now have ecode
1290 pointing to the condition or callout. */
1291
1292 codelink = GET(ecode, 1); /* Offset to the second branch */
1293 ecode += 1 + LINK_SIZE; /* From this opcode */
1294
1295 /* Because of the way auto-callout works during compile, a callout item is
1296 inserted between OP_COND and an assertion condition. */
1297
1298 if (*ecode == OP_CALLOUT)
1299 {
1300 if (PUBL(callout) != NULL)
1301 {
1302 PUBL(callout_block) cb;
1303 cb.version = 2; /* Version 1 of the callout block */
1304 cb.callout_number = ecode[1];
1305 cb.offset_vector = md->offset_vector;
1306 #if defined COMPILE_PCRE8
1307 cb.subject = (PCRE_SPTR)md->start_subject;
1308 #elif defined COMPILE_PCRE16
1309 cb.subject = (PCRE_SPTR16)md->start_subject;
1310 #elif defined COMPILE_PCRE32
1311 cb.subject = (PCRE_SPTR32)md->start_subject;
1312 #endif
1313 cb.subject_length = (int)(md->end_subject - md->start_subject);
1314 cb.start_match = (int)(mstart - md->start_subject);
1315 cb.current_position = (int)(eptr - md->start_subject);
1316 cb.pattern_position = GET(ecode, 2);
1317 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1318 cb.capture_top = offset_top/2;
1319 cb.capture_last = md->capture_last & CAPLMASK;
1320 /* Internal change requires this for API compatibility. */
1321 if (cb.capture_last == 0) cb.capture_last = -1;
1322 cb.callout_data = md->callout_data;
1323 cb.mark = md->nomatch_mark;
1324 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1325 if (rrc < 0) RRETURN(rrc);
1326 }
1327
1328 /* Advance ecode past the callout, so it now points to the condition. We
1329 must adjust codelink so that the value of ecode+codelink is unchanged. */
1330
1331 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1332 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1333 }
1334
1335 /* Test the various possible conditions */
1336
1337 condition = FALSE;
1338 switch(condcode = *ecode)
1339 {
1340 case OP_RREF: /* Numbered group recursion test */
1341 if (md->recursive != NULL) /* Not recursing => FALSE */
1342 {
1343 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1344 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1345 }
1346 break;
1347
1348 case OP_DNRREF: /* Duplicate named group recursion test */
1349 if (md->recursive != NULL)
1350 {
1351 int count = GET2(ecode, 1 + IMM2_SIZE);
1352 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1353 while (count-- > 0)
1354 {
1355 unsigned int recno = GET2(slot, 0);
1356 condition = recno == md->recursive->group_num;
1357 if (condition) break;
1358 slot += md->name_entry_size;
1359 }
1360 }
1361 break;
1362
1363 case OP_CREF: /* Numbered group used test */
1364 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1365 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1366 break;
1367
1368 case OP_DNCREF: /* Duplicate named group used test */
1369 {
1370 int count = GET2(ecode, 1 + IMM2_SIZE);
1371 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1372 while (count-- > 0)
1373 {
1374 offset = GET2(slot, 0) << 1;
1375 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1376 if (condition) break;
1377 slot += md->name_entry_size;
1378 }
1379 }
1380 break;
1381
1382 case OP_DEF: /* DEFINE - always false */
1383 break;
1384
1385 /* The condition is an assertion. Call match() to evaluate it - setting
1386 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1387 of an assertion. */
1388
1389 default:
1390 md->match_function_type = MATCH_CONDASSERT;
1391 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1392 if (rrc == MATCH_MATCH)
1393 {
1394 if (md->end_offset_top > offset_top)
1395 offset_top = md->end_offset_top; /* Captures may have happened */
1396 condition = TRUE;
1397
1398 /* Advance ecode past the assertion to the start of the first branch,
1399 but adjust it so that the general choosing code below works. */
1400
1401 ecode += GET(ecode, 1);
1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1403 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1404 }
1405
1406 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1407 assertion; it is therefore treated as NOMATCH. Any other return is an
1408 error. */
1409
1410 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1411 {
1412 RRETURN(rrc); /* Need braces because of following else */
1413 }
1414 break;
1415 }
1416
1417 /* Choose branch according to the condition */
1418
1419 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1420
1421 /* We are now at the branch that is to be obeyed. As there is only one, we
1422 can use tail recursion to avoid using another stack frame, except when
1423 there is unlimited repeat of a possibly empty group. In the latter case, a
1424 recursive call to match() is always required, unless the second alternative
1425 doesn't exist, in which case we can just plough on. Note that, for
1426 compatibility with Perl, the | in a conditional group is NOT treated as
1427 creating two alternatives. If a THEN is encountered in the branch, it
1428 propagates out to the enclosing alternative (unless nested in a deeper set
1429 of alternatives, of course). */
1430
1431 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1432 {
1433 if (op != OP_SCOND)
1434 {
1435 goto TAIL_RECURSE;
1436 }
1437
1438 md->match_function_type = MATCH_CBEGROUP;
1439 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1440 RRETURN(rrc);
1441 }
1442
1443 /* Condition false & no alternative; continue after the group. */
1444
1445 else
1446 {
1447 }
1448 break;
1449
1450
1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452 to close any currently open capturing brackets. */
1453
1454 case OP_CLOSE:
1455 number = GET2(ecode, 1); /* Must be less than 65536 */
1456 offset = number << 1;
1457
1458 #ifdef PCRE_DEBUG
1459 printf("end bracket %d at *ACCEPT", number);
1460 printf("\n");
1461 #endif
1462
1463 md->capture_last = (md->capture_last & OVFLMASK) | number;
1464 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1465 {
1466 md->offset_vector[offset] =
1467 md->offset_vector[md->offset_end - number];
1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1469 if (offset_top <= offset) offset_top = offset + 2;
1470 }
1471 ecode += 1 + IMM2_SIZE;
1472 break;
1473
1474
1475 /* End of the pattern, either real or forced. */
1476
1477 case OP_END:
1478 case OP_ACCEPT:
1479 case OP_ASSERT_ACCEPT:
1480
1481 /* If we have matched an empty string, fail if not in an assertion and not
1482 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1483 is set and we have matched at the start of the subject. In both cases,
1484 backtracking will then try other alternatives, if any. */
1485
1486 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1487 md->recursive == NULL &&
1488 (md->notempty ||
1489 (md->notempty_atstart &&
1490 mstart == md->start_subject + md->start_offset)))
1491 RRETURN(MATCH_NOMATCH);
1492
1493 /* Otherwise, we have a match. */
1494
1495 md->end_match_ptr = eptr; /* Record where we ended */
1496 md->end_offset_top = offset_top; /* and how many extracts were taken */
1497 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1498
1499 /* For some reason, the macros don't work properly if an expression is
1500 given as the argument to RRETURN when the heap is in use. */
1501
1502 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1503 RRETURN(rrc);
1504
1505 /* Assertion brackets. Check the alternative branches in turn - the
1506 matching won't pass the KET for an assertion. If any one branch matches,
1507 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1508 start of each branch to move the current point backwards, so the code at
1509 this level is identical to the lookahead case. When the assertion is part
1510 of a condition, we want to return immediately afterwards. The caller of
1511 this incarnation of the match() function will have set MATCH_CONDASSERT in
1512 md->match_function type, and one of these opcodes will be the first opcode
1513 that is processed. We use a local variable that is preserved over calls to
1514 match() to remember this case. */
1515
1516 case OP_ASSERT:
1517 case OP_ASSERTBACK:
1518 save_mark = md->mark;
1519 if (md->match_function_type == MATCH_CONDASSERT)
1520 {
1521 condassert = TRUE;
1522 md->match_function_type = 0;
1523 }
1524 else condassert = FALSE;
1525
1526 /* Loop for each branch */
1527
1528 do
1529 {
1530 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1531
1532 /* A match means that the assertion is true; break out of the loop
1533 that matches its alternatives. */
1534
1535 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1536 {
1537 mstart = md->start_match_ptr; /* In case \K reset it */
1538 break;
1539 }
1540
1541 /* If not matched, restore the previous mark setting. */
1542
1543 md->mark = save_mark;
1544
1545 /* See comment in the code for capturing groups above about handling
1546 THEN. */
1547
1548 if (rrc == MATCH_THEN)
1549 {
1550 next = ecode + GET(ecode,1);
1551 if (md->start_match_ptr < next &&
1552 (*ecode == OP_ALT || *next == OP_ALT))
1553 rrc = MATCH_NOMATCH;
1554 }
1555
1556 /* Anything other than NOMATCH causes the entire assertion to fail,
1557 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1558 uncaptured THEN, which means they take their normal effect. This
1559 consistent approach does not always have exactly the same effect as in
1560 Perl. */
1561
1562 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1563 ecode += GET(ecode, 1);
1564 }
1565 while (*ecode == OP_ALT); /* Continue for next alternative */
1566
1567 /* If we have tried all the alternative branches, the assertion has
1568 failed. If not, we broke out after a match. */
1569
1570 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1571
1572 /* If checking an assertion for a condition, return MATCH_MATCH. */
1573
1574 if (condassert) RRETURN(MATCH_MATCH);
1575
1576 /* Continue from after a successful assertion, updating the offsets high
1577 water mark, since extracts may have been taken during the assertion. */
1578
1579 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1580 ecode += 1 + LINK_SIZE;
1581 offset_top = md->end_offset_top;
1582 continue;
1583
1584 /* Negative assertion: all branches must fail to match for the assertion to
1585 succeed. */
1586
1587 case OP_ASSERT_NOT:
1588 case OP_ASSERTBACK_NOT:
1589 save_mark = md->mark;
1590 if (md->match_function_type == MATCH_CONDASSERT)
1591 {
1592 condassert = TRUE;
1593 md->match_function_type = 0;
1594 }
1595 else condassert = FALSE;
1596
1597 /* Loop for each alternative branch. */
1598
1599 do
1600 {
1601 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1602 md->mark = save_mark; /* Always restore the mark setting */
1603
1604 switch(rrc)
1605 {
1606 case MATCH_MATCH: /* A successful match means */
1607 case MATCH_ACCEPT: /* the assertion has failed. */
1608 RRETURN(MATCH_NOMATCH);
1609
1610 case MATCH_NOMATCH: /* Carry on with next branch */
1611 break;
1612
1613 /* See comment in the code for capturing groups above about handling
1614 THEN. */
1615
1616 case MATCH_THEN:
1617 next = ecode + GET(ecode,1);
1618 if (md->start_match_ptr < next &&
1619 (*ecode == OP_ALT || *next == OP_ALT))
1620 {
1621 rrc = MATCH_NOMATCH;
1622 break;
1623 }
1624 /* Otherwise fall through. */
1625
1626 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1627 assertion to fail to match, without considering any more alternatives.
1628 Failing to match means the assertion is true. This is a consistent
1629 approach, but does not always have the same effect as in Perl. */
1630
1631 case MATCH_COMMIT:
1632 case MATCH_SKIP:
1633 case MATCH_SKIP_ARG:
1634 case MATCH_PRUNE:
1635 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1636 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1637
1638 /* Anything else is an error */
1639
1640 default:
1641 RRETURN(rrc);
1642 }
1643
1644 /* Continue with next branch */
1645
1646 ecode += GET(ecode,1);
1647 }
1648 while (*ecode == OP_ALT);
1649
1650 /* All branches in the assertion failed to match. */
1651
1652 NEG_ASSERT_TRUE:
1653 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1654 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1655 continue;
1656
1657 /* Move the subject pointer back. This occurs only at the start of
1658 each branch of a lookbehind assertion. If we are too close to the start to
1659 move back, this match function fails. When working with UTF-8 we move
1660 back a number of characters, not bytes. */
1661
1662 case OP_REVERSE:
1663 #ifdef SUPPORT_UTF
1664 if (utf)
1665 {
1666 i = GET(ecode, 1);
1667 while (i-- > 0)
1668 {
1669 eptr--;
1670 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1671 BACKCHAR(eptr);
1672 }
1673 }
1674 else
1675 #endif
1676
1677 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1678
1679 {
1680 eptr -= GET(ecode, 1);
1681 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1682 }
1683
1684 /* Save the earliest consulted character, then skip to next op code */
1685
1686 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1687 ecode += 1 + LINK_SIZE;
1688 break;
1689
1690 /* The callout item calls an external function, if one is provided, passing
1691 details of the match so far. This is mainly for debugging, though the
1692 function is able to force a failure. */
1693
1694 case OP_CALLOUT:
1695 if (PUBL(callout) != NULL)
1696 {
1697 PUBL(callout_block) cb;
1698 cb.version = 2; /* Version 1 of the callout block */
1699 cb.callout_number = ecode[1];
1700 cb.offset_vector = md->offset_vector;
1701 #if defined COMPILE_PCRE8
1702 cb.subject = (PCRE_SPTR)md->start_subject;
1703 #elif defined COMPILE_PCRE16
1704 cb.subject = (PCRE_SPTR16)md->start_subject;
1705 #elif defined COMPILE_PCRE32
1706 cb.subject = (PCRE_SPTR32)md->start_subject;
1707 #endif
1708 cb.subject_length = (int)(md->end_subject - md->start_subject);
1709 cb.start_match = (int)(mstart - md->start_subject);
1710 cb.current_position = (int)(eptr - md->start_subject);
1711 cb.pattern_position = GET(ecode, 2);
1712 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1713 cb.capture_top = offset_top/2;
1714 cb.capture_last = md->capture_last & CAPLMASK;
1715 /* Internal change requires this for API compatibility. */
1716 if (cb.capture_last == 0) cb.capture_last = -1;
1717 cb.callout_data = md->callout_data;
1718 cb.mark = md->nomatch_mark;
1719 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1720 if (rrc < 0) RRETURN(rrc);
1721 }
1722 ecode += 2 + 2*LINK_SIZE;
1723 break;
1724
1725 /* Recursion either matches the current regex, or some subexpression. The
1726 offset data is the offset to the starting bracket from the start of the
1727 whole pattern. (This is so that it works from duplicated subpatterns.)
1728
1729 The state of the capturing groups is preserved over recursion, and
1730 re-instated afterwards. We don't know how many are started and not yet
1731 finished (offset_top records the completed total) so we just have to save
1732 all the potential data. There may be up to 65535 such values, which is too
1733 large to put on the stack, but using malloc for small numbers seems
1734 expensive. As a compromise, the stack is used when there are no more than
1735 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1736
1737 There are also other values that have to be saved. We use a chained
1738 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1739 for the original version of this logic. It has, however, been hacked around
1740 a lot, so he is not to blame for the current way it works. */
1741
1742 case OP_RECURSE:
1743 {
1744 recursion_info *ri;
1745 unsigned int recno;
1746
1747 callpat = md->start_code + GET(ecode, 1);
1748 recno = (callpat == md->start_code)? 0 :
1749 GET2(callpat, 1 + LINK_SIZE);
1750
1751 /* Check for repeating a recursion without advancing the subject pointer.
1752 This should catch convoluted mutual recursions. (Some simple cases are
1753 caught at compile time.) */
1754
1755 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1756 if (recno == ri->group_num && eptr == ri->subject_position)
1757 RRETURN(PCRE_ERROR_RECURSELOOP);
1758
1759 /* Add to "recursing stack" */
1760
1761 new_recursive.group_num = recno;
1762 new_recursive.saved_capture_last = md->capture_last;
1763 new_recursive.subject_position = eptr;
1764 new_recursive.prevrec = md->recursive;
1765 md->recursive = &new_recursive;
1766
1767 /* Where to continue from afterwards */
1768
1769 ecode += 1 + LINK_SIZE;
1770
1771 /* Now save the offset data */
1772
1773 new_recursive.saved_max = md->offset_end;
1774 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1775 new_recursive.offset_save = stacksave;
1776 else
1777 {
1778 new_recursive.offset_save =
1779 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1780 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1781 }
1782 memcpy(new_recursive.offset_save, md->offset_vector,
1783 new_recursive.saved_max * sizeof(int));
1784
1785 /* OK, now we can do the recursion. After processing each alternative,
1786 restore the offset data and the last captured value. If there were nested
1787 recursions, md->recursive might be changed, so reset it before looping.
1788 */
1789
1790 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1791 cbegroup = (*callpat >= OP_SBRA);
1792 do
1793 {
1794 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1795 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1796 md, eptrb, RM6);
1797 memcpy(md->offset_vector, new_recursive.offset_save,
1798 new_recursive.saved_max * sizeof(int));
1799 md->capture_last = new_recursive.saved_capture_last;
1800 md->recursive = new_recursive.prevrec;
1801 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1802 {
1803 DPRINTF(("Recursion matched\n"));
1804 if (new_recursive.offset_save != stacksave)
1805 (PUBL(free))(new_recursive.offset_save);
1806
1807 /* Set where we got to in the subject, and reset the start in case
1808 it was changed by \K. This *is* propagated back out of a recursion,
1809 for Perl compatibility. */
1810
1811 eptr = md->end_match_ptr;
1812 mstart = md->start_match_ptr;
1813 goto RECURSION_MATCHED; /* Exit loop; end processing */
1814 }
1815
1816 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1817 recursion; they cause a NOMATCH for the entire recursion. These codes
1818 are defined in a range that can be tested for. */
1819
1820 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1821 RRETURN(MATCH_NOMATCH);
1822
1823 /* Any return code other than NOMATCH is an error. */
1824
1825 if (rrc != MATCH_NOMATCH)
1826 {
1827 DPRINTF(("Recursion gave error %d\n", rrc));
1828 if (new_recursive.offset_save != stacksave)
1829 (PUBL(free))(new_recursive.offset_save);
1830 RRETURN(rrc);
1831 }
1832
1833 md->recursive = &new_recursive;
1834 callpat += GET(callpat, 1);
1835 }
1836 while (*callpat == OP_ALT);
1837
1838 DPRINTF(("Recursion didn't match\n"));
1839 md->recursive = new_recursive.prevrec;
1840 if (new_recursive.offset_save != stacksave)
1841 (PUBL(free))(new_recursive.offset_save);
1842 RRETURN(MATCH_NOMATCH);
1843 }
1844
1845 RECURSION_MATCHED:
1846 break;
1847
1848 /* An alternation is the end of a branch; scan along to find the end of the
1849 bracketed group and go to there. */
1850
1851 case OP_ALT:
1852 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1853 break;
1854
1855 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1856 indicating that it may occur zero times. It may repeat infinitely, or not
1857 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1858 with fixed upper repeat limits are compiled as a number of copies, with the
1859 optional ones preceded by BRAZERO or BRAMINZERO. */
1860
1861 case OP_BRAZERO:
1862 next = ecode + 1;
1863 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1865 do next += GET(next, 1); while (*next == OP_ALT);
1866 ecode = next + 1 + LINK_SIZE;
1867 break;
1868
1869 case OP_BRAMINZERO:
1870 next = ecode + 1;
1871 do next += GET(next, 1); while (*next == OP_ALT);
1872 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1874 ecode++;
1875 break;
1876
1877 case OP_SKIPZERO:
1878 next = ecode+1;
1879 do next += GET(next,1); while (*next == OP_ALT);
1880 ecode = next + 1 + LINK_SIZE;
1881 break;
1882
1883 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1884 here; just jump to the group, with allow_zero set TRUE. */
1885
1886 case OP_BRAPOSZERO:
1887 op = *(++ecode);
1888 allow_zero = TRUE;
1889 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1890 goto POSSESSIVE_NON_CAPTURE;
1891
1892 /* End of a group, repeated or non-repeating. */
1893
1894 case OP_KET:
1895 case OP_KETRMIN:
1896 case OP_KETRMAX:
1897 case OP_KETRPOS:
1898 prev = ecode - GET(ecode, 1);
1899
1900 /* If this was a group that remembered the subject start, in order to break
1901 infinite repeats of empty string matches, retrieve the subject start from
1902 the chain. Otherwise, set it NULL. */
1903
1904 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1905 {
1906 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1907 eptrb = eptrb->epb_prev; /* Backup to previous group */
1908 }
1909 else saved_eptr = NULL;
1910
1911 /* If we are at the end of an assertion group or a non-capturing atomic
1912 group, stop matching and return MATCH_MATCH, but record the current high
1913 water mark for use by positive assertions. We also need to record the match
1914 start in case it was changed by \K. */
1915
1916 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1917 *prev == OP_ONCE_NC)
1918 {
1919 md->end_match_ptr = eptr; /* For ONCE_NC */
1920 md->end_offset_top = offset_top;
1921 md->start_match_ptr = mstart;
1922 RRETURN(MATCH_MATCH); /* Sets md->mark */
1923 }
1924
1925 /* For capturing groups we have to check the group number back at the start
1926 and if necessary complete handling an extraction by setting the offsets and
1927 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1928 into group 0, so it won't be picked up here. Instead, we catch it when the
1929 OP_END is reached. Other recursion is handled here. We just have to record
1930 the current subject position and start match pointer and give a MATCH
1931 return. */
1932
1933 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1934 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1935 {
1936 number = GET2(prev, 1+LINK_SIZE);
1937 offset = number << 1;
1938
1939 #ifdef PCRE_DEBUG
1940 printf("end bracket %d", number);
1941 printf("\n");
1942 #endif
1943
1944 /* Handle a recursively called group. */
1945
1946 if (md->recursive != NULL && md->recursive->group_num == number)
1947 {
1948 md->end_match_ptr = eptr;
1949 md->start_match_ptr = mstart;
1950 RRETURN(MATCH_MATCH);
1951 }
1952
1953 /* Deal with capturing */
1954
1955 md->capture_last = (md->capture_last & OVFLMASK) | number;
1956 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1957 {
1958 /* If offset is greater than offset_top, it means that we are
1959 "skipping" a capturing group, and that group's offsets must be marked
1960 unset. In earlier versions of PCRE, all the offsets were unset at the
1961 start of matching, but this doesn't work because atomic groups and
1962 assertions can cause a value to be set that should later be unset.
1963 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1964 part of the atomic group, but this is not on the final matching path,
1965 so must be unset when 2 is set. (If there is no group 2, there is no
1966 problem, because offset_top will then be 2, indicating no capture.) */
1967
1968 if (offset > offset_top)
1969 {
1970 register int *iptr = md->offset_vector + offset_top;
1971 register int *iend = md->offset_vector + offset;
1972 while (iptr < iend) *iptr++ = -1;
1973 }
1974
1975 /* Now make the extraction */
1976
1977 md->offset_vector[offset] =
1978 md->offset_vector[md->offset_end - number];
1979 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1980 if (offset_top <= offset) offset_top = offset + 2;
1981 }
1982 }
1983
1984 /* For an ordinary non-repeating ket, just continue at this level. This
1985 also happens for a repeating ket if no characters were matched in the
1986 group. This is the forcible breaking of infinite loops as implemented in
1987 Perl 5.005. For a non-repeating atomic group that includes captures,
1988 establish a backup point by processing the rest of the pattern at a lower
1989 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1990 original OP_ONCE level, thereby bypassing intermediate backup points, but
1991 resetting any captures that happened along the way. */
1992
1993 if (*ecode == OP_KET || eptr == saved_eptr)
1994 {
1995 if (*prev == OP_ONCE)
1996 {
1997 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1999 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2000 RRETURN(MATCH_ONCE);
2001 }
2002 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2003 break;
2004 }
2005
2006 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2007 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2008 at a time from the outer level, thus saving stack. */
2009
2010 if (*ecode == OP_KETRPOS)
2011 {
2012 md->start_match_ptr = mstart; /* In case \K reset it */
2013 md->end_match_ptr = eptr;
2014 md->end_offset_top = offset_top;
2015 RRETURN(MATCH_KETRPOS);
2016 }
2017
2018 /* The normal repeating kets try the rest of the pattern or restart from
2019 the preceding bracket, in the appropriate order. In the second case, we can
2020 use tail recursion to avoid using another stack frame, unless we have an
2021 an atomic group or an unlimited repeat of a group that can match an empty
2022 string. */
2023
2024 if (*ecode == OP_KETRMIN)
2025 {
2026 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2027 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2028 if (*prev == OP_ONCE)
2029 {
2030 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2031 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2032 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2033 RRETURN(MATCH_ONCE);
2034 }
2035 if (*prev >= OP_SBRA) /* Could match an empty string */
2036 {
2037 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2038 RRETURN(rrc);
2039 }
2040 ecode = prev;
2041 goto TAIL_RECURSE;
2042 }
2043 else /* OP_KETRMAX */
2044 {
2045 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2046 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 if (*prev == OP_ONCE)
2049 {
2050 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2052 md->once_target = prev;
2053 RRETURN(MATCH_ONCE);
2054 }
2055 ecode += 1 + LINK_SIZE;
2056 goto TAIL_RECURSE;
2057 }
2058 /* Control never gets here */
2059
2060 /* Not multiline mode: start of subject assertion, unless notbol. */
2061
2062 case OP_CIRC:
2063 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2064
2065 /* Start of subject assertion */
2066
2067 case OP_SOD:
2068 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2069 ecode++;
2070 break;
2071
2072 /* Multiline mode: start of subject unless notbol, or after any newline. */
2073
2074 case OP_CIRCM:
2075 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2076 if (eptr != md->start_subject &&
2077 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2078 RRETURN(MATCH_NOMATCH);
2079 ecode++;
2080 break;
2081
2082 /* Start of match assertion */
2083
2084 case OP_SOM:
2085 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2086 ecode++;
2087 break;
2088
2089 /* Reset the start of match point */
2090
2091 case OP_SET_SOM:
2092 mstart = eptr;
2093 ecode++;
2094 break;
2095
2096 /* Multiline mode: assert before any newline, or before end of subject
2097 unless noteol is set. */
2098
2099 case OP_DOLLM:
2100 if (eptr < md->end_subject)
2101 {
2102 if (!IS_NEWLINE(eptr))
2103 {
2104 if (md->partial != 0 &&
2105 eptr + 1 >= md->end_subject &&
2106 NLBLOCK->nltype == NLTYPE_FIXED &&
2107 NLBLOCK->nllen == 2 &&
2108 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2109 {
2110 md->hitend = TRUE;
2111 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2112 }
2113 RRETURN(MATCH_NOMATCH);
2114 }
2115 }
2116 else
2117 {
2118 if (md->noteol) RRETURN(MATCH_NOMATCH);
2119 SCHECK_PARTIAL();
2120 }
2121 ecode++;
2122 break;
2123
2124 /* Not multiline mode: assert before a terminating newline or before end of
2125 subject unless noteol is set. */
2126
2127 case OP_DOLL:
2128 if (md->noteol) RRETURN(MATCH_NOMATCH);
2129 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2130
2131 /* ... else fall through for endonly */
2132
2133 /* End of subject assertion (\z) */
2134
2135 case OP_EOD:
2136 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2137 SCHECK_PARTIAL();
2138 ecode++;
2139 break;
2140
2141 /* End of subject or ending \n assertion (\Z) */
2142
2143 case OP_EODN:
2144 ASSERT_NL_OR_EOS:
2145 if (eptr < md->end_subject &&
2146 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2147 {
2148 if (md->partial != 0 &&
2149 eptr + 1 >= md->end_subject &&
2150 NLBLOCK->nltype == NLTYPE_FIXED &&
2151 NLBLOCK->nllen == 2 &&
2152 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2153 {
2154 md->hitend = TRUE;
2155 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2156 }
2157 RRETURN(MATCH_NOMATCH);
2158 }
2159
2160 /* Either at end of string or \n before end. */
2161
2162 SCHECK_PARTIAL();
2163 ecode++;
2164 break;
2165
2166 /* Word boundary assertions */
2167
2168 case OP_NOT_WORD_BOUNDARY:
2169 case OP_WORD_BOUNDARY:
2170 {
2171
2172 /* Find out if the previous and current characters are "word" characters.
2173 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2174 be "non-word" characters. Remember the earliest consulted character for
2175 partial matching. */
2176
2177 #ifdef SUPPORT_UTF
2178 if (utf)
2179 {
2180 /* Get status of previous character */
2181
2182 if (eptr == md->start_subject) prev_is_word = FALSE; else
2183 {
2184 PCRE_PUCHAR lastptr = eptr - 1;
2185 BACKCHAR(lastptr);
2186 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2187 GETCHAR(c, lastptr);
2188 #ifdef SUPPORT_UCP
2189 if (md->use_ucp)
2190 {
2191 if (c == '_') prev_is_word = TRUE; else
2192 {
2193 int cat = UCD_CATEGORY(c);
2194 prev_is_word = (cat == ucp_L || cat == ucp_N);
2195 }
2196 }
2197 else
2198 #endif
2199 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2200 }
2201
2202 /* Get status of next character */
2203
2204 if (eptr >= md->end_subject)
2205 {
2206 SCHECK_PARTIAL();
2207 cur_is_word = FALSE;
2208 }
2209 else
2210 {
2211 GETCHAR(c, eptr);
2212 #ifdef SUPPORT_UCP
2213 if (md->use_ucp)
2214 {
2215 if (c == '_') cur_is_word = TRUE; else
2216 {
2217 int cat = UCD_CATEGORY(c);
2218 cur_is_word = (cat == ucp_L || cat == ucp_N);
2219 }
2220 }
2221 else
2222 #endif
2223 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2224 }
2225 }
2226 else
2227 #endif
2228
2229 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2230 consistency with the behaviour of \w we do use it in this case. */
2231
2232 {
2233 /* Get status of previous character */
2234
2235 if (eptr == md->start_subject) prev_is_word = FALSE; else
2236 {
2237 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2238 #ifdef SUPPORT_UCP
2239 if (md->use_ucp)
2240 {
2241 c = eptr[-1];
2242 if (c == '_') prev_is_word = TRUE; else
2243 {
2244 int cat = UCD_CATEGORY(c);
2245 prev_is_word = (cat == ucp_L || cat == ucp_N);
2246 }
2247 }
2248 else
2249 #endif
2250 prev_is_word = MAX_255(eptr[-1])
2251 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2252 }
2253
2254 /* Get status of next character */
2255
2256 if (eptr >= md->end_subject)
2257 {
2258 SCHECK_PARTIAL();
2259 cur_is_word = FALSE;
2260 }
2261 else
2262 #ifdef SUPPORT_UCP
2263 if (md->use_ucp)
2264 {
2265 c = *eptr;
2266 if (c == '_') cur_is_word = TRUE; else
2267 {
2268 int cat = UCD_CATEGORY(c);
2269 cur_is_word = (cat == ucp_L || cat == ucp_N);
2270 }
2271 }
2272 else
2273 #endif
2274 cur_is_word = MAX_255(*eptr)
2275 && ((md->ctypes[*eptr] & ctype_word) != 0);
2276 }
2277
2278 /* Now see if the situation is what we want */
2279
2280 if ((*ecode++ == OP_WORD_BOUNDARY)?
2281 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2282 RRETURN(MATCH_NOMATCH);
2283 }
2284 break;
2285
2286 /* Match any single character type except newline; have to take care with
2287 CRLF newlines and partial matching. */
2288
2289 case OP_ANY:
2290 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2291 if (md->partial != 0 &&
2292 eptr + 1 >= md->end_subject &&
2293 NLBLOCK->nltype == NLTYPE_FIXED &&
2294 NLBLOCK->nllen == 2 &&
2295 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2296 {
2297 md->hitend = TRUE;
2298 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2299 }
2300
2301 /* Fall through */
2302
2303 /* Match any single character whatsoever. */
2304
2305 case OP_ALLANY:
2306 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2307 { /* not be updated before SCHECK_PARTIAL. */
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 eptr++;
2312 #ifdef SUPPORT_UTF
2313 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2314 #endif
2315 ecode++;
2316 break;
2317
2318 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2319 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2320
2321 case OP_ANYBYTE:
2322 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2323 { /* not be updated before SCHECK_PARTIAL. */
2324 SCHECK_PARTIAL();
2325 RRETURN(MATCH_NOMATCH);
2326 }
2327 eptr++;
2328 ecode++;
2329 break;
2330
2331 case OP_NOT_DIGIT:
2332 if (eptr >= md->end_subject)
2333 {
2334 SCHECK_PARTIAL();
2335 RRETURN(MATCH_NOMATCH);
2336 }
2337 GETCHARINCTEST(c, eptr);
2338 if (
2339 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2340 c < 256 &&
2341 #endif
2342 (md->ctypes[c] & ctype_digit) != 0
2343 )
2344 RRETURN(MATCH_NOMATCH);
2345 ecode++;
2346 break;
2347
2348 case OP_DIGIT:
2349 if (eptr >= md->end_subject)
2350 {
2351 SCHECK_PARTIAL();
2352 RRETURN(MATCH_NOMATCH);
2353 }
2354 GETCHARINCTEST(c, eptr);
2355 if (
2356 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2357 c > 255 ||
2358 #endif
2359 (md->ctypes[c] & ctype_digit) == 0
2360 )
2361 RRETURN(MATCH_NOMATCH);
2362 ecode++;
2363 break;
2364
2365 case OP_NOT_WHITESPACE:
2366 if (eptr >= md->end_subject)
2367 {
2368 SCHECK_PARTIAL();
2369 RRETURN(MATCH_NOMATCH);
2370 }
2371 GETCHARINCTEST(c, eptr);
2372 if (
2373 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2374 c < 256 &&
2375 #endif
2376 (md->ctypes[c] & ctype_space) != 0
2377 )
2378 RRETURN(MATCH_NOMATCH);
2379 ecode++;
2380 break;
2381
2382 case OP_WHITESPACE:
2383 if (eptr >= md->end_subject)
2384 {
2385 SCHECK_PARTIAL();
2386 RRETURN(MATCH_NOMATCH);
2387 }
2388 GETCHARINCTEST(c, eptr);
2389 if (
2390 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2391 c > 255 ||
2392 #endif
2393 (md->ctypes[c] & ctype_space) == 0
2394 )
2395 RRETURN(MATCH_NOMATCH);
2396 ecode++;
2397 break;
2398
2399 case OP_NOT_WORDCHAR:
2400 if (eptr >= md->end_subject)
2401 {
2402 SCHECK_PARTIAL();
2403 RRETURN(MATCH_NOMATCH);
2404 }
2405 GETCHARINCTEST(c, eptr);
2406 if (
2407 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2408 c < 256 &&
2409 #endif
2410 (md->ctypes[c] & ctype_word) != 0
2411 )
2412 RRETURN(MATCH_NOMATCH);
2413 ecode++;
2414 break;
2415
2416 case OP_WORDCHAR:
2417 if (eptr >= md->end_subject)
2418 {
2419 SCHECK_PARTIAL();
2420 RRETURN(MATCH_NOMATCH);
2421 }
2422 GETCHARINCTEST(c, eptr);
2423 if (
2424 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2425 c > 255 ||
2426 #endif
2427 (md->ctypes[c] & ctype_word) == 0
2428 )
2429 RRETURN(MATCH_NOMATCH);
2430 ecode++;
2431 break;
2432
2433 case OP_ANYNL:
2434 if (eptr >= md->end_subject)
2435 {
2436 SCHECK_PARTIAL();
2437 RRETURN(MATCH_NOMATCH);
2438 }
2439 GETCHARINCTEST(c, eptr);
2440 switch(c)
2441 {
2442 default: RRETURN(MATCH_NOMATCH);
2443
2444 case CHAR_CR:
2445 if (eptr >= md->end_subject)
2446 {
2447 SCHECK_PARTIAL();
2448 }
2449 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2450 break;
2451
2452 case CHAR_LF:
2453 break;
2454
2455 case CHAR_VT:
2456 case CHAR_FF:
2457 case CHAR_NEL:
2458 #ifndef EBCDIC
2459 case 0x2028:
2460 case 0x2029:
2461 #endif /* Not EBCDIC */
2462 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2463 break;
2464 }
2465 ecode++;
2466 break;
2467
2468 case OP_NOT_HSPACE:
2469 if (eptr >= md->end_subject)
2470 {
2471 SCHECK_PARTIAL();
2472 RRETURN(MATCH_NOMATCH);
2473 }
2474 GETCHARINCTEST(c, eptr);
2475 switch(c)
2476 {
2477 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2478 default: break;
2479 }
2480 ecode++;
2481 break;
2482
2483 case OP_HSPACE:
2484 if (eptr >= md->end_subject)
2485 {
2486 SCHECK_PARTIAL();
2487 RRETURN(MATCH_NOMATCH);
2488 }
2489 GETCHARINCTEST(c, eptr);
2490 switch(c)
2491 {
2492 HSPACE_CASES: break; /* Byte and multibyte cases */
2493 default: RRETURN(MATCH_NOMATCH);
2494 }
2495 ecode++;
2496 break;
2497
2498 case OP_NOT_VSPACE:
2499 if (eptr >= md->end_subject)
2500 {
2501 SCHECK_PARTIAL();
2502 RRETURN(MATCH_NOMATCH);
2503 }
2504 GETCHARINCTEST(c, eptr);
2505 switch(c)
2506 {
2507 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2508 default: break;
2509 }
2510 ecode++;
2511 break;
2512
2513 case OP_VSPACE:
2514 if (eptr >= md->end_subject)
2515 {
2516 SCHECK_PARTIAL();
2517 RRETURN(MATCH_NOMATCH);
2518 }
2519 GETCHARINCTEST(c, eptr);
2520 switch(c)
2521 {
2522 VSPACE_CASES: break;
2523 default: RRETURN(MATCH_NOMATCH);
2524 }
2525 ecode++;
2526 break;
2527
2528 #ifdef SUPPORT_UCP
2529 /* Check the next character by Unicode property. We will get here only
2530 if the support is in the binary; otherwise a compile-time error occurs. */
2531
2532 case OP_PROP:
2533 case OP_NOTPROP:
2534 if (eptr >= md->end_subject)
2535 {
2536 SCHECK_PARTIAL();
2537 RRETURN(MATCH_NOMATCH);
2538 }
2539 GETCHARINCTEST(c, eptr);
2540 {
2541 const pcre_uint32 *cp;
2542 const ucd_record *prop = GET_UCD(c);
2543
2544 switch(ecode[1])
2545 {
2546 case PT_ANY:
2547 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2548 break;
2549
2550 case PT_LAMP:
2551 if ((prop->chartype == ucp_Lu ||
2552 prop->chartype == ucp_Ll ||
2553 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2554 RRETURN(MATCH_NOMATCH);
2555 break;
2556
2557 case PT_GC:
2558 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2559 RRETURN(MATCH_NOMATCH);
2560 break;
2561
2562 case PT_PC:
2563 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2564 RRETURN(MATCH_NOMATCH);
2565 break;
2566
2567 case PT_SC:
2568 if ((ecode[2] != prop->script) == (op == OP_PROP))
2569 RRETURN(MATCH_NOMATCH);
2570 break;
2571
2572 /* These are specials */
2573
2574 case PT_ALNUM:
2575 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2576 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2577 RRETURN(MATCH_NOMATCH);
2578 break;
2579
2580 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2581 which means that Perl space and POSIX space are now identical. PCRE
2582 was changed at release 8.34. */
2583
2584 case PT_SPACE: /* Perl space */
2585 case PT_PXSPACE: /* POSIX space */
2586 switch(c)
2587 {
2588 HSPACE_CASES:
2589 VSPACE_CASES:
2590 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2591 break;
2592
2593 default:
2594 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2595 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2596 break;
2597 }
2598 break;
2599
2600 case PT_WORD:
2601 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2602 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2603 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2604 RRETURN(MATCH_NOMATCH);
2605 break;
2606
2607 case PT_CLIST:
2608 cp = PRIV(ucd_caseless_sets) + ecode[2];
2609 for (;;)
2610 {
2611 if (c < *cp)
2612 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2613 if (c == *cp++)
2614 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2615 }
2616 break;
2617
2618 case PT_UCNC:
2619 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2620 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2621 c >= 0xe000) == (op == OP_NOTPROP))
2622 RRETURN(MATCH_NOMATCH);
2623 break;
2624
2625 /* This should never occur */
2626
2627 default:
2628 RRETURN(PCRE_ERROR_INTERNAL);
2629 }
2630
2631 ecode += 3;
2632 }
2633 break;
2634
2635 /* Match an extended Unicode sequence. We will get here only if the support
2636 is in the binary; otherwise a compile-time error occurs. */
2637
2638 case OP_EXTUNI:
2639 if (eptr >= md->end_subject)
2640 {
2641 SCHECK_PARTIAL();
2642 RRETURN(MATCH_NOMATCH);
2643 }
2644 else
2645 {
2646 int lgb, rgb;
2647 GETCHARINCTEST(c, eptr);
2648 lgb = UCD_GRAPHBREAK(c);
2649 while (eptr < md->end_subject)
2650 {
2651 int len = 1;
2652 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2653 rgb = UCD_GRAPHBREAK(c);
2654 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2655 lgb = rgb;
2656 eptr += len;
2657 }
2658 }
2659 CHECK_PARTIAL();
2660 ecode++;
2661 break;
2662 #endif /* SUPPORT_UCP */
2663
2664
2665 /* Match a back reference, possibly repeatedly. Look past the end of the
2666 item to see if there is repeat information following. The code is similar
2667 to that for character classes, but repeated for efficiency. Then obey
2668 similar code to character type repeats - written out again for speed.
2669 However, if the referenced string is the empty string, always treat
2670 it as matched, any number of times (otherwise there could be infinite
2671 loops). If the reference is unset, there are two possibilities:
2672
2673 (a) In the default, Perl-compatible state, set the length negative;
2674 this ensures that every attempt at a match fails. We can't just fail
2675 here, because of the possibility of quantifiers with zero minima.
2676
2677 (b) If the JavaScript compatibility flag is set, set the length to zero
2678 so that the back reference matches an empty string.
2679
2680 Otherwise, set the length to the length of what was matched by the
2681 referenced subpattern.
2682
2683 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2684 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2685 and OP_DNREFI are used. In this case we must scan the list of groups to
2686 which the name refers, and use the first one that is set. */
2687
2688 case OP_DNREF:
2689 case OP_DNREFI:
2690 caseless = op == OP_DNREFI;
2691 {
2692 int count = GET2(ecode, 1+IMM2_SIZE);
2693 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2694 ecode += 1 + 2*IMM2_SIZE;
2695
2696 while (count-- > 0)
2697 {
2698 offset = GET2(slot, 0) << 1;
2699 if (offset < offset_top && md->offset_vector[offset] >= 0) break;
2700 slot += md->name_entry_size;
2701 }
2702 if (count < 0)
2703 length = (md->jscript_compat)? 0 : -1;
2704 else
2705 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2706 }
2707 goto REF_REPEAT;
2708
2709 case OP_REF:
2710 case OP_REFI:
2711 caseless = op == OP_REFI;
2712 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2713 ecode += 1 + IMM2_SIZE;
2714 if (offset >= offset_top || md->offset_vector[offset] < 0)
2715 length = (md->jscript_compat)? 0 : -1;
2716 else
2717 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2718
2719 /* Set up for repetition, or handle the non-repeated case */
2720
2721 REF_REPEAT:
2722 switch (*ecode)
2723 {
2724 case OP_CRSTAR:
2725 case OP_CRMINSTAR:
2726 case OP_CRPLUS:
2727 case OP_CRMINPLUS:
2728 case OP_CRQUERY:
2729 case OP_CRMINQUERY:
2730 c = *ecode++ - OP_CRSTAR;
2731 minimize = (c & 1) != 0;
2732 min = rep_min[c]; /* Pick up values from tables; */
2733 max = rep_max[c]; /* zero for max => infinity */
2734 if (max == 0) max = INT_MAX;
2735 break;
2736
2737 case OP_CRRANGE:
2738 case OP_CRMINRANGE:
2739 minimize = (*ecode == OP_CRMINRANGE);
2740 min = GET2(ecode, 1);
2741 max = GET2(ecode, 1 + IMM2_SIZE);
2742 if (max == 0) max = INT_MAX;
2743 ecode += 1 + 2 * IMM2_SIZE;
2744 break;
2745
2746 default: /* No repeat follows */
2747 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2748 {
2749 if (length == -2) eptr = md->end_subject; /* Partial match */
2750 CHECK_PARTIAL();
2751 RRETURN(MATCH_NOMATCH);
2752 }
2753 eptr += length;
2754 continue; /* With the main loop */
2755 }
2756
2757 /* Handle repeated back references. If the length of the reference is
2758 zero, just continue with the main loop. If the length is negative, it
2759 means the reference is unset in non-Java-compatible mode. If the minimum is
2760 zero, we can continue at the same level without recursion. For any other
2761 minimum, carrying on will result in NOMATCH. */
2762
2763 if (length == 0) continue;
2764 if (length < 0 && min == 0) continue;
2765
2766 /* First, ensure the minimum number of matches are present. We get back
2767 the length of the reference string explicitly rather than passing the
2768 address of eptr, so that eptr can be a register variable. */
2769
2770 for (i = 1; i <= min; i++)
2771 {
2772 int slength;
2773 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2774 {
2775 if (slength == -2) eptr = md->end_subject; /* Partial match */
2776 CHECK_PARTIAL();
2777 RRETURN(MATCH_NOMATCH);
2778 }
2779 eptr += slength;
2780 }
2781
2782 /* If min = max, continue at the same level without recursion.
2783 They are not both allowed to be zero. */
2784
2785 if (min == max) continue;
2786
2787 /* If minimizing, keep trying and advancing the pointer */
2788
2789 if (minimize)
2790 {
2791 for (fi = min;; fi++)
2792 {
2793 int slength;
2794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2796 if (fi >= max) RRETURN(MATCH_NOMATCH);
2797 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2798 {
2799 if (slength == -2) eptr = md->end_subject; /* Partial match */
2800 CHECK_PARTIAL();
2801 RRETURN(MATCH_NOMATCH);
2802 }
2803 eptr += slength;
2804 }
2805 /* Control never gets here */
2806 }
2807
2808 /* If maximizing, find the longest string and work backwards */
2809
2810 else
2811 {
2812 pp = eptr;
2813 for (i = min; i < max; i++)
2814 {
2815 int slength;
2816 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2817 {
2818 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2819 the soft partial matching case. */
2820
2821 if (slength == -2 && md->partial != 0 &&
2822 md->end_subject > md->start_used_ptr)
2823 {
2824 md->hitend = TRUE;
2825 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2826 }
2827 break;
2828 }
2829 eptr += slength;
2830 }
2831
2832 while (eptr >= pp)
2833 {
2834 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2836 eptr -= length;
2837 }
2838 RRETURN(MATCH_NOMATCH);
2839 }
2840 /* Control never gets here */
2841
2842 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2843 used when all the characters in the class have values in the range 0-255,
2844 and either the matching is caseful, or the characters are in the range
2845 0-127 when UTF-8 processing is enabled. The only difference between
2846 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2847 encountered.
2848
2849 First, look past the end of the item to see if there is repeat information
2850 following. Then obey similar code to character type repeats - written out
2851 again for speed. */
2852
2853 case OP_NCLASS:
2854 case OP_CLASS:
2855 {
2856 /* The data variable is saved across frames, so the byte map needs to
2857 be stored there. */
2858 #define BYTE_MAP ((pcre_uint8 *)data)
2859 data = ecode + 1; /* Save for matching */
2860 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2861
2862 switch (*ecode)
2863 {
2864 case OP_CRSTAR:
2865 case OP_CRMINSTAR:
2866 case OP_CRPLUS:
2867 case OP_CRMINPLUS:
2868 case OP_CRQUERY:
2869 case OP_CRMINQUERY:
2870 case OP_CRPOSSTAR:
2871 case OP_CRPOSPLUS:
2872 case OP_CRPOSQUERY:
2873 c = *ecode++ - OP_CRSTAR;
2874 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2875 else possessive = TRUE;
2876 min = rep_min[c]; /* Pick up values from tables; */
2877 max = rep_max[c]; /* zero for max => infinity */
2878 if (max == 0) max = INT_MAX;
2879 break;
2880
2881 case OP_CRRANGE:
2882 case OP_CRMINRANGE:
2883 case OP_CRPOSRANGE:
2884 minimize = (*ecode == OP_CRMINRANGE);
2885 possessive = (*ecode == OP_CRPOSRANGE);
2886 min = GET2(ecode, 1);
2887 max = GET2(ecode, 1 + IMM2_SIZE);
2888 if (max == 0) max = INT_MAX;
2889 ecode += 1 + 2 * IMM2_SIZE;
2890 break;
2891
2892 default: /* No repeat follows */
2893 min = max = 1;
2894 break;
2895 }
2896
2897 /* First, ensure the minimum number of matches are present. */
2898
2899 #ifdef SUPPORT_UTF
2900 if (utf)
2901 {
2902 for (i = 1; i <= min; i++)
2903 {
2904 if (eptr >= md->end_subject)
2905 {
2906 SCHECK_PARTIAL();
2907 RRETURN(MATCH_NOMATCH);
2908 }
2909 GETCHARINC(c, eptr);
2910 if (c > 255)
2911 {
2912 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2913 }
2914 else
2915 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2916 }
2917 }
2918 else
2919 #endif
2920 /* Not UTF mode */
2921 {
2922 for (i = 1; i <= min; i++)
2923 {
2924 if (eptr >= md->end_subject)
2925 {
2926 SCHECK_PARTIAL();
2927 RRETURN(MATCH_NOMATCH);
2928 }
2929 c = *eptr++;
2930 #ifndef COMPILE_PCRE8
2931 if (c > 255)
2932 {
2933 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2934 }
2935 else
2936 #endif
2937 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2938 }
2939 }
2940
2941 /* If max == min we can continue with the main loop without the
2942 need to recurse. */
2943
2944 if (min == max) continue;
2945
2946 /* If minimizing, keep testing the rest of the expression and advancing
2947 the pointer while it matches the class. */
2948
2949 if (minimize)
2950 {
2951 #ifdef SUPPORT_UTF
2952 if (utf)
2953 {
2954 for (fi = min;; fi++)
2955 {
2956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2958 if (fi >= max) RRETURN(MATCH_NOMATCH);
2959 if (eptr >= md->end_subject)
2960 {
2961 SCHECK_PARTIAL();
2962 RRETURN(MATCH_NOMATCH);
2963 }
2964 GETCHARINC(c, eptr);
2965 if (c > 255)
2966 {
2967 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2968 }
2969 else
2970 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2971 }
2972 }
2973 else
2974 #endif
2975 /* Not UTF mode */
2976 {
2977 for (fi = min;; fi++)
2978 {
2979 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2981 if (fi >= max) RRETURN(MATCH_NOMATCH);
2982 if (eptr >= md->end_subject)
2983 {
2984 SCHECK_PARTIAL();
2985 RRETURN(MATCH_NOMATCH);
2986 }
2987 c = *eptr++;
2988 #ifndef COMPILE_PCRE8
2989 if (c > 255)
2990 {
2991 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2992 }
2993 else
2994 #endif
2995 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2996 }
2997 }
2998 /* Control never gets here */
2999 }
3000
3001 /* If maximizing, find the longest possible run, then work backwards. */
3002
3003 else
3004 {
3005 pp = eptr;
3006
3007 #ifdef SUPPORT_UTF
3008 if (utf)
3009 {
3010 for (i = min; i < max; i++)
3011 {
3012 int len = 1;
3013 if (eptr >= md->end_subject)
3014 {
3015 SCHECK_PARTIAL();
3016 break;
3017 }
3018 GETCHARLEN(c, eptr, len);
3019 if (c > 255)
3020 {
3021 if (op == OP_CLASS) break;
3022 }
3023 else
3024 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3025 eptr += len;
3026 }
3027
3028 if (possessive) continue; /* No backtracking */
3029
3030 for (;;)
3031 {
3032 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3034 if (eptr-- == pp) break; /* Stop if tried at original pos */
3035 BACKCHAR(eptr);
3036 }
3037 }
3038 else
3039 #endif
3040 /* Not UTF mode */
3041 {
3042 for (i = min; i < max; i++)
3043 {
3044 if (eptr >= md->end_subject)
3045 {
3046 SCHECK_PARTIAL();
3047 break;
3048 }
3049 c = *eptr;
3050 #ifndef COMPILE_PCRE8
3051 if (c > 255)
3052 {
3053 if (op == OP_CLASS) break;
3054 }
3055 else
3056 #endif
3057 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3058 eptr++;
3059 }
3060
3061 if (possessive) continue; /* No backtracking */
3062
3063 while (eptr >= pp)
3064 {
3065 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3067 eptr--;
3068 }
3069 }
3070
3071 RRETURN(MATCH_NOMATCH);
3072 }
3073 #undef BYTE_MAP
3074 }
3075 /* Control never gets here */
3076
3077
3078 /* Match an extended character class. This opcode is encountered only
3079 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3080 mode, because Unicode properties are supported in non-UTF-8 mode. */
3081
3082 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3083 case OP_XCLASS:
3084 {
3085 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3086 ecode += GET(ecode, 1); /* Advance past the item */
3087
3088 switch (*ecode)
3089 {
3090 case OP_CRSTAR:
3091 case OP_CRMINSTAR:
3092 case OP_CRPLUS:
3093 case OP_CRMINPLUS:
3094 case OP_CRQUERY:
3095 case OP_CRMINQUERY:
3096 case OP_CRPOSSTAR:
3097 case OP_CRPOSPLUS:
3098 case OP_CRPOSQUERY:
3099 c = *ecode++ - OP_CRSTAR;
3100 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3101 else possessive = TRUE;
3102 min = rep_min[c]; /* Pick up values from tables; */
3103 max = rep_max[c]; /* zero for max => infinity */
3104 if (max == 0) max = INT_MAX;
3105 break;
3106
3107 case OP_CRRANGE:
3108 case OP_CRMINRANGE:
3109 case OP_CRPOSRANGE:
3110 minimize = (*ecode == OP_CRMINRANGE);
3111 possessive = (*ecode == OP_CRPOSRANGE);
3112 min = GET2(ecode, 1);
3113 max = GET2(ecode, 1 + IMM2_SIZE);
3114 if (max == 0) max = INT_MAX;
3115 ecode += 1 + 2 * IMM2_SIZE;
3116 break;
3117
3118 default: /* No repeat follows */
3119 min = max = 1;
3120 break;
3121 }
3122
3123 /* First, ensure the minimum number of matches are present. */
3124
3125 for (i = 1; i <= min; i++)
3126 {
3127 if (eptr >= md->end_subject)
3128 {
3129 SCHECK_PARTIAL();
3130 RRETURN(MATCH_NOMATCH);
3131 }
3132 GETCHARINCTEST(c, eptr);
3133 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3134 }
3135
3136 /* If max == min we can continue with the main loop without the
3137 need to recurse. */
3138
3139 if (min == max) continue;
3140
3141 /* If minimizing, keep testing the rest of the expression and advancing
3142 the pointer while it matches the class. */
3143
3144 if (minimize)
3145 {
3146 for (fi = min;; fi++)
3147 {
3148 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3149 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3150 if (fi >= max) RRETURN(MATCH_NOMATCH);
3151 if (eptr >= md->end_subject)
3152 {
3153 SCHECK_PARTIAL();
3154 RRETURN(MATCH_NOMATCH);
3155 }
3156 GETCHARINCTEST(c, eptr);
3157 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3158 }
3159 /* Control never gets here */
3160 }
3161
3162 /* If maximizing, find the longest possible run, then work backwards. */
3163
3164 else
3165 {
3166 pp = eptr;
3167 for (i = min; i < max; i++)
3168 {
3169 int len = 1;
3170 if (eptr >= md->end_subject)
3171 {
3172 SCHECK_PARTIAL();
3173 break;
3174 }
3175 #ifdef SUPPORT_UTF
3176 GETCHARLENTEST(c, eptr, len);
3177 #else
3178 c = *eptr;
3179 #endif
3180 if (!PRIV(xclass)(c, data, utf)) break;
3181 eptr += len;
3182 }
3183
3184 if (possessive) continue; /* No backtracking */
3185
3186 for(;;)
3187 {
3188 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3190 if (eptr-- == pp) break; /* Stop if tried at original pos */
3191 #ifdef SUPPORT_UTF
3192 if (utf) BACKCHAR(eptr);
3193 #endif
3194 }
3195 RRETURN(MATCH_NOMATCH);
3196 }
3197
3198 /* Control never gets here */
3199 }
3200 #endif /* End of XCLASS */
3201
3202 /* Match a single character, casefully */
3203
3204 case OP_CHAR:
3205 #ifdef SUPPORT_UTF
3206 if (utf)
3207 {
3208 length = 1;
3209 ecode++;
3210 GETCHARLEN(fc, ecode, length);
3211 if (length > md->end_subject - eptr)
3212 {
3213 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3214 RRETURN(MATCH_NOMATCH);
3215 }
3216 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3217 }
3218 else
3219 #endif
3220 /* Not UTF mode */
3221 {
3222 if (md->end_subject - eptr < 1)
3223 {
3224 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3225 RRETURN(MATCH_NOMATCH);
3226 }
3227 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3228 ecode += 2;
3229 }
3230 break;
3231
3232 /* Match a single character, caselessly. If we are at the end of the
3233 subject, give up immediately. */
3234
3235 case OP_CHARI:
3236 if (eptr >= md->end_subject)
3237 {
3238 SCHECK_PARTIAL();
3239 RRETURN(MATCH_NOMATCH);
3240 }
3241
3242 #ifdef SUPPORT_UTF
3243 if (utf)
3244 {
3245 length = 1;
3246 ecode++;
3247 GETCHARLEN(fc, ecode, length);
3248
3249 /* If the pattern character's value is < 128, we have only one byte, and
3250 we know that its other case must also be one byte long, so we can use the
3251 fast lookup table. We know that there is at least one byte left in the
3252 subject. */
3253
3254 if (fc < 128)
3255 {
3256 pcre_uint32 cc = RAWUCHAR(eptr);
3257 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3258 ecode++;
3259 eptr++;
3260 }
3261
3262 /* Otherwise we must pick up the subject character. Note that we cannot
3263 use the value of "length" to check for sufficient bytes left, because the
3264 other case of the character may have more or fewer bytes. */
3265
3266 else
3267 {
3268 pcre_uint32 dc;
3269 GETCHARINC(dc, eptr);
3270 ecode += length;
3271
3272 /* If we have Unicode property support, we can use it to test the other
3273 case of the character, if there is one. */
3274
3275 if (fc != dc)
3276 {
3277 #ifdef SUPPORT_UCP
3278 if (dc != UCD_OTHERCASE(fc))
3279 #endif
3280 RRETURN(MATCH_NOMATCH);
3281 }
3282 }
3283 }
3284 else
3285 #endif /* SUPPORT_UTF */
3286
3287 /* Not UTF mode */
3288 {
3289 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3290 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3291 eptr++;
3292 ecode += 2;
3293 }
3294 break;
3295
3296 /* Match a single character repeatedly. */
3297
3298 case OP_EXACT:
3299 case OP_EXACTI:
3300 min = max = GET2(ecode, 1);
3301 ecode += 1 + IMM2_SIZE;
3302 goto REPEATCHAR;
3303
3304 case OP_POSUPTO:
3305 case OP_POSUPTOI:
3306 possessive = TRUE;
3307 /* Fall through */
3308
3309 case OP_UPTO:
3310 case OP_UPTOI:
3311 case OP_MINUPTO:
3312 case OP_MINUPTOI:
3313 min = 0;
3314 max = GET2(ecode, 1);
3315 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3316 ecode += 1 + IMM2_SIZE;
3317 goto REPEATCHAR;
3318
3319 case OP_POSSTAR:
3320 case OP_POSSTARI:
3321 possessive = TRUE;
3322 min = 0;
3323 max = INT_MAX;
3324 ecode++;
3325 goto REPEATCHAR;
3326
3327 case OP_POSPLUS:
3328 case OP_POSPLUSI:
3329 possessive = TRUE;
3330 min = 1;
3331 max = INT_MAX;
3332 ecode++;
3333 goto REPEATCHAR;
3334
3335 case OP_POSQUERY:
3336 case OP_POSQUERYI:
3337 possessive = TRUE;
3338 min = 0;
3339 max = 1;
3340 ecode++;
3341 goto REPEATCHAR;
3342
3343 case OP_STAR:
3344 case OP_STARI:
3345 case OP_MINSTAR:
3346 case OP_MINSTARI:
3347 case OP_PLUS:
3348 case OP_PLUSI:
3349 case OP_MINPLUS:
3350 case OP_MINPLUSI:
3351 case OP_QUERY:
3352 case OP_QUERYI:
3353 case OP_MINQUERY:
3354 case OP_MINQUERYI:
3355 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3356 minimize = (c & 1) != 0;
3357 min = rep_min[c]; /* Pick up values from tables; */
3358 max = rep_max[c]; /* zero for max => infinity */
3359 if (max == 0) max = INT_MAX;
3360
3361 /* Common code for all repeated single-character matches. We first check
3362 for the minimum number of characters. If the minimum equals the maximum, we
3363 are done. Otherwise, if minimizing, check the rest of the pattern for a
3364 match; if there isn't one, advance up to the maximum, one character at a
3365 time.
3366
3367 If maximizing, advance up to the maximum number of matching characters,
3368 until eptr is past the end of the maximum run. If possessive, we are
3369 then done (no backing up). Otherwise, match at this position; anything
3370 other than no match is immediately returned. For nomatch, back up one
3371 character, unless we are matching \R and the last thing matched was
3372 \r\n, in which case, back up two bytes. When we reach the first optional
3373 character position, we can save stack by doing a tail recurse.
3374
3375 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3376 for speed. */
3377
3378 REPEATCHAR:
3379 #ifdef SUPPORT_UTF
3380 if (utf)
3381 {
3382 length = 1;
3383 charptr = ecode;
3384 GETCHARLEN(fc, ecode, length);
3385 ecode += length;
3386
3387 /* Handle multibyte character matching specially here. There is
3388 support for caseless matching if UCP support is present. */
3389
3390 if (length > 1)
3391 {
3392 #ifdef SUPPORT_UCP
3393 pcre_uint32 othercase;
3394 if (op >= OP_STARI && /* Caseless */
3395 (othercase = UCD_OTHERCASE(fc)) != fc)
3396 oclength = PRIV(ord2utf)(othercase, occhars);
3397 else oclength = 0;
3398 #endif /* SUPPORT_UCP */
3399
3400 for (i = 1; i <= min; i++)
3401 {
3402 if (eptr <= md->end_subject - length &&
3403 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3404 #ifdef SUPPORT_UCP
3405 else if (oclength > 0 &&
3406 eptr <= md->end_subject - oclength &&
3407 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3408 #endif /* SUPPORT_UCP */
3409 else
3410 {
3411 CHECK_PARTIAL();
3412 RRETURN(MATCH_NOMATCH);
3413 }
3414 }
3415
3416 if (min == max) continue;
3417
3418 if (minimize)
3419 {
3420 for (fi = min;; fi++)
3421 {
3422 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3423 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3424 if (fi >= max) RRETURN(MATCH_NOMATCH);
3425 if (eptr <= md->end_subject - length &&
3426 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3427 #ifdef SUPPORT_UCP
3428 else if (oclength > 0 &&
3429 eptr <= md->end_subject - oclength &&
3430 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3431 #endif /* SUPPORT_UCP */
3432 else
3433 {
3434 CHECK_PARTIAL();
3435 RRETURN(MATCH_NOMATCH);
3436 }
3437 }
3438 /* Control never gets here */
3439 }
3440
3441 else /* Maximize */
3442 {
3443 pp = eptr;
3444 for (i = min; i < max; i++)
3445 {
3446 if (eptr <= md->end_subject - length &&
3447 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3448 #ifdef SUPPORT_UCP
3449 else if (oclength > 0 &&
3450 eptr <= md->end_subject - oclength &&
3451 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3452 #endif /* SUPPORT_UCP */
3453 else
3454 {
3455 CHECK_PARTIAL();
3456 break;
3457 }
3458 }
3459
3460 if (possessive) continue; /* No backtracking */
3461 for(;;)
3462 {
3463 if (eptr == pp) goto TAIL_RECURSE;
3464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3466 #ifdef SUPPORT_UCP
3467 eptr--;
3468 BACKCHAR(eptr);
3469 #else /* without SUPPORT_UCP */
3470 eptr -= length;
3471 #endif /* SUPPORT_UCP */
3472 }
3473 }
3474 /* Control never gets here */
3475 }
3476
3477 /* If the length of a UTF-8 character is 1, we fall through here, and
3478 obey the code as for non-UTF-8 characters below, though in this case the
3479 value of fc will always be < 128. */
3480 }
3481 else
3482 #endif /* SUPPORT_UTF */
3483 /* When not in UTF-8 mode, load a single-byte character. */
3484 fc = *ecode++;
3485
3486 /* The value of fc at this point is always one character, though we may
3487 or may not be in UTF mode. The code is duplicated for the caseless and
3488 caseful cases, for speed, since matching characters is likely to be quite
3489 common. First, ensure the minimum number of matches are present. If min =
3490 max, continue at the same level without recursing. Otherwise, if
3491 minimizing, keep trying the rest of the expression and advancing one
3492 matching character if failing, up to the maximum. Alternatively, if
3493 maximizing, find the maximum number of characters and work backwards. */
3494
3495 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3496 max, (char *)eptr));
3497
3498 if (op >= OP_STARI) /* Caseless */
3499 {
3500 #ifdef COMPILE_PCRE8
3501 /* fc must be < 128 if UTF is enabled. */
3502 foc = md->fcc[fc];
3503 #else
3504 #ifdef SUPPORT_UTF
3505 #ifdef SUPPORT_UCP
3506 if (utf && fc > 127)
3507 foc = UCD_OTHERCASE(fc);
3508 #else
3509 if (utf && fc > 127)
3510 foc = fc;
3511 #endif /* SUPPORT_UCP */
3512 else
3513 #endif /* SUPPORT_UTF */
3514 foc = TABLE_GET(fc, md->fcc, fc);
3515 #endif /* COMPILE_PCRE8 */
3516
3517 for (i = 1; i <= min; i++)
3518 {
3519 pcre_uint32 cc; /* Faster than pcre_uchar */
3520 if (eptr >= md->end_subject)
3521 {
3522 SCHECK_PARTIAL();
3523 RRETURN(MATCH_NOMATCH);
3524 }
3525 cc = RAWUCHARTEST(eptr);
3526 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3527 eptr++;
3528 }
3529 if (min == max) continue;
3530 if (minimize)
3531 {
3532 for (fi = min;; fi++)
3533 {
3534 pcre_uint32 cc; /* Faster than pcre_uchar */
3535 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3536 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3537 if (fi >= max) RRETURN(MATCH_NOMATCH);
3538 if (eptr >= md->end_subject)
3539 {
3540 SCHECK_PARTIAL();
3541 RRETURN(MATCH_NOMATCH);
3542 }
3543 cc = RAWUCHARTEST(eptr);
3544 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3545 eptr++;
3546 }
3547 /* Control never gets here */
3548 }
3549 else /* Maximize */
3550 {
3551 pp = eptr;
3552 for (i = min; i < max; i++)
3553 {
3554 pcre_uint32 cc; /* Faster than pcre_uchar */
3555 if (eptr >= md->end_subject)
3556 {
3557 SCHECK_PARTIAL();
3558 break;
3559 }
3560 cc = RAWUCHARTEST(eptr);
3561 if (fc != cc && foc != cc) break;
3562 eptr++;
3563 }
3564 if (possessive) continue; /* No backtracking */
3565 for (;;)
3566 {
3567 if (eptr == pp) goto TAIL_RECURSE;
3568 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3569 eptr--;
3570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3571 }
3572 /* Control never gets here */
3573 }
3574 }
3575
3576 /* Caseful comparisons (includes all multi-byte characters) */
3577
3578 else
3579 {
3580 for (i = 1; i <= min; i++)
3581 {
3582 if (eptr >= md->end_subject)
3583 {
3584 SCHECK_PARTIAL();
3585 RRETURN(MATCH_NOMATCH);
3586 }
3587 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3588 }
3589
3590 if (min == max) continue;
3591
3592 if (minimize)
3593 {
3594 for (fi = min;; fi++)
3595 {
3596 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3598 if (fi >= max) RRETURN(MATCH_NOMATCH);
3599 if (eptr >= md->end_subject)
3600 {
3601 SCHECK_PARTIAL();
3602 RRETURN(MATCH_NOMATCH);
3603 }
3604 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3605 }
3606 /* Control never gets here */
3607 }
3608 else /* Maximize */
3609 {
3610 pp = eptr;
3611 for (i = min; i < max; i++)
3612 {
3613 if (eptr >= md->end_subject)
3614 {
3615 SCHECK_PARTIAL();
3616 break;
3617 }
3618 if (fc != RAWUCHARTEST(eptr)) break;
3619 eptr++;
3620 }
3621 if (possessive) continue; /* No backtracking */
3622 for (;;)
3623 {
3624 if (eptr == pp) goto TAIL_RECURSE;
3625 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3626 eptr--;
3627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3628 }
3629 /* Control never gets here */
3630 }
3631 }
3632 /* Control never gets here */
3633
3634 /* Match a negated single one-byte character. The character we are
3635 checking can be multibyte. */
3636
3637 case OP_NOT:
3638 case OP_NOTI:
3639 if (eptr >= md->end_subject)
3640 {
3641 SCHECK_PARTIAL();
3642 RRETURN(MATCH_NOMATCH);
3643 }
3644 #ifdef SUPPORT_UTF
3645 if (utf)
3646 {
3647 register pcre_uint32 ch, och;
3648
3649 ecode++;
3650 GETCHARINC(ch, ecode);
3651 GETCHARINC(c, eptr);
3652
3653 if (op == OP_NOT)
3654 {
3655 if (ch == c) RRETURN(MATCH_NOMATCH);
3656 }
3657 else
3658 {
3659 #ifdef SUPPORT_UCP
3660 if (ch > 127)
3661 och = UCD_OTHERCASE(ch);
3662 #else
3663 if (ch > 127)
3664 och = ch;
3665 #endif /* SUPPORT_UCP */
3666 else
3667 och = TABLE_GET(ch, md->fcc, ch);
3668 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3669 }
3670 }
3671 else
3672 #endif
3673 {
3674 register pcre_uint32 ch = ecode[1];
3675 c = *eptr++;
3676 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3677 RRETURN(MATCH_NOMATCH);
3678 ecode += 2;
3679 }
3680 break;
3681
3682 /* Match a negated single one-byte character repeatedly. This is almost a
3683 repeat of the code for a repeated single character, but I haven't found a
3684 nice way of commoning these up that doesn't require a test of the
3685 positive/negative option for each character match. Maybe that wouldn't add
3686 very much to the time taken, but character matching *is* what this is all
3687 about... */
3688
3689 case OP_NOTEXACT:
3690 case OP_NOTEXACTI:
3691 min = max = GET2(ecode, 1);
3692 ecode += 1 + IMM2_SIZE;
3693 goto REPEATNOTCHAR;
3694
3695 case OP_NOTUPTO:
3696 case OP_NOTUPTOI:
3697 case OP_NOTMINUPTO:
3698 case OP_NOTMINUPTOI:
3699 min = 0;
3700 max = GET2(ecode, 1);
3701 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3702 ecode += 1 + IMM2_SIZE;
3703 goto REPEATNOTCHAR;
3704
3705 case OP_NOTPOSSTAR:
3706 case OP_NOTPOSSTARI:
3707 possessive = TRUE;
3708 min = 0;
3709 max = INT_MAX;
3710 ecode++;
3711 goto REPEATNOTCHAR;
3712
3713 case OP_NOTPOSPLUS:
3714 case OP_NOTPOSPLUSI:
3715 possessive = TRUE;
3716 min = 1;
3717 max = INT_MAX;
3718 ecode++;
3719 goto REPEATNOTCHAR;
3720
3721 case OP_NOTPOSQUERY:
3722 case OP_NOTPOSQUERYI:
3723 possessive = TRUE;
3724 min = 0;
3725 max = 1;
3726 ecode++;
3727 goto REPEATNOTCHAR;
3728
3729 case OP_NOTPOSUPTO:
3730 case OP_NOTPOSUPTOI:
3731 possessive = TRUE;
3732 min = 0;
3733 max = GET2(ecode, 1);
3734 ecode += 1 + IMM2_SIZE;
3735 goto REPEATNOTCHAR;
3736
3737 case OP_NOTSTAR:
3738 case OP_NOTSTARI:
3739 case OP_NOTMINSTAR:
3740 case OP_NOTMINSTARI:
3741 case OP_NOTPLUS:
3742 case OP_NOTPLUSI:
3743 case OP_NOTMINPLUS:
3744 case OP_NOTMINPLUSI:
3745 case OP_NOTQUERY:
3746 case OP_NOTQUERYI:
3747 case OP_NOTMINQUERY:
3748 case OP_NOTMINQUERYI:
3749 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3750 minimize = (c & 1) != 0;
3751 min = rep_min[c]; /* Pick up values from tables; */
3752 max = rep_max[c]; /* zero for max => infinity */
3753 if (max == 0) max = INT_MAX;
3754
3755 /* Common code for all repeated single-byte matches. */
3756
3757 REPEATNOTCHAR:
3758 GETCHARINCTEST(fc, ecode);
3759
3760 /* The code is duplicated for the caseless and caseful cases, for speed,
3761 since matching characters is likely to be quite common. First, ensure the
3762 minimum number of matches are present. If min = max, continue at the same
3763 level without recursing. Otherwise, if minimizing, keep trying the rest of
3764 the expression and advancing one matching character if failing, up to the
3765 maximum. Alternatively, if maximizing, find the maximum number of
3766 characters and work backwards. */
3767
3768 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3769 max, (char *)eptr));
3770
3771 if (op >= OP_NOTSTARI) /* Caseless */
3772 {
3773 #ifdef SUPPORT_UTF
3774 #ifdef SUPPORT_UCP
3775 if (utf && fc > 127)
3776 foc = UCD_OTHERCASE(fc);
3777 #else
3778 if (utf && fc > 127)
3779 foc = fc;
3780 #endif /* SUPPORT_UCP */
3781 else
3782 #endif /* SUPPORT_UTF */
3783 foc = TABLE_GET(fc, md->fcc, fc);
3784
3785 #ifdef SUPPORT_UTF
3786 if (utf)
3787 {
3788 register pcre_uint32 d;
3789 for (i = 1; i <= min; i++)
3790 {
3791 if (eptr >= md->end_subject)
3792 {
3793 SCHECK_PARTIAL();
3794 RRETURN(MATCH_NOMATCH);
3795 }
3796 GETCHARINC(d, eptr);
3797 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3798 }
3799 }
3800 else
3801 #endif /* SUPPORT_UTF */
3802 /* Not UTF mode */
3803 {
3804 for (i = 1; i <= min; i++)
3805 {
3806 if (eptr >= md->end_subject)
3807 {
3808 SCHECK_PARTIAL();
3809 RRETURN(MATCH_NOMATCH);
3810 }
3811 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3812 eptr++;
3813 }
3814 }
3815
3816 if (min == max) continue;
3817
3818 if (minimize)
3819 {
3820 #ifdef SUPPORT_UTF
3821 if (utf)
3822 {
3823 register pcre_uint32 d;
3824 for (fi = min;; fi++)
3825 {
3826 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3828 if (fi >= max) RRETURN(MATCH_NOMATCH);
3829 if (eptr >= md->end_subject)
3830 {
3831 SCHECK_PARTIAL();
3832 RRETURN(MATCH_NOMATCH);
3833 }
3834 GETCHARINC(d, eptr);
3835 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3836 }
3837 }
3838 else
3839 #endif /*SUPPORT_UTF */
3840 /* Not UTF mode */
3841 {
3842 for (fi = min;; fi++)
3843 {
3844 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3846 if (fi >= max) RRETURN(MATCH_NOMATCH);
3847 if (eptr >= md->end_subject)
3848 {
3849 SCHECK_PARTIAL();
3850 RRETURN(MATCH_NOMATCH);
3851 }
3852 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3853 eptr++;
3854 }
3855 }
3856 /* Control never gets here */
3857 }
3858
3859 /* Maximize case */
3860
3861 else
3862 {
3863 pp = eptr;
3864
3865 #ifdef SUPPORT_UTF
3866 if (utf)
3867 {
3868 register pcre_uint32 d;
3869 for (i = min; i < max; i++)
3870 {
3871 int len = 1;
3872 if (eptr >= md->end_subject)
3873 {
3874 SCHECK_PARTIAL();
3875 break;
3876 }
3877 GETCHARLEN(d, eptr, len);
3878 if (fc == d || (unsigned int)foc == d) break;
3879 eptr += len;
3880 }
3881 if (possessive) continue; /* No backtracking */
3882 for(;;)
3883 {
3884 if (eptr == pp) goto TAIL_RECURSE;
3885 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3887 eptr--;
3888 BACKCHAR(eptr);
3889 }
3890 }
3891 else
3892 #endif /* SUPPORT_UTF */
3893 /* Not UTF mode */
3894 {
3895 for (i = min; i < max; i++)
3896 {
3897 if (eptr >= md->end_subject)
3898 {
3899 SCHECK_PARTIAL();
3900 break;
3901 }
3902 if (fc == *eptr || foc == *eptr) break;
3903 eptr++;
3904 }
3905 if (possessive) continue; /* No backtracking */
3906 for (;;)
3907 {
3908 if (eptr == pp) goto TAIL_RECURSE;
3909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3911 eptr--;
3912 }
3913 }
3914 /* Control never gets here */
3915 }
3916 }
3917
3918 /* Caseful comparisons */
3919
3920 else
3921 {
3922 #ifdef SUPPORT_UTF
3923 if (utf)
3924 {
3925 register pcre_uint32 d;
3926 for (i = 1; i <= min; i++)
3927 {
3928 if (eptr >= md->end_subject)
3929 {
3930 SCHECK_PARTIAL();
3931 RRETURN(MATCH_NOMATCH);
3932 }
3933 GETCHARINC(d, eptr);
3934 if (fc == d) RRETURN(MATCH_NOMATCH);
3935 }
3936 }
3937 else
3938 #endif
3939 /* Not UTF mode */
3940 {
3941 for (i = 1; i <= min; i++)
3942 {
3943 if (eptr >= md->end_subject)
3944 {
3945 SCHECK_PARTIAL();
3946 RRETURN(MATCH_NOMATCH);
3947 }
3948 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3949 }
3950 }
3951
3952 if (min == max) continue;
3953
3954 if (minimize)
3955 {
3956 #ifdef SUPPORT_UTF
3957 if (utf)
3958 {
3959 register pcre_uint32 d;
3960 for (fi = min;; fi++)
3961 {
3962 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3964 if (fi >= max) RRETURN(MATCH_NOMATCH);
3965 if (eptr >= md->end_subject)
3966 {
3967 SCHECK_PARTIAL();
3968 RRETURN(MATCH_NOMATCH);
3969 }
3970 GETCHARINC(d, eptr);
3971 if (fc == d) RRETURN(MATCH_NOMATCH);
3972 }
3973 }
3974 else
3975 #endif
3976 /* Not UTF mode */
3977 {
3978 for (fi = min;; fi++)
3979 {
3980 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3982 if (fi >= max) RRETURN(MATCH_NOMATCH);
3983 if (eptr >= md->end_subject)
3984 {
3985 SCHECK_PARTIAL();
3986 RRETURN(MATCH_NOMATCH);
3987 }
3988 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3989 }
3990 }
3991 /* Control never gets here */
3992 }
3993
3994 /* Maximize case */
3995
3996 else
3997 {
3998 pp = eptr;
3999
4000 #ifdef SUPPORT_UTF
4001 if (utf)
4002 {
4003 register pcre_uint32 d;
4004 for (i = min; i < max; i++)
4005 {
4006 int len = 1;
4007 if (eptr >= md->end_subject)
4008 {
4009 SCHECK_PARTIAL();
4010 break;
4011 }
4012 GETCHARLEN(d, eptr, len);
4013 if (fc == d) break;
4014 eptr += len;
4015 }
4016 if (possessive) continue; /* No backtracking */
4017 for(;;)
4018 {
4019 if (eptr == pp) goto TAIL_RECURSE;
4020 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4022 eptr--;
4023 BACKCHAR(eptr);
4024 }
4025 }
4026 else
4027 #endif
4028 /* Not UTF mode */
4029 {
4030 for (i = min; i < max; i++)
4031 {
4032 if (eptr >= md->end_subject)
4033 {
4034 SCHECK_PARTIAL();
4035 break;
4036 }
4037 if (fc == *eptr) break;
4038 eptr++;
4039 }
4040 if (possessive) continue; /* No backtracking */
4041 for (;;)
4042 {
4043 if (eptr == pp) goto TAIL_RECURSE;
4044 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4046 eptr--;
4047 }
4048 }
4049 /* Control never gets here */
4050 }
4051 }
4052 /* Control never gets here */
4053
4054 /* Match a single character type repeatedly; several different opcodes
4055 share code. This is very similar to the code for single characters, but we
4056 repeat it in the interests of efficiency. */
4057
4058 case OP_TYPEEXACT:
4059 min = max = GET2(ecode, 1);
4060 minimize = TRUE;
4061 ecode += 1 + IMM2_SIZE;
4062 goto REPEATTYPE;
4063
4064 case OP_TYPEUPTO:
4065 case OP_TYPEMINUPTO:
4066 min = 0;
4067 max = GET2(ecode, 1);
4068 minimize = *ecode == OP_TYPEMINUPTO;
4069 ecode += 1 + IMM2_SIZE;
4070 goto REPEATTYPE;
4071
4072 case OP_TYPEPOSSTAR:
4073 possessive = TRUE;
4074 min = 0;
4075 max = INT_MAX;
4076 ecode++;
4077 goto REPEATTYPE;
4078
4079 case OP_TYPEPOSPLUS:
4080 possessive = TRUE;
4081 min = 1;
4082 max = INT_MAX;
4083 ecode++;
4084 goto REPEATTYPE;
4085
4086 case OP_TYPEPOSQUERY:
4087 possessive = TRUE;
4088 min = 0;
4089 max = 1;
4090 ecode++;
4091 goto REPEATTYPE;
4092
4093 case OP_TYPEPOSUPTO:
4094 possessive = TRUE;
4095 min = 0;
4096 max = GET2(ecode, 1);
4097 ecode += 1 + IMM2_SIZE;
4098 goto REPEATTYPE;
4099
4100 case OP_TYPESTAR:
4101 case OP_TYPEMINSTAR:
4102 case OP_TYPEPLUS:
4103 case OP_TYPEMINPLUS:
4104 case OP_TYPEQUERY:
4105 case OP_TYPEMINQUERY:
4106 c = *ecode++ - OP_TYPESTAR;
4107 minimize = (c & 1) != 0;
4108 min = rep_min[c]; /* Pick up values from tables; */
4109 max = rep_max[c]; /* zero for max => infinity */
4110 if (max == 0) max = INT_MAX;
4111
4112 /* Common code for all repeated single character type matches. Note that
4113 in UTF-8 mode, '.' matches a character of any length, but for the other
4114 character types, the valid characters are all one-byte long. */
4115
4116 REPEATTYPE:
4117 ctype = *ecode++; /* Code for the character type */
4118
4119 #ifdef SUPPORT_UCP
4120 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4121 {
4122 prop_fail_result = ctype == OP_NOTPROP;
4123 prop_type = *ecode++;
4124 prop_value = *ecode++;
4125 }
4126 else prop_type = -1;
4127 #endif
4128
4129 /* First, ensure the minimum number of matches are present. Use inline
4130 code for maximizing the speed, and do the type test once at the start
4131 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4132 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4133 and single-bytes. */
4134
4135 if (min > 0)
4136 {
4137 #ifdef SUPPORT_UCP
4138 if (prop_type >= 0)
4139 {
4140 switch(prop_type)
4141 {
4142 case PT_ANY:
4143 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4144 for (i = 1; i <= min; i++)
4145 {
4146 if (eptr >= md->end_subject)
4147 {
4148 SCHECK_PARTIAL();
4149 RRETURN(MATCH_NOMATCH);
4150 }
4151 GETCHARINCTEST(c, eptr);
4152 }
4153 break;
4154
4155 case PT_LAMP:
4156 for (i = 1; i <= min; i++)
4157 {
4158 int chartype;
4159 if (eptr >= md->end_subject)
4160 {
4161 SCHECK_PARTIAL();
4162 RRETURN(MATCH_NOMATCH);
4163 }
4164 GETCHARINCTEST(c, eptr);
4165 chartype = UCD_CHARTYPE(c);
4166 if ((chartype == ucp_Lu ||
4167 chartype == ucp_Ll ||
4168 chartype == ucp_Lt) == prop_fail_result)
4169 RRETURN(MATCH_NOMATCH);
4170 }
4171 break;
4172
4173 case PT_GC:
4174 for (i = 1; i <= min; i++)
4175 {
4176 if (eptr >= md->end_subject)
4177 {
4178 SCHECK_PARTIAL();
4179 RRETURN(MATCH_NOMATCH);
4180 }
4181 GETCHARINCTEST(c, eptr);
4182 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 break;
4186
4187 case PT_PC:
4188 for (i = 1; i <= min; i++)
4189 {
4190 if (eptr >= md->end_subject)
4191 {
4192 SCHECK_PARTIAL();
4193 RRETURN(MATCH_NOMATCH);
4194 }
4195 GETCHARINCTEST(c, eptr);
4196 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4197 RRETURN(MATCH_NOMATCH);
4198 }
4199 break;
4200
4201 case PT_SC:
4202 for (i = 1; i <= min; i++)
4203 {
4204 if (eptr >= md->end_subject)
4205 {
4206 SCHECK_PARTIAL();
4207 RRETURN(MATCH_NOMATCH);
4208 }
4209 GETCHARINCTEST(c, eptr);
4210 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4211 RRETURN(MATCH_NOMATCH);
4212 }
4213 break;
4214
4215 case PT_ALNUM:
4216 for (i = 1; i <= min; i++)
4217 {
4218 int category;
4219 if (eptr >= md->end_subject)
4220 {
4221 SCHECK_PARTIAL();
4222 RRETURN(MATCH_NOMATCH);
4223 }
4224 GETCHARINCTEST(c, eptr);
4225 category = UCD_CATEGORY(c);
4226 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4227 RRETURN(MATCH_NOMATCH);
4228 }
4229 break;
4230
4231 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4232 which means that Perl space and POSIX space are now identical. PCRE
4233 was changed at release 8.34. */
4234
4235 case PT_SPACE: /* Perl space */
4236 case PT_PXSPACE: /* POSIX space */
4237 for (i = 1; i <= min; i++)
4238 {
4239 if (eptr >= md->end_subject)
4240 {
4241 SCHECK_PARTIAL();
4242 RRETURN(MATCH_NOMATCH);
4243 }
4244 GETCHARINCTEST(c, eptr);
4245 switch(c)
4246 {
4247 HSPACE_CASES:
4248 VSPACE_CASES:
4249 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4250 break;
4251
4252 default:
4253 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4254 RRETURN(MATCH_NOMATCH);
4255 break;
4256 }
4257 }
4258 break;
4259
4260 case PT_WORD:
4261 for (i = 1; i <= min; i++)
4262 {
4263 int category;
4264 if (eptr >= md->end_subject)
4265 {
4266 SCHECK_PARTIAL();
4267 RRETURN(MATCH_NOMATCH);
4268 }
4269 GETCHARINCTEST(c, eptr);
4270 category = UCD_CATEGORY(c);
4271 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4272 == prop_fail_result)
4273 RRETURN(MATCH_NOMATCH);
4274 }
4275 break;
4276
4277 case PT_CLIST:
4278 for (i = 1; i <= min; i++)
4279 {
4280 const pcre_uint32 *cp;
4281 if (eptr >= md->end_subject)
4282 {
4283 SCHECK_PARTIAL();
4284 RRETURN(MATCH_NOMATCH);
4285 }
4286 GETCHARINCTEST(c, eptr);
4287 cp = PRIV(ucd_caseless_sets) + prop_value;
4288 for (;;)
4289 {
4290 if (c < *cp)
4291 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4292 if (c == *cp++)
4293 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4294 }
4295 }
4296 break;
4297
4298 case PT_UCNC:
4299 for (i = 1; i <= min; i++)
4300 {
4301 if (eptr >= md->end_subject)
4302 {
4303 SCHECK_PARTIAL();
4304 RRETURN(MATCH_NOMATCH);
4305 }
4306 GETCHARINCTEST(c, eptr);
4307 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4308 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4309 c >= 0xe000) == prop_fail_result)
4310 RRETURN(MATCH_NOMATCH);
4311 }
4312 break;
4313
4314 /* This should not occur */
4315
4316 default:
4317 RRETURN(PCRE_ERROR_INTERNAL);
4318 }
4319 }
4320
4321 /* Match extended Unicode sequences. We will get here only if the
4322 support is in the binary; otherwise a compile-time error occurs. */
4323
4324 else if (ctype == OP_EXTUNI)
4325 {
4326 for (i = 1; i <= min; i++)
4327 {
4328 if (eptr >= md->end_subject)
4329 {
4330 SCHECK_PARTIAL();
4331 RRETURN(MATCH_NOMATCH);
4332 }
4333 else
4334 {
4335 int lgb, rgb;
4336 GETCHARINCTEST(c, eptr);
4337 lgb = UCD_GRAPHBREAK(c);
4338 while (eptr < md->end_subject)
4339 {
4340 int len = 1;
4341 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4342 rgb = UCD_GRAPHBREAK(c);
4343 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4344 lgb = rgb;
4345 eptr += len;
4346 }
4347 }
4348 CHECK_PARTIAL();
4349 }
4350 }
4351
4352 else
4353 #endif /* SUPPORT_UCP */
4354
4355 /* Handle all other cases when the coding is UTF-8 */
4356
4357 #ifdef SUPPORT_UTF
4358 if (utf) switch(ctype)
4359 {
4360 case OP_ANY:
4361 for (i = 1; i <= min; i++)
4362 {
4363 if (eptr >= md->end_subject)
4364 {
4365 SCHECK_PARTIAL();
4366 RRETURN(MATCH_NOMATCH);
4367 }
4368 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4369 if (md->partial != 0 &&
4370 eptr + 1 >= md->end_subject &&
4371 NLBLOCK->nltype == NLTYPE_FIXED &&
4372 NLBLOCK->nllen == 2 &&
4373 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4374 {
4375 md->hitend = TRUE;
4376 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4377 }
4378 eptr++;
4379 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4380 }
4381 break;
4382
4383 case OP_ALLANY:
4384 for (i = 1; i <= min; i++)
4385 {
4386 if (eptr >= md->end_subject)
4387 {
4388 SCHECK_PARTIAL();
4389 RRETURN(MATCH_NOMATCH);
4390 }
4391 eptr++;
4392 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4393 }
4394 break;
4395
4396 case OP_ANYBYTE:
4397 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4398 eptr += min;
4399 break;
4400
4401 case OP_ANYNL:
4402 for (i = 1; i <= min; i++)
4403 {
4404 if (eptr >= md->end_subject)
4405 {
4406 SCHECK_PARTIAL();
4407 RRETURN(MATCH_NOMATCH);
4408 }
4409 GETCHARINC(c, eptr);
4410 switch(c)
4411 {
4412 default: RRETURN(MATCH_NOMATCH);
4413
4414 case CHAR_CR:
4415 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4416 break;
4417
4418 case CHAR_LF:
4419 break;
4420
4421 case CHAR_VT:
4422 case CHAR_FF:
4423 case CHAR_NEL:
4424 #ifndef EBCDIC
4425 case 0x2028:
4426 case 0x2029:
4427 #endif /* Not EBCDIC */
4428 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4429 break;
4430 }
4431 }
4432 break;
4433
4434 case OP_NOT_HSPACE:
4435 for (i = 1; i <= min; i++)
4436 {
4437 if (eptr >= md->end_subject)
4438 {
4439 SCHECK_PARTIAL();
4440 RRETURN(MATCH_NOMATCH);
4441 }
4442 GETCHARINC(c, eptr);
4443 switch(c)
4444 {
4445 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4446 default: break;
4447 }
4448 }
4449 break;
4450
4451 case OP_HSPACE:
4452 for (i = 1; i <= min; i++)
4453 {
4454 if (eptr >= md->end_subject)
4455 {
4456 SCHECK_PARTIAL();
4457 RRETURN(MATCH_NOMATCH);
4458 }
4459 GETCHARINC(c, eptr);
4460 switch(c)
4461 {
4462 HSPACE_CASES: break; /* Byte and multibyte cases */
4463 default: RRETURN(MATCH_NOMATCH);
4464 }
4465 }
4466 break;
4467
4468 case OP_NOT_VSPACE:
4469 for (i = 1; i <= min; i++)
4470 {
4471 if (eptr >= md->end_subject)
4472 {
4473 SCHECK_PARTIAL();
4474 RRETURN(MATCH_NOMATCH);
4475 }
4476 GETCHARINC(c, eptr);
4477 switch(c)
4478 {
4479 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4480 default: break;
4481 }
4482 }
4483 break;
4484
4485 case OP_VSPACE:
4486 for (i = 1; i <= min; i++)
4487 {
4488 if (eptr >= md->end_subject)
4489 {
4490 SCHECK_PARTIAL();
4491 RRETURN(MATCH_NOMATCH);
4492 }
4493 GETCHARINC(c, eptr);
4494 switch(c)
4495 {
4496 VSPACE_CASES: break;
4497 default: RRETURN(MATCH_NOMATCH);
4498 }
4499 }
4500 break;
4501
4502 case OP_NOT_DIGIT:
4503 for (i = 1; i <= min; i++)
4504 {
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 RRETURN(MATCH_NOMATCH);
4509 }
4510 GETCHARINC(c, eptr);
4511 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4512 RRETURN(MATCH_NOMATCH);
4513 }
4514 break;
4515
4516 case OP_DIGIT:
4517 for (i = 1; i <= min; i++)
4518 {
4519 pcre_uint32 cc;
4520 if (eptr >= md->end_subject)
4521 {
4522 SCHECK_PARTIAL();
4523 RRETURN(MATCH_NOMATCH);
4524 }
4525 cc = RAWUCHAR(eptr);
4526 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4527 RRETURN(MATCH_NOMATCH);
4528 eptr++;
4529 /* No need to skip more bytes - we know it's a 1-byte character */
4530 }
4531 break;
4532
4533 case OP_NOT_WHITESPACE:
4534 for (i = 1; i <= min; i++)
4535 {
4536 pcre_uint32 cc;
4537 if (eptr >= md->end_subject)
4538 {
4539 SCHECK_PARTIAL();
4540 RRETURN(MATCH_NOMATCH);
4541 }
4542 cc = RAWUCHAR(eptr);
4543 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4544 RRETURN(MATCH_NOMATCH);
4545 eptr++;
4546 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4547 }
4548 break;
4549
4550 case OP_WHITESPACE:
4551 for (i = 1; i <= min; i++)
4552 {
4553 pcre_uint32 cc;
4554 if (eptr >= md->end_subject)
4555 {
4556 SCHECK_PARTIAL();
4557 RRETURN(MATCH_NOMATCH);
4558 }
4559 cc = RAWUCHAR(eptr);
4560 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4561 RRETURN(MATCH_NOMATCH);
4562 eptr++;
4563 /* No need to skip more bytes - we know it's a 1-byte character */
4564 }
4565 break;
4566
4567 case OP_NOT_WORDCHAR:
4568 for (i = 1; i <= min; i++)
4569 {
4570 pcre_uint32 cc;
4571 if (eptr >= md->end_subject)
4572 {
4573 SCHECK_PARTIAL();
4574 RRETURN(MATCH_NOMATCH);
4575 }
4576 cc = RAWUCHAR(eptr);
4577 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4578 RRETURN(MATCH_NOMATCH);
4579 eptr++;
4580 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4581 }
4582 break;
4583
4584 case OP_WORDCHAR:
4585 for (i = 1; i <= min; i++)
4586 {
4587 pcre_uint32 cc;
4588 if (eptr >= md->end_subject)
4589 {
4590 SCHECK_PARTIAL();
4591 RRETURN(MATCH_NOMATCH);
4592 }
4593 cc = RAWUCHAR(eptr);
4594 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4595 RRETURN(MATCH_NOMATCH);
4596 eptr++;
4597 /* No need to skip more bytes - we know it's a 1-byte character */
4598 }
4599 break;
4600
4601 default:
4602 RRETURN(PCRE_ERROR_INTERNAL);
4603 } /* End switch(ctype) */
4604
4605 else
4606 #endif /* SUPPORT_UTF */
4607
4608 /* Code for the non-UTF-8 case for minimum matching of operators other
4609 than OP_PROP and OP_NOTPROP. */
4610
4611 switch(ctype)
4612 {
4613 case OP_ANY:
4614 for (i = 1; i <= min; i++)
4615 {
4616 if (eptr >= md->end_subject)
4617 {
4618 SCHECK_PARTIAL();
4619 RRETURN(MATCH_NOMATCH);
4620 }
4621 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4622 if (md->partial != 0 &&
4623 eptr + 1 >= md->end_subject &&
4624 NLBLOCK->nltype == NLTYPE_FIXED &&
4625 NLBLOCK->nllen == 2 &&
4626 *eptr == NLBLOCK->nl[0])
4627 {
4628 md->hitend = TRUE;
4629 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4630 }
4631 eptr++;
4632 }
4633 break;
4634
4635 case OP_ALLANY:
4636 if (eptr > md->end_subject - min)
4637 {
4638 SCHECK_PARTIAL();
4639 RRETURN(MATCH_NOMATCH);
4640 }
4641 eptr += min;
4642 break;
4643
4644 case OP_ANYBYTE:
4645 if (eptr > md->end_subject - min)
4646 {
4647 SCHECK_PARTIAL();
4648 RRETURN(MATCH_NOMATCH);
4649 }
4650 eptr += min;
4651 break;
4652
4653 case OP_ANYNL:
4654 for (i = 1; i <= min; i++)
4655 {
4656 if (eptr >= md->end_subject)
4657 {
4658 SCHECK_PARTIAL();
4659 RRETURN(MATCH_NOMATCH);
4660 }
4661 switch(*eptr++)
4662 {
4663 default: RRETURN(MATCH_NOMATCH);
4664
4665 case CHAR_CR:
4666 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4667 break;
4668
4669 case CHAR_LF:
4670 break;
4671
4672 case CHAR_VT:
4673 case CHAR_FF:
4674 case CHAR_NEL:
4675 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4676 case 0x2028:
4677 case 0x2029:
4678 #endif
4679 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4680 break;
4681 }
4682 }
4683 break;
4684
4685 case OP_NOT_HSPACE:
4686 for (i = 1; i <= min; i++)
4687 {
4688 if (eptr >= md->end_subject)
4689 {
4690 SCHECK_PARTIAL();
4691 RRETURN(MATCH_NOMATCH);
4692 }
4693 switch(*eptr++)
4694 {
4695 default: break;
4696 HSPACE_BYTE_CASES:
4697 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4698 HSPACE_MULTIBYTE_CASES:
4699 #endif
4700 RRETURN(MATCH_NOMATCH);
4701 }
4702 }
4703 break;
4704
4705 case OP_HSPACE:
4706 for (i = 1; i <= min; i++)
4707 {
4708 if (eptr >= md->end_subject)
4709 {
4710 SCHECK_PARTIAL();
4711 RRETURN(MATCH_NOMATCH);
4712 }
4713 switch(*eptr++)
4714 {
4715 default: RRETURN(MATCH_NOMATCH);
4716 HSPACE_BYTE_CASES:
4717 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4718 HSPACE_MULTIBYTE_CASES:
4719 #endif
4720 break;
4721 }
4722 }
4723 break;
4724
4725 case OP_NOT_VSPACE:
4726 for (i = 1; i <= min; i++)
4727 {
4728 if (eptr >= md->end_subject)
4729 {
4730 SCHECK_PARTIAL();
4731 RRETURN(MATCH_NOMATCH);
4732 }
4733 switch(*eptr++)
4734 {
4735 VSPACE_BYTE_CASES:
4736 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4737 VSPACE_MULTIBYTE_CASES:
4738 #endif
4739 RRETURN(MATCH_NOMATCH);
4740 default: break;
4741 }
4742 }
4743 break;
4744
4745 case OP_VSPACE:
4746 for (i = 1; i <= min; i++)
4747 {
4748 if (eptr >= md->end_subject)
4749 {
4750 SCHECK_PARTIAL();
4751 RRETURN(MATCH_NOMATCH);
4752 }
4753 switch(*eptr++)
4754 {
4755 default: RRETURN(MATCH_NOMATCH);
4756 VSPACE_BYTE_CASES:
4757 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4758 VSPACE_MULTIBYTE_CASES:
4759 #endif
4760 break;
4761 }
4762 }
4763 break;
4764
4765 case OP_NOT_DIGIT:
4766 for (i = 1; i <= min; i++)
4767 {
4768 if (eptr >= md->end_subject)
4769 {
4770 SCHECK_PARTIAL();
4771 RRETURN(MATCH_NOMATCH);
4772 }
4773 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4774 RRETURN(MATCH_NOMATCH);
4775 eptr++;
4776 }
4777 break;
4778
4779 case OP_DIGIT:
4780 for (i = 1; i <= min; i++)
4781 {
4782 if (eptr >= md->end_subject)
4783 {
4784 SCHECK_PARTIAL();
4785 RRETURN(MATCH_NOMATCH);
4786 }
4787 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4788 RRETURN(MATCH_NOMATCH);
4789 eptr++;
4790 }
4791 break;
4792
4793 case OP_NOT_WHITESPACE:
4794 for (i = 1; i <= min; i++)
4795 {
4796 if (eptr >= md->end_subject)
4797 {
4798 SCHECK_PARTIAL();
4799 RRETURN(MATCH_NOMATCH);
4800 }
4801 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4802 RRETURN(MATCH_NOMATCH);
4803 eptr++;
4804 }
4805 break;
4806
4807 case OP_WHITESPACE:
4808 for (i = 1; i <= min; i++)
4809 {
4810 if (eptr >= md->end_subject)
4811 {
4812 SCHECK_PARTIAL();
4813 RRETURN(MATCH_NOMATCH);
4814 }
4815 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4816 RRETURN(MATCH_NOMATCH);
4817 eptr++;
4818 }
4819 break;
4820
4821 case OP_NOT_WORDCHAR:
4822 for (i = 1; i <= min; i++)
4823 {
4824 if (eptr >= md->end_subject)
4825 {
4826 SCHECK_PARTIAL();
4827 RRETURN(MATCH_NOMATCH);
4828 }
4829 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4830 RRETURN(MATCH_NOMATCH);
4831 eptr++;
4832 }
4833 break;
4834
4835 case OP_WORDCHAR:
4836 for (i = 1; i <= min; i++)
4837 {
4838 if (eptr >= md->end_subject)
4839 {
4840 SCHECK_PARTIAL();
4841 RRETURN(MATCH_NOMATCH);
4842 }
4843 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4844 RRETURN(MATCH_NOMATCH);
4845 eptr++;
4846 }
4847 break;
4848
4849 default:
4850 RRETURN(PCRE_ERROR_INTERNAL);
4851 }
4852 }
4853
4854 /* If min = max, continue at the same level without recursing */
4855
4856 if (min == max) continue;
4857
4858 /* If minimizing, we have to test the rest of the pattern before each
4859 subsequent match. Again, separate the UTF-8 case for speed, and also
4860 separate the UCP cases. */
4861
4862 if (minimize)
4863 {
4864 #ifdef SUPPORT_UCP
4865 if (prop_type >= 0)
4866 {
4867 switch(prop_type)
4868 {
4869 case PT_ANY:
4870 for (fi = min;; fi++)
4871 {
4872 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4874 if (fi >= max) RRETURN(MATCH_NOMATCH);
4875 if (eptr >= md->end_subject)
4876 {
4877 SCHECK_PARTIAL();
4878 RRETURN(MATCH_NOMATCH);
4879 }
4880 GETCHARINCTEST(c, eptr);
4881 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4882 }
4883 /* Control never gets here */
4884
4885 case PT_LAMP:
4886 for (fi = min;; fi++)
4887 {
4888 int chartype;
4889 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4890 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4891 if (fi >= max) RRETURN(MATCH_NOMATCH);
4892 if (eptr >= md->end_subject)
4893 {
4894 SCHECK_PARTIAL();
4895 RRETURN(MATCH_NOMATCH);
4896 }
4897 GETCHARINCTEST(c, eptr);
4898 chartype = UCD_CHARTYPE(c);
4899 if ((chartype == ucp_Lu ||
4900 chartype == ucp_Ll ||
4901 chartype == ucp_Lt) == prop_fail_result)
4902 RRETURN(MATCH_NOMATCH);
4903 }
4904 /* Control never gets here */
4905
4906 case PT_GC:
4907 for (fi = min;; fi++)
4908 {
4909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4911 if (fi >= max) RRETURN(MATCH_NOMATCH);
4912 if (eptr >= md->end_subject)
4913 {
4914 SCHECK_PARTIAL();
4915 RRETURN(MATCH_NOMATCH);
4916 }
4917 GETCHARINCTEST(c, eptr);
4918 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4919 RRETURN(MATCH_NOMATCH);
4920 }
4921 /* Control never gets here */
4922
4923 case PT_PC:
4924 for (fi = min;; fi++)
4925 {
4926 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928 if (fi >= max) RRETURN(MATCH_NOMATCH);
4929 if (eptr >= md->end_subject)
4930 {
4931 SCHECK_PARTIAL();
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 GETCHARINCTEST(c, eptr);
4935 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4936 RRETURN(MATCH_NOMATCH);
4937 }
4938 /* Control never gets here */
4939
4940 case PT_SC:
4941 for (fi = min;; fi++)
4942 {
4943 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4945 if (fi >= max) RRETURN(MATCH_NOMATCH);
4946 if (eptr >= md->end_subject)
4947 {
4948 SCHECK_PARTIAL();
4949 RRETURN(MATCH_NOMATCH);
4950 }
4951 GETCHARINCTEST(c, eptr);
4952 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4953 RRETURN(MATCH_NOMATCH);
4954 }
4955 /* Control never gets here */
4956
4957 case PT_ALNUM:
4958 for (fi = min;; fi++)
4959 {
4960 int category;
4961 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4962 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4963 if (fi >= max) RRETURN(MATCH_NOMATCH);
4964 if (eptr >= md->end_subject)
4965 {
4966 SCHECK_PARTIAL();
4967 RRETURN(MATCH_NOMATCH);
4968 }
4969 GETCHARINCTEST(c, eptr);
4970 category = UCD_CATEGORY(c);
4971 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4972 RRETURN(MATCH_NOMATCH);
4973 }
4974 /* Control never gets here */
4975
4976 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4977 which means that Perl space and POSIX space are now identical. PCRE
4978 was changed at release 8.34. */
4979
4980 case PT_SPACE: /* Perl space */
4981 case PT_PXSPACE: /* POSIX space */
4982 for (fi = min;; fi++)
4983 {
4984 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4985 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4986 if (fi >= max) RRETURN(MATCH_NOMATCH);
4987 if (eptr >= md->end_subject)
4988 {
4989 SCHECK_PARTIAL();
4990 RRETURN(MATCH_NOMATCH);
4991 }
4992 GETCHARINCTEST(c, eptr);
4993 switch(c)
4994 {
4995 HSPACE_CASES:
4996 VSPACE_CASES:
4997 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4998 break;
4999
5000 default:
5001 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5002 RRETURN(MATCH_NOMATCH);
5003 break;
5004 }
5005 }
5006 /* Control never gets here */
5007
5008 case PT_WORD:
5009 for (fi = min;; fi++)
5010 {
5011 int category;
5012 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5013 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5014 if (fi >= max) RRETURN(MATCH_NOMATCH);
5015 if (eptr >= md->end_subject)
5016 {
5017 SCHECK_PARTIAL();
5018 RRETURN(MATCH_NOMATCH);
5019 }
5020 GETCHARINCTEST(c, eptr);
5021 category = UCD_CATEGORY(c);
5022 if ((category == ucp_L ||
5023 category == ucp_N ||
5024 c == CHAR_UNDERSCORE)
5025 == prop_fail_result)
5026 RRETURN(MATCH_NOMATCH);
5027 }
5028 /* Control never gets here */
5029
5030 case PT_CLIST:
5031 for (fi = min;; fi++)
5032 {
5033 const pcre_uint32 *cp;
5034 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5036 if (fi >= max) RRETURN(MATCH_NOMATCH);
5037 if (eptr >= md->end_subject)
5038 {
5039 SCHECK_PARTIAL();
5040 RRETURN(MATCH_NOMATCH);
5041 }
5042 GETCHARINCTEST(c, eptr);
5043 cp = PRIV(ucd_caseless_sets) + prop_value;
5044 for (;;)
5045 {
5046 if (c < *cp)
5047 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5048 if (c == *cp++)
5049 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5050 }
5051 }
5052 /* Control never gets here */
5053
5054 case PT_UCNC:
5055 for (fi = min;; fi++)
5056 {
5057 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5059 if (fi >= max) RRETURN(MATCH_NOMATCH);
5060 if (eptr >= md->end_subject)
5061 {
5062 SCHECK_PARTIAL();
5063 RRETURN(MATCH_NOMATCH);
5064 }
5065 GETCHARINCTEST(c, eptr);
5066 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5067 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5068 c >= 0xe000) == prop_fail_result)
5069 RRETURN(MATCH_NOMATCH);
5070 }
5071 /* Control never gets here */
5072
5073 /* This should never occur */
5074 default:
5075 RRETURN(PCRE_ERROR_INTERNAL);
5076 }
5077 }
5078
5079 /* Match extended Unicode sequences. We will get here only if the
5080 support is in the binary; otherwise a compile-time error occurs. */
5081
5082 else if (ctype == OP_EXTUNI)
5083 {
5084 for (fi = min;; fi++)
5085 {
5086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5088 if (fi >= max) RRETURN(MATCH_NOMATCH);
5089 if (eptr >= md->end_subject)
5090 {
5091 SCHECK_PARTIAL();
5092 RRETURN(MATCH_NOMATCH);
5093 }
5094 else
5095 {
5096 int lgb, rgb;
5097 GETCHARINCTEST(c, eptr);
5098 lgb = UCD_GRAPHBREAK(c);
5099 while (eptr < md->end_subject)
5100 {
5101 int len = 1;
5102 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5103 rgb = UCD_GRAPHBREAK(c);
5104 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5105 lgb = rgb;
5106 eptr += len;
5107 }
5108 }
5109 CHECK_PARTIAL();
5110 }
5111 }
5112 else
5113 #endif /* SUPPORT_UCP */
5114
5115 #ifdef SUPPORT_UTF
5116 if (utf)
5117 {
5118 for (fi = min;; fi++)
5119 {
5120 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5122 if (fi >= max) RRETURN(MATCH_NOMATCH);
5123 if (eptr >= md->end_subject)
5124 {
5125 SCHECK_PARTIAL();
5126 RRETURN(MATCH_NOMATCH);
5127 }
5128 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5129 RRETURN(MATCH_NOMATCH);
5130 GETCHARINC(c, eptr);
5131 switch(ctype)
5132 {
5133 case OP_ANY: /* This is the non-NL case */
5134 if (md->partial != 0 && /* Take care with CRLF partial */
5135 eptr >= md->end_subject &&
5136 NLBLOCK->nltype == NLTYPE_FIXED &&
5137 NLBLOCK->nllen == 2 &&
5138 c == NLBLOCK->nl[0])
5139 {
5140 md->hitend = TRUE;
5141 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5142 }
5143 break;
5144
5145 case OP_ALLANY:
5146 case OP_ANYBYTE:
5147 break;
5148
5149 case OP_ANYNL:
5150 switch(c)
5151 {
5152 default: RRETURN(MATCH_NOMATCH);
5153 case CHAR_CR:
5154 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5155 break;
5156
5157 case CHAR_LF:
5158 break;
5159
5160 case CHAR_VT:
5161 case CHAR_FF:
5162 case CHAR_NEL:
5163 #ifndef EBCDIC
5164 case 0x2028:
5165 case 0x2029:
5166 #endif /* Not EBCDIC */
5167 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5168 break;
5169 }
5170 break;
5171
5172 case OP_NOT_HSPACE:
5173 switch(c)
5174 {
5175 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5176 default: break;
5177 }
5178 break;
5179
5180 case OP_HSPACE:
5181 switch(c)
5182 {
5183 HSPACE_CASES: break;
5184 default: RRETURN(MATCH_NOMATCH);
5185 }
5186 break;
5187
5188 case OP_NOT_VSPACE:
5189 switch(c)
5190 {
5191 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5192 default: break;
5193 }
5194 break;
5195
5196 case OP_VSPACE:
5197 switch(c)
5198 {
5199 VSPACE_CASES: break;
5200 default: RRETURN(MATCH_NOMATCH);
5201 }
5202 break;
5203
5204 case OP_NOT_DIGIT:
5205 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5206 RRETURN(MATCH_NOMATCH);
5207 break;
5208
5209 case OP_DIGIT:
5210 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5211 RRETURN(MATCH_NOMATCH);
5212 break;
5213
5214 case OP_NOT_WHITESPACE:
5215 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5216 RRETURN(MATCH_NOMATCH);
5217 break;
5218
5219 case OP_WHITESPACE:
5220 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5221 RRETURN(MATCH_NOMATCH);
5222 break;
5223
5224 case OP_NOT_WORDCHAR:
5225 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5226 RRETURN(MATCH_NOMATCH);
5227 break;
5228
5229 case OP_WORDCHAR:
5230 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5231 RRETURN(MATCH_NOMATCH);
5232 break;
5233
5234 default:
5235 RRETURN(PCRE_ERROR_INTERNAL);
5236 }
5237 }
5238 }
5239 else
5240 #endif
5241 /* Not UTF mode */
5242 {
5243 for (fi = min;; fi++)
5244 {
5245 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5246 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5247 if (fi >= max) RRETURN(MATCH_NOMATCH);
5248 if (eptr >= md->end_subject)
5249 {
5250 SCHECK_PARTIAL();
5251 RRETURN(MATCH_NOMATCH);
5252 }
5253 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5254 RRETURN(MATCH_NOMATCH);
5255 c = *eptr++;
5256 switch(ctype)
5257 {
5258 case OP_ANY: /* This is the non-NL case */
5259 if (md->partial != 0 && /* Take care with CRLF partial */
5260 eptr >= md->end_subject &&
5261 NLBLOCK->nltype == NLTYPE_FIXED &&
5262 NLBLOCK->nllen == 2 &&
5263 c == NLBLOCK->nl[0])
5264 {
5265 md->hitend = TRUE;
5266 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5267 }
5268 break;
5269
5270 case OP_ALLANY:
5271 case OP_ANYBYTE:
5272 break;
5273
5274 case OP_ANYNL:
5275 switch(c)
5276 {
5277 default: RRETURN(MATCH_NOMATCH);
5278 case CHAR_CR:
5279 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5280 break;
5281
5282 case CHAR_LF:
5283 break;
5284
5285 case CHAR_VT:
5286 case CHAR_FF:
5287 case CHAR_NEL:
5288 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5289 case 0x2028:
5290 case 0x2029:
5291 #endif
5292 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5293 break;
5294 }
5295 break;
5296
5297 case OP_NOT_HSPACE:
5298 switch(c)
5299 {
5300 default: break;
5301 HSPACE_BYTE_CASES:
5302 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5303 HSPACE_MULTIBYTE_CASES:
5304 #endif
5305 RRETURN(MATCH_NOMATCH);
5306 }
5307 break;
5308
5309 case OP_HSPACE:
5310 switch(c)
5311 {
5312 default: RRETURN(MATCH_NOMATCH);
5313 HSPACE_BYTE_CASES:
5314 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5315 HSPACE_MULTIBYTE_CASES:
5316 #endif
5317 break;
5318 }
5319 break;
5320
5321 case OP_NOT_VSPACE:
5322 switch(c)
5323 {
5324 default: break;
5325 VSPACE_BYTE_CASES:
5326 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5327 VSPACE_MULTIBYTE_CASES:
5328 #endif
5329 RRETURN(MATCH_NOMATCH);
5330 }
5331 break;
5332
5333 case OP_VSPACE:
5334 switch(c)
5335 {
5336 default: RRETURN(MATCH_NOMATCH);
5337 VSPACE_BYTE_CASES:
5338 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5339 VSPACE_MULTIBYTE_CASES:
5340 #endif
5341 break;
5342 }
5343 break;
5344
5345 case OP_NOT_DIGIT:
5346 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5347 break;
5348
5349 case OP_DIGIT:
5350 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5351 break;
5352
5353 case OP_NOT_WHITESPACE:
5354 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5355 break;
5356
5357 case OP_WHITESPACE:
5358 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5359 break;
5360
5361 case OP_NOT_WORDCHAR:
5362 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5363 break;
5364
5365 case OP_WORDCHAR:
5366 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5367 break;
5368
5369 default:
5370 RRETURN(PCRE_ERROR_INTERNAL);
5371 }
5372 }
5373 }
5374 /* Control never gets here */
5375 }
5376
5377 /* If maximizing, it is worth using inline code for speed, doing the type
5378 test once at the start (i.e. keep it out of the loop). Again, keep the
5379 UTF-8 and UCP stuff separate. */
5380
5381 else
5382 {
5383 pp = eptr; /* Remember where we started */
5384
5385 #ifdef SUPPORT_UCP
5386 if (prop_type >= 0)
5387 {
5388 switch(prop_type)
5389 {
5390 case PT_ANY:
5391 for (i = min; i < max; i++)
5392 {
5393 int len = 1;
5394 if (eptr >= md->end_subject)
5395 {
5396 SCHECK_PARTIAL();
5397 break;
5398 }
5399 GETCHARLENTEST(c, eptr, len);
5400 if (prop_fail_result) break;
5401 eptr+= len;
5402 }
5403 break;
5404
5405 case PT_LAMP:
5406 for (i = min; i < max; i++)
5407 {
5408 int chartype;
5409 int len = 1;
5410 if (eptr >= md->end_subject)
5411 {
5412 SCHECK_PARTIAL();
5413 break;
5414 }
5415 GETCHARLENTEST(c, eptr, len);
5416 chartype = UCD_CHARTYPE(c);
5417 if ((chartype == ucp_Lu ||
5418 chartype == ucp_Ll ||
5419 chartype == ucp_Lt) == prop_fail_result)
5420 break;
5421 eptr+= len;
5422 }
5423 break;
5424
5425 case PT_GC:
5426 for (i = min; i < max; i++)
5427 {
5428 int len = 1;
5429 if (eptr >= md->end_subject)
5430 {
5431 SCHECK_PARTIAL();
5432 break;
5433 }
5434 GETCHARLENTEST(c, eptr, len);
5435 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5436 eptr+= len;
5437 }
5438 break;
5439
5440 case PT_PC:
5441 for (i = min; i < max; i++)
5442 {
5443 int len = 1;
5444 if (eptr >= md->end_subject)
5445 {
5446 SCHECK_PARTIAL();
5447 break;
5448 }
5449 GETCHARLENTEST(c, eptr, len);
5450 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5451 eptr+= len;
5452 }
5453 break;
5454
5455 case PT_SC:
5456 for (i = min; i < max; i++)
5457 {
5458 int len = 1;
5459 if (eptr >= md->end_subject)
5460 {
5461 SCHECK_PARTIAL();
5462 break;
5463 }
5464 GETCHARLENTEST(c, eptr, len);
5465 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5466 eptr+= len;
5467 }
5468 break;
5469
5470 case PT_ALNUM:
5471 for (i = min; i < max; i++)
5472 {
5473 int category;
5474 int len = 1;
5475 if (eptr >= md->end_subject)
5476 {
5477 SCHECK_PARTIAL();
5478 break;
5479 }
5480 GETCHARLENTEST(c, eptr, len);
5481 category = UCD_CATEGORY(c);
5482 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5483 break;
5484 eptr+= len;
5485 }
5486 break;
5487
5488 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5489 which means that Perl space and POSIX space are now identical. PCRE
5490 was changed at release 8.34. */
5491
5492 case PT_SPACE: /* Perl space */
5493 case PT_PXSPACE: /* POSIX space */
5494 for (i = min; i < max; i++)
5495 {
5496 int len = 1;
5497 if (eptr >= md->end_subject)
5498 {
5499 SCHECK_PARTIAL();
5500 break;
5501 }
5502 GETCHARLENTEST(c, eptr, len);
5503 switch(c)
5504 {
5505 HSPACE_CASES:
5506 VSPACE_CASES:
5507 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5508 break;
5509
5510 default:
5511 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5512 goto ENDLOOP99; /* Break the loop */
5513 break;
5514 }
5515 eptr+= len;
5516 }
5517 ENDLOOP99:
5518 break;
5519
5520 case PT_WORD:
5521 for (i = min; i < max; i++)
5522 {
5523 int category;
5524 int len = 1;
5525 if (eptr >= md->end_subject)
5526 {
5527 SCHECK_PARTIAL();
5528 break;
5529 }
5530 GETCHARLENTEST(c, eptr, len);
5531 category = UCD_CATEGORY(c);
5532 if ((category == ucp_L || category == ucp_N ||
5533 c == CHAR_UNDERSCORE) == prop_fail_result)
5534 break;
5535 eptr+= len;
5536 }
5537 break;
5538
5539 case PT_CLIST:
5540 for (i = min; i < max; i++)
5541 {
5542 const pcre_uint32 *cp;
5543 int len = 1;
5544 if (eptr >= md->end_subject)
5545 {
5546 SCHECK_PARTIAL();
5547 break;
5548 }
5549 GETCHARLENTEST(c, eptr, len);
5550 cp = PRIV(ucd_caseless_sets) + prop_value;
5551 for (;;)
5552 {
5553 if (c < *cp)
5554 { if (prop_fail_result) break; else goto GOT_MAX; }
5555 if (c == *cp++)
5556 { if (prop_fail_result) goto GOT_MAX; else break; }
5557 }
5558 eptr += len;
5559 }
5560 GOT_MAX:
5561 break;
5562
5563 case PT_UCNC:
5564 for (i = min; i < max; i++)
5565 {
5566 int len = 1;
5567 if (eptr >= md->end_subject)
5568 {
5569 SCHECK_PARTIAL();
5570 break;
5571 }
5572 GETCHARLENTEST(c, eptr, len);
5573 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5574 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5575 c >= 0xe000) == prop_fail_result)
5576 break;
5577 eptr += len;
5578 }
5579 break;
5580
5581 default:
5582 RRETURN(PCRE_ERROR_INTERNAL);
5583 }
5584
5585 /* eptr is now past the end of the maximum run */
5586
5587 if (possessive) continue; /* No backtracking */
5588 for(;;)
5589 {
5590 if (eptr == pp) goto TAIL_RECURSE;
5591 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5592 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5593 eptr--;
5594 if (utf) BACKCHAR(eptr);
5595 }
5596 }
5597
5598 /* Match extended Unicode grapheme clusters. We will get here only if the
5599 support is in the binary; otherwise a compile-time error occurs. */
5600
5601 else if (ctype == OP_EXTUNI)
5602 {
5603 for (i = min; i < max; i++)
5604 {
5605 if (eptr >= md->end_subject)
5606 {
5607 SCHECK_PARTIAL();
5608 break;
5609 }
5610 else
5611 {
5612 int lgb, rgb;
5613 GETCHARINCTEST(c, eptr);
5614 lgb = UCD_GRAPHBREAK(c);
5615 while (eptr < md->end_subject)
5616 {
5617 int len = 1;
5618 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5619 rgb = UCD_GRAPHBREAK(c);
5620 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5621 lgb = rgb;
5622 eptr += len;
5623 }
5624 }
5625 CHECK_PARTIAL();
5626 }
5627
5628 /* eptr is now past the end of the maximum run */
5629
5630 if (possessive) continue; /* No backtracking */
5631
5632 for(;;)
5633 {
5634 int lgb, rgb;
5635 PCRE_PUCHAR fptr;
5636
5637 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5638 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5640
5641 /* Backtracking over an extended grapheme cluster involves inspecting
5642 the previous two characters (if present) to see if a break is
5643 permitted between them. */
5644
5645 eptr--;
5646 if (!utf) c = *eptr; else
5647 {
5648 BACKCHAR(eptr);
5649 GETCHAR(c, eptr);
5650 }
5651 rgb = UCD_GRAPHBREAK(c);
5652
5653 for (;;)
5654 {
5655 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5656 fptr = eptr - 1;
5657 if (!utf) c = *fptr; else
5658 {
5659 BACKCHAR(fptr);
5660 GETCHAR(c, fptr);
5661 }
5662 lgb = UCD_GRAPHBREAK(c);
5663 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5664 eptr = fptr;
5665 rgb = lgb;
5666 }
5667 }
5668 }
5669
5670 else
5671 #endif /* SUPPORT_UCP */
5672
5673 #ifdef SUPPORT_UTF
5674 if (utf)
5675 {
5676 switch(ctype)
5677 {
5678 case OP_ANY:
5679 if (max < INT_MAX)
5680 {
5681 for (i = min; i < max; i++)
5682 {
5683 if (eptr >= md->end_subject)
5684 {
5685 SCHECK_PARTIAL();
5686 break;
5687 }
5688 if (IS_NEWLINE(eptr)) break;
5689 if (md->partial != 0 && /* Take care with CRLF partial */
5690 eptr + 1 >= md->end_subject &&
5691 NLBLOCK->nltype == NLTYPE_FIXED &&
5692 NLBLOCK->nllen == 2 &&
5693 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5694 {
5695 md->hitend = TRUE;
5696 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5697 }
5698 eptr++;
5699 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5700 }
5701 }
5702
5703 /* Handle unlimited UTF-8 repeat */
5704
5705 else
5706 {
5707 for (i = min; i < max; i++)
5708 {
5709 if (eptr >= md->end_subject)
5710 {
5711 SCHECK_PARTIAL();
5712 break;
5713 }
5714 if (IS_NEWLINE(eptr)) break;
5715 if (md->partial != 0 && /* Take care with CRLF partial */
5716 eptr + 1 >= md->end_subject &&
5717 NLBLOCK->nltype == NLTYPE_FIXED &&
5718 NLBLOCK->nllen == 2 &&
5719 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5720 {
5721 md->hitend = TRUE;
5722 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5723 }
5724 eptr++;
5725 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5726 }
5727 }
5728 break;
5729
5730 case OP_ALLANY:
5731 if (max < INT_MAX)
5732 {
5733 for (i = min; i < max; i++)
5734 {
5735 if (eptr >= md->end_subject)
5736 {
5737 SCHECK_PARTIAL();
5738 break;
5739 }
5740 eptr++;
5741 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5742 }
5743 }
5744 else
5745 {
5746 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5747 SCHECK_PARTIAL();
5748 }
5749 break;
5750
5751 /* The byte case is the same as non-UTF8 */
5752
5753 case OP_ANYBYTE:
5754 c = max - min;
5755 if (c > (unsigned int)(md->end_subject - eptr))
5756 {
5757 eptr = md->end_subject;
5758 SCHECK_PARTIAL();
5759 }
5760 else eptr += c;
5761 break;
5762
5763 case OP_ANYNL:
5764 for (i = min; i < max; i++)
5765 {
5766 int len = 1;
5767 if (eptr >= md->end_subject)
5768 {
5769 SCHECK_PARTIAL();
5770 break;
5771 }
5772 GETCHARLEN(c, eptr, len);
5773 if (c == CHAR_CR)
5774 {
5775 if (++eptr >= md->end_subject) break;
5776 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5777 }
5778 else
5779 {
5780 if (c != CHAR_LF &&
5781 (md->bsr_anycrlf ||
5782 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5783 #ifndef EBCDIC
5784 && c != 0x2028 && c != 0x2029
5785 #endif /* Not EBCDIC */
5786 )))
5787 break;
5788 eptr += len;
5789 }
5790 }
5791 break;
5792
5793 case OP_NOT_HSPACE:
5794 case OP_HSPACE:
5795 for (i = min; i < max; i++)
5796 {
5797 BOOL gotspace;
5798 int len = 1;
5799 if (eptr >= md->end_subject)
5800 {
5801 SCHECK_PARTIAL();
5802 break;
5803 }
5804 GETCHARLEN(c, eptr, len);
5805 switch(c)
5806 {
5807 HSPACE_CASES: gotspace = TRUE; break;
5808 default: gotspace = FALSE; break;
5809 }
5810 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5811 eptr += len;
5812 }
5813 break;
5814
5815 case OP_NOT_VSPACE:
5816 case OP_VSPACE:
5817 for (i = min; i < max; i++)
5818 {
5819 BOOL gotspace;
5820 int len = 1;
5821 if (eptr >= md->end_subject)
5822 {
5823 SCHECK_PARTIAL();
5824 break;
5825 }
5826 GETCHARLEN(c, eptr, len);
5827 switch(c)
5828 {
5829 VSPACE_CASES: gotspace = TRUE; break;
5830 default: gotspace = FALSE; break;
5831 }
5832 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5833 eptr += len;
5834 }
5835 break;
5836
5837 case OP_NOT_DIGIT:
5838 for (i = min; i < max; i++)
5839 {
5840 int len = 1;
5841 if (eptr >= md->end_subject)
5842 {
5843 SCHECK_PARTIAL();
5844 break;
5845 }
5846 GETCHARLEN(c, eptr, len);
5847 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5848 eptr+= len;
5849 }
5850 break;
5851
5852 case OP_DIGIT:
5853 for (i = min; i < max; i++)
5854 {
5855 int len = 1;
5856 if (eptr >= md->end_subject)
5857 {
5858 SCHECK_PARTIAL();
5859 break;
5860 }
5861 GETCHARLEN(c, eptr, len);
5862 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5863 eptr+= len;
5864 }
5865 break;
5866
5867 case OP_NOT_WHITESPACE:
5868 for (i = min; i < max; i++)
5869 {
5870 int len = 1;
5871 if (eptr >= md->end_subject)
5872 {
5873 SCHECK_PARTIAL();
5874 break;
5875 }
5876 GETCHARLEN(c, eptr, len);
5877 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5878 eptr+= len;
5879 }
5880 break;
5881
5882 case OP_WHITESPACE:
5883 for (i = min; i < max; i++)
5884 {
5885 int len = 1;
5886 if (eptr >= md->end_subject)
5887 {
5888 SCHECK_PARTIAL();
5889 break;
5890 }
5891 GETCHARLEN(c, eptr, len);
5892 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5893 eptr+= len;
5894 }
5895 break;
5896
5897 case OP_NOT_WORDCHAR:
5898 for (i = min; i < max; i++)
5899 {
5900 int len = 1;
5901 if (eptr >= md->end_subject)
5902 {
5903 SCHECK_PARTIAL();
5904 break;
5905 }
5906 GETCHARLEN(c, eptr, len);
5907 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5908 eptr+= len;
5909 }
5910 break;
5911
5912 case OP_WORDCHAR:
5913 for (i = min; i < max; i++)
5914 {
5915 int len = 1;
5916 if (eptr >= md->end_subject)
5917 {
5918 SCHECK_PARTIAL();
5919 break;
5920 }
5921 GETCHARLEN(c, eptr, len);
5922 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5923 eptr+= len;
5924 }
5925 break;
5926
5927 default:
5928 RRETURN(PCRE_ERROR_INTERNAL);
5929 }
5930
5931 if (possessive) continue; /* No backtracking */
5932 for(;;)
5933 {
5934 if (eptr == pp) goto TAIL_RECURSE;
5935 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5937 eptr--;
5938 BACKCHAR(eptr);
5939 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5940 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5941 }
5942 }
5943 else
5944 #endif /* SUPPORT_UTF */
5945 /* Not UTF mode */
5946 {
5947 switch(ctype)
5948 {
5949 case OP_ANY:
5950 for (i = min; i < max; i++)
5951 {
5952 if (eptr >= md->end_subject)
5953 {
5954 SCHECK_PARTIAL();
5955 break;
5956 }
5957 if (IS_NEWLINE(eptr)) break;
5958 if (md->partial != 0 && /* Take care with CRLF partial */
5959 eptr + 1 >= md->end_subject &&
5960 NLBLOCK->nltype == NLTYPE_FIXED &&
5961 NLBLOCK->nllen == 2 &&
5962 *eptr == NLBLOCK->nl[0])
5963 {
5964 md->hitend = TRUE;
5965 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5966 }
5967 eptr++;
5968 }
5969 break;
5970
5971 case OP_ALLANY:
5972 case OP_ANYBYTE:
5973 c = max - min;
5974 if (c > (unsigned int)(md->end_subject - eptr))
5975 {
5976 eptr = md->end_subject;
5977 SCHECK_PARTIAL();
5978 }
5979 else eptr += c;
5980 break;
5981
5982 case OP_ANYNL:
5983 for (i = min; i < max; i++)
5984 {
5985 if (eptr >= md->end_subject)
5986 {
5987 SCHECK_PARTIAL();
5988 break;
5989 }
5990 c = *eptr;
5991 if (c == CHAR_CR)
5992 {
5993 if (++eptr >= md->end_subject) break;
5994 if (*eptr == CHAR_LF) eptr++;
5995 }
5996 else
5997 {
5998 if (c != CHAR_LF && (md->bsr_anycrlf ||
5999 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6000 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6001 && c != 0x2028 && c != 0x2029
6002 #endif
6003 ))) break;
6004 eptr++;
6005 }
6006 }
6007 break;
6008
6009 case OP_NOT_HSPACE:
6010 for (i = min; i < max; i++)
6011 {
6012 if (eptr >= md->end_subject)
6013 {
6014 SCHECK_PARTIAL();
6015 break;
6016 }
6017 switch(*eptr)
6018 {
6019 default: eptr++; break;
6020 HSPACE_BYTE_CASES:
6021 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6022 HSPACE_MULTIBYTE_CASES:
6023 #endif
6024 goto ENDLOOP00;
6025 }
6026 }
6027 ENDLOOP00:
6028 break;
6029
6030 case OP_HSPACE:
6031 for (i = min; i < max; i++)
6032 {
6033 if (eptr >= md->end_subject)
6034 {
6035 SCHECK_PARTIAL();
6036 break;
6037 }
6038 switch(*eptr)
6039 {
6040 default: goto ENDLOOP01;
6041 HSPACE_BYTE_CASES:
6042 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6043 HSPACE_MULTIBYTE_CASES:
6044 #endif
6045 eptr++; break;
6046 }
6047 }
6048 ENDLOOP01:
6049 break;
6050
6051 case OP_NOT_VSPACE:
6052 for (i = min; i < max; i++)
6053 {
6054 if (eptr >= md->end_subject)
6055 {
6056 SCHECK_PARTIAL();
6057 break;
6058 }
6059 switch(*eptr)
6060 {
6061 default: eptr++; break;
6062 VSPACE_BYTE_CASES:
6063 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6064 VSPACE_MULTIBYTE_CASES:
6065 #endif
6066 goto ENDLOOP02;
6067 }
6068 }
6069 ENDLOOP02:
6070 break;
6071
6072 case OP_VSPACE:
6073 for (i = min; i < max; i++)
6074 {
6075 if (eptr >= md->end_subject)
6076 {
6077 SCHECK_PARTIAL();
6078 break;
6079 }
6080 switch(*eptr)
6081 {
6082 default: goto ENDLOOP03;
6083 VSPACE_BYTE_CASES:
6084 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6085 VSPACE_MULTIBYTE_CASES:
6086 #endif
6087 eptr++; break;
6088 }
6089 }
6090 ENDLOOP03:
6091 break;
6092
6093 case OP_NOT_DIGIT:
6094 for (i = min; i < max; i++)
6095 {
6096 if (eptr >= md->end_subject)
6097 {
6098 SCHECK_PARTIAL();
6099 break;
6100 }
6101 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6102 eptr++;
6103 }
6104 break;
6105
6106 case OP_DIGIT:
6107 for (i = min; i < max; i++)
6108 {
6109 if (eptr >= md->end_subject)
6110 {
6111 SCHECK_PARTIAL();
6112 break;
6113 }
6114 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6115 eptr++;
6116 }
6117 break;
6118
6119 case OP_NOT_WHITESPACE:
6120 for (i = min; i < max; i++)
6121 {
6122 if (eptr >= md->end_subject)
6123 {
6124 SCHECK_PARTIAL();
6125 break;
6126 }
6127 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6128 eptr++;
6129 }
6130 break;
6131
6132 case OP_WHITESPACE:
6133 for (i = min; i < max; i++)
6134 {
6135 if (eptr >= md->end_subject)
6136 {
6137 SCHECK_PARTIAL();
6138 break;
6139 }
6140 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6141 eptr++;
6142 }
6143 break;
6144
6145 case OP_NOT_WORDCHAR:
6146 for (i = min; i < max; i++)
6147 {
6148 if (eptr >= md->end_subject)
6149 {
6150 SCHECK_PARTIAL();
6151 break;
6152 }
6153 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6154 eptr++;
6155 }
6156 break;
6157
6158 case OP_WORDCHAR:
6159 for (i = min; i < max; i++)
6160 {
6161 if (eptr >= md->end_subject)
6162 {
6163 SCHECK_PARTIAL();
6164 break;
6165 }
6166 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6167 eptr++;
6168 }
6169 break;
6170
6171 default:
6172 RRETURN(PCRE_ERROR_INTERNAL);
6173 }
6174
6175 if (possessive) continue; /* No backtracking */
6176 for (;;)
6177 {
6178 if (eptr == pp) goto TAIL_RECURSE;
6179 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6180 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6181 eptr--;
6182 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6183 eptr[-1] == CHAR_CR) eptr--;
6184 }
6185 }
6186
6187 /* Control never gets here */
6188 }
6189
6190 /* There's been some horrible disaster. Arrival here can only mean there is
6191 something seriously wrong in the code above or the OP_xxx definitions. */
6192
6193 default:
6194 DPRINTF(("Unknown opcode %d\n", *ecode));
6195 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6196 }
6197
6198 /* Do not stick any code in here without much thought; it is assumed
6199 that "continue" in the code above comes out to here to repeat the main
6200 loop. */
6201
6202 } /* End of main loop */
6203 /* Control never reaches here */
6204
6205
6206 /* When compiling to use the heap rather than the stack for recursive calls to
6207 match(), the RRETURN() macro jumps here. The number that is saved in
6208 frame->Xwhere indicates which label we actually want to return to. */
6209
6210 #ifdef NO_RECURSE
6211 #define LBL(val) case val: goto L_RM##val;
6212 HEAP_RETURN:
6213 switch (frame->Xwhere)
6214 {
6215 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6216 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6217 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6218 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6219 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6220 LBL(65) LBL(66)
6221 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6222 LBL(21)
6223 #endif
6224 #ifdef SUPPORT_UTF
6225 LBL(16) LBL(18) LBL(20)
6226 LBL(22) LBL(23) LBL(28) LBL(30)
6227 LBL(32) LBL(34) LBL(42) LBL(46)
6228 #ifdef SUPPORT_UCP
6229 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6230 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6231 #endif /* SUPPORT_UCP */
6232 #endif /* SUPPORT_UTF */
6233 default:
6234 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6235 return PCRE_ERROR_INTERNAL;
6236 }
6237 #undef LBL
6238 #endif /* NO_RECURSE */
6239 }
6240
6241
6242 /***************************************************************************
6243 ****************************************************************************
6244 RECURSION IN THE match() FUNCTION
6245
6246 Undefine all the macros that were defined above to handle this. */
6247
6248 #ifdef NO_RECURSE
6249 #undef eptr
6250 #undef ecode
6251 #undef mstart
6252 #undef offset_top
6253 #undef eptrb
6254 #undef flags
6255
6256 #undef callpat
6257 #undef charptr
6258 #undef data
6259 #undef next
6260 #undef pp
6261 #undef prev
6262 #undef saved_eptr
6263
6264 #undef new_recursive
6265
6266 #undef cur_is_word
6267 #undef condition
6268 #undef prev_is_word
6269
6270 #undef ctype
6271 #undef length
6272 #undef max
6273 #undef min
6274 #undef number
6275 #undef offset
6276 #undef op
6277 #undef save_capture_last
6278 #undef save_offset1
6279 #undef save_offset2
6280 #undef save_offset3
6281 #undef stacksave
6282
6283 #undef newptrb
6284
6285 #endif
6286
6287 /* These two are defined as macros in both cases */
6288
6289 #undef fc
6290 #undef fi
6291
6292 /***************************************************************************
6293 ***************************************************************************/
6294
6295
6296 #ifdef NO_RECURSE
6297 /*************************************************
6298 * Release allocated heap frames *
6299 *************************************************/
6300
6301 /* This function releases all the allocated frames. The base frame is on the
6302 machine stack, and so must not be freed.
6303
6304 Argument: the address of the base frame
6305 Returns: nothing
6306 */
6307
6308 static void
6309 release_match_heapframes (heapframe *frame_base)
6310 {
6311 heapframe *nextframe = frame_base->Xnextframe;
6312 while (nextframe != NULL)
6313 {
6314 heapframe *oldframe = nextframe;
6315 nextframe = nextframe->Xnextframe;
6316 (PUBL(stack_free))(oldframe);
6317 }
6318 }
6319 #endif
6320
6321
6322 /*************************************************
6323 * Execute a Regular Expression *
6324 *************************************************/
6325
6326 /* This function applies a compiled re to a subject string and picks out
6327 portions of the string if it matches. Two elements in the vector are set for
6328 each substring: the offsets to the start and end of the substring.
6329
6330 Arguments:
6331 argument_re points to the compiled expression
6332 extra_data points to extra data or is NULL
6333 subject points to the subject string
6334 length length of subject string (may contain binary zeros)
6335 start_offset where to start in the subject string
6336 options option bits
6337 offsets points to a vector of ints to be filled in with offsets
6338 offsetcount the number of elements in the vector
6339
6340 Returns: > 0 => success; value is the number of elements filled in
6341 = 0 => success, but offsets is not big enough
6342 -1 => failed to match
6343 < -1 => some kind of unexpected problem
6344 */
6345
6346 #if defined COMPILE_PCRE8
6347 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6348 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6349 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6350 int offsetcount)
6351 #elif defined COMPILE_PCRE16
6352 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6353 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6354 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6355 int offsetcount)
6356 #elif defined COMPILE_PCRE32
6357 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6358 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6359 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6360 int offsetcount)
6361 #endif
6362 {
6363 int rc, ocount, arg_offset_max;
6364 int newline;
6365 BOOL using_temporary_offsets = FALSE;
6366 BOOL anchored;
6367 BOOL startline;
6368 BOOL firstline;
6369 BOOL utf;
6370 BOOL has_first_char = FALSE;
6371 BOOL has_req_char = FALSE;
6372 pcre_uchar first_char = 0;
6373 pcre_uchar first_char2 = 0;
6374 pcre_uchar req_char = 0;
6375 pcre_uchar req_char2 = 0;
6376 match_data match_block;
6377 match_data *md = &match_block;
6378 const pcre_uint8 *tables;
6379 const pcre_uint8 *start_bits = NULL;
6380 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6381 PCRE_PUCHAR end_subject;
6382 PCRE_PUCHAR start_partial = NULL;
6383 PCRE_PUCHAR match_partial = NULL;
6384 PCRE_PUCHAR req_char_ptr = start_match - 1;
6385
6386 const pcre_study_data *study;
6387 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6388
6389 #ifdef NO_RECURSE
6390 heapframe frame_zero;
6391 frame_zero.Xprevframe = NULL; /* Marks the top level */
6392 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6393 md->match_frames_base = &frame_zero;
6394 #endif
6395
6396 /* Check for the special magic call that measures the size of the stack used
6397 per recursive call of match(). Without the funny casting for sizeof, a Windows
6398 compiler gave this error: "unary minus operator applied to unsigned type,
6399 result still unsigned". Hopefully the cast fixes that. */
6400
6401 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6402 start_offset == -999)
6403 #ifdef NO_RECURSE
6404 return -((int)sizeof(heapframe));
6405 #else
6406 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6407 #endif
6408
6409 /* Plausibility checks */
6410
6411 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6412 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6413 return PCRE_ERROR_NULL;
6414 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6415 if (length < 0) return PCRE_ERROR_BADLENGTH;
6416 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6417
6418 /* Check that the first field in the block is the magic number. If it is not,
6419 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6420 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6421 means that the pattern is likely compiled with different endianness. */
6422
6423 if (re->magic_number != MAGIC_NUMBER)
6424 return re->magic_number == REVERSED_MAGIC_NUMBER?
6425 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6426 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6427
6428 /* These two settings are used in the code for checking a UTF-8 string that
6429 follows immediately afterwards. Other values in the md block are used only
6430 during "normal" pcre_exec() processing, not when the JIT support is in use,
6431 so they are set up later. */
6432
6433 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6434 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6435 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6436 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6437
6438 /* Check a UTF-8 string if required. Pass back the character offset and error
6439 code for an invalid string if a results vector is available. */
6440
6441 #ifdef SUPPORT_UTF
6442 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6443 {
6444 int erroroffset;
6445 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6446 if (errorcode != 0)
6447 {
6448 if (offsetcount >= 2)
6449 {
6450 offsets[0] = erroroffset;
6451 offsets[1] = errorcode;
6452 }
6453 #if defined COMPILE_PCRE8
6454 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6455 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6456 #elif defined COMPILE_PCRE16
6457 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6458 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6459 #elif defined COMPILE_PCRE32
6460 return PCRE_ERROR_BADUTF32;
6461 #endif
6462 }
6463 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6464 /* Check that a start_offset points to the start of a UTF character. */
6465 if (start_offset > 0 && start_offset < length &&
6466 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6467 return PCRE_ERROR_BADUTF8_OFFSET;
6468 #endif
6469 }
6470 #endif
6471
6472 /* If the pattern was successfully studied with JIT support, run the JIT
6473 executable instead of the rest of this function. Most options must be set at
6474 compile time for the JIT code to be usable. Fallback to the normal code path if
6475 an unsupported flag is set. */
6476
6477 #ifdef SUPPORT_JIT
6478 if (extra_data != NULL
6479 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6480 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6481 && extra_data->executable_jit != NULL
6482 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6483 {
6484 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6485 start_offset, options, offsets, offsetcount);
6486
6487 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6488 mode is not compiled. In this case we simply fallback to interpreter. */
6489
6490 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6491 }
6492 #endif
6493
6494 /* Carry on with non-JIT matching. This information is for finding all the
6495 numbers associated with a given name, for condition testing. */
6496
6497 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6498 md->name_count = re->name_count;
6499 md->name_entry_size = re->name_entry_size;
6500
6501 /* Fish out the optional data from the extra_data structure, first setting
6502 the default values. */
6503
6504 study = NULL;
6505 md->match_limit = MATCH_LIMIT;
6506 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6507 md->callout_data = NULL;
6508
6509 /* The table pointer is always in native byte order. */
6510
6511 tables = re->tables;
6512
6513 /* The two limit values override the defaults, whatever their value. */
6514
6515 if (extra_data != NULL)
6516 {
6517 register unsigned int flags = extra_data->flags;
6518 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6519 study = (const pcre_study_data *)extra_data->study_data;
6520 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6521 md->match_limit = extra_data->match_limit;
6522 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6523 md->match_limit_recursion = extra_data->match_limit_recursion;
6524 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6525 md->callout_data = extra_data->callout_data;
6526 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6527 }
6528
6529 /* Limits in the regex override only if they are smaller. */
6530
6531 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6532 md->match_limit = re->limit_match;
6533
6534 if ((re->flags & PCRE_RLSET) != 0 &&
6535 re->limit_recursion < md->match_limit_recursion)
6536 md->match_limit_recursion = re->limit_recursion;
6537
6538 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6539 is a feature that makes it possible to save compiled regex and re-use them
6540 in other programs later. */
6541
6542 if (tables == NULL) tables = PRIV(default_tables);
6543
6544 /* Set up other data */
6545
6546 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6547 startline = (re->flags & PCRE_STARTLINE) != 0;
6548 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6549
6550 /* The code starts after the real_pcre block and the capture name table. */
6551
6552 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6553 re->name_count * re->name_entry_size;
6554
6555 md->start_subject = (PCRE_PUCHAR)subject;
6556 md->start_offset = start_offset;
6557 md->end_subject = md->start_subject + length;
6558 end_subject = md->end_subject;
6559
6560 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6561 md->use_ucp = (re->options & PCRE_UCP) != 0;
6562 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6563 md->ignore_skip_arg = 0;
6564
6565 /* Some options are unpacked into BOOL variables in the hope that testing
6566 them will be faster than individual option bits. */
6567
6568 md->notbol = (options & PCRE_NOTBOL) != 0;
6569 md->noteol = (options & PCRE_NOTEOL) != 0;
6570 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6571 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6572
6573 md->hitend = FALSE;
6574 md->mark = md->nomatch_mark = NULL; /* In case never set */
6575
6576 md->recursive = NULL; /* No recursion at top level */
6577 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6578
6579 md->lcc = tables + lcc_offset;
6580 md->fcc = tables + fcc_offset;
6581 md->ctypes = tables + ctypes_offset;
6582
6583 /* Handle different \R options. */
6584
6585 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6586 {
6587 case 0:
6588 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6589 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6590 else
6591 #ifdef BSR_ANYCRLF
6592 md->bsr_anycrlf = TRUE;
6593 #else
6594 md->bsr_anycrlf = FALSE;
6595 #endif
6596 break;
6597
6598 case PCRE_BSR_ANYCRLF:
6599 md->bsr_anycrlf = TRUE;
6600 break;
6601
6602 case PCRE_BSR_UNICODE:
6603 md->bsr_anycrlf = FALSE;
6604 break;
6605
6606 default: return PCRE_ERROR_BADNEWLINE;
6607 }
6608
6609 /* Handle different types of newline. The three bits give eight cases. If
6610 nothing is set at run time, whatever was used at compile time applies. */
6611
6612 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6613 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6614 {
6615 case 0: newline = NEWLINE; break; /* Compile-time default */
6616 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6617 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6618 case PCRE_NEWLINE_CR+
6619 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6620 case PCRE_NEWLINE_ANY: newline = -1; break;
6621 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6622 default: return PCRE_ERROR_BADNEWLINE;
6623 }
6624
6625 if (newline == -2)
6626 {
6627 md->nltype = NLTYPE_ANYCRLF;
6628 }
6629 else if (newline < 0)
6630 {
6631 md->nltype = NLTYPE_ANY;
6632 }
6633 else
6634 {
6635 md->nltype = NLTYPE_FIXED;
6636 if (newline > 255)
6637 {
6638 md->nllen = 2;
6639 md->nl[0] = (newline >> 8) & 255;
6640 md->nl[1] = newline & 255;
6641 }
6642 else
6643 {
6644 md->nllen = 1;
6645 md->nl[0] = newline;
6646 }
6647 }
6648
6649 /* Partial matching was originally supported only for a restricted set of
6650 regexes; from release 8.00 there are no restrictions, but the bits are still
6651 defined (though never set). So there's no harm in leaving this code. */
6652
6653 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6654 return PCRE_ERROR_BADPARTIAL;
6655
6656 /* If the expression has got more back references than the offsets supplied can
6657 hold, we get a temporary chunk of working store to use during the matching.
6658 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6659 of 3. */
6660
6661 ocount = offsetcount - (offsetcount % 3);
6662 arg_offset_max = (2*ocount)/3;
6663
6664 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6665 {
6666 ocount = re->top_backref * 3 + 3;
6667 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6668 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6669 using_temporary_offsets = TRUE;
6670 DPRINTF(("Got memory to hold back references\n"));
6671 }
6672 else md->offset_vector = offsets;
6673 md->offset_end = ocount;
6674 md->offset_max = (2*ocount)/3;
6675 md->capture_last = 0;
6676
6677 /* Reset the working variable associated with each extraction. These should
6678 never be used unless previously set, but they get saved and restored, and so we
6679 initialize them to avoid reading uninitialized locations. Also, unset the
6680 offsets for the matched string. This is really just for tidiness with callouts,
6681 in case they inspect these fields. */
6682
6683 if (md->offset_vector != NULL)
6684 {
6685 register int *iptr = md->offset_vector + ocount;
6686 register int *iend = iptr - re->top_bracket;
6687 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6688 while (--iptr >= iend) *iptr = -1;
6689 md->offset_vector[0] = md->offset_vector[1] = -1;
6690 }
6691
6692 /* Set up the first character to match, if available. The first_char value is
6693 never set for an anchored regular expression, but the anchoring may be forced
6694 at run time, so we have to test for anchoring. The first char may be unset for
6695 an unanchored pattern, of course. If there's no first char and the pattern was
6696 studied, there may be a bitmap of possible first characters. */
6697
6698 if (!anchored)
6699 {
6700 if ((re->flags & PCRE_FIRSTSET) != 0)
6701 {
6702 has_first_char = TRUE;
6703 first_char = first_char2 = (pcre_uchar)(re->first_char);
6704 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6705 {
6706 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6707 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6708 if (utf && first_char > 127)
6709 first_char2 = UCD_OTHERCASE(first_char);
6710 #endif
6711 }
6712 }
6713 else
6714 if (!startline && study != NULL &&
6715 (study->flags & PCRE_STUDY_MAPPED) != 0)
6716 start_bits = study->start_bits;
6717 }
6718
6719 /* For anchored or unanchored matches, there may be a "last known required
6720 character" set. */
6721
6722 if ((re->flags & PCRE_REQCHSET) != 0)
6723 {
6724 has_req_char = TRUE;
6725 req_char = req_char2 = (pcre_uchar)(re->req_char);
6726 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6727 {
6728 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6729 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6730 if (utf && req_char > 127)
6731 req_char2 = UCD_OTHERCASE(req_char);
6732 #endif
6733 }
6734 }
6735
6736
6737 /* ==========================================================================*/
6738
6739 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6740 the loop runs just once. */
6741
6742 for(;;)
6743 {
6744 PCRE_PUCHAR save_end_subject = end_subject;
6745 PCRE_PUCHAR new_start_match;
6746
6747 /* If firstline is TRUE, the start of the match is constrained to the first
6748 line of a multiline string. That is, the match must be before or at the first
6749 newline. Implement this by temporarily adjusting end_subject so that we stop
6750 scanning at a newline. If the match fails at the newline, later code breaks
6751 this loop. */
6752
6753 if (firstline)
6754 {
6755