/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1414 - (show annotations)
Sun Dec 22 16:27:35 2013 UTC (5 years, 10 months ago) by zherczeg
File MIME type: text/plain
File size: 217949 byte(s)
A new flag is set, when property checks are present in an XCLASS.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
199 if (utf)
200 {
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
208 wrong). */
209
210 PCRE_PUCHAR endptr = p + length;
211 while (p < endptr)
212 {
213 pcre_uint32 c, d;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
216 GETCHARINC(c, eptr);
217 GETCHARINC(d, p);
218 ur = GET_UCD(d);
219 if (c != d && c != d + ur->other_case)
220 {
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222 for (;;)
223 {
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
226 }
227 }
228 }
229 }
230 else
231 #endif
232
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
235 {
236 while (length-- > 0)
237 {
238 pcre_uint32 cc, cp;
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = RAWUCHARTEST(eptr);
241 cp = RAWUCHARTEST(p);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
243 p++;
244 eptr++;
245 }
246 }
247 }
248
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
251
252 else
253 {
254 while (length-- > 0)
255 {
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
258 }
259 }
260
261 return (int)(eptr - eptr_start);
262 }
263
264
265
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
269
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
275 fine.
276
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
281
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
286 always used to.
287
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
294
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
303
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
306
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
314
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
318
319 #ifndef NO_RECURSE
320 #define REGISTER register
321
322 #ifdef PCRE_DEBUG
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
324 { \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
328 }
329 #define RRETURN(ra) \
330 { \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
332 return ra; \
333 }
334 #else
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
338 #endif
339
340 #else
341
342
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
346
347 #define REGISTER
348
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
350 {\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
353 {\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
358 }\
359 frame->Xwhere = rw;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
367 frame = newframe;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
369 goto HEAP_RECURSE;\
370 L_##rw:\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
372 }
373
374 #define RRETURN(ra)\
375 {\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
378 if (frame != NULL)\
379 {\
380 rrc = ra;\
381 goto HEAP_RETURN;\
382 }\
383 return ra;\
384 }
385
386
387 /* Structure for remembering the local variables in a private frame */
388
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
392
393 /* Function arguments that may change */
394
395 PCRE_PUCHAR Xeptr;
396 const pcre_uchar *Xecode;
397 PCRE_PUCHAR Xmstart;
398 int Xoffset_top;
399 eptrblock *Xeptrb;
400 unsigned int Xrdepth;
401
402 /* Function local variables */
403
404 PCRE_PUCHAR Xcallpat;
405 #ifdef SUPPORT_UTF
406 PCRE_PUCHAR Xcharptr;
407 #endif
408 PCRE_PUCHAR Xdata;
409 PCRE_PUCHAR Xnext;
410 PCRE_PUCHAR Xpp;
411 PCRE_PUCHAR Xprev;
412 PCRE_PUCHAR Xsaved_eptr;
413
414 recursion_info Xnew_recursive;
415
416 BOOL Xcur_is_word;
417 BOOL Xcondition;
418 BOOL Xprev_is_word;
419
420 #ifdef SUPPORT_UCP
421 int Xprop_type;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
424 int Xoclength;
425 pcre_uchar Xocchars[6];
426 #endif
427
428 int Xcodelink;
429 int Xctype;
430 unsigned int Xfc;
431 int Xfi;
432 int Xlength;
433 int Xmax;
434 int Xmin;
435 unsigned int Xnumber;
436 int Xoffset;
437 unsigned int Xop;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
441
442 eptrblock Xnewptrb;
443
444 /* Where to jump back to */
445
446 int Xwhere;
447
448 } heapframe;
449
450 #endif
451
452
453 /***************************************************************************
454 ***************************************************************************/
455
456
457
458 /*************************************************
459 * Match from current position *
460 *************************************************/
461
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
464 same response. */
465
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
471 the subject. */
472
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
476 { \
477 md->hitend = TRUE; \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
479 }
480
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
483 { \
484 md->hitend = TRUE; \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
486 }
487
488
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
493
494 Arguments:
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
498 by encountering \K)
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
504
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
510 */
511
512 static int
513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515 unsigned int rdepth)
516 {
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
520
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
525
526 BOOL minimize, possessive; /* Quantifier options */
527 BOOL caseless;
528 int condcode;
529
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
536
537 #ifdef NO_RECURSE
538 heapframe *frame = (heapframe *)md->match_frames_base;
539
540 /* Copy in the original argument variables */
541
542 frame->Xeptr = eptr;
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
548
549 /* This is where control jumps back to to effect "recursion" */
550
551 HEAP_RECURSE:
552
553 /* Macros make the argument variables come from the current frame */
554
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
561
562 /* Ditto for the local variables */
563
564 #ifdef SUPPORT_UTF
565 #define charptr frame->Xcharptr
566 #endif
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
574
575 #define new_recursive frame->Xnew_recursive
576
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
580
581 #ifdef SUPPORT_UCP
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
587 #endif
588
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
603
604 #define newptrb frame->Xnewptrb
605
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
609
610 #else /* NO_RECURSE not defined */
611 #define fi i
612 #define fc c
613
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
620 to RMATCH(). */
621
622 #ifdef SUPPORT_UTF
623 const pcre_uchar *charptr;
624 #endif
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
628 PCRE_PUCHAR pp;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
631
632 recursion_info new_recursive;
633
634 BOOL cur_is_word;
635 BOOL condition;
636 BOOL prev_is_word;
637
638 #ifdef SUPPORT_UCP
639 int prop_type;
640 unsigned int prop_value;
641 int prop_fail_result;
642 int oclength;
643 pcre_uchar occhars[6];
644 #endif
645
646 int codelink;
647 int ctype;
648 int length;
649 int max;
650 int min;
651 unsigned int number;
652 int offset;
653 unsigned int op;
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
657
658 eptrblock newptrb;
659
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
664 size. */
665
666 if (ecode == NULL)
667 {
668 if (rdepth == 0)
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670 else
671 {
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
674 }
675 }
676 #endif /* NO_RECURSE */
677
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
682
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
688 #define foc number
689 #define save_mark data
690
691 /* These statements are here to stop the compiler complaining about unitialized
692 variables. */
693
694 #ifdef SUPPORT_UCP
695 prop_value = 0;
696 prop_fail_result = 0;
697 #endif
698
699
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
703 original patch. */
704
705 TAIL_RECURSE:
706
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
714
715 #ifdef SUPPORT_UTF
716 utf = md->utf; /* Local copy of the flag */
717 #else
718 utf = FALSE;
719 #endif
720
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
723
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
726
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
731
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
738
739 if (md->match_function_type == MATCH_CBEGROUP)
740 {
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
743 eptrb = &newptrb;
744 md->match_function_type = 0;
745 }
746
747 /* Now start processing the opcodes. */
748
749 for (;;)
750 {
751 minimize = possessive = FALSE;
752 op = *ecode;
753
754 switch(op)
755 {
756 case OP_MARK:
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
760 eptrb, RM55);
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
763
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
769 unaltered. */
770
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
773 {
774 md->start_match_ptr = eptr;
775 RRETURN(MATCH_SKIP);
776 }
777 RRETURN(rrc);
778
779 case OP_FAIL:
780 RRETURN(MATCH_NOMATCH);
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
787
788 case OP_PRUNE:
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 eptrb, RM51);
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
793
794 case OP_PRUNE_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
798 eptrb, RM56);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
803
804 case OP_SKIP:
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
806 eptrb, RM53);
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
809 RRETURN(MATCH_SKIP);
810
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
818
819 case OP_SKIP_ARG:
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
822 {
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
824 break;
825 }
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
827 eptrb, RM57);
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
829
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
834
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
837
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
841
842 case OP_THEN:
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
844 eptrb, RM54);
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
847 RRETURN(MATCH_THEN);
848
849 case OP_THEN_ARG:
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
853 md, eptrb, RM58);
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
858 RRETURN(MATCH_THEN);
859
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
867
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
872
873 case OP_ONCE_NC:
874 prev = ecode;
875 saved_eptr = eptr;
876 save_mark = md->mark;
877 do
878 {
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
881 {
882 mstart = md->start_match_ptr;
883 break;
884 }
885 if (rrc == MATCH_THEN)
886 {
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
890 rrc = MATCH_NOMATCH;
891 }
892
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
896 }
897 while (*ecode == OP_ALT);
898
899 /* If hit the end of the group (which could be repeated), fail */
900
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
902
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
905
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
907
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
910
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
914 5.005. */
915
916 if (*ecode == OP_KET || eptr == saved_eptr)
917 {
918 ecode += 1+LINK_SIZE;
919 break;
920 }
921
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
925
926 if (*ecode == OP_KETRMIN)
927 {
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930 ecode = prev;
931 goto TAIL_RECURSE;
932 }
933 else /* OP_KETRMAX */
934 {
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
938 goto TAIL_RECURSE;
939 }
940 /* Control never gets here */
941
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
951
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
955
956 case OP_CBRA:
957 case OP_SCBRA:
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
960
961 #ifdef PCRE_DEBUG
962 printf("start bracket %d\n", number);
963 printf("subject=");
964 pchars(eptr, 16, TRUE, md);
965 printf("\n");
966 #endif
967
968 if (offset < md->offset_max)
969 {
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
975
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
979
980 for (;;)
981 {
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
984 eptrb, RM1);
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
986
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
997
998 if (rrc == MATCH_THEN)
999 {
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1004 }
1005
1006 /* Anything other than NOMATCH is passed back. */
1007
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1013 }
1014
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1019
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1021
1022 RRETURN(rrc);
1023 }
1024
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1027
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1032
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1038
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1045
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1048
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1055
1056 case OP_ONCE:
1057 case OP_BRA:
1058 case OP_SBRA:
1059 DPRINTF(("start non-capturing bracket\n"));
1060
1061 for (;;)
1062 {
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1065
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1068 above. */
1069
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1071 {
1072 ecode += PRIV(OP_lengths)[*ecode];
1073 goto TAIL_RECURSE;
1074 }
1075
1076 /* In all other cases, we have to make another call to match(). */
1077
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1081 RM2);
1082
1083 /* See comment in the code for capturing groups above about handling
1084 THEN. */
1085
1086 if (rrc == MATCH_THEN)
1087 {
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1092 }
1093
1094 if (rrc != MATCH_NOMATCH)
1095 {
1096 if (rrc == MATCH_ONCE)
1097 {
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1100 {
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1103 }
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1105 }
1106 RRETURN(rrc);
1107 }
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1112 }
1113
1114 RRETURN(MATCH_NOMATCH);
1115
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1123
1124 case OP_CBRAPOS:
1125 case OP_SCBRAPOS:
1126 allow_zero = FALSE;
1127
1128 POSSESSIVE_CAPTURE:
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1131
1132 #ifdef PCRE_DEBUG
1133 printf("start possessive bracket %d\n", number);
1134 printf("subject=");
1135 pchars(eptr, 16, TRUE, md);
1136 printf("\n");
1137 #endif
1138
1139 if (offset < md->offset_max)
1140 {
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1143
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1148
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1150
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1159
1160 for (;;)
1161 {
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1166 eptrb, RM63);
1167 if (rrc == MATCH_KETRPOS)
1168 {
1169 offset_top = md->end_offset_top;
1170 eptr = md->end_match_ptr;
1171 ecode = md->start_code + code_offset;
1172 save_capture_last = md->capture_last;
1173 matched_once = TRUE;
1174 mstart = md->start_match_ptr; /* In case \K changed it */
1175 continue;
1176 }
1177
1178 /* See comment in the code for capturing groups above about handling
1179 THEN. */
1180
1181 if (rrc == MATCH_THEN)
1182 {
1183 next = ecode + GET(ecode,1);
1184 if (md->start_match_ptr < next &&
1185 (*ecode == OP_ALT || *next == OP_ALT))
1186 rrc = MATCH_NOMATCH;
1187 }
1188
1189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1190 md->capture_last = save_capture_last;
1191 ecode += GET(ecode, 1);
1192 if (*ecode != OP_ALT) break;
1193 }
1194
1195 if (!matched_once)
1196 {
1197 md->offset_vector[offset] = save_offset1;
1198 md->offset_vector[offset+1] = save_offset2;
1199 md->offset_vector[md->offset_end - number] = save_offset3;
1200 }
1201
1202 if (allow_zero || matched_once)
1203 {
1204 ecode += 1 + LINK_SIZE;
1205 break;
1206 }
1207
1208 RRETURN(MATCH_NOMATCH);
1209 }
1210
1211 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1212 as a non-capturing bracket. */
1213
1214 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216
1217 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1218
1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221
1222 /* Non-capturing possessive bracket with unlimited repeat. We come here
1223 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1224 without the capturing complication. It is written out separately for speed
1225 and cleanliness. */
1226
1227 case OP_BRAPOS:
1228 case OP_SBRAPOS:
1229 allow_zero = FALSE;
1230
1231 POSSESSIVE_NON_CAPTURE:
1232 matched_once = FALSE;
1233 code_offset = (int)(ecode - md->start_code);
1234 save_capture_last = md->capture_last;
1235
1236 for (;;)
1237 {
1238 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1239 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1240 eptrb, RM48);
1241 if (rrc == MATCH_KETRPOS)
1242 {
1243 offset_top = md->end_offset_top;
1244 eptr = md->end_match_ptr;
1245 ecode = md->start_code + code_offset;
1246 matched_once = TRUE;
1247 mstart = md->start_match_ptr; /* In case \K reset it */
1248 continue;
1249 }
1250
1251 /* See comment in the code for capturing groups above about handling
1252 THEN. */
1253
1254 if (rrc == MATCH_THEN)
1255 {
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1260 }
1261
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1266 }
1267
1268 if (matched_once || allow_zero)
1269 {
1270 ecode += 1 + LINK_SIZE;
1271 break;
1272 }
1273 RRETURN(MATCH_NOMATCH);
1274
1275 /* Control never reaches here. */
1276
1277 /* Conditional group: compilation checked that there are no more than two
1278 branches. If the condition is false, skipping the first branch takes us
1279 past the end of the item if there is only one branch, but that's exactly
1280 what we want. */
1281
1282 case OP_COND:
1283 case OP_SCOND:
1284
1285 /* The variable codelink will be added to ecode when the condition is
1286 false, to get to the second branch. Setting it to the offset to the ALT
1287 or KET, then incrementing ecode achieves this effect. We now have ecode
1288 pointing to the condition or callout. */
1289
1290 codelink = GET(ecode, 1); /* Offset to the second branch */
1291 ecode += 1 + LINK_SIZE; /* From this opcode */
1292
1293 /* Because of the way auto-callout works during compile, a callout item is
1294 inserted between OP_COND and an assertion condition. */
1295
1296 if (*ecode == OP_CALLOUT)
1297 {
1298 if (PUBL(callout) != NULL)
1299 {
1300 PUBL(callout_block) cb;
1301 cb.version = 2; /* Version 1 of the callout block */
1302 cb.callout_number = ecode[1];
1303 cb.offset_vector = md->offset_vector;
1304 #if defined COMPILE_PCRE8
1305 cb.subject = (PCRE_SPTR)md->start_subject;
1306 #elif defined COMPILE_PCRE16
1307 cb.subject = (PCRE_SPTR16)md->start_subject;
1308 #elif defined COMPILE_PCRE32
1309 cb.subject = (PCRE_SPTR32)md->start_subject;
1310 #endif
1311 cb.subject_length = (int)(md->end_subject - md->start_subject);
1312 cb.start_match = (int)(mstart - md->start_subject);
1313 cb.current_position = (int)(eptr - md->start_subject);
1314 cb.pattern_position = GET(ecode, 2);
1315 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1316 cb.capture_top = offset_top/2;
1317 cb.capture_last = md->capture_last & CAPLMASK;
1318 /* Internal change requires this for API compatibility. */
1319 if (cb.capture_last == 0) cb.capture_last = -1;
1320 cb.callout_data = md->callout_data;
1321 cb.mark = md->nomatch_mark;
1322 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1323 if (rrc < 0) RRETURN(rrc);
1324 }
1325
1326 /* Advance ecode past the callout, so it now points to the condition. We
1327 must adjust codelink so that the value of ecode+codelink is unchanged. */
1328
1329 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1330 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1331 }
1332
1333 /* Test the various possible conditions */
1334
1335 condition = FALSE;
1336 switch(condcode = *ecode)
1337 {
1338 case OP_RREF: /* Numbered group recursion test */
1339 if (md->recursive != NULL) /* Not recursing => FALSE */
1340 {
1341 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1342 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1343 }
1344 break;
1345
1346 case OP_DNRREF: /* Duplicate named group recursion test */
1347 if (md->recursive != NULL)
1348 {
1349 int count = GET2(ecode, 1 + IMM2_SIZE);
1350 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1351 while (count-- > 0)
1352 {
1353 unsigned int recno = GET2(slot, 0);
1354 condition = recno == md->recursive->group_num;
1355 if (condition) break;
1356 slot += md->name_entry_size;
1357 }
1358 }
1359 break;
1360
1361 case OP_CREF: /* Numbered group used test */
1362 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1363 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1364 break;
1365
1366 case OP_DNCREF: /* Duplicate named group used test */
1367 {
1368 int count = GET2(ecode, 1 + IMM2_SIZE);
1369 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1370 while (count-- > 0)
1371 {
1372 offset = GET2(slot, 0) << 1;
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 if (condition) break;
1375 slot += md->name_entry_size;
1376 }
1377 }
1378 break;
1379
1380 case OP_DEF: /* DEFINE - always false */
1381 break;
1382
1383 /* The condition is an assertion. Call match() to evaluate it - setting
1384 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1385 of an assertion. */
1386
1387 default:
1388 md->match_function_type = MATCH_CONDASSERT;
1389 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1390 if (rrc == MATCH_MATCH)
1391 {
1392 if (md->end_offset_top > offset_top)
1393 offset_top = md->end_offset_top; /* Captures may have happened */
1394 condition = TRUE;
1395
1396 /* Advance ecode past the assertion to the start of the first branch,
1397 but adjust it so that the general choosing code below works. */
1398
1399 ecode += GET(ecode, 1);
1400 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1402 }
1403
1404 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405 assertion; it is therefore treated as NOMATCH. Any other return is an
1406 error. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 break;
1413 }
1414
1415 /* Choose branch according to the condition */
1416
1417 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, we
1420 can use tail recursion to avoid using another stack frame, except when
1421 there is unlimited repeat of a possibly empty group. In the latter case, a
1422 recursive call to match() is always required, unless the second alternative
1423 doesn't exist, in which case we can just plough on. Note that, for
1424 compatibility with Perl, the | in a conditional group is NOT treated as
1425 creating two alternatives. If a THEN is encountered in the branch, it
1426 propagates out to the enclosing alternative (unless nested in a deeper set
1427 of alternatives, of course). */
1428
1429 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 goto TAIL_RECURSE;
1434 }
1435
1436 md->match_function_type = MATCH_CBEGROUP;
1437 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1438 RRETURN(rrc);
1439 }
1440
1441 /* Condition false & no alternative; continue after the group. */
1442
1443 else
1444 {
1445 }
1446 break;
1447
1448
1449 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450 to close any currently open capturing brackets. */
1451
1452 case OP_CLOSE:
1453 number = GET2(ecode, 1); /* Must be less than 65536 */
1454 offset = number << 1;
1455
1456 #ifdef PCRE_DEBUG
1457 printf("end bracket %d at *ACCEPT", number);
1458 printf("\n");
1459 #endif
1460
1461 md->capture_last = (md->capture_last & OVFLMASK) | number;
1462 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1463 {
1464 md->offset_vector[offset] =
1465 md->offset_vector[md->offset_end - number];
1466 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467 if (offset_top <= offset) offset_top = offset + 2;
1468 }
1469 ecode += 1 + IMM2_SIZE;
1470 break;
1471
1472
1473 /* End of the pattern, either real or forced. */
1474
1475 case OP_END:
1476 case OP_ACCEPT:
1477 case OP_ASSERT_ACCEPT:
1478
1479 /* If we have matched an empty string, fail if not in an assertion and not
1480 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481 is set and we have matched at the start of the subject. In both cases,
1482 backtracking will then try other alternatives, if any. */
1483
1484 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485 md->recursive == NULL &&
1486 (md->notempty ||
1487 (md->notempty_atstart &&
1488 mstart == md->start_subject + md->start_offset)))
1489 RRETURN(MATCH_NOMATCH);
1490
1491 /* Otherwise, we have a match. */
1492
1493 md->end_match_ptr = eptr; /* Record where we ended */
1494 md->end_offset_top = offset_top; /* and how many extracts were taken */
1495 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1496
1497 /* For some reason, the macros don't work properly if an expression is
1498 given as the argument to RRETURN when the heap is in use. */
1499
1500 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501 RRETURN(rrc);
1502
1503 /* Assertion brackets. Check the alternative branches in turn - the
1504 matching won't pass the KET for an assertion. If any one branch matches,
1505 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506 start of each branch to move the current point backwards, so the code at
1507 this level is identical to the lookahead case. When the assertion is part
1508 of a condition, we want to return immediately afterwards. The caller of
1509 this incarnation of the match() function will have set MATCH_CONDASSERT in
1510 md->match_function type, and one of these opcodes will be the first opcode
1511 that is processed. We use a local variable that is preserved over calls to
1512 match() to remember this case. */
1513
1514 case OP_ASSERT:
1515 case OP_ASSERTBACK:
1516 save_mark = md->mark;
1517 if (md->match_function_type == MATCH_CONDASSERT)
1518 {
1519 condassert = TRUE;
1520 md->match_function_type = 0;
1521 }
1522 else condassert = FALSE;
1523
1524 /* Loop for each branch */
1525
1526 do
1527 {
1528 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1529
1530 /* A match means that the assertion is true; break out of the loop
1531 that matches its alternatives. */
1532
1533 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1534 {
1535 mstart = md->start_match_ptr; /* In case \K reset it */
1536 break;
1537 }
1538
1539 /* If not matched, restore the previous mark setting. */
1540
1541 md->mark = save_mark;
1542
1543 /* See comment in the code for capturing groups above about handling
1544 THEN. */
1545
1546 if (rrc == MATCH_THEN)
1547 {
1548 next = ecode + GET(ecode,1);
1549 if (md->start_match_ptr < next &&
1550 (*ecode == OP_ALT || *next == OP_ALT))
1551 rrc = MATCH_NOMATCH;
1552 }
1553
1554 /* Anything other than NOMATCH causes the entire assertion to fail,
1555 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1556 uncaptured THEN, which means they take their normal effect. This
1557 consistent approach does not always have exactly the same effect as in
1558 Perl. */
1559
1560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1561 ecode += GET(ecode, 1);
1562 }
1563 while (*ecode == OP_ALT); /* Continue for next alternative */
1564
1565 /* If we have tried all the alternative branches, the assertion has
1566 failed. If not, we broke out after a match. */
1567
1568 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1569
1570 /* If checking an assertion for a condition, return MATCH_MATCH. */
1571
1572 if (condassert) RRETURN(MATCH_MATCH);
1573
1574 /* Continue from after a successful assertion, updating the offsets high
1575 water mark, since extracts may have been taken during the assertion. */
1576
1577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1578 ecode += 1 + LINK_SIZE;
1579 offset_top = md->end_offset_top;
1580 continue;
1581
1582 /* Negative assertion: all branches must fail to match for the assertion to
1583 succeed. */
1584
1585 case OP_ASSERT_NOT:
1586 case OP_ASSERTBACK_NOT:
1587 save_mark = md->mark;
1588 if (md->match_function_type == MATCH_CONDASSERT)
1589 {
1590 condassert = TRUE;
1591 md->match_function_type = 0;
1592 }
1593 else condassert = FALSE;
1594
1595 /* Loop for each alternative branch. */
1596
1597 do
1598 {
1599 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1600 md->mark = save_mark; /* Always restore the mark setting */
1601
1602 switch(rrc)
1603 {
1604 case MATCH_MATCH: /* A successful match means */
1605 case MATCH_ACCEPT: /* the assertion has failed. */
1606 RRETURN(MATCH_NOMATCH);
1607
1608 case MATCH_NOMATCH: /* Carry on with next branch */
1609 break;
1610
1611 /* See comment in the code for capturing groups above about handling
1612 THEN. */
1613
1614 case MATCH_THEN:
1615 next = ecode + GET(ecode,1);
1616 if (md->start_match_ptr < next &&
1617 (*ecode == OP_ALT || *next == OP_ALT))
1618 {
1619 rrc = MATCH_NOMATCH;
1620 break;
1621 }
1622 /* Otherwise fall through. */
1623
1624 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1625 assertion to fail to match, without considering any more alternatives.
1626 Failing to match means the assertion is true. This is a consistent
1627 approach, but does not always have the same effect as in Perl. */
1628
1629 case MATCH_COMMIT:
1630 case MATCH_SKIP:
1631 case MATCH_SKIP_ARG:
1632 case MATCH_PRUNE:
1633 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1634 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1635
1636 /* Anything else is an error */
1637
1638 default:
1639 RRETURN(rrc);
1640 }
1641
1642 /* Continue with next branch */
1643
1644 ecode += GET(ecode,1);
1645 }
1646 while (*ecode == OP_ALT);
1647
1648 /* All branches in the assertion failed to match. */
1649
1650 NEG_ASSERT_TRUE:
1651 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1652 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1653 continue;
1654
1655 /* Move the subject pointer back. This occurs only at the start of
1656 each branch of a lookbehind assertion. If we are too close to the start to
1657 move back, this match function fails. When working with UTF-8 we move
1658 back a number of characters, not bytes. */
1659
1660 case OP_REVERSE:
1661 #ifdef SUPPORT_UTF
1662 if (utf)
1663 {
1664 i = GET(ecode, 1);
1665 while (i-- > 0)
1666 {
1667 eptr--;
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 BACKCHAR(eptr);
1670 }
1671 }
1672 else
1673 #endif
1674
1675 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1676
1677 {
1678 eptr -= GET(ecode, 1);
1679 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1680 }
1681
1682 /* Save the earliest consulted character, then skip to next op code */
1683
1684 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1685 ecode += 1 + LINK_SIZE;
1686 break;
1687
1688 /* The callout item calls an external function, if one is provided, passing
1689 details of the match so far. This is mainly for debugging, though the
1690 function is able to force a failure. */
1691
1692 case OP_CALLOUT:
1693 if (PUBL(callout) != NULL)
1694 {
1695 PUBL(callout_block) cb;
1696 cb.version = 2; /* Version 1 of the callout block */
1697 cb.callout_number = ecode[1];
1698 cb.offset_vector = md->offset_vector;
1699 #if defined COMPILE_PCRE8
1700 cb.subject = (PCRE_SPTR)md->start_subject;
1701 #elif defined COMPILE_PCRE16
1702 cb.subject = (PCRE_SPTR16)md->start_subject;
1703 #elif defined COMPILE_PCRE32
1704 cb.subject = (PCRE_SPTR32)md->start_subject;
1705 #endif
1706 cb.subject_length = (int)(md->end_subject - md->start_subject);
1707 cb.start_match = (int)(mstart - md->start_subject);
1708 cb.current_position = (int)(eptr - md->start_subject);
1709 cb.pattern_position = GET(ecode, 2);
1710 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1711 cb.capture_top = offset_top/2;
1712 cb.capture_last = md->capture_last & CAPLMASK;
1713 /* Internal change requires this for API compatibility. */
1714 if (cb.capture_last == 0) cb.capture_last = -1;
1715 cb.callout_data = md->callout_data;
1716 cb.mark = md->nomatch_mark;
1717 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1718 if (rrc < 0) RRETURN(rrc);
1719 }
1720 ecode += 2 + 2*LINK_SIZE;
1721 break;
1722
1723 /* Recursion either matches the current regex, or some subexpression. The
1724 offset data is the offset to the starting bracket from the start of the
1725 whole pattern. (This is so that it works from duplicated subpatterns.)
1726
1727 The state of the capturing groups is preserved over recursion, and
1728 re-instated afterwards. We don't know how many are started and not yet
1729 finished (offset_top records the completed total) so we just have to save
1730 all the potential data. There may be up to 65535 such values, which is too
1731 large to put on the stack, but using malloc for small numbers seems
1732 expensive. As a compromise, the stack is used when there are no more than
1733 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1734
1735 There are also other values that have to be saved. We use a chained
1736 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1737 for the original version of this logic. It has, however, been hacked around
1738 a lot, so he is not to blame for the current way it works. */
1739
1740 case OP_RECURSE:
1741 {
1742 recursion_info *ri;
1743 unsigned int recno;
1744
1745 callpat = md->start_code + GET(ecode, 1);
1746 recno = (callpat == md->start_code)? 0 :
1747 GET2(callpat, 1 + LINK_SIZE);
1748
1749 /* Check for repeating a recursion without advancing the subject pointer.
1750 This should catch convoluted mutual recursions. (Some simple cases are
1751 caught at compile time.) */
1752
1753 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1754 if (recno == ri->group_num && eptr == ri->subject_position)
1755 RRETURN(PCRE_ERROR_RECURSELOOP);
1756
1757 /* Add to "recursing stack" */
1758
1759 new_recursive.group_num = recno;
1760 new_recursive.saved_capture_last = md->capture_last;
1761 new_recursive.subject_position = eptr;
1762 new_recursive.prevrec = md->recursive;
1763 md->recursive = &new_recursive;
1764
1765 /* Where to continue from afterwards */
1766
1767 ecode += 1 + LINK_SIZE;
1768
1769 /* Now save the offset data */
1770
1771 new_recursive.saved_max = md->offset_end;
1772 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1773 new_recursive.offset_save = stacksave;
1774 else
1775 {
1776 new_recursive.offset_save =
1777 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1778 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1779 }
1780 memcpy(new_recursive.offset_save, md->offset_vector,
1781 new_recursive.saved_max * sizeof(int));
1782
1783 /* OK, now we can do the recursion. After processing each alternative,
1784 restore the offset data and the last captured value. If there were nested
1785 recursions, md->recursive might be changed, so reset it before looping.
1786 */
1787
1788 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1789 cbegroup = (*callpat >= OP_SBRA);
1790 do
1791 {
1792 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1793 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1794 md, eptrb, RM6);
1795 memcpy(md->offset_vector, new_recursive.offset_save,
1796 new_recursive.saved_max * sizeof(int));
1797 md->capture_last = new_recursive.saved_capture_last;
1798 md->recursive = new_recursive.prevrec;
1799 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1800 {
1801 DPRINTF(("Recursion matched\n"));
1802 if (new_recursive.offset_save != stacksave)
1803 (PUBL(free))(new_recursive.offset_save);
1804
1805 /* Set where we got to in the subject, and reset the start in case
1806 it was changed by \K. This *is* propagated back out of a recursion,
1807 for Perl compatibility. */
1808
1809 eptr = md->end_match_ptr;
1810 mstart = md->start_match_ptr;
1811 goto RECURSION_MATCHED; /* Exit loop; end processing */
1812 }
1813
1814 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1815 recursion; they cause a NOMATCH for the entire recursion. These codes
1816 are defined in a range that can be tested for. */
1817
1818 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1819 RRETURN(MATCH_NOMATCH);
1820
1821 /* Any return code other than NOMATCH is an error. */
1822
1823 if (rrc != MATCH_NOMATCH)
1824 {
1825 DPRINTF(("Recursion gave error %d\n", rrc));
1826 if (new_recursive.offset_save != stacksave)
1827 (PUBL(free))(new_recursive.offset_save);
1828 RRETURN(rrc);
1829 }
1830
1831 md->recursive = &new_recursive;
1832 callpat += GET(callpat, 1);
1833 }
1834 while (*callpat == OP_ALT);
1835
1836 DPRINTF(("Recursion didn't match\n"));
1837 md->recursive = new_recursive.prevrec;
1838 if (new_recursive.offset_save != stacksave)
1839 (PUBL(free))(new_recursive.offset_save);
1840 RRETURN(MATCH_NOMATCH);
1841 }
1842
1843 RECURSION_MATCHED:
1844 break;
1845
1846 /* An alternation is the end of a branch; scan along to find the end of the
1847 bracketed group and go to there. */
1848
1849 case OP_ALT:
1850 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1851 break;
1852
1853 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1854 indicating that it may occur zero times. It may repeat infinitely, or not
1855 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1856 with fixed upper repeat limits are compiled as a number of copies, with the
1857 optional ones preceded by BRAZERO or BRAMINZERO. */
1858
1859 case OP_BRAZERO:
1860 next = ecode + 1;
1861 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1863 do next += GET(next, 1); while (*next == OP_ALT);
1864 ecode = next + 1 + LINK_SIZE;
1865 break;
1866
1867 case OP_BRAMINZERO:
1868 next = ecode + 1;
1869 do next += GET(next, 1); while (*next == OP_ALT);
1870 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1872 ecode++;
1873 break;
1874
1875 case OP_SKIPZERO:
1876 next = ecode+1;
1877 do next += GET(next,1); while (*next == OP_ALT);
1878 ecode = next + 1 + LINK_SIZE;
1879 break;
1880
1881 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1882 here; just jump to the group, with allow_zero set TRUE. */
1883
1884 case OP_BRAPOSZERO:
1885 op = *(++ecode);
1886 allow_zero = TRUE;
1887 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1888 goto POSSESSIVE_NON_CAPTURE;
1889
1890 /* End of a group, repeated or non-repeating. */
1891
1892 case OP_KET:
1893 case OP_KETRMIN:
1894 case OP_KETRMAX:
1895 case OP_KETRPOS:
1896 prev = ecode - GET(ecode, 1);
1897
1898 /* If this was a group that remembered the subject start, in order to break
1899 infinite repeats of empty string matches, retrieve the subject start from
1900 the chain. Otherwise, set it NULL. */
1901
1902 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1903 {
1904 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1905 eptrb = eptrb->epb_prev; /* Backup to previous group */
1906 }
1907 else saved_eptr = NULL;
1908
1909 /* If we are at the end of an assertion group or a non-capturing atomic
1910 group, stop matching and return MATCH_MATCH, but record the current high
1911 water mark for use by positive assertions. We also need to record the match
1912 start in case it was changed by \K. */
1913
1914 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1915 *prev == OP_ONCE_NC)
1916 {
1917 md->end_match_ptr = eptr; /* For ONCE_NC */
1918 md->end_offset_top = offset_top;
1919 md->start_match_ptr = mstart;
1920 RRETURN(MATCH_MATCH); /* Sets md->mark */
1921 }
1922
1923 /* For capturing groups we have to check the group number back at the start
1924 and if necessary complete handling an extraction by setting the offsets and
1925 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1926 into group 0, so it won't be picked up here. Instead, we catch it when the
1927 OP_END is reached. Other recursion is handled here. We just have to record
1928 the current subject position and start match pointer and give a MATCH
1929 return. */
1930
1931 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1932 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1933 {
1934 number = GET2(prev, 1+LINK_SIZE);
1935 offset = number << 1;
1936
1937 #ifdef PCRE_DEBUG
1938 printf("end bracket %d", number);
1939 printf("\n");
1940 #endif
1941
1942 /* Handle a recursively called group. */
1943
1944 if (md->recursive != NULL && md->recursive->group_num == number)
1945 {
1946 md->end_match_ptr = eptr;
1947 md->start_match_ptr = mstart;
1948 RRETURN(MATCH_MATCH);
1949 }
1950
1951 /* Deal with capturing */
1952
1953 md->capture_last = (md->capture_last & OVFLMASK) | number;
1954 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1955 {
1956 /* If offset is greater than offset_top, it means that we are
1957 "skipping" a capturing group, and that group's offsets must be marked
1958 unset. In earlier versions of PCRE, all the offsets were unset at the
1959 start of matching, but this doesn't work because atomic groups and
1960 assertions can cause a value to be set that should later be unset.
1961 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1962 part of the atomic group, but this is not on the final matching path,
1963 so must be unset when 2 is set. (If there is no group 2, there is no
1964 problem, because offset_top will then be 2, indicating no capture.) */
1965
1966 if (offset > offset_top)
1967 {
1968 register int *iptr = md->offset_vector + offset_top;
1969 register int *iend = md->offset_vector + offset;
1970 while (iptr < iend) *iptr++ = -1;
1971 }
1972
1973 /* Now make the extraction */
1974
1975 md->offset_vector[offset] =
1976 md->offset_vector[md->offset_end - number];
1977 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1978 if (offset_top <= offset) offset_top = offset + 2;
1979 }
1980 }
1981
1982 /* For an ordinary non-repeating ket, just continue at this level. This
1983 also happens for a repeating ket if no characters were matched in the
1984 group. This is the forcible breaking of infinite loops as implemented in
1985 Perl 5.005. For a non-repeating atomic group that includes captures,
1986 establish a backup point by processing the rest of the pattern at a lower
1987 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1988 original OP_ONCE level, thereby bypassing intermediate backup points, but
1989 resetting any captures that happened along the way. */
1990
1991 if (*ecode == OP_KET || eptr == saved_eptr)
1992 {
1993 if (*prev == OP_ONCE)
1994 {
1995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1998 RRETURN(MATCH_ONCE);
1999 }
2000 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2001 break;
2002 }
2003
2004 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2005 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2006 at a time from the outer level, thus saving stack. */
2007
2008 if (*ecode == OP_KETRPOS)
2009 {
2010 md->start_match_ptr = mstart; /* In case \K reset it */
2011 md->end_match_ptr = eptr;
2012 md->end_offset_top = offset_top;
2013 RRETURN(MATCH_KETRPOS);
2014 }
2015
2016 /* The normal repeating kets try the rest of the pattern or restart from
2017 the preceding bracket, in the appropriate order. In the second case, we can
2018 use tail recursion to avoid using another stack frame, unless we have an
2019 an atomic group or an unlimited repeat of a group that can match an empty
2020 string. */
2021
2022 if (*ecode == OP_KETRMIN)
2023 {
2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 if (*prev == OP_ONCE)
2027 {
2028 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2030 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2031 RRETURN(MATCH_ONCE);
2032 }
2033 if (*prev >= OP_SBRA) /* Could match an empty string */
2034 {
2035 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2036 RRETURN(rrc);
2037 }
2038 ecode = prev;
2039 goto TAIL_RECURSE;
2040 }
2041 else /* OP_KETRMAX */
2042 {
2043 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2044 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046 if (*prev == OP_ONCE)
2047 {
2048 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2050 md->once_target = prev;
2051 RRETURN(MATCH_ONCE);
2052 }
2053 ecode += 1 + LINK_SIZE;
2054 goto TAIL_RECURSE;
2055 }
2056 /* Control never gets here */
2057
2058 /* Not multiline mode: start of subject assertion, unless notbol. */
2059
2060 case OP_CIRC:
2061 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2062
2063 /* Start of subject assertion */
2064
2065 case OP_SOD:
2066 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2067 ecode++;
2068 break;
2069
2070 /* Multiline mode: start of subject unless notbol, or after any newline. */
2071
2072 case OP_CIRCM:
2073 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2074 if (eptr != md->start_subject &&
2075 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2076 RRETURN(MATCH_NOMATCH);
2077 ecode++;
2078 break;
2079
2080 /* Start of match assertion */
2081
2082 case OP_SOM:
2083 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2084 ecode++;
2085 break;
2086
2087 /* Reset the start of match point */
2088
2089 case OP_SET_SOM:
2090 mstart = eptr;
2091 ecode++;
2092 break;
2093
2094 /* Multiline mode: assert before any newline, or before end of subject
2095 unless noteol is set. */
2096
2097 case OP_DOLLM:
2098 if (eptr < md->end_subject)
2099 {
2100 if (!IS_NEWLINE(eptr))
2101 {
2102 if (md->partial != 0 &&
2103 eptr + 1 >= md->end_subject &&
2104 NLBLOCK->nltype == NLTYPE_FIXED &&
2105 NLBLOCK->nllen == 2 &&
2106 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2107 {
2108 md->hitend = TRUE;
2109 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2110 }
2111 RRETURN(MATCH_NOMATCH);
2112 }
2113 }
2114 else
2115 {
2116 if (md->noteol) RRETURN(MATCH_NOMATCH);
2117 SCHECK_PARTIAL();
2118 }
2119 ecode++;
2120 break;
2121
2122 /* Not multiline mode: assert before a terminating newline or before end of
2123 subject unless noteol is set. */
2124
2125 case OP_DOLL:
2126 if (md->noteol) RRETURN(MATCH_NOMATCH);
2127 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2128
2129 /* ... else fall through for endonly */
2130
2131 /* End of subject assertion (\z) */
2132
2133 case OP_EOD:
2134 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2135 SCHECK_PARTIAL();
2136 ecode++;
2137 break;
2138
2139 /* End of subject or ending \n assertion (\Z) */
2140
2141 case OP_EODN:
2142 ASSERT_NL_OR_EOS:
2143 if (eptr < md->end_subject &&
2144 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2145 {
2146 if (md->partial != 0 &&
2147 eptr + 1 >= md->end_subject &&
2148 NLBLOCK->nltype == NLTYPE_FIXED &&
2149 NLBLOCK->nllen == 2 &&
2150 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2151 {
2152 md->hitend = TRUE;
2153 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2154 }
2155 RRETURN(MATCH_NOMATCH);
2156 }
2157
2158 /* Either at end of string or \n before end. */
2159
2160 SCHECK_PARTIAL();
2161 ecode++;
2162 break;
2163
2164 /* Word boundary assertions */
2165
2166 case OP_NOT_WORD_BOUNDARY:
2167 case OP_WORD_BOUNDARY:
2168 {
2169
2170 /* Find out if the previous and current characters are "word" characters.
2171 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2172 be "non-word" characters. Remember the earliest consulted character for
2173 partial matching. */
2174
2175 #ifdef SUPPORT_UTF
2176 if (utf)
2177 {
2178 /* Get status of previous character */
2179
2180 if (eptr == md->start_subject) prev_is_word = FALSE; else
2181 {
2182 PCRE_PUCHAR lastptr = eptr - 1;
2183 BACKCHAR(lastptr);
2184 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2185 GETCHAR(c, lastptr);
2186 #ifdef SUPPORT_UCP
2187 if (md->use_ucp)
2188 {
2189 if (c == '_') prev_is_word = TRUE; else
2190 {
2191 int cat = UCD_CATEGORY(c);
2192 prev_is_word = (cat == ucp_L || cat == ucp_N);
2193 }
2194 }
2195 else
2196 #endif
2197 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2198 }
2199
2200 /* Get status of next character */
2201
2202 if (eptr >= md->end_subject)
2203 {
2204 SCHECK_PARTIAL();
2205 cur_is_word = FALSE;
2206 }
2207 else
2208 {
2209 GETCHAR(c, eptr);
2210 #ifdef SUPPORT_UCP
2211 if (md->use_ucp)
2212 {
2213 if (c == '_') cur_is_word = TRUE; else
2214 {
2215 int cat = UCD_CATEGORY(c);
2216 cur_is_word = (cat == ucp_L || cat == ucp_N);
2217 }
2218 }
2219 else
2220 #endif
2221 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2222 }
2223 }
2224 else
2225 #endif
2226
2227 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2228 consistency with the behaviour of \w we do use it in this case. */
2229
2230 {
2231 /* Get status of previous character */
2232
2233 if (eptr == md->start_subject) prev_is_word = FALSE; else
2234 {
2235 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2236 #ifdef SUPPORT_UCP
2237 if (md->use_ucp)
2238 {
2239 c = eptr[-1];
2240 if (c == '_') prev_is_word = TRUE; else
2241 {
2242 int cat = UCD_CATEGORY(c);
2243 prev_is_word = (cat == ucp_L || cat == ucp_N);
2244 }
2245 }
2246 else
2247 #endif
2248 prev_is_word = MAX_255(eptr[-1])
2249 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2250 }
2251
2252 /* Get status of next character */
2253
2254 if (eptr >= md->end_subject)
2255 {
2256 SCHECK_PARTIAL();
2257 cur_is_word = FALSE;
2258 }
2259 else
2260 #ifdef SUPPORT_UCP
2261 if (md->use_ucp)
2262 {
2263 c = *eptr;
2264 if (c == '_') cur_is_word = TRUE; else
2265 {
2266 int cat = UCD_CATEGORY(c);
2267 cur_is_word = (cat == ucp_L || cat == ucp_N);
2268 }
2269 }
2270 else
2271 #endif
2272 cur_is_word = MAX_255(*eptr)
2273 && ((md->ctypes[*eptr] & ctype_word) != 0);
2274 }
2275
2276 /* Now see if the situation is what we want */
2277
2278 if ((*ecode++ == OP_WORD_BOUNDARY)?
2279 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2280 RRETURN(MATCH_NOMATCH);
2281 }
2282 break;
2283
2284 /* Match any single character type except newline; have to take care with
2285 CRLF newlines and partial matching. */
2286
2287 case OP_ANY:
2288 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2289 if (md->partial != 0 &&
2290 eptr + 1 >= md->end_subject &&
2291 NLBLOCK->nltype == NLTYPE_FIXED &&
2292 NLBLOCK->nllen == 2 &&
2293 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2294 {
2295 md->hitend = TRUE;
2296 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2297 }
2298
2299 /* Fall through */
2300
2301 /* Match any single character whatsoever. */
2302
2303 case OP_ALLANY:
2304 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2305 { /* not be updated before SCHECK_PARTIAL. */
2306 SCHECK_PARTIAL();
2307 RRETURN(MATCH_NOMATCH);
2308 }
2309 eptr++;
2310 #ifdef SUPPORT_UTF
2311 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2312 #endif
2313 ecode++;
2314 break;
2315
2316 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2317 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2318
2319 case OP_ANYBYTE:
2320 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2321 { /* not be updated before SCHECK_PARTIAL. */
2322 SCHECK_PARTIAL();
2323 RRETURN(MATCH_NOMATCH);
2324 }
2325 eptr++;
2326 ecode++;
2327 break;
2328
2329 case OP_NOT_DIGIT:
2330 if (eptr >= md->end_subject)
2331 {
2332 SCHECK_PARTIAL();
2333 RRETURN(MATCH_NOMATCH);
2334 }
2335 GETCHARINCTEST(c, eptr);
2336 if (
2337 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2338 c < 256 &&
2339 #endif
2340 (md->ctypes[c] & ctype_digit) != 0
2341 )
2342 RRETURN(MATCH_NOMATCH);
2343 ecode++;
2344 break;
2345
2346 case OP_DIGIT:
2347 if (eptr >= md->end_subject)
2348 {
2349 SCHECK_PARTIAL();
2350 RRETURN(MATCH_NOMATCH);
2351 }
2352 GETCHARINCTEST(c, eptr);
2353 if (
2354 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2355 c > 255 ||
2356 #endif
2357 (md->ctypes[c] & ctype_digit) == 0
2358 )
2359 RRETURN(MATCH_NOMATCH);
2360 ecode++;
2361 break;
2362
2363 case OP_NOT_WHITESPACE:
2364 if (eptr >= md->end_subject)
2365 {
2366 SCHECK_PARTIAL();
2367 RRETURN(MATCH_NOMATCH);
2368 }
2369 GETCHARINCTEST(c, eptr);
2370 if (
2371 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2372 c < 256 &&
2373 #endif
2374 (md->ctypes[c] & ctype_space) != 0
2375 )
2376 RRETURN(MATCH_NOMATCH);
2377 ecode++;
2378 break;
2379
2380 case OP_WHITESPACE:
2381 if (eptr >= md->end_subject)
2382 {
2383 SCHECK_PARTIAL();
2384 RRETURN(MATCH_NOMATCH);
2385 }
2386 GETCHARINCTEST(c, eptr);
2387 if (
2388 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2389 c > 255 ||
2390 #endif
2391 (md->ctypes[c] & ctype_space) == 0
2392 )
2393 RRETURN(MATCH_NOMATCH);
2394 ecode++;
2395 break;
2396
2397 case OP_NOT_WORDCHAR:
2398 if (eptr >= md->end_subject)
2399 {
2400 SCHECK_PARTIAL();
2401 RRETURN(MATCH_NOMATCH);
2402 }
2403 GETCHARINCTEST(c, eptr);
2404 if (
2405 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2406 c < 256 &&
2407 #endif
2408 (md->ctypes[c] & ctype_word) != 0
2409 )
2410 RRETURN(MATCH_NOMATCH);
2411 ecode++;
2412 break;
2413
2414 case OP_WORDCHAR:
2415 if (eptr >= md->end_subject)
2416 {
2417 SCHECK_PARTIAL();
2418 RRETURN(MATCH_NOMATCH);
2419 }
2420 GETCHARINCTEST(c, eptr);
2421 if (
2422 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2423 c > 255 ||
2424 #endif
2425 (md->ctypes[c] & ctype_word) == 0
2426 )
2427 RRETURN(MATCH_NOMATCH);
2428 ecode++;
2429 break;
2430
2431 case OP_ANYNL:
2432 if (eptr >= md->end_subject)
2433 {
2434 SCHECK_PARTIAL();
2435 RRETURN(MATCH_NOMATCH);
2436 }
2437 GETCHARINCTEST(c, eptr);
2438 switch(c)
2439 {
2440 default: RRETURN(MATCH_NOMATCH);
2441
2442 case CHAR_CR:
2443 if (eptr >= md->end_subject)
2444 {
2445 SCHECK_PARTIAL();
2446 }
2447 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2448 break;
2449
2450 case CHAR_LF:
2451 break;
2452
2453 case CHAR_VT:
2454 case CHAR_FF:
2455 case CHAR_NEL:
2456 #ifndef EBCDIC
2457 case 0x2028:
2458 case 0x2029:
2459 #endif /* Not EBCDIC */
2460 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2461 break;
2462 }
2463 ecode++;
2464 break;
2465
2466 case OP_NOT_HSPACE:
2467 if (eptr >= md->end_subject)
2468 {
2469 SCHECK_PARTIAL();
2470 RRETURN(MATCH_NOMATCH);
2471 }
2472 GETCHARINCTEST(c, eptr);
2473 switch(c)
2474 {
2475 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2476 default: break;
2477 }
2478 ecode++;
2479 break;
2480
2481 case OP_HSPACE:
2482 if (eptr >= md->end_subject)
2483 {
2484 SCHECK_PARTIAL();
2485 RRETURN(MATCH_NOMATCH);
2486 }
2487 GETCHARINCTEST(c, eptr);
2488 switch(c)
2489 {
2490 HSPACE_CASES: break; /* Byte and multibyte cases */
2491 default: RRETURN(MATCH_NOMATCH);
2492 }
2493 ecode++;
2494 break;
2495
2496 case OP_NOT_VSPACE:
2497 if (eptr >= md->end_subject)
2498 {
2499 SCHECK_PARTIAL();
2500 RRETURN(MATCH_NOMATCH);
2501 }
2502 GETCHARINCTEST(c, eptr);
2503 switch(c)
2504 {
2505 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2506 default: break;
2507 }
2508 ecode++;
2509 break;
2510
2511 case OP_VSPACE:
2512 if (eptr >= md->end_subject)
2513 {
2514 SCHECK_PARTIAL();
2515 RRETURN(MATCH_NOMATCH);
2516 }
2517 GETCHARINCTEST(c, eptr);
2518 switch(c)
2519 {
2520 VSPACE_CASES: break;
2521 default: RRETURN(MATCH_NOMATCH);
2522 }
2523 ecode++;
2524 break;
2525
2526 #ifdef SUPPORT_UCP
2527 /* Check the next character by Unicode property. We will get here only
2528 if the support is in the binary; otherwise a compile-time error occurs. */
2529
2530 case OP_PROP:
2531 case OP_NOTPROP:
2532 if (eptr >= md->end_subject)
2533 {
2534 SCHECK_PARTIAL();
2535 RRETURN(MATCH_NOMATCH);
2536 }
2537 GETCHARINCTEST(c, eptr);
2538 {
2539 const pcre_uint32 *cp;
2540 const ucd_record *prop = GET_UCD(c);
2541
2542 switch(ecode[1])
2543 {
2544 case PT_ANY:
2545 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2546 break;
2547
2548 case PT_LAMP:
2549 if ((prop->chartype == ucp_Lu ||
2550 prop->chartype == ucp_Ll ||
2551 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2552 RRETURN(MATCH_NOMATCH);
2553 break;
2554
2555 case PT_GC:
2556 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2557 RRETURN(MATCH_NOMATCH);
2558 break;
2559
2560 case PT_PC:
2561 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2562 RRETURN(MATCH_NOMATCH);
2563 break;
2564
2565 case PT_SC:
2566 if ((ecode[2] != prop->script) == (op == OP_PROP))
2567 RRETURN(MATCH_NOMATCH);
2568 break;
2569
2570 /* These are specials */
2571
2572 case PT_ALNUM:
2573 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2574 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2575 RRETURN(MATCH_NOMATCH);
2576 break;
2577
2578 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2579 which means that Perl space and POSIX space are now identical. PCRE
2580 was changed at release 8.34. */
2581
2582 case PT_SPACE: /* Perl space */
2583 case PT_PXSPACE: /* POSIX space */
2584 switch(c)
2585 {
2586 HSPACE_CASES:
2587 VSPACE_CASES:
2588 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2589 break;
2590
2591 default:
2592 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2593 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2594 break;
2595 }
2596 break;
2597
2598 case PT_WORD:
2599 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2600 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2601 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2604
2605 case PT_CLIST:
2606 cp = PRIV(ucd_caseless_sets) + ecode[2];
2607 for (;;)
2608 {
2609 if (c < *cp)
2610 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2611 if (c == *cp++)
2612 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2613 }
2614 break;
2615
2616 case PT_UCNC:
2617 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2618 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2619 c >= 0xe000) == (op == OP_NOTPROP))
2620 RRETURN(MATCH_NOMATCH);
2621 break;
2622
2623 /* This should never occur */
2624
2625 default:
2626 RRETURN(PCRE_ERROR_INTERNAL);
2627 }
2628
2629 ecode += 3;
2630 }
2631 break;
2632
2633 /* Match an extended Unicode sequence. We will get here only if the support
2634 is in the binary; otherwise a compile-time error occurs. */
2635
2636 case OP_EXTUNI:
2637 if (eptr >= md->end_subject)
2638 {
2639 SCHECK_PARTIAL();
2640 RRETURN(MATCH_NOMATCH);
2641 }
2642 else
2643 {
2644 int lgb, rgb;
2645 GETCHARINCTEST(c, eptr);
2646 lgb = UCD_GRAPHBREAK(c);
2647 while (eptr < md->end_subject)
2648 {
2649 int len = 1;
2650 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2651 rgb = UCD_GRAPHBREAK(c);
2652 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2653 lgb = rgb;
2654 eptr += len;
2655 }
2656 }
2657 CHECK_PARTIAL();
2658 ecode++;
2659 break;
2660 #endif /* SUPPORT_UCP */
2661
2662
2663 /* Match a back reference, possibly repeatedly. Look past the end of the
2664 item to see if there is repeat information following. The code is similar
2665 to that for character classes, but repeated for efficiency. Then obey
2666 similar code to character type repeats - written out again for speed.
2667 However, if the referenced string is the empty string, always treat
2668 it as matched, any number of times (otherwise there could be infinite
2669 loops). If the reference is unset, there are two possibilities:
2670
2671 (a) In the default, Perl-compatible state, set the length negative;
2672 this ensures that every attempt at a match fails. We can't just fail
2673 here, because of the possibility of quantifiers with zero minima.
2674
2675 (b) If the JavaScript compatibility flag is set, set the length to zero
2676 so that the back reference matches an empty string.
2677
2678 Otherwise, set the length to the length of what was matched by the
2679 referenced subpattern.
2680
2681 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2682 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2683 and OP_DNREFI are used. In this case we must scan the list of groups to
2684 which the name refers, and use the first one that is set. */
2685
2686 case OP_DNREF:
2687 case OP_DNREFI:
2688 caseless = op == OP_DNREFI;
2689 {
2690 int count = GET2(ecode, 1+IMM2_SIZE);
2691 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2692 ecode += 1 + 2*IMM2_SIZE;
2693
2694 while (count-- > 0)
2695 {
2696 offset = GET2(slot, 0) << 1;
2697 if (offset < offset_top && md->offset_vector[offset] >= 0) break;
2698 slot += md->name_entry_size;
2699 }
2700 if (count < 0)
2701 length = (md->jscript_compat)? 0 : -1;
2702 else
2703 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2704 }
2705 goto REF_REPEAT;
2706
2707 case OP_REF:
2708 case OP_REFI:
2709 caseless = op == OP_REFI;
2710 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2711 ecode += 1 + IMM2_SIZE;
2712 if (offset >= offset_top || md->offset_vector[offset] < 0)
2713 length = (md->jscript_compat)? 0 : -1;
2714 else
2715 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2716
2717 /* Set up for repetition, or handle the non-repeated case */
2718
2719 REF_REPEAT:
2720 switch (*ecode)
2721 {
2722 case OP_CRSTAR:
2723 case OP_CRMINSTAR:
2724 case OP_CRPLUS:
2725 case OP_CRMINPLUS:
2726 case OP_CRQUERY:
2727 case OP_CRMINQUERY:
2728 c = *ecode++ - OP_CRSTAR;
2729 minimize = (c & 1) != 0;
2730 min = rep_min[c]; /* Pick up values from tables; */
2731 max = rep_max[c]; /* zero for max => infinity */
2732 if (max == 0) max = INT_MAX;
2733 break;
2734
2735 case OP_CRRANGE:
2736 case OP_CRMINRANGE:
2737 minimize = (*ecode == OP_CRMINRANGE);
2738 min = GET2(ecode, 1);
2739 max = GET2(ecode, 1 + IMM2_SIZE);
2740 if (max == 0) max = INT_MAX;
2741 ecode += 1 + 2 * IMM2_SIZE;
2742 break;
2743
2744 default: /* No repeat follows */
2745 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2746 {
2747 if (length == -2) eptr = md->end_subject; /* Partial match */
2748 CHECK_PARTIAL();
2749 RRETURN(MATCH_NOMATCH);
2750 }
2751 eptr += length;
2752 continue; /* With the main loop */
2753 }
2754
2755 /* Handle repeated back references. If the length of the reference is
2756 zero, just continue with the main loop. If the length is negative, it
2757 means the reference is unset in non-Java-compatible mode. If the minimum is
2758 zero, we can continue at the same level without recursion. For any other
2759 minimum, carrying on will result in NOMATCH. */
2760
2761 if (length == 0) continue;
2762 if (length < 0 && min == 0) continue;
2763
2764 /* First, ensure the minimum number of matches are present. We get back
2765 the length of the reference string explicitly rather than passing the
2766 address of eptr, so that eptr can be a register variable. */
2767
2768 for (i = 1; i <= min; i++)
2769 {
2770 int slength;
2771 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2772 {
2773 if (slength == -2) eptr = md->end_subject; /* Partial match */
2774 CHECK_PARTIAL();
2775 RRETURN(MATCH_NOMATCH);
2776 }
2777 eptr += slength;
2778 }
2779
2780 /* If min = max, continue at the same level without recursion.
2781 They are not both allowed to be zero. */
2782
2783 if (min == max) continue;
2784
2785 /* If minimizing, keep trying and advancing the pointer */
2786
2787 if (minimize)
2788 {
2789 for (fi = min;; fi++)
2790 {
2791 int slength;
2792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2794 if (fi >= max) RRETURN(MATCH_NOMATCH);
2795 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2796 {
2797 if (slength == -2) eptr = md->end_subject; /* Partial match */
2798 CHECK_PARTIAL();
2799 RRETURN(MATCH_NOMATCH);
2800 }
2801 eptr += slength;
2802 }
2803 /* Control never gets here */
2804 }
2805
2806 /* If maximizing, find the longest string and work backwards */
2807
2808 else
2809 {
2810 pp = eptr;
2811 for (i = min; i < max; i++)
2812 {
2813 int slength;
2814 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2815 {
2816 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2817 the soft partial matching case. */
2818
2819 if (slength == -2 && md->partial != 0 &&
2820 md->end_subject > md->start_used_ptr)
2821 {
2822 md->hitend = TRUE;
2823 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2824 }
2825 break;
2826 }
2827 eptr += slength;
2828 }
2829
2830 while (eptr >= pp)
2831 {
2832 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2834 eptr -= length;
2835 }
2836 RRETURN(MATCH_NOMATCH);
2837 }
2838 /* Control never gets here */
2839
2840 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2841 used when all the characters in the class have values in the range 0-255,
2842 and either the matching is caseful, or the characters are in the range
2843 0-127 when UTF-8 processing is enabled. The only difference between
2844 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2845 encountered.
2846
2847 First, look past the end of the item to see if there is repeat information
2848 following. Then obey similar code to character type repeats - written out
2849 again for speed. */
2850
2851 case OP_NCLASS:
2852 case OP_CLASS:
2853 {
2854 /* The data variable is saved across frames, so the byte map needs to
2855 be stored there. */
2856 #define BYTE_MAP ((pcre_uint8 *)data)
2857 data = ecode + 1; /* Save for matching */
2858 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2859
2860 switch (*ecode)
2861 {
2862 case OP_CRSTAR:
2863 case OP_CRMINSTAR:
2864 case OP_CRPLUS:
2865 case OP_CRMINPLUS:
2866 case OP_CRQUERY:
2867 case OP_CRMINQUERY:
2868 case OP_CRPOSSTAR:
2869 case OP_CRPOSPLUS:
2870 case OP_CRPOSQUERY:
2871 c = *ecode++ - OP_CRSTAR;
2872 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2873 else possessive = TRUE;
2874 min = rep_min[c]; /* Pick up values from tables; */
2875 max = rep_max[c]; /* zero for max => infinity */
2876 if (max == 0) max = INT_MAX;
2877 break;
2878
2879 case OP_CRRANGE:
2880 case OP_CRMINRANGE:
2881 case OP_CRPOSRANGE:
2882 minimize = (*ecode == OP_CRMINRANGE);
2883 possessive = (*ecode == OP_CRPOSRANGE);
2884 min = GET2(ecode, 1);
2885 max = GET2(ecode, 1 + IMM2_SIZE);
2886 if (max == 0) max = INT_MAX;
2887 ecode += 1 + 2 * IMM2_SIZE;
2888 break;
2889
2890 default: /* No repeat follows */
2891 min = max = 1;
2892 break;
2893 }
2894
2895 /* First, ensure the minimum number of matches are present. */
2896
2897 #ifdef SUPPORT_UTF
2898 if (utf)
2899 {
2900 for (i = 1; i <= min; i++)
2901 {
2902 if (eptr >= md->end_subject)
2903 {
2904 SCHECK_PARTIAL();
2905 RRETURN(MATCH_NOMATCH);
2906 }
2907 GETCHARINC(c, eptr);
2908 if (c > 255)
2909 {
2910 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2911 }
2912 else
2913 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2914 }
2915 }
2916 else
2917 #endif
2918 /* Not UTF mode */
2919 {
2920 for (i = 1; i <= min; i++)
2921 {
2922 if (eptr >= md->end_subject)
2923 {
2924 SCHECK_PARTIAL();
2925 RRETURN(MATCH_NOMATCH);
2926 }
2927 c = *eptr++;
2928 #ifndef COMPILE_PCRE8
2929 if (c > 255)
2930 {
2931 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2932 }
2933 else
2934 #endif
2935 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2936 }
2937 }
2938
2939 /* If max == min we can continue with the main loop without the
2940 need to recurse. */
2941
2942 if (min == max) continue;
2943
2944 /* If minimizing, keep testing the rest of the expression and advancing
2945 the pointer while it matches the class. */
2946
2947 if (minimize)
2948 {
2949 #ifdef SUPPORT_UTF
2950 if (utf)
2951 {
2952 for (fi = min;; fi++)
2953 {
2954 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2955 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2956 if (fi >= max) RRETURN(MATCH_NOMATCH);
2957 if (eptr >= md->end_subject)
2958 {
2959 SCHECK_PARTIAL();
2960 RRETURN(MATCH_NOMATCH);
2961 }
2962 GETCHARINC(c, eptr);
2963 if (c > 255)
2964 {
2965 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2966 }
2967 else
2968 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2969 }
2970 }
2971 else
2972 #endif
2973 /* Not UTF mode */
2974 {
2975 for (fi = min;; fi++)
2976 {
2977 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2979 if (fi >= max) RRETURN(MATCH_NOMATCH);
2980 if (eptr >= md->end_subject)
2981 {
2982 SCHECK_PARTIAL();
2983 RRETURN(MATCH_NOMATCH);
2984 }
2985 c = *eptr++;
2986 #ifndef COMPILE_PCRE8
2987 if (c > 255)
2988 {
2989 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2990 }
2991 else
2992 #endif
2993 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2994 }
2995 }
2996 /* Control never gets here */
2997 }
2998
2999 /* If maximizing, find the longest possible run, then work backwards. */
3000
3001 else
3002 {
3003 pp = eptr;
3004
3005 #ifdef SUPPORT_UTF
3006 if (utf)
3007 {
3008 for (i = min; i < max; i++)
3009 {
3010 int len = 1;
3011 if (eptr >= md->end_subject)
3012 {
3013 SCHECK_PARTIAL();
3014 break;
3015 }
3016 GETCHARLEN(c, eptr, len);
3017 if (c > 255)
3018 {
3019 if (op == OP_CLASS) break;
3020 }
3021 else
3022 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3023 eptr += len;
3024 }
3025
3026 if (possessive) continue; /* No backtracking */
3027
3028 for (;;)
3029 {
3030 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3031 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3032 if (eptr-- == pp) break; /* Stop if tried at original pos */
3033 BACKCHAR(eptr);
3034 }
3035 }
3036 else
3037 #endif
3038 /* Not UTF mode */
3039 {
3040 for (i = min; i < max; i++)
3041 {
3042 if (eptr >= md->end_subject)
3043 {
3044 SCHECK_PARTIAL();
3045 break;
3046 }
3047 c = *eptr;
3048 #ifndef COMPILE_PCRE8
3049 if (c > 255)
3050 {
3051 if (op == OP_CLASS) break;
3052 }
3053 else
3054 #endif
3055 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3056 eptr++;
3057 }
3058
3059 if (possessive) continue; /* No backtracking */
3060
3061 while (eptr >= pp)
3062 {
3063 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3065 eptr--;
3066 }
3067 }
3068
3069 RRETURN(MATCH_NOMATCH);
3070 }
3071 #undef BYTE_MAP
3072 }
3073 /* Control never gets here */
3074
3075
3076 /* Match an extended character class. In the 8-bit library, this opcode is
3077 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3078 32-bit libraries, codepoints greater than 255 may be encountered even when
3079 UTF is not supported. */
3080
3081 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3082 case OP_XCLASS:
3083 {
3084 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3085 ecode += GET(ecode, 1); /* Advance past the item */
3086
3087 switch (*ecode)
3088 {
3089 case OP_CRSTAR:
3090 case OP_CRMINSTAR:
3091 case OP_CRPLUS:
3092 case OP_CRMINPLUS:
3093 case OP_CRQUERY:
3094 case OP_CRMINQUERY:
3095 case OP_CRPOSSTAR:
3096 case OP_CRPOSPLUS:
3097 case OP_CRPOSQUERY:
3098 c = *ecode++ - OP_CRSTAR;
3099 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3100 else possessive = TRUE;
3101 min = rep_min[c]; /* Pick up values from tables; */
3102 max = rep_max[c]; /* zero for max => infinity */
3103 if (max == 0) max = INT_MAX;
3104 break;
3105
3106 case OP_CRRANGE:
3107 case OP_CRMINRANGE:
3108 case OP_CRPOSRANGE:
3109 minimize = (*ecode == OP_CRMINRANGE);
3110 possessive = (*ecode == OP_CRPOSRANGE);
3111 min = GET2(ecode, 1);
3112 max = GET2(ecode, 1 + IMM2_SIZE);
3113 if (max == 0) max = INT_MAX;
3114 ecode += 1 + 2 * IMM2_SIZE;
3115 break;
3116
3117 default: /* No repeat follows */
3118 min = max = 1;
3119 break;
3120 }
3121
3122 /* First, ensure the minimum number of matches are present. */
3123
3124 for (i = 1; i <= min; i++)
3125 {
3126 if (eptr >= md->end_subject)
3127 {
3128 SCHECK_PARTIAL();
3129 RRETURN(MATCH_NOMATCH);
3130 }
3131 GETCHARINCTEST(c, eptr);
3132 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3133 }
3134
3135 /* If max == min we can continue with the main loop without the
3136 need to recurse. */
3137
3138 if (min == max) continue;
3139
3140 /* If minimizing, keep testing the rest of the expression and advancing
3141 the pointer while it matches the class. */
3142
3143 if (minimize)
3144 {
3145 for (fi = min;; fi++)
3146 {
3147 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3148 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3149 if (fi >= max) RRETURN(MATCH_NOMATCH);
3150 if (eptr >= md->end_subject)
3151 {
3152 SCHECK_PARTIAL();
3153 RRETURN(MATCH_NOMATCH);
3154 }
3155 GETCHARINCTEST(c, eptr);
3156 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3157 }
3158 /* Control never gets here */
3159 }
3160
3161 /* If maximizing, find the longest possible run, then work backwards. */
3162
3163 else
3164 {
3165 pp = eptr;
3166 for (i = min; i < max; i++)
3167 {
3168 int len = 1;
3169 if (eptr >= md->end_subject)
3170 {
3171 SCHECK_PARTIAL();
3172 break;
3173 }
3174 #ifdef SUPPORT_UTF
3175 GETCHARLENTEST(c, eptr, len);
3176 #else
3177 c = *eptr;
3178 #endif
3179 if (!PRIV(xclass)(c, data, utf)) break;
3180 eptr += len;
3181 }
3182
3183 if (possessive) continue; /* No backtracking */
3184
3185 for(;;)
3186 {
3187 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3188 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3189 if (eptr-- == pp) break; /* Stop if tried at original pos */
3190 #ifdef SUPPORT_UTF
3191 if (utf) BACKCHAR(eptr);
3192 #endif
3193 }
3194 RRETURN(MATCH_NOMATCH);
3195 }
3196
3197 /* Control never gets here */
3198 }
3199 #endif /* End of XCLASS */
3200
3201 /* Match a single character, casefully */
3202
3203 case OP_CHAR:
3204 #ifdef SUPPORT_UTF
3205 if (utf)
3206 {
3207 length = 1;
3208 ecode++;
3209 GETCHARLEN(fc, ecode, length);
3210 if (length > md->end_subject - eptr)
3211 {
3212 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3213 RRETURN(MATCH_NOMATCH);
3214 }
3215 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3216 }
3217 else
3218 #endif
3219 /* Not UTF mode */
3220 {
3221 if (md->end_subject - eptr < 1)
3222 {
3223 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3224 RRETURN(MATCH_NOMATCH);
3225 }
3226 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3227 ecode += 2;
3228 }
3229 break;
3230
3231 /* Match a single character, caselessly. If we are at the end of the
3232 subject, give up immediately. */
3233
3234 case OP_CHARI:
3235 if (eptr >= md->end_subject)
3236 {
3237 SCHECK_PARTIAL();
3238 RRETURN(MATCH_NOMATCH);
3239 }
3240
3241 #ifdef SUPPORT_UTF
3242 if (utf)
3243 {
3244 length = 1;
3245 ecode++;
3246 GETCHARLEN(fc, ecode, length);
3247
3248 /* If the pattern character's value is < 128, we have only one byte, and
3249 we know that its other case must also be one byte long, so we can use the
3250 fast lookup table. We know that there is at least one byte left in the
3251 subject. */
3252
3253 if (fc < 128)
3254 {
3255 pcre_uint32 cc = RAWUCHAR(eptr);
3256 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3257 ecode++;
3258 eptr++;
3259 }
3260
3261 /* Otherwise we must pick up the subject character. Note that we cannot
3262 use the value of "length" to check for sufficient bytes left, because the
3263 other case of the character may have more or fewer bytes. */
3264
3265 else
3266 {
3267 pcre_uint32 dc;
3268 GETCHARINC(dc, eptr);
3269 ecode += length;
3270
3271 /* If we have Unicode property support, we can use it to test the other
3272 case of the character, if there is one. */
3273
3274 if (fc != dc)
3275 {
3276 #ifdef SUPPORT_UCP
3277 if (dc != UCD_OTHERCASE(fc))
3278 #endif
3279 RRETURN(MATCH_NOMATCH);
3280 }
3281 }
3282 }
3283 else
3284 #endif /* SUPPORT_UTF */
3285
3286 /* Not UTF mode */
3287 {
3288 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3289 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3290 eptr++;
3291 ecode += 2;
3292 }
3293 break;
3294
3295 /* Match a single character repeatedly. */
3296
3297 case OP_EXACT:
3298 case OP_EXACTI:
3299 min = max = GET2(ecode, 1);
3300 ecode += 1 + IMM2_SIZE;
3301 goto REPEATCHAR;
3302
3303 case OP_POSUPTO:
3304 case OP_POSUPTOI:
3305 possessive = TRUE;
3306 /* Fall through */
3307
3308 case OP_UPTO:
3309 case OP_UPTOI:
3310 case OP_MINUPTO:
3311 case OP_MINUPTOI:
3312 min = 0;
3313 max = GET2(ecode, 1);
3314 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3315 ecode += 1 + IMM2_SIZE;
3316 goto REPEATCHAR;
3317
3318 case OP_POSSTAR:
3319 case OP_POSSTARI:
3320 possessive = TRUE;
3321 min = 0;
3322 max = INT_MAX;
3323 ecode++;
3324 goto REPEATCHAR;
3325
3326 case OP_POSPLUS:
3327 case OP_POSPLUSI:
3328 possessive = TRUE;
3329 min = 1;
3330 max = INT_MAX;
3331 ecode++;
3332 goto REPEATCHAR;
3333
3334 case OP_POSQUERY:
3335 case OP_POSQUERYI:
3336 possessive = TRUE;
3337 min = 0;
3338 max = 1;
3339 ecode++;
3340 goto REPEATCHAR;
3341
3342 case OP_STAR:
3343 case OP_STARI:
3344 case OP_MINSTAR:
3345 case OP_MINSTARI:
3346 case OP_PLUS:
3347 case OP_PLUSI:
3348 case OP_MINPLUS:
3349 case OP_MINPLUSI:
3350 case OP_QUERY:
3351 case OP_QUERYI:
3352 case OP_MINQUERY:
3353 case OP_MINQUERYI:
3354 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3355 minimize = (c & 1) != 0;
3356 min = rep_min[c]; /* Pick up values from tables; */
3357 max = rep_max[c]; /* zero for max => infinity */
3358 if (max == 0) max = INT_MAX;
3359
3360 /* Common code for all repeated single-character matches. We first check
3361 for the minimum number of characters. If the minimum equals the maximum, we
3362 are done. Otherwise, if minimizing, check the rest of the pattern for a
3363 match; if there isn't one, advance up to the maximum, one character at a
3364 time.
3365
3366 If maximizing, advance up to the maximum number of matching characters,
3367 until eptr is past the end of the maximum run. If possessive, we are
3368 then done (no backing up). Otherwise, match at this position; anything
3369 other than no match is immediately returned. For nomatch, back up one
3370 character, unless we are matching \R and the last thing matched was
3371 \r\n, in which case, back up two bytes. When we reach the first optional
3372 character position, we can save stack by doing a tail recurse.
3373
3374 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3375 for speed. */
3376
3377 REPEATCHAR:
3378 #ifdef SUPPORT_UTF
3379 if (utf)
3380 {
3381 length = 1;
3382 charptr = ecode;
3383 GETCHARLEN(fc, ecode, length);
3384 ecode += length;
3385
3386 /* Handle multibyte character matching specially here. There is
3387 support for caseless matching if UCP support is present. */
3388
3389 if (length > 1)
3390 {
3391 #ifdef SUPPORT_UCP
3392 pcre_uint32 othercase;
3393 if (op >= OP_STARI && /* Caseless */
3394 (othercase = UCD_OTHERCASE(fc)) != fc)
3395 oclength = PRIV(ord2utf)(othercase, occhars);
3396 else oclength = 0;
3397 #endif /* SUPPORT_UCP */
3398
3399 for (i = 1; i <= min; i++)
3400 {
3401 if (eptr <= md->end_subject - length &&
3402 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3403 #ifdef SUPPORT_UCP
3404 else if (oclength > 0 &&
3405 eptr <= md->end_subject - oclength &&
3406 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3407 #endif /* SUPPORT_UCP */
3408 else
3409 {
3410 CHECK_PARTIAL();
3411 RRETURN(MATCH_NOMATCH);
3412 }
3413 }
3414
3415 if (min == max) continue;
3416
3417 if (minimize)
3418 {
3419 for (fi = min;; fi++)
3420 {
3421 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3423 if (fi >= max) RRETURN(MATCH_NOMATCH);
3424 if (eptr <= md->end_subject - length &&
3425 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3426 #ifdef SUPPORT_UCP
3427 else if (oclength > 0 &&
3428 eptr <= md->end_subject - oclength &&
3429 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3430 #endif /* SUPPORT_UCP */
3431 else
3432 {
3433 CHECK_PARTIAL();
3434 RRETURN(MATCH_NOMATCH);
3435 }
3436 }
3437 /* Control never gets here */
3438 }
3439
3440 else /* Maximize */
3441 {
3442 pp = eptr;
3443 for (i = min; i < max; i++)
3444 {
3445 if (eptr <= md->end_subject - length &&
3446 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3447 #ifdef SUPPORT_UCP
3448 else if (oclength > 0 &&
3449 eptr <= md->end_subject - oclength &&
3450 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3451 #endif /* SUPPORT_UCP */
3452 else
3453 {
3454 CHECK_PARTIAL();
3455 break;
3456 }
3457 }
3458
3459 if (possessive) continue; /* No backtracking */
3460 for(;;)
3461 {
3462 if (eptr == pp) goto TAIL_RECURSE;
3463 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3464 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3465 #ifdef SUPPORT_UCP
3466 eptr--;
3467 BACKCHAR(eptr);
3468 #else /* without SUPPORT_UCP */
3469 eptr -= length;
3470 #endif /* SUPPORT_UCP */
3471 }
3472 }
3473 /* Control never gets here */
3474 }
3475
3476 /* If the length of a UTF-8 character is 1, we fall through here, and
3477 obey the code as for non-UTF-8 characters below, though in this case the
3478 value of fc will always be < 128. */
3479 }
3480 else
3481 #endif /* SUPPORT_UTF */
3482 /* When not in UTF-8 mode, load a single-byte character. */
3483 fc = *ecode++;
3484
3485 /* The value of fc at this point is always one character, though we may
3486 or may not be in UTF mode. The code is duplicated for the caseless and
3487 caseful cases, for speed, since matching characters is likely to be quite
3488 common. First, ensure the minimum number of matches are present. If min =
3489 max, continue at the same level without recursing. Otherwise, if
3490 minimizing, keep trying the rest of the expression and advancing one
3491 matching character if failing, up to the maximum. Alternatively, if
3492 maximizing, find the maximum number of characters and work backwards. */
3493
3494 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3495 max, (char *)eptr));
3496
3497 if (op >= OP_STARI) /* Caseless */
3498 {
3499 #ifdef COMPILE_PCRE8
3500 /* fc must be < 128 if UTF is enabled. */
3501 foc = md->fcc[fc];
3502 #else
3503 #ifdef SUPPORT_UTF
3504 #ifdef SUPPORT_UCP
3505 if (utf && fc > 127)
3506 foc = UCD_OTHERCASE(fc);
3507 #else
3508 if (utf && fc > 127)
3509 foc = fc;
3510 #endif /* SUPPORT_UCP */
3511 else
3512 #endif /* SUPPORT_UTF */
3513 foc = TABLE_GET(fc, md->fcc, fc);
3514 #endif /* COMPILE_PCRE8 */
3515
3516 for (i = 1; i <= min; i++)
3517 {
3518 pcre_uint32 cc; /* Faster than pcre_uchar */
3519 if (eptr >= md->end_subject)
3520 {
3521 SCHECK_PARTIAL();
3522 RRETURN(MATCH_NOMATCH);
3523 }
3524 cc = RAWUCHARTEST(eptr);
3525 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3526 eptr++;
3527 }
3528 if (min == max) continue;
3529 if (minimize)
3530 {
3531 for (fi = min;; fi++)
3532 {
3533 pcre_uint32 cc; /* Faster than pcre_uchar */
3534 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3535 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3536 if (fi >= max) RRETURN(MATCH_NOMATCH);
3537 if (eptr >= md->end_subject)
3538 {
3539 SCHECK_PARTIAL();
3540 RRETURN(MATCH_NOMATCH);
3541 }
3542 cc = RAWUCHARTEST(eptr);
3543 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3544 eptr++;
3545 }
3546 /* Control never gets here */
3547 }
3548 else /* Maximize */
3549 {
3550 pp = eptr;
3551 for (i = min; i < max; i++)
3552 {
3553 pcre_uint32 cc; /* Faster than pcre_uchar */
3554 if (eptr >= md->end_subject)
3555 {
3556 SCHECK_PARTIAL();
3557 break;
3558 }
3559 cc = RAWUCHARTEST(eptr);
3560 if (fc != cc && foc != cc) break;
3561 eptr++;
3562 }
3563 if (possessive) continue; /* No backtracking */
3564 for (;;)
3565 {
3566 if (eptr == pp) goto TAIL_RECURSE;
3567 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3568 eptr--;
3569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3570 }
3571 /* Control never gets here */
3572 }
3573 }
3574
3575 /* Caseful comparisons (includes all multi-byte characters) */
3576
3577 else
3578 {
3579 for (i = 1; i <= min; i++)
3580 {
3581 if (eptr >= md->end_subject)
3582 {
3583 SCHECK_PARTIAL();
3584 RRETURN(MATCH_NOMATCH);
3585 }
3586 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3587 }
3588
3589 if (min == max) continue;
3590
3591 if (minimize)
3592 {
3593 for (fi = min;; fi++)
3594 {
3595 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3596 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3597 if (fi >= max) RRETURN(MATCH_NOMATCH);
3598 if (eptr >= md->end_subject)
3599 {
3600 SCHECK_PARTIAL();
3601 RRETURN(MATCH_NOMATCH);
3602 }
3603 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3604 }
3605 /* Control never gets here */
3606 }
3607 else /* Maximize */
3608 {
3609 pp = eptr;
3610 for (i = min; i < max; i++)
3611 {
3612 if (eptr >= md->end_subject)
3613 {
3614 SCHECK_PARTIAL();
3615 break;
3616 }
3617 if (fc != RAWUCHARTEST(eptr)) break;
3618 eptr++;
3619 }
3620 if (possessive) continue; /* No backtracking */
3621 for (;;)
3622 {
3623 if (eptr == pp) goto TAIL_RECURSE;
3624 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3625 eptr--;
3626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3627 }
3628 /* Control never gets here */
3629 }
3630 }
3631 /* Control never gets here */
3632
3633 /* Match a negated single one-byte character. The character we are
3634 checking can be multibyte. */
3635
3636 case OP_NOT:
3637 case OP_NOTI:
3638 if (eptr >= md->end_subject)
3639 {
3640 SCHECK_PARTIAL();
3641 RRETURN(MATCH_NOMATCH);
3642 }
3643 #ifdef SUPPORT_UTF
3644 if (utf)
3645 {
3646 register pcre_uint32 ch, och;
3647
3648 ecode++;
3649 GETCHARINC(ch, ecode);
3650 GETCHARINC(c, eptr);
3651
3652 if (op == OP_NOT)
3653 {
3654 if (ch == c) RRETURN(MATCH_NOMATCH);
3655 }
3656 else
3657 {
3658 #ifdef SUPPORT_UCP
3659 if (ch > 127)
3660 och = UCD_OTHERCASE(ch);
3661 #else
3662 if (ch > 127)
3663 och = ch;
3664 #endif /* SUPPORT_UCP */
3665 else
3666 och = TABLE_GET(ch, md->fcc, ch);
3667 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3668 }
3669 }
3670 else
3671 #endif
3672 {
3673 register pcre_uint32 ch = ecode[1];
3674 c = *eptr++;
3675 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3676 RRETURN(MATCH_NOMATCH);
3677 ecode += 2;
3678 }
3679 break;
3680
3681 /* Match a negated single one-byte character repeatedly. This is almost a
3682 repeat of the code for a repeated single character, but I haven't found a
3683 nice way of commoning these up that doesn't require a test of the
3684 positive/negative option for each character match. Maybe that wouldn't add
3685 very much to the time taken, but character matching *is* what this is all
3686 about... */
3687
3688 case OP_NOTEXACT:
3689 case OP_NOTEXACTI:
3690 min = max = GET2(ecode, 1);
3691 ecode += 1 + IMM2_SIZE;
3692 goto REPEATNOTCHAR;
3693
3694 case OP_NOTUPTO:
3695 case OP_NOTUPTOI:
3696 case OP_NOTMINUPTO:
3697 case OP_NOTMINUPTOI:
3698 min = 0;
3699 max = GET2(ecode, 1);
3700 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3701 ecode += 1 + IMM2_SIZE;
3702 goto REPEATNOTCHAR;
3703
3704 case OP_NOTPOSSTAR:
3705 case OP_NOTPOSSTARI:
3706 possessive = TRUE;
3707 min = 0;
3708 max = INT_MAX;
3709 ecode++;
3710 goto REPEATNOTCHAR;
3711
3712 case OP_NOTPOSPLUS:
3713 case OP_NOTPOSPLUSI:
3714 possessive = TRUE;
3715 min = 1;
3716 max = INT_MAX;
3717 ecode++;
3718 goto REPEATNOTCHAR;
3719
3720 case OP_NOTPOSQUERY:
3721 case OP_NOTPOSQUERYI:
3722 possessive = TRUE;
3723 min = 0;
3724 max = 1;
3725 ecode++;
3726 goto REPEATNOTCHAR;
3727
3728 case OP_NOTPOSUPTO:
3729 case OP_NOTPOSUPTOI:
3730 possessive = TRUE;
3731 min = 0;
3732 max = GET2(ecode, 1);
3733 ecode += 1 + IMM2_SIZE;
3734 goto REPEATNOTCHAR;
3735
3736 case OP_NOTSTAR:
3737 case OP_NOTSTARI:
3738 case OP_NOTMINSTAR:
3739 case OP_NOTMINSTARI:
3740 case OP_NOTPLUS:
3741 case OP_NOTPLUSI:
3742 case OP_NOTMINPLUS:
3743 case OP_NOTMINPLUSI:
3744 case OP_NOTQUERY:
3745 case OP_NOTQUERYI:
3746 case OP_NOTMINQUERY:
3747 case OP_NOTMINQUERYI:
3748 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3749 minimize = (c & 1) != 0;
3750 min = rep_min[c]; /* Pick up values from tables; */
3751 max = rep_max[c]; /* zero for max => infinity */
3752 if (max == 0) max = INT_MAX;
3753
3754 /* Common code for all repeated single-byte matches. */
3755
3756 REPEATNOTCHAR:
3757 GETCHARINCTEST(fc, ecode);
3758
3759 /* The code is duplicated for the caseless and caseful cases, for speed,
3760 since matching characters is likely to be quite common. First, ensure the
3761 minimum number of matches are present. If min = max, continue at the same
3762 level without recursing. Otherwise, if minimizing, keep trying the rest of
3763 the expression and advancing one matching character if failing, up to the
3764 maximum. Alternatively, if maximizing, find the maximum number of
3765 characters and work backwards. */
3766
3767 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3768 max, (char *)eptr));
3769
3770 if (op >= OP_NOTSTARI) /* Caseless */
3771 {
3772 #ifdef SUPPORT_UTF
3773 #ifdef SUPPORT_UCP
3774 if (utf && fc > 127)
3775 foc = UCD_OTHERCASE(fc);
3776 #else
3777 if (utf && fc > 127)
3778 foc = fc;
3779 #endif /* SUPPORT_UCP */
3780 else
3781 #endif /* SUPPORT_UTF */
3782 foc = TABLE_GET(fc, md->fcc, fc);
3783
3784 #ifdef SUPPORT_UTF
3785 if (utf)
3786 {
3787 register pcre_uint32 d;
3788 for (i = 1; i <= min; i++)
3789 {
3790 if (eptr >= md->end_subject)
3791 {
3792 SCHECK_PARTIAL();
3793 RRETURN(MATCH_NOMATCH);
3794 }
3795 GETCHARINC(d, eptr);
3796 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3797 }
3798 }
3799 else
3800 #endif /* SUPPORT_UTF */
3801 /* Not UTF mode */
3802 {
3803 for (i = 1; i <= min; i++)
3804 {
3805 if (eptr >= md->end_subject)
3806 {
3807 SCHECK_PARTIAL();
3808 RRETURN(MATCH_NOMATCH);
3809 }
3810 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3811 eptr++;
3812 }
3813 }
3814
3815 if (min == max) continue;
3816
3817 if (minimize)
3818 {
3819 #ifdef SUPPORT_UTF
3820 if (utf)
3821 {
3822 register pcre_uint32 d;
3823 for (fi = min;; fi++)
3824 {
3825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3827 if (fi >= max) RRETURN(MATCH_NOMATCH);
3828 if (eptr >= md->end_subject)
3829 {
3830 SCHECK_PARTIAL();
3831 RRETURN(MATCH_NOMATCH);
3832 }
3833 GETCHARINC(d, eptr);
3834 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3835 }
3836 }
3837 else
3838 #endif /*SUPPORT_UTF */
3839 /* Not UTF mode */
3840 {
3841 for (fi = min;; fi++)
3842 {
3843 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3844 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3845 if (fi >= max) RRETURN(MATCH_NOMATCH);
3846 if (eptr >= md->end_subject)
3847 {
3848 SCHECK_PARTIAL();
3849 RRETURN(MATCH_NOMATCH);
3850 }
3851 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3852 eptr++;
3853 }
3854 }
3855 /* Control never gets here */
3856 }
3857
3858 /* Maximize case */
3859
3860 else
3861 {
3862 pp = eptr;
3863
3864 #ifdef SUPPORT_UTF
3865 if (utf)
3866 {
3867 register pcre_uint32 d;
3868 for (i = min; i < max; i++)
3869 {
3870 int len = 1;
3871 if (eptr >= md->end_subject)
3872 {
3873 SCHECK_PARTIAL();
3874 break;
3875 }
3876 GETCHARLEN(d, eptr, len);
3877 if (fc == d || (unsigned int)foc == d) break;
3878 eptr += len;
3879 }
3880 if (possessive) continue; /* No backtracking */
3881 for(;;)
3882 {
3883 if (eptr == pp) goto TAIL_RECURSE;
3884 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3886 eptr--;
3887 BACKCHAR(eptr);
3888 }
3889 }
3890 else
3891 #endif /* SUPPORT_UTF */
3892 /* Not UTF mode */
3893 {
3894 for (i = min; i < max; i++)
3895 {
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 break;
3900 }
3901 if (fc == *eptr || foc == *eptr) break;
3902 eptr++;
3903 }
3904 if (possessive) continue; /* No backtracking */
3905 for (;;)
3906 {
3907 if (eptr == pp) goto TAIL_RECURSE;
3908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3910 eptr--;
3911 }
3912 }
3913 /* Control never gets here */
3914 }
3915 }
3916
3917 /* Caseful comparisons */
3918
3919 else
3920 {
3921 #ifdef SUPPORT_UTF
3922 if (utf)
3923 {
3924 register pcre_uint32 d;
3925 for (i = 1; i <= min; i++)
3926 {
3927 if (eptr >= md->end_subject)
3928 {
3929 SCHECK_PARTIAL();
3930 RRETURN(MATCH_NOMATCH);
3931 }
3932 GETCHARINC(d, eptr);
3933 if (fc == d) RRETURN(MATCH_NOMATCH);
3934 }
3935 }
3936 else
3937 #endif
3938 /* Not UTF mode */
3939 {
3940 for (i = 1; i <= min; i++)
3941 {
3942 if (eptr >= md->end_subject)
3943 {
3944 SCHECK_PARTIAL();
3945 RRETURN(MATCH_NOMATCH);
3946 }
3947 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3948 }
3949 }
3950
3951 if (min == max) continue;
3952
3953 if (minimize)
3954 {
3955 #ifdef SUPPORT_UTF
3956 if (utf)
3957 {
3958 register pcre_uint32 d;
3959 for (fi = min;; fi++)
3960 {
3961 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3962 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3963 if (fi >= max) RRETURN(MATCH_NOMATCH);
3964 if (eptr >= md->end_subject)
3965 {
3966 SCHECK_PARTIAL();
3967 RRETURN(MATCH_NOMATCH);
3968 }
3969 GETCHARINC(d, eptr);
3970 if (fc == d) RRETURN(MATCH_NOMATCH);
3971 }
3972 }
3973 else
3974 #endif
3975 /* Not UTF mode */
3976 {
3977 for (fi = min;; fi++)
3978 {
3979 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3981 if (fi >= max) RRETURN(MATCH_NOMATCH);
3982 if (eptr >= md->end_subject)
3983 {
3984 SCHECK_PARTIAL();
3985 RRETURN(MATCH_NOMATCH);
3986 }
3987 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3988 }
3989 }
3990 /* Control never gets here */
3991 }
3992
3993 /* Maximize case */
3994
3995 else
3996 {
3997 pp = eptr;
3998
3999 #ifdef SUPPORT_UTF
4000 if (utf)
4001 {
4002 register pcre_uint32 d;
4003 for (i = min; i < max; i++)
4004 {
4005 int len = 1;
4006 if (eptr >= md->end_subject)
4007 {
4008 SCHECK_PARTIAL();
4009 break;
4010 }
4011 GETCHARLEN(d, eptr, len);
4012 if (fc == d) break;
4013 eptr += len;
4014 }
4015 if (possessive) continue; /* No backtracking */
4016 for(;;)
4017 {
4018 if (eptr == pp) goto TAIL_RECURSE;
4019 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4020 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4021 eptr--;
4022 BACKCHAR(eptr);
4023 }
4024 }
4025 else
4026 #endif
4027 /* Not UTF mode */
4028 {
4029 for (i = min; i < max; i++)
4030 {
4031 if (eptr >= md->end_subject)
4032 {
4033 SCHECK_PARTIAL();
4034 break;
4035 }
4036 if (fc == *eptr) break;
4037 eptr++;
4038 }
4039 if (possessive) continue; /* No backtracking */
4040 for (;;)
4041 {
4042 if (eptr == pp) goto TAIL_RECURSE;
4043 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4045 eptr--;
4046 }
4047 }
4048 /* Control never gets here */
4049 }
4050 }
4051 /* Control never gets here */
4052
4053 /* Match a single character type repeatedly; several different opcodes
4054 share code. This is very similar to the code for single characters, but we
4055 repeat it in the interests of efficiency. */
4056
4057 case OP_TYPEEXACT:
4058 min = max = GET2(ecode, 1);
4059 minimize = TRUE;
4060 ecode += 1 + IMM2_SIZE;
4061 goto REPEATTYPE;
4062
4063 case OP_TYPEUPTO:
4064 case OP_TYPEMINUPTO:
4065 min = 0;
4066 max = GET2(ecode, 1);
4067 minimize = *ecode == OP_TYPEMINUPTO;
4068 ecode += 1 + IMM2_SIZE;
4069 goto REPEATTYPE;
4070
4071 case OP_TYPEPOSSTAR:
4072 possessive = TRUE;
4073 min = 0;
4074 max = INT_MAX;
4075 ecode++;
4076 goto REPEATTYPE;
4077
4078 case OP_TYPEPOSPLUS:
4079 possessive = TRUE;
4080 min = 1;
4081 max = INT_MAX;
4082 ecode++;
4083 goto REPEATTYPE;
4084
4085 case OP_TYPEPOSQUERY:
4086 possessive = TRUE;
4087 min = 0;
4088 max = 1;
4089 ecode++;
4090 goto REPEATTYPE;
4091
4092 case OP_TYPEPOSUPTO:
4093 possessive = TRUE;
4094 min = 0;
4095 max = GET2(ecode, 1);
4096 ecode += 1 + IMM2_SIZE;
4097 goto REPEATTYPE;
4098
4099 case OP_TYPESTAR:
4100 case OP_TYPEMINSTAR:
4101 case OP_TYPEPLUS:
4102 case OP_TYPEMINPLUS:
4103 case OP_TYPEQUERY:
4104 case OP_TYPEMINQUERY:
4105 c = *ecode++ - OP_TYPESTAR;
4106 minimize = (c & 1) != 0;
4107 min = rep_min[c]; /* Pick up values from tables; */
4108 max = rep_max[c]; /* zero for max => infinity */
4109 if (max == 0) max = INT_MAX;
4110
4111 /* Common code for all repeated single character type matches. Note that
4112 in UTF-8 mode, '.' matches a character of any length, but for the other
4113 character types, the valid characters are all one-byte long. */
4114
4115 REPEATTYPE:
4116 ctype = *ecode++; /* Code for the character type */
4117
4118 #ifdef SUPPORT_UCP
4119 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4120 {
4121 prop_fail_result = ctype == OP_NOTPROP;
4122 prop_type = *ecode++;
4123 prop_value = *ecode++;
4124 }
4125 else prop_type = -1;
4126 #endif
4127
4128 /* First, ensure the minimum number of matches are present. Use inline
4129 code for maximizing the speed, and do the type test once at the start
4130 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4131 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4132 and single-bytes. */
4133
4134 if (min > 0)
4135 {
4136 #ifdef SUPPORT_UCP
4137 if (prop_type >= 0)
4138 {
4139 switch(prop_type)
4140 {
4141 case PT_ANY:
4142 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4143 for (i = 1; i <= min; i++)
4144 {
4145 if (eptr >= md->end_subject)
4146 {
4147 SCHECK_PARTIAL();
4148 RRETURN(MATCH_NOMATCH);
4149 }
4150 GETCHARINCTEST(c, eptr);
4151 }
4152 break;
4153
4154 case PT_LAMP:
4155 for (i = 1; i <= min; i++)
4156 {
4157 int chartype;
4158 if (eptr >= md->end_subject)
4159 {
4160 SCHECK_PARTIAL();
4161 RRETURN(MATCH_NOMATCH);
4162 }
4163 GETCHARINCTEST(c, eptr);
4164 chartype = UCD_CHARTYPE(c);
4165 if ((chartype == ucp_Lu ||
4166 chartype == ucp_Ll ||
4167 chartype == ucp_Lt) == prop_fail_result)
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case PT_GC:
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 GETCHARINCTEST(c, eptr);
4181 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4182 RRETURN(MATCH_NOMATCH);
4183 }
4184 break;
4185
4186 case PT_PC:
4187 for (i = 1; i <= min; i++)
4188 {
4189 if (eptr >= md->end_subject)
4190 {
4191 SCHECK_PARTIAL();
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 GETCHARINCTEST(c, eptr);
4195 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4196 RRETURN(MATCH_NOMATCH);
4197 }
4198 break;
4199
4200 case PT_SC:
4201 for (i = 1; i <= min; i++)
4202 {
4203 if (eptr >= md->end_subject)
4204 {
4205 SCHECK_PARTIAL();
4206 RRETURN(MATCH_NOMATCH);
4207 }
4208 GETCHARINCTEST(c, eptr);
4209 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4210 RRETURN(MATCH_NOMATCH);
4211 }
4212 break;
4213
4214 case PT_ALNUM:
4215 for (i = 1; i <= min; i++)
4216 {
4217 int category;
4218 if (eptr >= md->end_subject)
4219 {
4220 SCHECK_PARTIAL();
4221 RRETURN(MATCH_NOMATCH);
4222 }
4223 GETCHARINCTEST(c, eptr);
4224 category = UCD_CATEGORY(c);
4225 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4226 RRETURN(MATCH_NOMATCH);
4227 }
4228 break;
4229
4230 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4231 which means that Perl space and POSIX space are now identical. PCRE
4232 was changed at release 8.34. */
4233
4234 case PT_SPACE: /* Perl space */
4235 case PT_PXSPACE: /* POSIX space */
4236 for (i = 1; i <= min; i++)
4237 {
4238 if (eptr >= md->end_subject)
4239 {
4240 SCHECK_PARTIAL();
4241 RRETURN(MATCH_NOMATCH);
4242 }
4243 GETCHARINCTEST(c, eptr);
4244 switch(c)
4245 {
4246 HSPACE_CASES:
4247 VSPACE_CASES:
4248 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4249 break;
4250
4251 default:
4252 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4253 RRETURN(MATCH_NOMATCH);
4254 break;
4255 }
4256 }
4257 break;
4258
4259 case PT_WORD:
4260 for (i = 1; i <= min; i++)
4261 {
4262 int category;
4263 if (eptr >= md->end_subject)
4264 {
4265 SCHECK_PARTIAL();
4266 RRETURN(MATCH_NOMATCH);
4267 }
4268 GETCHARINCTEST(c, eptr);
4269 category = UCD_CATEGORY(c);
4270 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4271 == prop_fail_result)
4272 RRETURN(MATCH_NOMATCH);
4273 }
4274 break;
4275
4276 case PT_CLIST:
4277 for (i = 1; i <= min; i++)
4278 {
4279 const pcre_uint32 *cp;
4280 if (eptr >= md->end_subject)
4281 {
4282 SCHECK_PARTIAL();
4283 RRETURN(MATCH_NOMATCH);
4284 }
4285 GETCHARINCTEST(c, eptr);
4286 cp = PRIV(ucd_caseless_sets) + prop_value;
4287 for (;;)
4288 {
4289 if (c < *cp)
4290 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4291 if (c == *cp++)
4292 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4293 }
4294 }
4295 break;
4296
4297 case PT_UCNC:
4298 for (i = 1; i <= min; i++)
4299 {
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 RRETURN(MATCH_NOMATCH);
4304 }
4305 GETCHARINCTEST(c, eptr);
4306 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4307 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4308 c >= 0xe000) == prop_fail_result)
4309 RRETURN(MATCH_NOMATCH);
4310 }
4311 break;
4312
4313 /* This should not occur */
4314
4315 default:
4316 RRETURN(PCRE_ERROR_INTERNAL);
4317 }
4318 }
4319
4320 /* Match extended Unicode sequences. We will get here only if the
4321 support is in the binary; otherwise a compile-time error occurs. */
4322
4323 else if (ctype == OP_EXTUNI)
4324 {
4325 for (i = 1; i <= min; i++)
4326 {
4327 if (eptr >= md->end_subject)
4328 {
4329 SCHECK_PARTIAL();
4330 RRETURN(MATCH_NOMATCH);
4331 }
4332 else
4333 {
4334 int lgb, rgb;
4335 GETCHARINCTEST(c, eptr);
4336 lgb = UCD_GRAPHBREAK(c);
4337 while (eptr < md->end_subject)
4338 {
4339 int len = 1;
4340 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4341 rgb = UCD_GRAPHBREAK(c);
4342 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4343 lgb = rgb;
4344 eptr += len;
4345 }
4346 }
4347 CHECK_PARTIAL();
4348 }
4349 }
4350
4351 else
4352 #endif /* SUPPORT_UCP */
4353
4354 /* Handle all other cases when the coding is UTF-8 */
4355
4356 #ifdef SUPPORT_UTF
4357 if (utf) switch(ctype)
4358 {
4359 case OP_ANY:
4360 for (i = 1; i <= min; i++)
4361 {
4362 if (eptr >= md->end_subject)
4363 {
4364 SCHECK_PARTIAL();
4365 RRETURN(MATCH_NOMATCH);
4366 }
4367 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4368 if (md->partial != 0 &&
4369 eptr + 1 >= md->end_subject &&
4370 NLBLOCK->nltype == NLTYPE_FIXED &&
4371 NLBLOCK->nllen == 2 &&
4372 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4373 {
4374 md->hitend = TRUE;
4375 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4376 }
4377 eptr++;
4378 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4379 }
4380 break;
4381
4382 case OP_ALLANY:
4383 for (i = 1; i <= min; i++)
4384 {
4385 if (eptr >= md->end_subject)
4386 {
4387 SCHECK_PARTIAL();
4388 RRETURN(MATCH_NOMATCH);
4389 }
4390 eptr++;
4391 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4392 }
4393 break;
4394
4395 case OP_ANYBYTE:
4396 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4397 eptr += min;
4398 break;
4399
4400 case OP_ANYNL:
4401 for (i = 1; i <= min; i++)
4402 {
4403 if (eptr >= md->end_subject)
4404 {
4405 SCHECK_PARTIAL();
4406 RRETURN(MATCH_NOMATCH);
4407 }
4408 GETCHARINC(c, eptr);
4409 switch(c)
4410 {
4411 default: RRETURN(MATCH_NOMATCH);
4412
4413 case CHAR_CR:
4414 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4415 break;
4416
4417 case CHAR_LF:
4418 break;
4419
4420 case CHAR_VT:
4421 case CHAR_FF:
4422 case CHAR_NEL:
4423 #ifndef EBCDIC
4424 case 0x2028:
4425 case 0x2029:
4426 #endif /* Not EBCDIC */
4427 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4428 break;
4429 }
4430 }
4431 break;
4432
4433 case OP_NOT_HSPACE:
4434 for (i = 1; i <= min; i++)
4435 {
4436 if (eptr >= md->end_subject)
4437 {
4438 SCHECK_PARTIAL();
4439 RRETURN(MATCH_NOMATCH);
4440 }
4441 GETCHARINC(c, eptr);
4442 switch(c)
4443 {
4444 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4445 default: break;
4446 }
4447 }
4448 break;
4449
4450 case OP_HSPACE:
4451 for (i = 1; i <= min; i++)
4452 {
4453 if (eptr >= md->end_subject)
4454 {
4455 SCHECK_PARTIAL();
4456 RRETURN(MATCH_NOMATCH);
4457 }
4458 GETCHARINC(c, eptr);
4459 switch(c)
4460 {
4461 HSPACE_CASES: break; /* Byte and multibyte cases */
4462 default: RRETURN(MATCH_NOMATCH);
4463 }
4464 }
4465 break;
4466
4467 case OP_NOT_VSPACE:
4468 for (i = 1; i <= min; i++)
4469 {
4470 if (eptr >= md->end_subject)
4471 {
4472 SCHECK_PARTIAL();
4473 RRETURN(MATCH_NOMATCH);
4474 }
4475 GETCHARINC(c, eptr);
4476 switch(c)
4477 {
4478 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4479 default: break;
4480 }
4481 }
4482 break;
4483
4484 case OP_VSPACE:
4485 for (i = 1; i <= min; i++)
4486 {
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 RRETURN(MATCH_NOMATCH);
4491 }
4492 GETCHARINC(c, eptr);
4493 switch(c)
4494 {
4495 VSPACE_CASES: break;
4496 default: RRETURN(MATCH_NOMATCH);
4497 }
4498 }
4499 break;
4500
4501 case OP_NOT_DIGIT:
4502 for (i = 1; i <= min; i++)
4503 {
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 RRETURN(MATCH_NOMATCH);
4508 }
4509 GETCHARINC(c, eptr);
4510 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4511 RRETURN(MATCH_NOMATCH);
4512 }
4513 break;
4514
4515 case OP_DIGIT:
4516 for (i = 1; i <= min; i++)
4517 {
4518 pcre_uint32 cc;
4519 if (eptr >= md->end_subject)
4520 {
4521 SCHECK_PARTIAL();
4522 RRETURN(MATCH_NOMATCH);
4523 }
4524 cc = RAWUCHAR(eptr);
4525 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4526 RRETURN(MATCH_NOMATCH);
4527 eptr++;
4528 /* No need to skip more bytes - we know it's a 1-byte character */
4529 }
4530 break;
4531
4532 case OP_NOT_WHITESPACE:
4533 for (i = 1; i <= min; i++)
4534 {
4535 pcre_uint32 cc;
4536 if (eptr >= md->end_subject)
4537 {
4538 SCHECK_PARTIAL();
4539 RRETURN(MATCH_NOMATCH);
4540 }
4541 cc = RAWUCHAR(eptr);
4542 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4543 RRETURN(MATCH_NOMATCH);
4544 eptr++;
4545 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4546 }
4547 break;
4548
4549 case OP_WHITESPACE:
4550 for (i = 1; i <= min; i++)
4551 {
4552 pcre_uint32 cc;
4553 if (eptr >= md->end_subject)
4554 {
4555 SCHECK_PARTIAL();
4556 RRETURN(MATCH_NOMATCH);
4557 }
4558 cc = RAWUCHAR(eptr);
4559 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4560 RRETURN(MATCH_NOMATCH);
4561 eptr++;
4562 /* No need to skip more bytes - we know it's a 1-byte character */
4563 }
4564 break;
4565
4566 case OP_NOT_WORDCHAR:
4567 for (i = 1; i <= min; i++)
4568 {
4569 pcre_uint32 cc;
4570 if (eptr >= md->end_subject)
4571 {
4572 SCHECK_PARTIAL();
4573 RRETURN(MATCH_NOMATCH);
4574 }
4575 cc = RAWUCHAR(eptr);
4576 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4577 RRETURN(MATCH_NOMATCH);
4578 eptr++;
4579 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4580 }
4581 break;
4582
4583 case OP_WORDCHAR:
4584 for (i = 1; i <= min; i++)
4585 {
4586 pcre_uint32 cc;
4587 if (eptr >= md->end_subject)
4588 {
4589 SCHECK_PARTIAL();
4590 RRETURN(MATCH_NOMATCH);
4591 }
4592 cc = RAWUCHAR(eptr);
4593 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4594 RRETURN(MATCH_NOMATCH);
4595 eptr++;
4596 /* No need to skip more bytes - we know it's a 1-byte character */
4597 }
4598 break;
4599
4600 default:
4601 RRETURN(PCRE_ERROR_INTERNAL);
4602 } /* End switch(ctype) */
4603
4604 else
4605 #endif /* SUPPORT_UTF */
4606
4607 /* Code for the non-UTF-8 case for minimum matching of operators other
4608 than OP_PROP and OP_NOTPROP. */
4609
4610 switch(ctype)
4611 {
4612 case OP_ANY:
4613 for (i = 1; i <= min; i++)
4614 {
4615 if (eptr >= md->end_subject)
4616 {
4617 SCHECK_PARTIAL();
4618 RRETURN(MATCH_NOMATCH);
4619 }
4620 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4621 if (md->partial != 0 &&
4622 eptr + 1 >= md->end_subject &&
4623 NLBLOCK->nltype == NLTYPE_FIXED &&
4624 NLBLOCK->nllen == 2 &&
4625 *eptr == NLBLOCK->nl[0])
4626 {
4627 md->hitend = TRUE;
4628 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4629 }
4630 eptr++;
4631 }
4632 break;
4633
4634 case OP_ALLANY:
4635 if (eptr > md->end_subject - min)
4636 {
4637 SCHECK_PARTIAL();
4638 RRETURN(MATCH_NOMATCH);
4639 }
4640 eptr += min;
4641 break;
4642
4643 case OP_ANYBYTE:
4644 if (eptr > md->end_subject - min)
4645 {
4646 SCHECK_PARTIAL();
4647 RRETURN(MATCH_NOMATCH);
4648 }
4649 eptr += min;
4650 break;
4651
4652 case OP_ANYNL:
4653 for (i = 1; i <= min; i++)
4654 {
4655 if (eptr >= md->end_subject)
4656 {
4657 SCHECK_PARTIAL();
4658 RRETURN(MATCH_NOMATCH);
4659 }
4660 switch(*eptr++)
4661 {
4662 default: RRETURN(MATCH_NOMATCH);
4663
4664 case CHAR_CR:
4665 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4666 break;
4667
4668 case CHAR_LF:
4669 break;
4670
4671 case CHAR_VT:
4672 case CHAR_FF:
4673 case CHAR_NEL:
4674 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4675 case 0x2028:
4676 case 0x2029:
4677 #endif
4678 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4679 break;
4680 }
4681 }
4682 break;
4683
4684 case OP_NOT_HSPACE:
4685 for (i = 1; i <= min; i++)
4686 {
4687 if (eptr >= md->end_subject)
4688 {
4689 SCHECK_PARTIAL();
4690 RRETURN(MATCH_NOMATCH);
4691 }
4692 switch(*eptr++)
4693 {
4694 default: break;
4695 HSPACE_BYTE_CASES:
4696 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4697 HSPACE_MULTIBYTE_CASES:
4698 #endif
4699 RRETURN(MATCH_NOMATCH);
4700 }
4701 }
4702 break;
4703
4704 case OP_HSPACE:
4705 for (i = 1; i <= min; i++)
4706 {
4707 if (eptr >= md->end_subject)
4708 {
4709 SCHECK_PARTIAL();
4710 RRETURN(MATCH_NOMATCH);
4711 }
4712 switch(*eptr++)
4713 {
4714 default: RRETURN(MATCH_NOMATCH);
4715 HSPACE_BYTE_CASES:
4716 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4717 HSPACE_MULTIBYTE_CASES:
4718 #endif
4719 break;
4720 }
4721 }
4722 break;
4723
4724 case OP_NOT_VSPACE:
4725 for (i = 1; i <= min; i++)
4726 {
4727 if (eptr >= md->end_subject)
4728 {
4729 SCHECK_PARTIAL();
4730 RRETURN(MATCH_NOMATCH);
4731 }
4732 switch(*eptr++)
4733 {
4734 VSPACE_BYTE_CASES:
4735 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4736 VSPACE_MULTIBYTE_CASES:
4737 #endif
4738 RRETURN(MATCH_NOMATCH);
4739 default: break;
4740 }
4741 }
4742 break;
4743
4744 case OP_VSPACE:
4745 for (i = 1; i <= min; i++)
4746 {
4747 if (eptr >= md->end_subject)
4748 {
4749 SCHECK_PARTIAL();
4750 RRETURN(MATCH_NOMATCH);
4751 }
4752 switch(*eptr++)
4753 {
4754 default: RRETURN(MATCH_NOMATCH);
4755 VSPACE_BYTE_CASES:
4756 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4757 VSPACE_MULTIBYTE_CASES:
4758 #endif
4759 break;
4760 }
4761 }
4762 break;
4763
4764 case OP_NOT_DIGIT:
4765 for (i = 1; i <= min; i++)
4766 {
4767 if (eptr >= md->end_subject)
4768 {
4769 SCHECK_PARTIAL();
4770 RRETURN(MATCH_NOMATCH);
4771 }
4772 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4773 RRETURN(MATCH_NOMATCH);
4774 eptr++;
4775 }
4776 break;
4777
4778 case OP_DIGIT:
4779 for (i = 1; i <= min; i++)
4780 {
4781 if (eptr >= md->end_subject)
4782 {
4783 SCHECK_PARTIAL();
4784 RRETURN(MATCH_NOMATCH);
4785 }
4786 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4787 RRETURN(MATCH_NOMATCH);
4788 eptr++;
4789 }
4790 break;
4791
4792 case OP_NOT_WHITESPACE:
4793 for (i = 1; i <= min; i++)
4794 {
4795 if (eptr >= md->end_subject)
4796 {
4797 SCHECK_PARTIAL();
4798 RRETURN(MATCH_NOMATCH);
4799 }
4800 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4801 RRETURN(MATCH_NOMATCH);
4802 eptr++;
4803 }
4804 break;
4805
4806 case OP_WHITESPACE:
4807 for (i = 1; i <= min; i++)
4808 {
4809 if (eptr >= md->end_subject)
4810 {
4811 SCHECK_PARTIAL();
4812 RRETURN(MATCH_NOMATCH);
4813 }
4814 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4815 RRETURN(MATCH_NOMATCH);
4816 eptr++;
4817 }
4818 break;
4819
4820 case OP_NOT_WORDCHAR:
4821 for (i = 1; i <= min; i++)
4822 {
4823 if (eptr >= md->end_subject)
4824 {
4825 SCHECK_PARTIAL();
4826 RRETURN(MATCH_NOMATCH);
4827 }
4828 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4829 RRETURN(MATCH_NOMATCH);
4830 eptr++;
4831 }
4832 break;
4833
4834 case OP_WORDCHAR:
4835 for (i = 1; i <= min; i++)
4836 {
4837 if (eptr >= md->end_subject)
4838 {
4839 SCHECK_PARTIAL();
4840 RRETURN(MATCH_NOMATCH);
4841 }
4842 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4843 RRETURN(MATCH_NOMATCH);
4844 eptr++;
4845 }
4846 break;
4847
4848 default:
4849 RRETURN(PCRE_ERROR_INTERNAL);
4850 }
4851 }
4852
4853 /* If min = max, continue at the same level without recursing */
4854
4855 if (min == max) continue;
4856
4857 /* If minimizing, we have to test the rest of the pattern before each
4858 subsequent match. Again, separate the UTF-8 case for speed, and also
4859 separate the UCP cases. */
4860
4861 if (minimize)
4862 {
4863 #ifdef SUPPORT_UCP
4864 if (prop_type >= 0)
4865 {
4866 switch(prop_type)
4867 {
4868 case PT_ANY:
4869 for (fi = min;; fi++)
4870 {
4871 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4873 if (fi >= max) RRETURN(MATCH_NOMATCH);
4874 if (eptr >= md->end_subject)
4875 {
4876 SCHECK_PARTIAL();
4877 RRETURN(MATCH_NOMATCH);
4878 }
4879 GETCHARINCTEST(c, eptr);
4880 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4881 }
4882 /* Control never gets here */
4883
4884 case PT_LAMP:
4885 for (fi = min;; fi++)
4886 {
4887 int chartype;
4888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4890 if (fi >= max) RRETURN(MATCH_NOMATCH);
4891 if (eptr >= md->end_subject)
4892 {
4893 SCHECK_PARTIAL();
4894 RRETURN(MATCH_NOMATCH);
4895 }
4896 GETCHARINCTEST(c, eptr);
4897 chartype = UCD_CHARTYPE(c);
4898 if ((chartype == ucp_Lu ||
4899 chartype == ucp_Ll ||
4900 chartype == ucp_Lt) == prop_fail_result)
4901 RRETURN(MATCH_NOMATCH);
4902 }
4903 /* Control never gets here */
4904
4905 case PT_GC:
4906 for (fi = min;; fi++)
4907 {
4908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4910 if (fi >= max) RRETURN(MATCH_NOMATCH);
4911 if (eptr >= md->end_subject)
4912 {
4913 SCHECK_PARTIAL();
4914 RRETURN(MATCH_NOMATCH);
4915 }
4916 GETCHARINCTEST(c, eptr);
4917 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4918 RRETURN(MATCH_NOMATCH);
4919 }
4920 /* Control never gets here */
4921
4922 case PT_PC:
4923 for (fi = min;; fi++)
4924 {
4925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4927 if (fi >= max) RRETURN(MATCH_NOMATCH);
4928 if (eptr >= md->end_subject)
4929 {
4930 SCHECK_PARTIAL();
4931 RRETURN(MATCH_NOMATCH);
4932 }
4933 GETCHARINCTEST(c, eptr);
4934 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4935 RRETURN(MATCH_NOMATCH);
4936 }
4937 /* Control never gets here */
4938
4939 case PT_SC:
4940 for (fi = min;; fi++)
4941 {
4942 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4943 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4944 if (fi >= max) RRETURN(MATCH_NOMATCH);
4945 if (eptr >= md->end_subject)
4946 {
4947 SCHECK_PARTIAL();
4948 RRETURN(MATCH_NOMATCH);
4949 }
4950 GETCHARINCTEST(c, eptr);
4951 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4952 RRETURN(MATCH_NOMATCH);
4953 }
4954 /* Control never gets here */
4955
4956 case PT_ALNUM:
4957 for (fi = min;; fi++)
4958 {
4959 int category;
4960 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4961 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4962 if (fi >= max) RRETURN(MATCH_NOMATCH);
4963 if (eptr >= md->end_subject)
4964 {
4965 SCHECK_PARTIAL();
4966 RRETURN(MATCH_NOMATCH);
4967 }
4968 GETCHARINCTEST(c, eptr);
4969 category = UCD_CATEGORY(c);
4970 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4971 RRETURN(MATCH_NOMATCH);
4972 }
4973 /* Control never gets here */
4974
4975 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4976 which means that Perl space and POSIX space are now identical. PCRE
4977 was changed at release 8.34. */
4978
4979 case PT_SPACE: /* Perl space */
4980 case PT_PXSPACE: /* POSIX space */
4981 for (fi = min;; fi++)
4982 {
4983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4985 if (fi >= max) RRETURN(MATCH_NOMATCH);
4986 if (eptr >= md->end_subject)
4987 {
4988 SCHECK_PARTIAL();
4989 RRETURN(MATCH_NOMATCH);
4990 }
4991 GETCHARINCTEST(c, eptr);
4992 switch(c)
4993 {
4994 HSPACE_CASES:
4995 VSPACE_CASES:
4996 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4997 break;
4998
4999 default:
5000 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5001 RRETURN(MATCH_NOMATCH);
5002 break;
5003 }
5004 }
5005 /* Control never gets here */
5006
5007 case PT_WORD:
5008 for (fi = min;; fi++)
5009 {
5010 int category;
5011 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5013 if (fi >= max) RRETURN(MATCH_NOMATCH);
5014 if (eptr >= md->end_subject)
5015 {
5016 SCHECK_PARTIAL();
5017 RRETURN(MATCH_NOMATCH);
5018 }
5019 GETCHARINCTEST(c, eptr);
5020 category = UCD_CATEGORY(c);
5021 if ((category == ucp_L ||
5022 category == ucp_N ||
5023 c == CHAR_UNDERSCORE)
5024 == prop_fail_result)
5025 RRETURN(MATCH_NOMATCH);
5026 }
5027 /* Control never gets here */
5028
5029 case PT_CLIST:
5030 for (fi = min;; fi++)
5031 {
5032 const pcre_uint32 *cp;
5033 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5035 if (fi >= max) RRETURN(MATCH_NOMATCH);
5036 if (eptr >= md->end_subject)
5037 {
5038 SCHECK_PARTIAL();
5039 RRETURN(MATCH_NOMATCH);
5040 }
5041 GETCHARINCTEST(c, eptr);
5042 cp = PRIV(ucd_caseless_sets) + prop_value;
5043 for (;;)
5044 {
5045 if (c < *cp)
5046 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5047 if (c == *cp++)
5048 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5049 }
5050 }
5051 /* Control never gets here */
5052
5053 case PT_UCNC:
5054 for (fi = min;; fi++)
5055 {
5056 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5058 if (fi >= max) RRETURN(MATCH_NOMATCH);
5059 if (eptr >= md->end_subject)
5060 {
5061 SCHECK_PARTIAL();
5062 RRETURN(MATCH_NOMATCH);
5063 }
5064 GETCHARINCTEST(c, eptr);
5065 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5066 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5067 c >= 0xe000) == prop_fail_result)
5068 RRETURN(MATCH_NOMATCH);
5069 }
5070 /* Control never gets here */
5071
5072 /* This should never occur */
5073 default:
5074 RRETURN(PCRE_ERROR_INTERNAL);
5075 }
5076 }
5077
5078 /* Match extended Unicode sequences. We will get here only if the
5079 support is in the binary; otherwise a compile-time error occurs. */
5080
5081 else if (ctype == OP_EXTUNI)
5082 {
5083 for (fi = min;; fi++)
5084 {
5085 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5086 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5087 if (fi >= max) RRETURN(MATCH_NOMATCH);
5088 if (eptr >= md->end_subject)
5089 {
5090 SCHECK_PARTIAL();
5091 RRETURN(MATCH_NOMATCH);
5092 }
5093 else
5094 {
5095 int lgb, rgb;
5096 GETCHARINCTEST(c, eptr);
5097 lgb = UCD_GRAPHBREAK(c);
5098 while (eptr < md->end_subject)
5099 {
5100 int len = 1;
5101 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5102 rgb = UCD_GRAPHBREAK(c);
5103 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5104 lgb = rgb;
5105 eptr += len;
5106 }
5107 }
5108 CHECK_PARTIAL();
5109 }
5110 }
5111 else
5112 #endif /* SUPPORT_UCP */
5113
5114 #ifdef SUPPORT_UTF
5115 if (utf)
5116 {
5117 for (fi = min;; fi++)
5118 {
5119 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5120 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5121 if (fi >= max) RRETURN(MATCH_NOMATCH);
5122 if (eptr >= md->end_subject)
5123 {
5124 SCHECK_PARTIAL();
5125 RRETURN(MATCH_NOMATCH);
5126 }
5127 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5128 RRETURN(MATCH_NOMATCH);
5129 GETCHARINC(c, eptr);
5130 switch(ctype)
5131 {
5132 case OP_ANY: /* This is the non-NL case */
5133 if (md->partial != 0 && /* Take care with CRLF partial */
5134 eptr >= md->end_subject &&
5135 NLBLOCK->nltype == NLTYPE_FIXED &&
5136 NLBLOCK->nllen == 2 &&
5137 c == NLBLOCK->nl[0])
5138 {
5139 md->hitend = TRUE;
5140 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5141 }
5142 break;
5143
5144 case OP_ALLANY:
5145 case OP_ANYBYTE:
5146 break;
5147
5148 case OP_ANYNL:
5149 switch(c)
5150 {
5151 default: RRETURN(MATCH_NOMATCH);
5152 case CHAR_CR:
5153 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5154 break;
5155
5156 case CHAR_LF:
5157 break;
5158
5159 case CHAR_VT:
5160 case CHAR_FF:
5161 case CHAR_NEL:
5162 #ifndef EBCDIC
5163 case 0x2028:
5164 case 0x2029:
5165 #endif /* Not EBCDIC */
5166 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5167 break;
5168 }
5169 break;
5170
5171 case OP_NOT_HSPACE:
5172 switch(c)
5173 {
5174 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5175 default: break;
5176 }
5177 break;
5178
5179 case OP_HSPACE:
5180 switch(c)
5181 {
5182 HSPACE_CASES: break;
5183 default: RRETURN(MATCH_NOMATCH);
5184 }
5185 break;
5186
5187 case OP_NOT_VSPACE:
5188 switch(c)
5189 {
5190 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5191 default: break;
5192 }
5193 break;
5194
5195 case OP_VSPACE:
5196 switch(c)
5197 {
5198 VSPACE_CASES: break;
5199 default: RRETURN(MATCH_NOMATCH);
5200 }
5201 break;
5202
5203 case OP_NOT_DIGIT:
5204 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5205 RRETURN(MATCH_NOMATCH);
5206 break;
5207
5208 case OP_DIGIT:
5209 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5210 RRETURN(MATCH_NOMATCH);
5211 break;
5212
5213 case OP_NOT_WHITESPACE:
5214 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5215 RRETURN(MATCH_NOMATCH);
5216 break;
5217
5218 case OP_WHITESPACE:
5219 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5220 RRETURN(MATCH_NOMATCH);
5221 break;
5222
5223 case OP_NOT_WORDCHAR:
5224 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5225 RRETURN(MATCH_NOMATCH);
5226 break;
5227
5228 case OP_WORDCHAR:
5229 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5230 RRETURN(MATCH_NOMATCH);
5231 break;
5232
5233 default:
5234 RRETURN(PCRE_ERROR_INTERNAL);
5235 }
5236 }
5237 }
5238 else
5239 #endif
5240 /* Not UTF mode */
5241 {
5242 for (fi = min;; fi++)
5243 {
5244 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5245 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5246 if (fi >= max) RRETURN(MATCH_NOMATCH);
5247 if (eptr >= md->end_subject)
5248 {
5249 SCHECK_PARTIAL();
5250 RRETURN(MATCH_NOMATCH);
5251 }
5252 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5253 RRETURN(MATCH_NOMATCH);
5254 c = *eptr++;
5255 switch(ctype)
5256 {
5257 case OP_ANY: /* This is the non-NL case */
5258 if (md->partial != 0 && /* Take care with CRLF partial */
5259 eptr >= md->end_subject &&
5260 NLBLOCK->nltype == NLTYPE_FIXED &&
5261 NLBLOCK->nllen == 2 &&
5262 c == NLBLOCK->nl[0])
5263 {
5264 md->hitend = TRUE;
5265 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5266 }
5267 break;
5268
5269 case OP_ALLANY:
5270 case OP_ANYBYTE:
5271 break;
5272
5273 case OP_ANYNL:
5274 switch(c)
5275 {
5276 default: RRETURN(MATCH_NOMATCH);
5277 case CHAR_CR:
5278 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5279 break;
5280
5281 case CHAR_LF:
5282 break;
5283
5284 case CHAR_VT:
5285 case CHAR_FF:
5286 case CHAR_NEL:
5287 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5288 case 0x2028:
5289 case 0x2029:
5290 #endif
5291 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5292 break;
5293 }
5294 break;
5295
5296 case OP_NOT_HSPACE:
5297 switch(c)
5298 {
5299 default: break;
5300 HSPACE_BYTE_CASES:
5301 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5302 HSPACE_MULTIBYTE_CASES:
5303 #endif
5304 RRETURN(MATCH_NOMATCH);
5305 }
5306 break;
5307
5308 case OP_HSPACE:
5309 switch(c)
5310 {
5311 default: RRETURN(MATCH_NOMATCH);
5312 HSPACE_BYTE_CASES:
5313 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5314 HSPACE_MULTIBYTE_CASES:
5315 #endif
5316 break;
5317 }
5318 break;
5319
5320 case OP_NOT_VSPACE:
5321 switch(c)
5322 {
5323 default: break;
5324 VSPACE_BYTE_CASES:
5325 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5326 VSPACE_MULTIBYTE_CASES:
5327 #endif
5328 RRETURN(MATCH_NOMATCH);
5329 }
5330 break;
5331
5332 case OP_VSPACE:
5333 switch(c)
5334 {
5335 default: RRETURN(MATCH_NOMATCH);
5336 VSPACE_BYTE_CASES:
5337 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5338 VSPACE_MULTIBYTE_CASES:
5339 #endif
5340 break;
5341 }
5342 break;
5343
5344 case OP_NOT_DIGIT:
5345 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5346 break;
5347
5348 case OP_DIGIT:
5349 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5350 break;
5351
5352 case OP_NOT_WHITESPACE:
5353 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5354 break;
5355
5356 case OP_WHITESPACE:
5357 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5358 break;
5359
5360 case OP_NOT_WORDCHAR:
5361 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5362 break;
5363
5364 case OP_WORDCHAR:
5365 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5366 break;
5367
5368 default:
5369 RRETURN(PCRE_ERROR_INTERNAL);
5370 }
5371 }
5372 }
5373 /* Control never gets here */
5374 }
5375
5376 /* If maximizing, it is worth using inline code for speed, doing the type
5377 test once at the start (i.e. keep it out of the loop). Again, keep the
5378 UTF-8 and UCP stuff separate. */
5379
5380 else
5381 {
5382 pp = eptr; /* Remember where we started */
5383
5384 #ifdef SUPPORT_UCP
5385 if (prop_type >= 0)
5386 {
5387 switch(prop_type)
5388 {
5389 case PT_ANY:
5390 for (i = min; i < max; i++)
5391 {
5392 int len = 1;
5393 if (eptr >= md->end_subject)
5394 {
5395 SCHECK_PARTIAL();
5396 break;
5397 }
5398 GETCHARLENTEST(c, eptr, len);
5399 if (prop_fail_result) break;
5400 eptr+= len;
5401 }
5402 break;
5403
5404 case PT_LAMP:
5405 for (i = min; i < max; i++)
5406 {
5407 int chartype;
5408 int len = 1;
5409 if (eptr >= md->end_subject)
5410 {
5411 SCHECK_PARTIAL();
5412 break;
5413 }
5414 GETCHARLENTEST(c, eptr, len);
5415 chartype = UCD_CHARTYPE(c);
5416 if ((chartype == ucp_Lu ||
5417 chartype == ucp_Ll ||
5418 chartype == ucp_Lt) == prop_fail_result)
5419 break;
5420 eptr+= len;
5421 }
5422 break;
5423
5424 case PT_GC:
5425 for (i = min; i < max; i++)
5426 {
5427 int len = 1;
5428 if (eptr >= md->end_subject)
5429 {
5430 SCHECK_PARTIAL();
5431 break;
5432 }
5433 GETCHARLENTEST(c, eptr, len);
5434 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5435 eptr+= len;
5436 }
5437 break;
5438
5439 case PT_PC:
5440 for (i = min; i < max; i++)
5441 {
5442 int len = 1;
5443 if (eptr >= md->end_subject)
5444 {
5445 SCHECK_PARTIAL();
5446 break;
5447 }
5448 GETCHARLENTEST(c, eptr, len);
5449 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5450 eptr+= len;
5451 }
5452 break;
5453
5454 case PT_SC:
5455 for (i = min; i < max; i++)
5456 {
5457 int len = 1;
5458 if (eptr >= md->end_subject)
5459 {
5460 SCHECK_PARTIAL();
5461 break;
5462 }
5463 GETCHARLENTEST(c, eptr, len);
5464 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5465 eptr+= len;
5466 }
5467 break;
5468
5469 case PT_ALNUM:
5470 for (i = min; i < max; i++)
5471 {
5472 int category;
5473 int len = 1;
5474 if (eptr >= md->end_subject)
5475 {
5476 SCHECK_PARTIAL();
5477 break;
5478 }
5479 GETCHARLENTEST(c, eptr, len);
5480 category = UCD_CATEGORY(c);
5481 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5482 break;
5483 eptr+= len;
5484 }
5485 break;
5486
5487 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5488 which means that Perl space and POSIX space are now identical. PCRE
5489 was changed at release 8.34. */
5490
5491 case PT_SPACE: /* Perl space */
5492 case PT_PXSPACE: /* POSIX space */
5493 for (i = min; i < max; i++)
5494 {
5495 int len = 1;
5496 if (eptr >= md->end_subject)
5497 {
5498 SCHECK_PARTIAL();
5499 break;
5500 }
5501 GETCHARLENTEST(c, eptr, len);
5502 switch(c)
5503 {
5504 HSPACE_CASES:
5505 VSPACE_CASES:
5506 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5507 break;
5508
5509 default:
5510 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5511 goto ENDLOOP99; /* Break the loop */
5512 break;
5513 }
5514 eptr+= len;
5515 }
5516 ENDLOOP99:
5517 break;
5518
5519 case PT_WORD:
5520 for (i = min; i < max; i++)
5521 {
5522 int category;
5523 int len = 1;
5524 if (eptr >= md->end_subject)
5525 {
5526 SCHECK_PARTIAL();
5527 break;
5528 }
5529 GETCHARLENTEST(c, eptr, len);
5530 category = UCD_CATEGORY(c);
5531 if ((category == ucp_L || category == ucp_N ||
5532 c == CHAR_UNDERSCORE) == prop_fail_result)
5533 break;
5534 eptr+= len;
5535 }
5536 break;
5537
5538 case PT_CLIST:
5539 for (i = min; i < max; i++)
5540 {
5541 const pcre_uint32 *cp;
5542 int len = 1;
5543 if (eptr >= md->end_subject)
5544 {
5545 SCHECK_PARTIAL();
5546 break;
5547 }
5548 GETCHARLENTEST(c, eptr, len);
5549 cp = PRIV(ucd_caseless_sets) + prop_value;
5550 for (;;)
5551 {
5552 if (c < *cp)
5553 { if (prop_fail_result) break; else goto GOT_MAX; }
5554 if (c == *cp++)
5555 { if (prop_fail_result) goto GOT_MAX; else break; }
5556 }
5557 eptr += len;
5558 }
5559 GOT_MAX:
5560 break;
5561
5562 case PT_UCNC:
5563 for (i = min; i < max; i++)
5564 {
5565 int len = 1;
5566 if (eptr >= md->end_subject)
5567 {
5568 SCHECK_PARTIAL();
5569 break;
5570 }
5571 GETCHARLENTEST(c, eptr, len);
5572 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5573 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5574 c >= 0xe000) == prop_fail_result)
5575 break;
5576 eptr += len;
5577 }
5578 break;
5579
5580 default:
5581 RRETURN(PCRE_ERROR_INTERNAL);
5582 }
5583
5584 /* eptr is now past the end of the maximum run */
5585
5586 if (possessive) continue; /* No backtracking */
5587 for(;;)
5588 {
5589 if (eptr == pp) goto TAIL_RECURSE;
5590 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5592 eptr--;
5593 if (utf) BACKCHAR(eptr);
5594 }
5595 }
5596
5597 /* Match extended Unicode grapheme clusters. We will get here only if the
5598 support is in the binary; otherwise a compile-time error occurs. */
5599
5600 else if (ctype == OP_EXTUNI)
5601 {
5602 for (i = min; i < max; i++)
5603 {
5604 if (eptr >= md->end_subject)
5605 {
5606 SCHECK_PARTIAL();
5607 break;
5608 }
5609 else
5610 {
5611 int lgb, rgb;
5612 GETCHARINCTEST(c, eptr);
5613 lgb = UCD_GRAPHBREAK(c);
5614 while (eptr < md->end_subject)
5615 {
5616 int len = 1;
5617 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5618 rgb = UCD_GRAPHBREAK(c);
5619 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5620 lgb = rgb;
5621 eptr += len;
5622 }
5623 }
5624 CHECK_PARTIAL();
5625 }
5626
5627 /* eptr is now past the end of the maximum run */
5628
5629 if (possessive) continue; /* No backtracking */
5630
5631 for(;;)
5632 {
5633 int lgb, rgb;
5634 PCRE_PUCHAR fptr;
5635
5636 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5637 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5638 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5639
5640 /* Backtracking over an extended grapheme cluster involves inspecting
5641 the previous two characters (if present) to see if a break is
5642 permitted between them. */
5643
5644 eptr--;
5645 if (!utf) c = *eptr; else
5646 {
5647 BACKCHAR(eptr);
5648 GETCHAR(c, eptr);
5649 }
5650 rgb = UCD_GRAPHBREAK(c);
5651
5652 for (;;)
5653 {
5654 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5655 fptr = eptr - 1;
5656 if (!utf) c = *fptr; else
5657 {
5658 BACKCHAR(fptr);
5659 GETCHAR(c, fptr);
5660 }
5661 lgb = UCD_GRAPHBREAK(c);
5662 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5663 eptr = fptr;
5664 rgb = lgb;
5665 }
5666 }
5667 }
5668
5669 else
5670 #endif /* SUPPORT_UCP */
5671
5672 #ifdef SUPPORT_UTF
5673 if (utf)
5674 {
5675 switch(ctype)
5676 {
5677 case OP_ANY:
5678 if (max < INT_MAX)
5679 {
5680 for (i = min; i < max; i++)
5681 {
5682 if (eptr >= md->end_subject)
5683 {
5684 SCHECK_PARTIAL();
5685 break;
5686 }
5687 if (IS_NEWLINE(eptr)) break;
5688 if (md->partial != 0 && /* Take care with CRLF partial */
5689 eptr + 1 >= md->end_subject &&
5690 NLBLOCK->nltype == NLTYPE_FIXED &&
5691 NLBLOCK->nllen == 2 &&
5692 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5693 {
5694 md->hitend = TRUE;
5695 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5696 }
5697 eptr++;
5698 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5699 }
5700 }
5701
5702 /* Handle unlimited UTF-8 repeat */
5703
5704 else
5705 {
5706 for (i = min; i < max; i++)
5707 {
5708 if (eptr >= md->end_subject)
5709 {
5710 SCHECK_PARTIAL();
5711 break;
5712 }
5713 if (IS_NEWLINE(eptr)) break;
5714 if (md->partial != 0 && /* Take care with CRLF partial */
5715 eptr + 1 >= md->end_subject &&
5716 NLBLOCK->nltype == NLTYPE_FIXED &&
5717 NLBLOCK->nllen == 2 &&
5718 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5719 {
5720 md->hitend = TRUE;
5721 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5722 }
5723 eptr++;
5724 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5725 }
5726 }
5727 break;
5728
5729 case OP_ALLANY:
5730 if (max < INT_MAX)
5731 {
5732 for (i = min; i < max; i++)
5733 {
5734 if (eptr >= md->end_subject)
5735 {
5736 SCHECK_PARTIAL();
5737 break;
5738 }
5739 eptr++;
5740 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5741 }
5742 }
5743 else
5744 {
5745 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5746 SCHECK_PARTIAL();
5747 }
5748 break;
5749
5750 /* The byte case is the same as non-UTF8 */
5751
5752 case OP_ANYBYTE:
5753 c = max - min;
5754 if (c > (unsigned int)(md->end_subject - eptr))
5755 {
5756 eptr = md->end_subject;
5757 SCHECK_PARTIAL();
5758 }
5759 else eptr += c;
5760 break;
5761
5762 case OP_ANYNL:
5763 for (i = min; i < max; i++)
5764 {
5765 int len = 1;
5766 if (eptr >= md->end_subject)
5767 {
5768 SCHECK_PARTIAL();
5769 break;
5770 }
5771 GETCHARLEN(c, eptr, len);
5772 if (c == CHAR_CR)
5773 {
5774 if (++eptr >= md->end_subject) break;
5775 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5776 }
5777 else
5778 {
5779 if (c != CHAR_LF &&
5780 (md->bsr_anycrlf ||
5781 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5782 #ifndef EBCDIC
5783 && c != 0x2028 && c != 0x2029
5784 #endif /* Not EBCDIC */
5785 )))
5786 break;
5787 eptr += len;
5788 }
5789 }
5790 break;
5791
5792 case OP_NOT_HSPACE:
5793 case OP_HSPACE:
5794 for (i = min; i < max; i++)
5795 {
5796 BOOL gotspace;
5797 int len = 1;
5798 if (eptr >= md->end_subject)
5799 {
5800 SCHECK_PARTIAL();
5801 break;
5802 }
5803 GETCHARLEN(c, eptr, len);
5804 switch(c)
5805 {
5806 HSPACE_CASES: gotspace = TRUE; break;
5807 default: gotspace = FALSE; break;
5808 }
5809 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5810 eptr += len;
5811 }
5812 break;
5813
5814 case OP_NOT_VSPACE:
5815 case OP_VSPACE:
5816 for (i = min; i < max; i++)
5817 {
5818 BOOL gotspace;
5819 int len = 1;
5820 if (eptr >= md->end_subject)
5821 {
5822 SCHECK_PARTIAL();
5823 break;
5824 }
5825 GETCHARLEN(c, eptr, len);
5826 switch(c)
5827 {
5828 VSPACE_CASES: gotspace = TRUE; break;
5829 default: gotspace = FALSE; break;
5830 }
5831 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5832 eptr += len;
5833 }
5834 break;
5835
5836 case OP_NOT_DIGIT:
5837 for (i = min; i < max; i++)
5838 {
5839 int len = 1;
5840 if (eptr >= md->end_subject)
5841 {
5842 SCHECK_PARTIAL();
5843 break;
5844 }
5845 GETCHARLEN(c, eptr, len);
5846 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5847 eptr+= len;
5848 }
5849 break;
5850
5851 case OP_DIGIT:
5852 for (i = min; i < max; i++)
5853 {
5854 int len = 1;
5855 if (eptr >= md->end_subject)
5856 {
5857 SCHECK_PARTIAL();
5858 break;
5859 }
5860 GETCHARLEN(c, eptr, len);
5861 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5862 eptr+= len;
5863 }
5864 break;
5865
5866 case OP_NOT_WHITESPACE:
5867 for (i = min; i < max; i++)
5868 {
5869 int len = 1;
5870 if (eptr >= md->end_subject)
5871 {
5872 SCHECK_PARTIAL();
5873 break;
5874 }
5875 GETCHARLEN(c, eptr, len);
5876 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5877 eptr+= len;
5878 }
5879 break;
5880
5881 case OP_WHITESPACE:
5882 for (i = min; i < max; i++)
5883 {
5884 int len = 1;
5885 if (eptr >= md->end_subject)
5886 {
5887 SCHECK_PARTIAL();
5888 break;
5889 }
5890 GETCHARLEN(c, eptr, len);
5891 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5892 eptr+= len;
5893 }
5894 break;
5895
5896 case OP_NOT_WORDCHAR:
5897 for (i = min; i < max; i++)
5898 {
5899 int len = 1;
5900 if (eptr >= md->end_subject)
5901 {
5902 SCHECK_PARTIAL();
5903 break;
5904 }
5905 GETCHARLEN(c, eptr, len);
5906 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5907 eptr+= len;
5908 }
5909 break;
5910
5911 case OP_WORDCHAR:
5912 for (i = min; i < max; i++)
5913 {
5914 int len = 1;
5915 if (eptr >= md->end_subject)
5916 {
5917 SCHECK_PARTIAL();
5918 break;
5919 }
5920 GETCHARLEN(c, eptr, len);
5921 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5922 eptr+= len;
5923 }
5924 break;
5925
5926 default:
5927 RRETURN(PCRE_ERROR_INTERNAL);
5928 }
5929
5930 if (possessive) continue; /* No backtracking */
5931 for(;;)
5932 {
5933 if (eptr == pp) goto TAIL_RECURSE;
5934 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5936 eptr--;
5937 BACKCHAR(eptr);
5938 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5939 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5940 }
5941 }
5942 else
5943 #endif /* SUPPORT_UTF */
5944 /* Not UTF mode */
5945 {
5946 switch(ctype)
5947 {
5948 case OP_ANY:
5949 for (i = min; i < max; i++)
5950 {
5951 if (eptr >= md->end_subject)
5952 {
5953 SCHECK_PARTIAL();
5954 break;
5955 }
5956 if (IS_NEWLINE(eptr)) break;
5957 if (md->partial != 0 && /* Take care with CRLF partial */
5958 eptr + 1 >= md->end_subject &&
5959 NLBLOCK->nltype == NLTYPE_FIXED &&
5960 NLBLOCK->nllen == 2 &&
5961 *eptr == NLBLOCK->nl[0])
5962 {
5963 md->hitend = TRUE;
5964 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5965 }
5966 eptr++;
5967 }
5968 break;
5969
5970 case OP_ALLANY:
5971 case OP_ANYBYTE:
5972 c = max - min;
5973 if (c > (unsigned int)(md->end_subject - eptr))
5974 {
5975 eptr = md->end_subject;
5976 SCHECK_PARTIAL();
5977 }
5978 else eptr += c;
5979 break;
5980
5981 case OP_ANYNL:
5982 for (i = min; i < max; i++)
5983 {
5984 if (eptr >= md->end_subject)
5985 {
5986 SCHECK_PARTIAL();
5987 break;
5988 }
5989 c = *eptr;
5990 if (c == CHAR_CR)
5991 {
5992 if (++eptr >= md->end_subject) break;
5993 if (*eptr == CHAR_LF) eptr++;
5994 }
5995 else
5996 {
5997 if (c != CHAR_LF && (md->bsr_anycrlf ||
5998 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5999 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6000 && c != 0x2028 && c != 0x2029
6001 #endif
6002 ))) break;
6003 eptr++;
6004 }
6005 }
6006 break;
6007
6008 case OP_NOT_HSPACE:
6009 for (i = min; i < max; i++)
6010 {
6011 if (eptr >= md->end_subject)
6012 {
6013 SCHECK_PARTIAL();
6014 break;
6015 }
6016 switch(*eptr)
6017 {
6018 default: eptr++; break;
6019 HSPACE_BYTE_CASES:
6020 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6021 HSPACE_MULTIBYTE_CASES:
6022 #endif
6023 goto ENDLOOP00;
6024 }
6025 }
6026 ENDLOOP00:
6027 break;
6028
6029 case OP_HSPACE:
6030 for (i = min; i < max; i++)
6031 {
6032 if (eptr >= md->end_subject)
6033 {
6034 SCHECK_PARTIAL();
6035 break;
6036 }
6037 switch(*eptr)
6038 {
6039 default: goto ENDLOOP01;
6040 HSPACE_BYTE_CASES:
6041 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6042 HSPACE_MULTIBYTE_CASES:
6043 #endif
6044 eptr++; break;
6045 }
6046 }
6047 ENDLOOP01:
6048 break;
6049
6050 case OP_NOT_VSPACE:
6051 for (i = min; i < max; i++)
6052 {
6053 if (eptr >= md->end_subject)
6054 {
6055 SCHECK_PARTIAL();
6056 break;
6057 }
6058 switch(*eptr)
6059 {
6060 default: eptr++; break;
6061 VSPACE_BYTE_CASES:
6062 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6063 VSPACE_MULTIBYTE_CASES:
6064 #endif
6065 goto ENDLOOP02;
6066 }
6067 }
6068 ENDLOOP02:
6069 break;
6070
6071 case OP_VSPACE:
6072 for (i = min; i < max; i++)
6073 {
6074 if (eptr >= md->end_subject)
6075 {
6076 SCHECK_PARTIAL();
6077 break;
6078 }
6079 switch(*eptr)
6080 {
6081 default: goto ENDLOOP03;
6082 VSPACE_BYTE_CASES:
6083 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6084 VSPACE_MULTIBYTE_CASES:
6085 #endif
6086 eptr++; break;
6087 }
6088 }
6089 ENDLOOP03:
6090 break;
6091
6092 case OP_NOT_DIGIT:
6093 for (i = min; i < max; i++)
6094 {
6095 if (eptr >= md->end_subject)
6096 {
6097 SCHECK_PARTIAL();
6098 break;
6099 }
6100 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6101 eptr++;
6102 }
6103 break;
6104
6105 case OP_DIGIT:
6106 for (i = min; i < max; i++)
6107 {
6108 if (eptr >= md->end_subject)
6109 {
6110 SCHECK_PARTIAL();
6111 break;
6112 }
6113 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6114 eptr++;
6115 }
6116 break;
6117
6118 case OP_NOT_WHITESPACE:
6119 for (i = min; i < max; i++)
6120 {
6121 if (eptr >= md->end_subject)
6122 {
6123 SCHECK_PARTIAL();
6124 break;
6125 }
6126 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6127 eptr++;
6128 }
6129 break;
6130
6131 case OP_WHITESPACE:
6132 for (i = min; i < max; i++)
6133 {
6134 if (eptr >= md->end_subject)
6135 {
6136 SCHECK_PARTIAL();
6137 break;
6138 }
6139 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6140 eptr++;
6141 }
6142 break;
6143
6144 case OP_NOT_WORDCHAR:
6145 for (i = min; i < max; i++)
6146 {
6147 if (eptr >= md->end_subject)
6148 {
6149 SCHECK_PARTIAL();
6150 break;
6151 }
6152 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6153 eptr++;
6154 }
6155 break;
6156
6157 case OP_WORDCHAR:
6158 for (i = min; i < max; i++)
6159 {
6160 if (eptr >= md->end_subject)
6161 {
6162 SCHECK_PARTIAL();
6163 break;
6164 }
6165 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6166 eptr++;
6167 }
6168 break;
6169
6170 default:
6171 RRETURN(PCRE_ERROR_INTERNAL);
6172 }
6173
6174 if (possessive) continue; /* No backtracking */
6175 for (;;)
6176 {
6177 if (eptr == pp) goto TAIL_RECURSE;
6178 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6179 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6180 eptr--;
6181 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6182 eptr[-1] == CHAR_CR) eptr--;
6183 }
6184 }
6185
6186 /* Control never gets here */
6187 }
6188
6189 /* There's been some horrible disaster. Arrival here can only mean there is
6190 something seriously wrong in the code above or the OP_xxx definitions. */
6191
6192 default:
6193 DPRINTF(("Unknown opcode %d\n", *ecode));
6194 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6195 }
6196
6197 /* Do not stick any code in here without much thought; it is assumed
6198 that "continue" in the code above comes out to here to repeat the main
6199 loop. */
6200
6201 } /* End of main loop */
6202 /* Control never reaches here */
6203
6204
6205 /* When compiling to use the heap rather than the stack for recursive calls to
6206 match(), the RRETURN() macro jumps here. The number that is saved in
6207 frame->Xwhere indicates which label we actually want to return to. */
6208
6209 #ifdef NO_RECURSE
6210 #define LBL(val) case val: goto L_RM##val;
6211 HEAP_RETURN:
6212 switch (frame->Xwhere)
6213 {
6214 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6215 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6216 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6217 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6218 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6219 LBL(65) LBL(66)
6220 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6221 LBL(20) LBL(21)
6222 #endif
6223 #ifdef SUPPORT_UTF
6224 LBL(16) LBL(18)
6225 LBL(22) LBL(23) LBL(28) LBL(30)
6226 LBL(32) LBL(34) LBL(42) LBL(46)
6227 #ifdef SUPPORT_UCP
6228 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6229 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6230 #endif /* SUPPORT_UCP */
6231 #endif /* SUPPORT_UTF */
6232 default:
6233 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6234 return PCRE_ERROR_INTERNAL;
6235 }
6236 #undef LBL
6237 #endif /* NO_RECURSE */
6238 }
6239
6240
6241 /***************************************************************************
6242 ****************************************************************************
6243 RECURSION IN THE match() FUNCTION
6244
6245 Undefine all the macros that were defined above to handle this. */
6246
6247 #ifdef NO_RECURSE
6248 #undef eptr
6249 #undef ecode
6250 #undef mstart
6251 #undef offset_top
6252 #undef eptrb
6253 #undef flags
6254
6255 #undef callpat
6256 #undef charptr
6257 #undef data
6258 #undef next
6259 #undef pp
6260 #undef prev
6261 #undef saved_eptr
6262
6263 #undef new_recursive
6264
6265 #undef cur_is_word
6266 #undef condition
6267 #undef prev_is_word
6268
6269 #undef ctype
6270 #undef length
6271 #undef max
6272 #undef min
6273 #undef number
6274 #undef offset
6275 #undef op
6276 #undef save_capture_last
6277 #undef save_offset1
6278 #undef save_offset2
6279 #undef save_offset3
6280 #undef stacksave
6281
6282 #undef newptrb
6283
6284 #endif
6285
6286 /* These two are defined as macros in both cases */
6287
6288 #undef fc
6289 #undef fi
6290
6291 /***************************************************************************
6292 ***************************************************************************/
6293
6294
6295 #ifdef NO_RECURSE
6296 /*************************************************
6297 * Release allocated heap frames *
6298 *************************************************/
6299
6300 /* This function releases all the allocated frames. The base frame is on the
6301 machine stack, and so must not be freed.
6302
6303 Argument: the address of the base frame
6304 Returns: nothing
6305 */
6306
6307 static void
6308 release_match_heapframes (heapframe *frame_base)
6309 {
6310 heapframe *nextframe = frame_base->Xnextframe;
6311 while (nextframe != NULL)
6312 {
6313 heapframe *oldframe = nextframe;
6314 nextframe = nextframe->Xnextframe;
6315 (PUBL(stack_free))(oldframe);
6316 }
6317 }
6318 #endif
6319
6320
6321 /*************************************************
6322 * Execute a Regular Expression *
6323 *************************************************/
6324
6325 /* This function applies a compiled re to a subject string and picks out
6326 portions of the string if it matches. Two elements in the vector are set for
6327 each substring: the offsets to the start and end of the substring.
6328
6329 Arguments:
6330 argument_re points to the compiled expression
6331 extra_data points to extra data or is NULL
6332 subject points to the subject string
6333 length length of subject string (may contain binary zeros)
6334 start_offset where to start in the subject string
6335 options option bits
6336 offsets points to a vector of ints to be filled in with offsets
6337 offsetcount the number of elements in the vector
6338
6339 Returns: > 0 => success; value is the number of elements filled in
6340 = 0 => success, but offsets is not big enough
6341 -1 => failed to match
6342 < -1 => some kind of unexpected problem
6343 */
6344
6345 #if defined COMPILE_PCRE8
6346 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6347 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6348 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6349 int offsetcount)
6350 #elif defined COMPILE_PCRE16
6351 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6352 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6353 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6354 int offsetcount)
6355 #elif defined COMPILE_PCRE32
6356 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6357 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6358 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6359 int offsetcount)
6360 #endif
6361 {
6362 int rc, ocount, arg_offset_max;
6363 int newline;
6364 BOOL using_temporary_offsets = FALSE;
6365 BOOL anchored;
6366 BOOL startline;
6367 BOOL firstline;
6368 BOOL utf;
6369 BOOL has_first_char = FALSE;
6370 BOOL has_req_char = FALSE;
6371 pcre_uchar first_char = 0;
6372 pcre_uchar first_char2 = 0;
6373 pcre_uchar req_char = 0;
6374 pcre_uchar req_char2 = 0;
6375 match_data match_block;
6376 match_data *md = &match_block;
6377 const pcre_uint8 *tables;
6378 const pcre_uint8 *start_bits = NULL;
6379 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6380 PCRE_PUCHAR end_subject;
6381 PCRE_PUCHAR start_partial = NULL;
6382 PCRE_PUCHAR match_partial = NULL;
6383 PCRE_PUCHAR req_char_ptr = start_match - 1;
6384
6385 const pcre_study_data *study;
6386 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6387
6388 #ifdef NO_RECURSE
6389 heapframe frame_zero;
6390 frame_zero.Xprevframe = NULL; /* Marks the top level */
6391 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6392 md->match_frames_base = &frame_zero;
6393 #endif
6394
6395 /* Check for the special magic call that measures the size of the stack used
6396 per recursive call of match(). Without the funny casting for sizeof, a Windows
6397 compiler gave this error: "unary minus operator applied to unsigned type,
6398 result still unsigned". Hopefully the cast fixes that. */
6399
6400 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6401 start_offset == -999)
6402 #ifdef NO_RECURSE
6403 return -((int)sizeof(heapframe));
6404 #else
6405 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6406 #endif
6407
6408 /* Plausibility checks */
6409
6410 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6411 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6412 return PCRE_ERROR_NULL;
6413 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6414 if (length < 0) return PCRE_ERROR_BADLENGTH;
6415 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6416
6417 /* Check that the first field in the block is the magic number. If it is not,
6418 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6419 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6420 means that the pattern is likely compiled with different endianness. */
6421
6422 if (re->magic_number != MAGIC_NUMBER)
6423 return re->magic_number == REVERSED_MAGIC_NUMBER?
6424 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6425 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6426
6427 /* These two settings are used in the code for checking a UTF-8 string that
6428 follows immediately afterwards. Other values in the md block are used only
6429 during "normal" pcre_exec() processing, not when the JIT support is in use,
6430 so they are set up later. */
6431
6432 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6433 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6434 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6435 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6436
6437 /* Check a UTF-8 string if required. Pass back the character offset and error
6438 code for an invalid string if a results vector is available. */
6439
6440 #ifdef SUPPORT_UTF
6441 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6442 {
6443 int erroroffset;
6444 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6445 if (errorcode != 0)
6446 {
6447 if (offsetcount >= 2)
6448 {
6449 offsets[0] = erroroffset;
6450 offsets[1] = errorcode;
6451 }
6452 #if defined COMPILE_PCRE8
6453 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6454 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6455 #elif defined COMPILE_PCRE16
6456 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6457 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6458 #elif defined COMPILE_PCRE32
6459 return PCRE_ERROR_BADUTF32;
6460 #endif
6461 }
6462 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6463 /* Check that a start_offset points to the start of a UTF character. */
6464 if (start_offset > 0 && start_offset < length &&
6465 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6466 return PCRE_ERROR_BADUTF8_OFFSET;
6467 #endif
6468 }
6469 #endif
6470
6471 /* If the pattern was successfully studied with JIT support, run the JIT
6472 executable instead of the rest of this function. Most options must be set at
6473 compile time for the JIT code to be usable. Fallback to the normal code path if
6474 an unsupported flag is set. */
6475
6476 #ifdef SUPPORT_JIT
6477 if (extra_data != NULL
6478 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6479 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6480 && extra_data->executable_jit != NULL
6481 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6482 {
6483 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6484 start_offset, options, offsets, offsetcount);
6485
6486 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6487 mode is not compiled. In this case we simply fallback to interpreter. */
6488
6489 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6490 }
6491 #endif
6492
6493 /* Carry on with non-JIT matching. This information is for finding all the
6494 numbers associated with a given name, for condition testing. */
6495
6496 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6497 md->name_count = re->name_count;
6498 md->name_entry_size = re->name_entry_size;
6499
6500 /* Fish out the optional data from the extra_data structure, first setting
6501 the default values. */
6502
6503 study = NULL;
6504 md->match_limit = MATCH_LIMIT;
6505 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6506 md->callout_data = NULL;
6507
6508 /* The table pointer is always in native byte order. */
6509
6510 tables = re->tables;
6511
6512 /* The two limit values override the defaults, whatever their value. */
6513
6514 if (extra_data != NULL)
6515 {
6516 register unsigned int flags = extra_data->flags;
6517 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6518 study = (const pcre_study_data *)extra_data->study_data;
6519 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6520 md->match_limit = extra_data->match_limit;
6521 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6522 md->match_limit_recursion = extra_data->match_limit_recursion;
6523 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6524 md->callout_data = extra_data->callout_data;
6525 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6526 }
6527
6528 /* Limits in the regex override only if they are smaller. */
6529
6530 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6531 md->match_limit = re->limit_match;
6532
6533 if ((re->flags & PCRE_RLSET) != 0 &&
6534 re->limit_recursion < md->match_limit_recursion)
6535 md->match_limit_recursion = re->limit_recursion;
6536
6537 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6538 is a feature that makes it possible to save compiled regex and re-use them
6539 in other programs later. */
6540
6541 if (tables == NULL) tables = PRIV(default_tables);
6542
6543 /* Set up other data */
6544
6545 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6546 startline = (re->flags & PCRE_STARTLINE) != 0;
6547 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6548
6549 /* The code starts after the real_pcre block and the capture name table. */
6550
6551 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6552 re->name_count * re->name_entry_size;
6553
6554 md->start_subject = (PCRE_PUCHAR)subject;
6555 md->start_offset = start_offset;
6556 md->end_subject = md->start_subject + length;
6557 end_subject = md->end_subject;
6558
6559 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6560 md->use_ucp = (re->options & PCRE_UCP) != 0;
6561 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6562 md->ignore_skip_arg = 0;
6563
6564 /* Some options are unpacked into BOOL variables in the hope that testing
6565 them will be faster than individual option bits. */
6566
6567 md->notbol = (options & PCRE_NOTBOL) != 0;
6568 md->noteol = (options & PCRE_NOTEOL) != 0;
6569 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6570 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6571
6572 md->hitend = FALSE;
6573 md->mark = md->nomatch_mark = NULL; /* In case never set */
6574
6575 md->recursive = NULL; /* No recursion at top level */
6576 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6577
6578 md->lcc = tables + lcc_offset;
6579 md->fcc = tables + fcc_offset;
6580 md->ctypes = tables + ctypes_offset;
6581
6582 /* Handle different \R options. */
6583
6584 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6585 {
6586 case 0:
6587 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6588 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6589 else
6590 #ifdef BSR_ANYCRLF
6591 md->bsr_anycrlf = TRUE;
6592 #else
6593 md->bsr_anycrlf = FALSE;
6594 #endif
6595 break;
6596
6597 case PCRE_BSR_ANYCRLF:
6598 md->bsr_anycrlf = TRUE;
6599 break;
6600
6601 case PCRE_BSR_UNICODE:
6602 md->bsr_anycrlf = FALSE;
6603 break;
6604
6605 default: return PCRE_ERROR_BADNEWLINE;
6606 }
6607
6608 /* Handle different types of newline. The three bits give eight cases. If
6609 nothing is set at run time, whatever was used at compile time applies. */
6610
6611 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6612 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6613 {
6614 case 0: newline = NEWLINE; break; /* Compile-time default */
6615 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6616 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6617 case PCRE_NEWLINE_CR+
6618 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6619 case PCRE_NEWLINE_ANY: newline = -1; break;
6620 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6621 default: return PCRE_ERROR_BADNEWLINE;
6622 }
6623
6624 if (newline == -2)
6625 {
6626 md->nltype = NLTYPE_ANYCRLF;
6627 }
6628 else if (newline < 0)
6629 {
6630 md->nltype = NLTYPE_ANY;
6631 }
6632 else
6633 {
6634 md->nltype = NLTYPE_FIXED;
6635 if (newline > 255)
6636 {
6637 md->nllen = 2;
6638 md->nl[0] = (newline >> 8) & 255;
6639 md->nl[1] = newline & 255;
6640 }
6641 else
6642 {
6643 md->nllen = 1;
6644 md->nl[0] = newline;
6645 }
6646 }
6647
6648 /* Partial matching was originally supported only for a restricted set of
6649 regexes; from release 8.00 there are no restrictions, but the bits are still
6650 defined (though never set). So there's no harm in leaving this code. */
6651
6652 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6653 return PCRE_ERROR_BADPARTIAL;
6654
6655 /* If the expression has got more back references than the offsets supplied can
6656 hold, we get a temporary chunk of working store to use during the matching.
6657 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6658 of 3. */
6659
6660 ocount = offsetcount - (offsetcount % 3);
6661 arg_offset_max = (2*ocount)/3;
6662
6663 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6664 {
6665 ocount = re->top_backref * 3 + 3;
6666 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6667 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6668 using_temporary_offsets = TRUE;
6669 DPRINTF(("Got memory to hold back references\n"));
6670 }
6671 else md->offset_vector = offsets;
6672 md->offset_end = ocount;
6673 md->offset_max = (2*ocount)/3;
6674 md->capture_last = 0;
6675
6676 /* Reset the working variable associated with each extraction. These should
6677 never be used unless previously set, but they get saved and restored, and so we
6678 initialize them to avoid reading uninitialized locations. Also, unset the
6679 offsets for the matched string. This is really just for tidiness with callouts,
6680 in case they inspect these fields. */
6681
6682 if (md->offset_vector != NULL)
6683 {
6684 register int *iptr = md->offset_vector + ocount;
6685 register int *iend = iptr - re->top_bracket;
6686 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6687 while (--iptr >= iend) *iptr = -1;
6688 md->offset_vector[0] = md->offset_vector[1] = -1;
6689 }
6690
6691 /* Set up the first character to match, if available. The first_char value is
6692 never set for an anchored regular expression, but the anchoring may be forced
6693 at run time, so we have to test for anchoring. The first char may be unset for
6694 an unanchored pattern, of course. If there's no first char and the pattern was
6695 studied, there may be a bitmap of possible first characters. */
6696
6697 if (!anchored)
6698 {
6699 if ((re->flags & PCRE_FIRSTSET) != 0)
6700 {
6701 has_first_char = TRUE;
6702 first_char = first_char2 = (pcre_uchar)(re->first_char);
6703 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6704 {
6705 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6706 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6707 if (utf && first_char > 127)
6708 first_char2 = UCD_OTHERCASE(first_char);
6709 #endif
6710 }
6711 }
6712 else
6713 if (!startline && study != NULL &&
6714 (study->flags & PCRE_STUDY_MAPPED) != 0)
6715 start_bits = study->start_bits;
6716 }
6717
6718 /* For anchored or unanchored matches, there may be a "last known required
6719 character" set. */
6720
6721 if ((re->flags & PCRE_REQCHSET) != 0)
6722 {
6723 has_req_char = TRUE;
6724 req_char = req_char2 = (pcre_uchar)(re->req_char);
6725 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6726 {
6727 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6728 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6729 if (utf && req_char > 127)
6730 req_char2 = UCD_OTHERCASE(req_char);
6731 #endif
6732 }
6733 }
6734
6735
6736 /* ==========================================================================*/
6737
6738 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6739 the loop runs just once. */
6740
6741 for(;;)
6742 {
6743 PCRE_PUCHAR save_end_subject = end_subject;
6744 PCRE_PUCHAR new_start_match;
6745
6746 /* If firstline is TRUE, the start of the match is constrained to the first
6747 line of a multiline string. That is, the match must be before or at the first
6748 newline. Implement this by temporarily adjusting end_subject so that we stop
6749 scanning at a newline. If the match fails at the newline, later code breaks
6750 this loop. */
6751
6752 if (firstline)
6753 {
6754