/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1260 - (show annotations)
Wed Feb 27 15:41:22 2013 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 215884 byte(s)
Add \p{Xuc} to match characters identifiable by Universal Character Names.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_COMMIT (-998)
91 #define MATCH_KETRPOS (-997)
92 #define MATCH_ONCE (-996)
93 #define MATCH_PRUNE (-995)
94 #define MATCH_SKIP (-994)
95 #define MATCH_SKIP_ARG (-993)
96 #define MATCH_THEN (-992)
97
98 /* Maximum number of ints of offset to save on the stack for recursive calls.
99 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
100 because the offset vector is always a multiple of 3 long. */
101
102 #define REC_STACK_SAVE_MAX 30
103
104 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
105
106 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
107 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
108
109 #ifdef PCRE_DEBUG
110 /*************************************************
111 * Debugging function to print chars *
112 *************************************************/
113
114 /* Print a sequence of chars in printable format, stopping at the end of the
115 subject if the requested.
116
117 Arguments:
118 p points to characters
119 length number to print
120 is_subject TRUE if printing from within md->start_subject
121 md pointer to matching data block, if is_subject is TRUE
122
123 Returns: nothing
124 */
125
126 static void
127 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
128 {
129 pcre_uint32 c;
130 BOOL utf = md->utf;
131 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
132 while (length-- > 0)
133 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
134 }
135 #endif
136
137
138
139 /*************************************************
140 * Match a back-reference *
141 *************************************************/
142
143 /* Normally, if a back reference hasn't been set, the length that is passed is
144 negative, so the match always fails. However, in JavaScript compatibility mode,
145 the length passed is zero. Note that in caseless UTF-8 mode, the number of
146 subject bytes matched may be different to the number of reference bytes.
147
148 Arguments:
149 offset index into the offset vector
150 eptr pointer into the subject
151 length length of reference to be matched (number of bytes)
152 md points to match data block
153 caseless TRUE if caseless
154
155 Returns: >= 0 the number of subject bytes matched
156 -1 no match
157 -2 partial match; always given if at end subject
158 */
159
160 static int
161 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
162 BOOL caseless)
163 {
164 PCRE_PUCHAR eptr_start = eptr;
165 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
166 #ifdef SUPPORT_UTF
167 BOOL utf = md->utf;
168 #endif
169
170 #ifdef PCRE_DEBUG
171 if (eptr >= md->end_subject)
172 printf("matching subject <null>");
173 else
174 {
175 printf("matching subject ");
176 pchars(eptr, length, TRUE, md);
177 }
178 printf(" against backref ");
179 pchars(p, length, FALSE, md);
180 printf("\n");
181 #endif
182
183 /* Always fail if reference not set (and not JavaScript compatible - in that
184 case the length is passed as zero). */
185
186 if (length < 0) return -1;
187
188 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
189 properly if Unicode properties are supported. Otherwise, we can check only
190 ASCII characters. */
191
192 if (caseless)
193 {
194 #ifdef SUPPORT_UTF
195 #ifdef SUPPORT_UCP
196 if (utf)
197 {
198 /* Match characters up to the end of the reference. NOTE: the number of
199 data units matched may differ, because in UTF-8 there are some characters
200 whose upper and lower case versions code have different numbers of bytes.
201 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
202 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
203 sequence of two of the latter. It is important, therefore, to check the
204 length along the reference, not along the subject (earlier code did this
205 wrong). */
206
207 PCRE_PUCHAR endptr = p + length;
208 while (p < endptr)
209 {
210 pcre_uint32 c, d;
211 const ucd_record *ur;
212 if (eptr >= md->end_subject) return -2; /* Partial match */
213 GETCHARINC(c, eptr);
214 GETCHARINC(d, p);
215 ur = GET_UCD(d);
216 if (c != d && c != d + ur->other_case)
217 {
218 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
219 for (;;)
220 {
221 if (c < *pp) return -1;
222 if (c == *pp++) break;
223 }
224 }
225 }
226 }
227 else
228 #endif
229 #endif
230
231 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
232 is no UCP support. */
233 {
234 while (length-- > 0)
235 {
236 pcre_uint32 cc, cp;
237 if (eptr >= md->end_subject) return -2; /* Partial match */
238 cc = RAWUCHARTEST(eptr);
239 cp = RAWUCHARTEST(p);
240 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
241 p++;
242 eptr++;
243 }
244 }
245 }
246
247 /* In the caseful case, we can just compare the bytes, whether or not we
248 are in UTF-8 mode. */
249
250 else
251 {
252 while (length-- > 0)
253 {
254 if (eptr >= md->end_subject) return -2; /* Partial match */
255 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
256 }
257 }
258
259 return (int)(eptr - eptr_start);
260 }
261
262
263
264 /***************************************************************************
265 ****************************************************************************
266 RECURSION IN THE match() FUNCTION
267
268 The match() function is highly recursive, though not every recursive call
269 increases the recursive depth. Nevertheless, some regular expressions can cause
270 it to recurse to a great depth. I was writing for Unix, so I just let it call
271 itself recursively. This uses the stack for saving everything that has to be
272 saved for a recursive call. On Unix, the stack can be large, and this works
273 fine.
274
275 It turns out that on some non-Unix-like systems there are problems with
276 programs that use a lot of stack. (This despite the fact that every last chip
277 has oodles of memory these days, and techniques for extending the stack have
278 been known for decades.) So....
279
280 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
281 calls by keeping local variables that need to be preserved in blocks of memory
282 obtained from malloc() instead instead of on the stack. Macros are used to
283 achieve this so that the actual code doesn't look very different to what it
284 always used to.
285
286 The original heap-recursive code used longjmp(). However, it seems that this
287 can be very slow on some operating systems. Following a suggestion from Stan
288 Switzer, the use of longjmp() has been abolished, at the cost of having to
289 provide a unique number for each call to RMATCH. There is no way of generating
290 a sequence of numbers at compile time in C. I have given them names, to make
291 them stand out more clearly.
292
293 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
294 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
295 tests. Furthermore, not using longjmp() means that local dynamic variables
296 don't have indeterminate values; this has meant that the frame size can be
297 reduced because the result can be "passed back" by straight setting of the
298 variable instead of being passed in the frame.
299 ****************************************************************************
300 ***************************************************************************/
301
302 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
303 below must be updated in sync. */
304
305 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
306 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
307 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
308 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
309 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
310 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
311 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
312
313 /* These versions of the macros use the stack, as normal. There are debugging
314 versions and production versions. Note that the "rw" argument of RMATCH isn't
315 actually used in this definition. */
316
317 #ifndef NO_RECURSE
318 #define REGISTER register
319
320 #ifdef PCRE_DEBUG
321 #define RMATCH(ra,rb,rc,rd,re,rw) \
322 { \
323 printf("match() called in line %d\n", __LINE__); \
324 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
325 printf("to line %d\n", __LINE__); \
326 }
327 #define RRETURN(ra) \
328 { \
329 printf("match() returned %d from line %d\n", ra, __LINE__); \
330 return ra; \
331 }
332 #else
333 #define RMATCH(ra,rb,rc,rd,re,rw) \
334 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
335 #define RRETURN(ra) return ra
336 #endif
337
338 #else
339
340
341 /* These versions of the macros manage a private stack on the heap. Note that
342 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
343 argument of match(), which never changes. */
344
345 #define REGISTER
346
347 #define RMATCH(ra,rb,rc,rd,re,rw)\
348 {\
349 heapframe *newframe = frame->Xnextframe;\
350 if (newframe == NULL)\
351 {\
352 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
353 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
354 newframe->Xnextframe = NULL;\
355 frame->Xnextframe = newframe;\
356 }\
357 frame->Xwhere = rw;\
358 newframe->Xeptr = ra;\
359 newframe->Xecode = rb;\
360 newframe->Xmstart = mstart;\
361 newframe->Xoffset_top = rc;\
362 newframe->Xeptrb = re;\
363 newframe->Xrdepth = frame->Xrdepth + 1;\
364 newframe->Xprevframe = frame;\
365 frame = newframe;\
366 DPRINTF(("restarting from line %d\n", __LINE__));\
367 goto HEAP_RECURSE;\
368 L_##rw:\
369 DPRINTF(("jumped back to line %d\n", __LINE__));\
370 }
371
372 #define RRETURN(ra)\
373 {\
374 heapframe *oldframe = frame;\
375 frame = oldframe->Xprevframe;\
376 if (frame != NULL)\
377 {\
378 rrc = ra;\
379 goto HEAP_RETURN;\
380 }\
381 return ra;\
382 }
383
384
385 /* Structure for remembering the local variables in a private frame */
386
387 typedef struct heapframe {
388 struct heapframe *Xprevframe;
389 struct heapframe *Xnextframe;
390
391 /* Function arguments that may change */
392
393 PCRE_PUCHAR Xeptr;
394 const pcre_uchar *Xecode;
395 PCRE_PUCHAR Xmstart;
396 int Xoffset_top;
397 eptrblock *Xeptrb;
398 unsigned int Xrdepth;
399
400 /* Function local variables */
401
402 PCRE_PUCHAR Xcallpat;
403 #ifdef SUPPORT_UTF
404 PCRE_PUCHAR Xcharptr;
405 #endif
406 PCRE_PUCHAR Xdata;
407 PCRE_PUCHAR Xnext;
408 PCRE_PUCHAR Xpp;
409 PCRE_PUCHAR Xprev;
410 PCRE_PUCHAR Xsaved_eptr;
411
412 recursion_info Xnew_recursive;
413
414 BOOL Xcur_is_word;
415 BOOL Xcondition;
416 BOOL Xprev_is_word;
417
418 #ifdef SUPPORT_UCP
419 int Xprop_type;
420 unsigned int Xprop_value;
421 int Xprop_fail_result;
422 int Xoclength;
423 pcre_uchar Xocchars[6];
424 #endif
425
426 int Xcodelink;
427 int Xctype;
428 unsigned int Xfc;
429 int Xfi;
430 int Xlength;
431 int Xmax;
432 int Xmin;
433 unsigned int Xnumber;
434 int Xoffset;
435 unsigned int Xop;
436 pcre_int32 Xsave_capture_last;
437 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
438 int Xstacksave[REC_STACK_SAVE_MAX];
439
440 eptrblock Xnewptrb;
441
442 /* Where to jump back to */
443
444 int Xwhere;
445
446 } heapframe;
447
448 #endif
449
450
451 /***************************************************************************
452 ***************************************************************************/
453
454
455
456 /*************************************************
457 * Match from current position *
458 *************************************************/
459
460 /* This function is called recursively in many circumstances. Whenever it
461 returns a negative (error) response, the outer incarnation must also return the
462 same response. */
463
464 /* These macros pack up tests that are used for partial matching, and which
465 appear several times in the code. We set the "hit end" flag if the pointer is
466 at the end of the subject and also past the start of the subject (i.e.
467 something has been matched). For hard partial matching, we then return
468 immediately. The second one is used when we already know we are past the end of
469 the subject. */
470
471 #define CHECK_PARTIAL()\
472 if (md->partial != 0 && eptr >= md->end_subject && \
473 eptr > md->start_used_ptr) \
474 { \
475 md->hitend = TRUE; \
476 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
477 }
478
479 #define SCHECK_PARTIAL()\
480 if (md->partial != 0 && eptr > md->start_used_ptr) \
481 { \
482 md->hitend = TRUE; \
483 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
484 }
485
486
487 /* Performance note: It might be tempting to extract commonly used fields from
488 the md structure (e.g. utf, end_subject) into individual variables to improve
489 performance. Tests using gcc on a SPARC disproved this; in the first case, it
490 made performance worse.
491
492 Arguments:
493 eptr pointer to current character in subject
494 ecode pointer to current position in compiled code
495 mstart pointer to the current match start position (can be modified
496 by encountering \K)
497 offset_top current top pointer
498 md pointer to "static" info for the match
499 eptrb pointer to chain of blocks containing eptr at start of
500 brackets - for testing for empty matches
501 rdepth the recursion depth
502
503 Returns: MATCH_MATCH if matched ) these values are >= 0
504 MATCH_NOMATCH if failed to match )
505 a negative MATCH_xxx value for PRUNE, SKIP, etc
506 a negative PCRE_ERROR_xxx value if aborted by an error condition
507 (e.g. stopped by repeated call or recursion limit)
508 */
509
510 static int
511 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
512 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
513 unsigned int rdepth)
514 {
515 /* These variables do not need to be preserved over recursion in this function,
516 so they can be ordinary variables in all cases. Mark some of them with
517 "register" because they are used a lot in loops. */
518
519 register int rrc; /* Returns from recursive calls */
520 register int i; /* Used for loops not involving calls to RMATCH() */
521 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
522 register BOOL utf; /* Local copy of UTF flag for speed */
523
524 BOOL minimize, possessive; /* Quantifier options */
525 BOOL caseless;
526 int condcode;
527
528 /* When recursion is not being used, all "local" variables that have to be
529 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
530 frame on the stack here; subsequent instantiations are obtained from the heap
531 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
532 the top-level on the stack rather than malloc-ing them all gives a performance
533 boost in many cases where there is not much "recursion". */
534
535 #ifdef NO_RECURSE
536 heapframe *frame = (heapframe *)md->match_frames_base;
537
538 /* Copy in the original argument variables */
539
540 frame->Xeptr = eptr;
541 frame->Xecode = ecode;
542 frame->Xmstart = mstart;
543 frame->Xoffset_top = offset_top;
544 frame->Xeptrb = eptrb;
545 frame->Xrdepth = rdepth;
546
547 /* This is where control jumps back to to effect "recursion" */
548
549 HEAP_RECURSE:
550
551 /* Macros make the argument variables come from the current frame */
552
553 #define eptr frame->Xeptr
554 #define ecode frame->Xecode
555 #define mstart frame->Xmstart
556 #define offset_top frame->Xoffset_top
557 #define eptrb frame->Xeptrb
558 #define rdepth frame->Xrdepth
559
560 /* Ditto for the local variables */
561
562 #ifdef SUPPORT_UTF
563 #define charptr frame->Xcharptr
564 #endif
565 #define callpat frame->Xcallpat
566 #define codelink frame->Xcodelink
567 #define data frame->Xdata
568 #define next frame->Xnext
569 #define pp frame->Xpp
570 #define prev frame->Xprev
571 #define saved_eptr frame->Xsaved_eptr
572
573 #define new_recursive frame->Xnew_recursive
574
575 #define cur_is_word frame->Xcur_is_word
576 #define condition frame->Xcondition
577 #define prev_is_word frame->Xprev_is_word
578
579 #ifdef SUPPORT_UCP
580 #define prop_type frame->Xprop_type
581 #define prop_value frame->Xprop_value
582 #define prop_fail_result frame->Xprop_fail_result
583 #define oclength frame->Xoclength
584 #define occhars frame->Xocchars
585 #endif
586
587 #define ctype frame->Xctype
588 #define fc frame->Xfc
589 #define fi frame->Xfi
590 #define length frame->Xlength
591 #define max frame->Xmax
592 #define min frame->Xmin
593 #define number frame->Xnumber
594 #define offset frame->Xoffset
595 #define op frame->Xop
596 #define save_capture_last frame->Xsave_capture_last
597 #define save_offset1 frame->Xsave_offset1
598 #define save_offset2 frame->Xsave_offset2
599 #define save_offset3 frame->Xsave_offset3
600 #define stacksave frame->Xstacksave
601
602 #define newptrb frame->Xnewptrb
603
604 /* When recursion is being used, local variables are allocated on the stack and
605 get preserved during recursion in the normal way. In this environment, fi and
606 i, and fc and c, can be the same variables. */
607
608 #else /* NO_RECURSE not defined */
609 #define fi i
610 #define fc c
611
612 /* Many of the following variables are used only in small blocks of the code.
613 My normal style of coding would have declared them within each of those blocks.
614 However, in order to accommodate the version of this code that uses an external
615 "stack" implemented on the heap, it is easier to declare them all here, so the
616 declarations can be cut out in a block. The only declarations within blocks
617 below are for variables that do not have to be preserved over a recursive call
618 to RMATCH(). */
619
620 #ifdef SUPPORT_UTF
621 const pcre_uchar *charptr;
622 #endif
623 const pcre_uchar *callpat;
624 const pcre_uchar *data;
625 const pcre_uchar *next;
626 PCRE_PUCHAR pp;
627 const pcre_uchar *prev;
628 PCRE_PUCHAR saved_eptr;
629
630 recursion_info new_recursive;
631
632 BOOL cur_is_word;
633 BOOL condition;
634 BOOL prev_is_word;
635
636 #ifdef SUPPORT_UCP
637 int prop_type;
638 unsigned int prop_value;
639 int prop_fail_result;
640 int oclength;
641 pcre_uchar occhars[6];
642 #endif
643
644 int codelink;
645 int ctype;
646 int length;
647 int max;
648 int min;
649 unsigned int number;
650 int offset;
651 unsigned int op;
652 pcre_int32 save_capture_last;
653 int save_offset1, save_offset2, save_offset3;
654 int stacksave[REC_STACK_SAVE_MAX];
655
656 eptrblock newptrb;
657
658 /* There is a special fudge for calling match() in a way that causes it to
659 measure the size of its basic stack frame when the stack is being used for
660 recursion. The second argument (ecode) being NULL triggers this behaviour. It
661 cannot normally ever be NULL. The return is the negated value of the frame
662 size. */
663
664 if (ecode == NULL)
665 {
666 if (rdepth == 0)
667 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
668 else
669 {
670 int len = (char *)&rdepth - (char *)eptr;
671 return (len > 0)? -len : len;
672 }
673 }
674 #endif /* NO_RECURSE */
675
676 /* To save space on the stack and in the heap frame, I have doubled up on some
677 of the local variables that are used only in localised parts of the code, but
678 still need to be preserved over recursive calls of match(). These macros define
679 the alternative names that are used. */
680
681 #define allow_zero cur_is_word
682 #define cbegroup condition
683 #define code_offset codelink
684 #define condassert condition
685 #define matched_once prev_is_word
686 #define foc number
687 #define save_mark data
688
689 /* These statements are here to stop the compiler complaining about unitialized
690 variables. */
691
692 #ifdef SUPPORT_UCP
693 prop_value = 0;
694 prop_fail_result = 0;
695 #endif
696
697
698 /* This label is used for tail recursion, which is used in a few cases even
699 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
700 used. Thanks to Ian Taylor for noticing this possibility and sending the
701 original patch. */
702
703 TAIL_RECURSE:
704
705 /* OK, now we can get on with the real code of the function. Recursive calls
706 are specified by the macro RMATCH and RRETURN is used to return. When
707 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
708 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
709 defined). However, RMATCH isn't like a function call because it's quite a
710 complicated macro. It has to be used in one particular way. This shouldn't,
711 however, impact performance when true recursion is being used. */
712
713 #ifdef SUPPORT_UTF
714 utf = md->utf; /* Local copy of the flag */
715 #else
716 utf = FALSE;
717 #endif
718
719 /* First check that we haven't called match() too many times, or that we
720 haven't exceeded the recursive call limit. */
721
722 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
723 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
724
725 /* At the start of a group with an unlimited repeat that may match an empty
726 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
727 done this way to save having to use another function argument, which would take
728 up space on the stack. See also MATCH_CONDASSERT below.
729
730 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
731 such remembered pointers, to be checked when we hit the closing ket, in order
732 to break infinite loops that match no characters. When match() is called in
733 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
734 NOT be used with tail recursion, because the memory block that is used is on
735 the stack, so a new one may be required for each match(). */
736
737 if (md->match_function_type == MATCH_CBEGROUP)
738 {
739 newptrb.epb_saved_eptr = eptr;
740 newptrb.epb_prev = eptrb;
741 eptrb = &newptrb;
742 md->match_function_type = 0;
743 }
744
745 /* Now start processing the opcodes. */
746
747 for (;;)
748 {
749 minimize = possessive = FALSE;
750 op = *ecode;
751
752 switch(op)
753 {
754 case OP_MARK:
755 md->nomatch_mark = ecode + 2;
756 md->mark = NULL; /* In case previously set by assertion */
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
758 eptrb, RM55);
759 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
760 md->mark == NULL) md->mark = ecode + 2;
761
762 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
763 argument, and we must check whether that argument matches this MARK's
764 argument. It is passed back in md->start_match_ptr (an overloading of that
765 variable). If it does match, we reset that variable to the current subject
766 position and return MATCH_SKIP. Otherwise, pass back the return code
767 unaltered. */
768
769 else if (rrc == MATCH_SKIP_ARG &&
770 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
771 {
772 md->start_match_ptr = eptr;
773 RRETURN(MATCH_SKIP);
774 }
775 RRETURN(rrc);
776
777 case OP_FAIL:
778 RRETURN(MATCH_NOMATCH);
779
780 /* COMMIT overrides PRUNE, SKIP, and THEN */
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
786 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
787 rrc != MATCH_THEN)
788 RRETURN(rrc);
789 RRETURN(MATCH_COMMIT);
790
791 /* PRUNE overrides THEN */
792
793 case OP_PRUNE:
794 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
795 eptrb, RM51);
796 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
797 RRETURN(MATCH_PRUNE);
798
799 case OP_PRUNE_ARG:
800 md->nomatch_mark = ecode + 2;
801 md->mark = NULL; /* In case previously set by assertion */
802 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
803 eptrb, RM56);
804 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
805 md->mark == NULL) md->mark = ecode + 2;
806 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
807 RRETURN(MATCH_PRUNE);
808
809 /* SKIP overrides PRUNE and THEN */
810
811 case OP_SKIP:
812 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
813 eptrb, RM53);
814 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
815 RRETURN(rrc);
816 md->start_match_ptr = eptr; /* Pass back current position */
817 RRETURN(MATCH_SKIP);
818
819 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
820 nomatch_mark. There is a flag that disables this opcode when re-matching a
821 pattern that ended with a SKIP for which there was not a matching MARK. */
822
823 case OP_SKIP_ARG:
824 if (md->ignore_skip_arg)
825 {
826 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
827 break;
828 }
829 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
830 eptrb, RM57);
831 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
832 RRETURN(rrc);
833
834 /* Pass back the current skip name by overloading md->start_match_ptr and
835 returning the special MATCH_SKIP_ARG return code. This will either be
836 caught by a matching MARK, or get to the top, where it causes a rematch
837 with the md->ignore_skip_arg flag set. */
838
839 md->start_match_ptr = ecode + 2;
840 RRETURN(MATCH_SKIP_ARG);
841
842 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
843 the branch in which it occurs can be determined. Overload the start of
844 match pointer to do this. */
845
846 case OP_THEN:
847 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
848 eptrb, RM54);
849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
850 md->start_match_ptr = ecode;
851 RRETURN(MATCH_THEN);
852
853 case OP_THEN_ARG:
854 md->nomatch_mark = ecode + 2;
855 md->mark = NULL; /* In case previously set by assertion */
856 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
857 md, eptrb, RM58);
858 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
859 md->mark == NULL) md->mark = ecode + 2;
860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
861 md->start_match_ptr = ecode;
862 RRETURN(MATCH_THEN);
863
864 /* Handle an atomic group that does not contain any capturing parentheses.
865 This can be handled like an assertion. Prior to 8.13, all atomic groups
866 were handled this way. In 8.13, the code was changed as below for ONCE, so
867 that backups pass through the group and thereby reset captured values.
868 However, this uses a lot more stack, so in 8.20, atomic groups that do not
869 contain any captures generate OP_ONCE_NC, which can be handled in the old,
870 less stack intensive way.
871
872 Check the alternative branches in turn - the matching won't pass the KET
873 for this kind of subpattern. If any one branch matches, we carry on as at
874 the end of a normal bracket, leaving the subject pointer, but resetting
875 the start-of-match value in case it was changed by \K. */
876
877 case OP_ONCE_NC:
878 prev = ecode;
879 saved_eptr = eptr;
880 save_mark = md->mark;
881 do
882 {
883 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
884 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
885 {
886 mstart = md->start_match_ptr;
887 break;
888 }
889 if (rrc == MATCH_THEN)
890 {
891 next = ecode + GET(ecode,1);
892 if (md->start_match_ptr < next &&
893 (*ecode == OP_ALT || *next == OP_ALT))
894 rrc = MATCH_NOMATCH;
895 }
896
897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
898 ecode += GET(ecode,1);
899 md->mark = save_mark;
900 }
901 while (*ecode == OP_ALT);
902
903 /* If hit the end of the group (which could be repeated), fail */
904
905 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
906
907 /* Continue as from after the group, updating the offsets high water
908 mark, since extracts may have been taken. */
909
910 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
911
912 offset_top = md->end_offset_top;
913 eptr = md->end_match_ptr;
914
915 /* For a non-repeating ket, just continue at this level. This also
916 happens for a repeating ket if no characters were matched in the group.
917 This is the forcible breaking of infinite loops as implemented in Perl
918 5.005. */
919
920 if (*ecode == OP_KET || eptr == saved_eptr)
921 {
922 ecode += 1+LINK_SIZE;
923 break;
924 }
925
926 /* The repeating kets try the rest of the pattern or restart from the
927 preceding bracket, in the appropriate order. The second "call" of match()
928 uses tail recursion, to avoid using another stack frame. */
929
930 if (*ecode == OP_KETRMIN)
931 {
932 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
934 ecode = prev;
935 goto TAIL_RECURSE;
936 }
937 else /* OP_KETRMAX */
938 {
939 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
941 ecode += 1 + LINK_SIZE;
942 goto TAIL_RECURSE;
943 }
944 /* Control never gets here */
945
946 /* Handle a capturing bracket, other than those that are possessive with an
947 unlimited repeat. If there is space in the offset vector, save the current
948 subject position in the working slot at the top of the vector. We mustn't
949 change the current values of the data slot, because they may be set from a
950 previous iteration of this group, and be referred to by a reference inside
951 the group. A failure to match might occur after the group has succeeded,
952 if something later on doesn't match. For this reason, we need to restore
953 the working value and also the values of the final offsets, in case they
954 were set by a previous iteration of the same bracket.
955
956 If there isn't enough space in the offset vector, treat this as if it were
957 a non-capturing bracket. Don't worry about setting the flag for the error
958 case here; that is handled in the code for KET. */
959
960 case OP_CBRA:
961 case OP_SCBRA:
962 number = GET2(ecode, 1+LINK_SIZE);
963 offset = number << 1;
964
965 #ifdef PCRE_DEBUG
966 printf("start bracket %d\n", number);
967 printf("subject=");
968 pchars(eptr, 16, TRUE, md);
969 printf("\n");
970 #endif
971
972 if (offset < md->offset_max)
973 {
974 save_offset1 = md->offset_vector[offset];
975 save_offset2 = md->offset_vector[offset+1];
976 save_offset3 = md->offset_vector[md->offset_end - number];
977 save_capture_last = md->capture_last;
978 save_mark = md->mark;
979
980 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
981 md->offset_vector[md->offset_end - number] =
982 (int)(eptr - md->start_subject);
983
984 for (;;)
985 {
986 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
987 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
988 eptrb, RM1);
989 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
990
991 /* If we backed up to a THEN, check whether it is within the current
992 branch by comparing the address of the THEN that is passed back with
993 the end of the branch. If it is within the current branch, and the
994 branch is one of two or more alternatives (it either starts or ends
995 with OP_ALT), we have reached the limit of THEN's action, so convert
996 the return code to NOMATCH, which will cause normal backtracking to
997 happen from now on. Otherwise, THEN is passed back to an outer
998 alternative. This implements Perl's treatment of parenthesized groups,
999 where a group not containing | does not affect the current alternative,
1000 that is, (X) is NOT the same as (X|(*F)). */
1001
1002 if (rrc == MATCH_THEN)
1003 {
1004 next = ecode + GET(ecode,1);
1005 if (md->start_match_ptr < next &&
1006 (*ecode == OP_ALT || *next == OP_ALT))
1007 rrc = MATCH_NOMATCH;
1008 }
1009
1010 /* Anything other than NOMATCH is passed back. */
1011
1012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1013 md->capture_last = save_capture_last;
1014 ecode += GET(ecode, 1);
1015 md->mark = save_mark;
1016 if (*ecode != OP_ALT) break;
1017 }
1018
1019 DPRINTF(("bracket %d failed\n", number));
1020 md->offset_vector[offset] = save_offset1;
1021 md->offset_vector[offset+1] = save_offset2;
1022 md->offset_vector[md->offset_end - number] = save_offset3;
1023
1024 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1025
1026 RRETURN(rrc);
1027 }
1028
1029 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1030 as a non-capturing bracket. */
1031
1032 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034
1035 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1036
1037 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1038 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1039
1040 /* Non-capturing or atomic group, except for possessive with unlimited
1041 repeat and ONCE group with no captures. Loop for all the alternatives.
1042
1043 When we get to the final alternative within the brackets, we used to return
1044 the result of a recursive call to match() whatever happened so it was
1045 possible to reduce stack usage by turning this into a tail recursion,
1046 except in the case of a possibly empty group. However, now that there is
1047 the possiblity of (*THEN) occurring in the final alternative, this
1048 optimization is no longer always possible.
1049
1050 We can optimize if we know there are no (*THEN)s in the pattern; at present
1051 this is the best that can be done.
1052
1053 MATCH_ONCE is returned when the end of an atomic group is successfully
1054 reached, but subsequent matching fails. It passes back up the tree (causing
1055 captured values to be reset) until the original atomic group level is
1056 reached. This is tested by comparing md->once_target with the start of the
1057 group. At this point, the return is converted into MATCH_NOMATCH so that
1058 previous backup points can be taken. */
1059
1060 case OP_ONCE:
1061 case OP_BRA:
1062 case OP_SBRA:
1063 DPRINTF(("start non-capturing bracket\n"));
1064
1065 for (;;)
1066 {
1067 if (op >= OP_SBRA || op == OP_ONCE)
1068 md->match_function_type = MATCH_CBEGROUP;
1069
1070 /* If this is not a possibly empty group, and there are no (*THEN)s in
1071 the pattern, and this is the final alternative, optimize as described
1072 above. */
1073
1074 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1075 {
1076 ecode += PRIV(OP_lengths)[*ecode];
1077 goto TAIL_RECURSE;
1078 }
1079
1080 /* In all other cases, we have to make another call to match(). */
1081
1082 save_mark = md->mark;
1083 save_capture_last = md->capture_last;
1084 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1085 RM2);
1086
1087 /* See comment in the code for capturing groups above about handling
1088 THEN. */
1089
1090 if (rrc == MATCH_THEN)
1091 {
1092 next = ecode + GET(ecode,1);
1093 if (md->start_match_ptr < next &&
1094 (*ecode == OP_ALT || *next == OP_ALT))
1095 rrc = MATCH_NOMATCH;
1096 }
1097
1098 if (rrc != MATCH_NOMATCH)
1099 {
1100 if (rrc == MATCH_ONCE)
1101 {
1102 const pcre_uchar *scode = ecode;
1103 if (*scode != OP_ONCE) /* If not at start, find it */
1104 {
1105 while (*scode == OP_ALT) scode += GET(scode, 1);
1106 scode -= GET(scode, 1);
1107 }
1108 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1109 }
1110 RRETURN(rrc);
1111 }
1112 ecode += GET(ecode, 1);
1113 md->mark = save_mark;
1114 if (*ecode != OP_ALT) break;
1115 md->capture_last = save_capture_last;
1116 }
1117
1118 RRETURN(MATCH_NOMATCH);
1119
1120 /* Handle possessive capturing brackets with an unlimited repeat. We come
1121 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1122 handled similarly to the normal case above. However, the matching is
1123 different. The end of these brackets will always be OP_KETRPOS, which
1124 returns MATCH_KETRPOS without going further in the pattern. By this means
1125 we can handle the group by iteration rather than recursion, thereby
1126 reducing the amount of stack needed. */
1127
1128 case OP_CBRAPOS:
1129 case OP_SCBRAPOS:
1130 allow_zero = FALSE;
1131
1132 POSSESSIVE_CAPTURE:
1133 number = GET2(ecode, 1+LINK_SIZE);
1134 offset = number << 1;
1135
1136 #ifdef PCRE_DEBUG
1137 printf("start possessive bracket %d\n", number);
1138 printf("subject=");
1139 pchars(eptr, 16, TRUE, md);
1140 printf("\n");
1141 #endif
1142
1143 if (offset < md->offset_max)
1144 {
1145 matched_once = FALSE;
1146 code_offset = (int)(ecode - md->start_code);
1147
1148 save_offset1 = md->offset_vector[offset];
1149 save_offset2 = md->offset_vector[offset+1];
1150 save_offset3 = md->offset_vector[md->offset_end - number];
1151 save_capture_last = md->capture_last;
1152
1153 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1154
1155 /* Each time round the loop, save the current subject position for use
1156 when the group matches. For MATCH_MATCH, the group has matched, so we
1157 restart it with a new subject starting position, remembering that we had
1158 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1159 usual. If we haven't matched any alternatives in any iteration, check to
1160 see if a previous iteration matched. If so, the group has matched;
1161 continue from afterwards. Otherwise it has failed; restore the previous
1162 capture values before returning NOMATCH. */
1163
1164 for (;;)
1165 {
1166 md->offset_vector[md->offset_end - number] =
1167 (int)(eptr - md->start_subject);
1168 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1169 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1170 eptrb, RM63);
1171 if (rrc == MATCH_KETRPOS)
1172 {
1173 offset_top = md->end_offset_top;
1174 eptr = md->end_match_ptr;
1175 ecode = md->start_code + code_offset;
1176 save_capture_last = md->capture_last;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 md->capture_last = save_capture_last;
1194 ecode += GET(ecode, 1);
1195 if (*ecode != OP_ALT) break;
1196 }
1197
1198 if (!matched_once)
1199 {
1200 md->offset_vector[offset] = save_offset1;
1201 md->offset_vector[offset+1] = save_offset2;
1202 md->offset_vector[md->offset_end - number] = save_offset3;
1203 }
1204
1205 if (allow_zero || matched_once)
1206 {
1207 ecode += 1 + LINK_SIZE;
1208 break;
1209 }
1210
1211 RRETURN(MATCH_NOMATCH);
1212 }
1213
1214 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1215 as a non-capturing bracket. */
1216
1217 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1218 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1219
1220 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1221
1222 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1223 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1224
1225 /* Non-capturing possessive bracket with unlimited repeat. We come here
1226 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1227 without the capturing complication. It is written out separately for speed
1228 and cleanliness. */
1229
1230 case OP_BRAPOS:
1231 case OP_SBRAPOS:
1232 allow_zero = FALSE;
1233
1234 POSSESSIVE_NON_CAPTURE:
1235 matched_once = FALSE;
1236 code_offset = (int)(ecode - md->start_code);
1237 save_capture_last = md->capture_last;
1238
1239 for (;;)
1240 {
1241 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1242 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1243 eptrb, RM48);
1244 if (rrc == MATCH_KETRPOS)
1245 {
1246 offset_top = md->end_offset_top;
1247 eptr = md->end_match_ptr;
1248 ecode = md->start_code + code_offset;
1249 matched_once = TRUE;
1250 continue;
1251 }
1252
1253 /* See comment in the code for capturing groups above about handling
1254 THEN. */
1255
1256 if (rrc == MATCH_THEN)
1257 {
1258 next = ecode + GET(ecode,1);
1259 if (md->start_match_ptr < next &&
1260 (*ecode == OP_ALT || *next == OP_ALT))
1261 rrc = MATCH_NOMATCH;
1262 }
1263
1264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1265 ecode += GET(ecode, 1);
1266 if (*ecode != OP_ALT) break;
1267 md->capture_last = save_capture_last;
1268 }
1269
1270 if (matched_once || allow_zero)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275 RRETURN(MATCH_NOMATCH);
1276
1277 /* Control never reaches here. */
1278
1279 /* Conditional group: compilation checked that there are no more than
1280 two branches. If the condition is false, skipping the first branch takes us
1281 past the end if there is only one branch, but that's OK because that is
1282 exactly what going to the ket would do. */
1283
1284 case OP_COND:
1285 case OP_SCOND:
1286 codelink = GET(ecode, 1);
1287
1288 /* Because of the way auto-callout works during compile, a callout item is
1289 inserted between OP_COND and an assertion condition. */
1290
1291 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1292 {
1293 if (PUBL(callout) != NULL)
1294 {
1295 PUBL(callout_block) cb;
1296 cb.version = 2; /* Version 1 of the callout block */
1297 cb.callout_number = ecode[LINK_SIZE+2];
1298 cb.offset_vector = md->offset_vector;
1299 #if defined COMPILE_PCRE8
1300 cb.subject = (PCRE_SPTR)md->start_subject;
1301 #elif defined COMPILE_PCRE16
1302 cb.subject = (PCRE_SPTR16)md->start_subject;
1303 #elif defined COMPILE_PCRE32
1304 cb.subject = (PCRE_SPTR32)md->start_subject;
1305 #endif
1306 cb.subject_length = (int)(md->end_subject - md->start_subject);
1307 cb.start_match = (int)(mstart - md->start_subject);
1308 cb.current_position = (int)(eptr - md->start_subject);
1309 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1310 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1311 cb.capture_top = offset_top/2;
1312 cb.capture_last = md->capture_last & CAPLMASK;
1313 /* Internal change requires this for API compatibility. */
1314 if (cb.capture_last == 0) cb.capture_last = -1;
1315 cb.callout_data = md->callout_data;
1316 cb.mark = md->nomatch_mark;
1317 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1318 if (rrc < 0) RRETURN(rrc);
1319 }
1320 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1321 }
1322
1323 condcode = ecode[LINK_SIZE+1];
1324
1325 /* Now see what the actual condition is */
1326
1327 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1328 {
1329 if (md->recursive == NULL) /* Not recursing => FALSE */
1330 {
1331 condition = FALSE;
1332 ecode += GET(ecode, 1);
1333 }
1334 else
1335 {
1336 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1337 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1338
1339 /* If the test is for recursion into a specific subpattern, and it is
1340 false, but the test was set up by name, scan the table to see if the
1341 name refers to any other numbers, and test them. The condition is true
1342 if any one is set. */
1343
1344 if (!condition && condcode == OP_NRREF)
1345 {
1346 pcre_uchar *slotA = md->name_table;
1347 for (i = 0; i < md->name_count; i++)
1348 {
1349 if (GET2(slotA, 0) == recno) break;
1350 slotA += md->name_entry_size;
1351 }
1352
1353 /* Found a name for the number - there can be only one; duplicate
1354 names for different numbers are allowed, but not vice versa. First
1355 scan down for duplicates. */
1356
1357 if (i < md->name_count)
1358 {
1359 pcre_uchar *slotB = slotA;
1360 while (slotB > md->name_table)
1361 {
1362 slotB -= md->name_entry_size;
1363 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1364 {
1365 condition = GET2(slotB, 0) == md->recursive->group_num;
1366 if (condition) break;
1367 }
1368 else break;
1369 }
1370
1371 /* Scan up for duplicates */
1372
1373 if (!condition)
1374 {
1375 slotB = slotA;
1376 for (i++; i < md->name_count; i++)
1377 {
1378 slotB += md->name_entry_size;
1379 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1380 {
1381 condition = GET2(slotB, 0) == md->recursive->group_num;
1382 if (condition) break;
1383 }
1384 else break;
1385 }
1386 }
1387 }
1388 }
1389
1390 /* Chose branch according to the condition */
1391
1392 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1393 }
1394 }
1395
1396 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1397 {
1398 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1399 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1400
1401 /* If the numbered capture is unset, but the reference was by name,
1402 scan the table to see if the name refers to any other numbers, and test
1403 them. The condition is true if any one is set. This is tediously similar
1404 to the code above, but not close enough to try to amalgamate. */
1405
1406 if (!condition && condcode == OP_NCREF)
1407 {
1408 unsigned int refno = offset >> 1;
1409 pcre_uchar *slotA = md->name_table;
1410
1411 for (i = 0; i < md->name_count; i++)
1412 {
1413 if (GET2(slotA, 0) == refno) break;
1414 slotA += md->name_entry_size;
1415 }
1416
1417 /* Found a name for the number - there can be only one; duplicate names
1418 for different numbers are allowed, but not vice versa. First scan down
1419 for duplicates. */
1420
1421 if (i < md->name_count)
1422 {
1423 pcre_uchar *slotB = slotA;
1424 while (slotB > md->name_table)
1425 {
1426 slotB -= md->name_entry_size;
1427 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1428 {
1429 offset = GET2(slotB, 0) << 1;
1430 condition = offset < offset_top &&
1431 md->offset_vector[offset] >= 0;
1432 if (condition) break;
1433 }
1434 else break;
1435 }
1436
1437 /* Scan up for duplicates */
1438
1439 if (!condition)
1440 {
1441 slotB = slotA;
1442 for (i++; i < md->name_count; i++)
1443 {
1444 slotB += md->name_entry_size;
1445 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1446 {
1447 offset = GET2(slotB, 0) << 1;
1448 condition = offset < offset_top &&
1449 md->offset_vector[offset] >= 0;
1450 if (condition) break;
1451 }
1452 else break;
1453 }
1454 }
1455 }
1456 }
1457
1458 /* Chose branch according to the condition */
1459
1460 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1461 }
1462
1463 else if (condcode == OP_DEF) /* DEFINE - always false */
1464 {
1465 condition = FALSE;
1466 ecode += GET(ecode, 1);
1467 }
1468
1469 /* The condition is an assertion. Call match() to evaluate it - setting
1470 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1471 an assertion. */
1472
1473 else
1474 {
1475 md->match_function_type = MATCH_CONDASSERT;
1476 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1477 if (rrc == MATCH_MATCH)
1478 {
1479 if (md->end_offset_top > offset_top)
1480 offset_top = md->end_offset_top; /* Captures may have happened */
1481 condition = TRUE;
1482 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1483 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1484 }
1485
1486 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1487 assertion; it is therefore treated as NOMATCH. */
1488
1489 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1490 {
1491 RRETURN(rrc); /* Need braces because of following else */
1492 }
1493 else
1494 {
1495 condition = FALSE;
1496 ecode += codelink;
1497 }
1498 }
1499
1500 /* We are now at the branch that is to be obeyed. As there is only one, can
1501 use tail recursion to avoid using another stack frame, except when there is
1502 unlimited repeat of a possibly empty group. In the latter case, a recursive
1503 call to match() is always required, unless the second alternative doesn't
1504 exist, in which case we can just plough on. Note that, for compatibility
1505 with Perl, the | in a conditional group is NOT treated as creating two
1506 alternatives. If a THEN is encountered in the branch, it propagates out to
1507 the enclosing alternative (unless nested in a deeper set of alternatives,
1508 of course). */
1509
1510 if (condition || *ecode == OP_ALT)
1511 {
1512 if (op != OP_SCOND)
1513 {
1514 ecode += 1 + LINK_SIZE;
1515 goto TAIL_RECURSE;
1516 }
1517
1518 md->match_function_type = MATCH_CBEGROUP;
1519 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1520 RRETURN(rrc);
1521 }
1522
1523 /* Condition false & no alternative; continue after the group. */
1524
1525 else
1526 {
1527 ecode += 1 + LINK_SIZE;
1528 }
1529 break;
1530
1531
1532 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1533 to close any currently open capturing brackets. */
1534
1535 case OP_CLOSE:
1536 number = GET2(ecode, 1); /* Must be less than 65536 */
1537 offset = number << 1;
1538
1539 #ifdef PCRE_DEBUG
1540 printf("end bracket %d at *ACCEPT", number);
1541 printf("\n");
1542 #endif
1543
1544 md->capture_last = (md->capture_last & OVFLMASK) | number;
1545 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1546 {
1547 md->offset_vector[offset] =
1548 md->offset_vector[md->offset_end - number];
1549 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1550 if (offset_top <= offset) offset_top = offset + 2;
1551 }
1552 ecode += 1 + IMM2_SIZE;
1553 break;
1554
1555
1556 /* End of the pattern, either real or forced. */
1557
1558 case OP_END:
1559 case OP_ACCEPT:
1560 case OP_ASSERT_ACCEPT:
1561
1562 /* If we have matched an empty string, fail if not in an assertion and not
1563 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1564 is set and we have matched at the start of the subject. In both cases,
1565 backtracking will then try other alternatives, if any. */
1566
1567 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1568 md->recursive == NULL &&
1569 (md->notempty ||
1570 (md->notempty_atstart &&
1571 mstart == md->start_subject + md->start_offset)))
1572 RRETURN(MATCH_NOMATCH);
1573
1574 /* Otherwise, we have a match. */
1575
1576 md->end_match_ptr = eptr; /* Record where we ended */
1577 md->end_offset_top = offset_top; /* and how many extracts were taken */
1578 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1579
1580 /* For some reason, the macros don't work properly if an expression is
1581 given as the argument to RRETURN when the heap is in use. */
1582
1583 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1584 RRETURN(rrc);
1585
1586 /* Assertion brackets. Check the alternative branches in turn - the
1587 matching won't pass the KET for an assertion. If any one branch matches,
1588 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1589 start of each branch to move the current point backwards, so the code at
1590 this level is identical to the lookahead case. When the assertion is part
1591 of a condition, we want to return immediately afterwards. The caller of
1592 this incarnation of the match() function will have set MATCH_CONDASSERT in
1593 md->match_function type, and one of these opcodes will be the first opcode
1594 that is processed. We use a local variable that is preserved over calls to
1595 match() to remember this case. */
1596
1597 case OP_ASSERT:
1598 case OP_ASSERTBACK:
1599 save_mark = md->mark;
1600 if (md->match_function_type == MATCH_CONDASSERT)
1601 {
1602 condassert = TRUE;
1603 md->match_function_type = 0;
1604 }
1605 else condassert = FALSE;
1606
1607 do
1608 {
1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1610 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1611 {
1612 mstart = md->start_match_ptr; /* In case \K reset it */
1613 break;
1614 }
1615 md->mark = save_mark;
1616
1617 /* A COMMIT failure must fail the entire assertion, without trying any
1618 subsequent branches. */
1619
1620 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1621
1622 /* PCRE does not allow THEN to escape beyond an assertion; it
1623 is treated as NOMATCH. */
1624
1625 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1626 ecode += GET(ecode, 1);
1627 }
1628 while (*ecode == OP_ALT);
1629
1630 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1631
1632 /* If checking an assertion for a condition, return MATCH_MATCH. */
1633
1634 if (condassert) RRETURN(MATCH_MATCH);
1635
1636 /* Continue from after the assertion, updating the offsets high water
1637 mark, since extracts may have been taken during the assertion. */
1638
1639 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1640 ecode += 1 + LINK_SIZE;
1641 offset_top = md->end_offset_top;
1642 continue;
1643
1644 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1645 PRUNE, or COMMIT means we must assume failure without checking subsequent
1646 branches. */
1647
1648 case OP_ASSERT_NOT:
1649 case OP_ASSERTBACK_NOT:
1650 save_mark = md->mark;
1651 if (md->match_function_type == MATCH_CONDASSERT)
1652 {
1653 condassert = TRUE;
1654 md->match_function_type = 0;
1655 }
1656 else condassert = FALSE;
1657
1658 do
1659 {
1660 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1661 md->mark = save_mark;
1662 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1663 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1664 {
1665 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1666 break;
1667 }
1668
1669 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1670 as NOMATCH. */
1671
1672 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1673 ecode += GET(ecode,1);
1674 }
1675 while (*ecode == OP_ALT);
1676
1677 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1678
1679 ecode += 1 + LINK_SIZE;
1680 continue;
1681
1682 /* Move the subject pointer back. This occurs only at the start of
1683 each branch of a lookbehind assertion. If we are too close to the start to
1684 move back, this match function fails. When working with UTF-8 we move
1685 back a number of characters, not bytes. */
1686
1687 case OP_REVERSE:
1688 #ifdef SUPPORT_UTF
1689 if (utf)
1690 {
1691 i = GET(ecode, 1);
1692 while (i-- > 0)
1693 {
1694 eptr--;
1695 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1696 BACKCHAR(eptr);
1697 }
1698 }
1699 else
1700 #endif
1701
1702 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1703
1704 {
1705 eptr -= GET(ecode, 1);
1706 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1707 }
1708
1709 /* Save the earliest consulted character, then skip to next op code */
1710
1711 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1712 ecode += 1 + LINK_SIZE;
1713 break;
1714
1715 /* The callout item calls an external function, if one is provided, passing
1716 details of the match so far. This is mainly for debugging, though the
1717 function is able to force a failure. */
1718
1719 case OP_CALLOUT:
1720 if (PUBL(callout) != NULL)
1721 {
1722 PUBL(callout_block) cb;
1723 cb.version = 2; /* Version 1 of the callout block */
1724 cb.callout_number = ecode[1];
1725 cb.offset_vector = md->offset_vector;
1726 #if defined COMPILE_PCRE8
1727 cb.subject = (PCRE_SPTR)md->start_subject;
1728 #elif defined COMPILE_PCRE16
1729 cb.subject = (PCRE_SPTR16)md->start_subject;
1730 #elif defined COMPILE_PCRE32
1731 cb.subject = (PCRE_SPTR32)md->start_subject;
1732 #endif
1733 cb.subject_length = (int)(md->end_subject - md->start_subject);
1734 cb.start_match = (int)(mstart - md->start_subject);
1735 cb.current_position = (int)(eptr - md->start_subject);
1736 cb.pattern_position = GET(ecode, 2);
1737 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1738 cb.capture_top = offset_top/2;
1739 cb.capture_last = md->capture_last & CAPLMASK;
1740 /* Internal change requires this for API compatibility. */
1741 if (cb.capture_last == 0) cb.capture_last = -1;
1742 cb.callout_data = md->callout_data;
1743 cb.mark = md->nomatch_mark;
1744 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1745 if (rrc < 0) RRETURN(rrc);
1746 }
1747 ecode += 2 + 2*LINK_SIZE;
1748 break;
1749
1750 /* Recursion either matches the current regex, or some subexpression. The
1751 offset data is the offset to the starting bracket from the start of the
1752 whole pattern. (This is so that it works from duplicated subpatterns.)
1753
1754 The state of the capturing groups is preserved over recursion, and
1755 re-instated afterwards. We don't know how many are started and not yet
1756 finished (offset_top records the completed total) so we just have to save
1757 all the potential data. There may be up to 65535 such values, which is too
1758 large to put on the stack, but using malloc for small numbers seems
1759 expensive. As a compromise, the stack is used when there are no more than
1760 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1761
1762 There are also other values that have to be saved. We use a chained
1763 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1764 for the original version of this logic. It has, however, been hacked around
1765 a lot, so he is not to blame for the current way it works. */
1766
1767 case OP_RECURSE:
1768 {
1769 recursion_info *ri;
1770 unsigned int recno;
1771
1772 callpat = md->start_code + GET(ecode, 1);
1773 recno = (callpat == md->start_code)? 0 :
1774 GET2(callpat, 1 + LINK_SIZE);
1775
1776 /* Check for repeating a recursion without advancing the subject pointer.
1777 This should catch convoluted mutual recursions. (Some simple cases are
1778 caught at compile time.) */
1779
1780 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1781 if (recno == ri->group_num && eptr == ri->subject_position)
1782 RRETURN(PCRE_ERROR_RECURSELOOP);
1783
1784 /* Add to "recursing stack" */
1785
1786 new_recursive.group_num = recno;
1787 new_recursive.saved_capture_last = md->capture_last;
1788 new_recursive.subject_position = eptr;
1789 new_recursive.prevrec = md->recursive;
1790 md->recursive = &new_recursive;
1791
1792 /* Where to continue from afterwards */
1793
1794 ecode += 1 + LINK_SIZE;
1795
1796 /* Now save the offset data */
1797
1798 new_recursive.saved_max = md->offset_end;
1799 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1800 new_recursive.offset_save = stacksave;
1801 else
1802 {
1803 new_recursive.offset_save =
1804 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1805 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1806 }
1807 memcpy(new_recursive.offset_save, md->offset_vector,
1808 new_recursive.saved_max * sizeof(int));
1809
1810 /* OK, now we can do the recursion. After processing each alternative,
1811 restore the offset data and the last captured value. If there were nested
1812 recursions, md->recursive might be changed, so reset it before looping.
1813 */
1814
1815 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1816 cbegroup = (*callpat >= OP_SBRA);
1817 do
1818 {
1819 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1820 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1821 md, eptrb, RM6);
1822 memcpy(md->offset_vector, new_recursive.offset_save,
1823 new_recursive.saved_max * sizeof(int));
1824 md->capture_last = new_recursive.saved_capture_last;
1825 md->recursive = new_recursive.prevrec;
1826 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1827 {
1828 DPRINTF(("Recursion matched\n"));
1829 if (new_recursive.offset_save != stacksave)
1830 (PUBL(free))(new_recursive.offset_save);
1831
1832 /* Set where we got to in the subject, and reset the start in case
1833 it was changed by \K. This *is* propagated back out of a recursion,
1834 for Perl compatibility. */
1835
1836 eptr = md->end_match_ptr;
1837 mstart = md->start_match_ptr;
1838 goto RECURSION_MATCHED; /* Exit loop; end processing */
1839 }
1840
1841 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1842 is treated as NOMATCH. */
1843
1844 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1845 rrc != MATCH_COMMIT)
1846 {
1847 DPRINTF(("Recursion gave error %d\n", rrc));
1848 if (new_recursive.offset_save != stacksave)
1849 (PUBL(free))(new_recursive.offset_save);
1850 RRETURN(rrc);
1851 }
1852
1853 md->recursive = &new_recursive;
1854 callpat += GET(callpat, 1);
1855 }
1856 while (*callpat == OP_ALT);
1857
1858 DPRINTF(("Recursion didn't match\n"));
1859 md->recursive = new_recursive.prevrec;
1860 if (new_recursive.offset_save != stacksave)
1861 (PUBL(free))(new_recursive.offset_save);
1862 RRETURN(MATCH_NOMATCH);
1863 }
1864
1865 RECURSION_MATCHED:
1866 break;
1867
1868 /* An alternation is the end of a branch; scan along to find the end of the
1869 bracketed group and go to there. */
1870
1871 case OP_ALT:
1872 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1873 break;
1874
1875 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1876 indicating that it may occur zero times. It may repeat infinitely, or not
1877 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1878 with fixed upper repeat limits are compiled as a number of copies, with the
1879 optional ones preceded by BRAZERO or BRAMINZERO. */
1880
1881 case OP_BRAZERO:
1882 next = ecode + 1;
1883 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1885 do next += GET(next, 1); while (*next == OP_ALT);
1886 ecode = next + 1 + LINK_SIZE;
1887 break;
1888
1889 case OP_BRAMINZERO:
1890 next = ecode + 1;
1891 do next += GET(next, 1); while (*next == OP_ALT);
1892 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1894 ecode++;
1895 break;
1896
1897 case OP_SKIPZERO:
1898 next = ecode+1;
1899 do next += GET(next,1); while (*next == OP_ALT);
1900 ecode = next + 1 + LINK_SIZE;
1901 break;
1902
1903 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1904 here; just jump to the group, with allow_zero set TRUE. */
1905
1906 case OP_BRAPOSZERO:
1907 op = *(++ecode);
1908 allow_zero = TRUE;
1909 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1910 goto POSSESSIVE_NON_CAPTURE;
1911
1912 /* End of a group, repeated or non-repeating. */
1913
1914 case OP_KET:
1915 case OP_KETRMIN:
1916 case OP_KETRMAX:
1917 case OP_KETRPOS:
1918 prev = ecode - GET(ecode, 1);
1919
1920 /* If this was a group that remembered the subject start, in order to break
1921 infinite repeats of empty string matches, retrieve the subject start from
1922 the chain. Otherwise, set it NULL. */
1923
1924 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1925 {
1926 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1927 eptrb = eptrb->epb_prev; /* Backup to previous group */
1928 }
1929 else saved_eptr = NULL;
1930
1931 /* If we are at the end of an assertion group or a non-capturing atomic
1932 group, stop matching and return MATCH_MATCH, but record the current high
1933 water mark for use by positive assertions. We also need to record the match
1934 start in case it was changed by \K. */
1935
1936 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1937 *prev == OP_ONCE_NC)
1938 {
1939 md->end_match_ptr = eptr; /* For ONCE_NC */
1940 md->end_offset_top = offset_top;
1941 md->start_match_ptr = mstart;
1942 RRETURN(MATCH_MATCH); /* Sets md->mark */
1943 }
1944
1945 /* For capturing groups we have to check the group number back at the start
1946 and if necessary complete handling an extraction by setting the offsets and
1947 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1948 into group 0, so it won't be picked up here. Instead, we catch it when the
1949 OP_END is reached. Other recursion is handled here. We just have to record
1950 the current subject position and start match pointer and give a MATCH
1951 return. */
1952
1953 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1954 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1955 {
1956 number = GET2(prev, 1+LINK_SIZE);
1957 offset = number << 1;
1958
1959 #ifdef PCRE_DEBUG
1960 printf("end bracket %d", number);
1961 printf("\n");
1962 #endif
1963
1964 /* Handle a recursively called group. */
1965
1966 if (md->recursive != NULL && md->recursive->group_num == number)
1967 {
1968 md->end_match_ptr = eptr;
1969 md->start_match_ptr = mstart;
1970 RRETURN(MATCH_MATCH);
1971 }
1972
1973 /* Deal with capturing */
1974
1975 md->capture_last = (md->capture_last & OVFLMASK) | number;
1976 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1977 {
1978 /* If offset is greater than offset_top, it means that we are
1979 "skipping" a capturing group, and that group's offsets must be marked
1980 unset. In earlier versions of PCRE, all the offsets were unset at the
1981 start of matching, but this doesn't work because atomic groups and
1982 assertions can cause a value to be set that should later be unset.
1983 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1984 part of the atomic group, but this is not on the final matching path,
1985 so must be unset when 2 is set. (If there is no group 2, there is no
1986 problem, because offset_top will then be 2, indicating no capture.) */
1987
1988 if (offset > offset_top)
1989 {
1990 register int *iptr = md->offset_vector + offset_top;
1991 register int *iend = md->offset_vector + offset;
1992 while (iptr < iend) *iptr++ = -1;
1993 }
1994
1995 /* Now make the extraction */
1996
1997 md->offset_vector[offset] =
1998 md->offset_vector[md->offset_end - number];
1999 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2000 if (offset_top <= offset) offset_top = offset + 2;
2001 }
2002 }
2003
2004 /* For an ordinary non-repeating ket, just continue at this level. This
2005 also happens for a repeating ket if no characters were matched in the
2006 group. This is the forcible breaking of infinite loops as implemented in
2007 Perl 5.005. For a non-repeating atomic group that includes captures,
2008 establish a backup point by processing the rest of the pattern at a lower
2009 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2010 original OP_ONCE level, thereby bypassing intermediate backup points, but
2011 resetting any captures that happened along the way. */
2012
2013 if (*ecode == OP_KET || eptr == saved_eptr)
2014 {
2015 if (*prev == OP_ONCE)
2016 {
2017 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2020 RRETURN(MATCH_ONCE);
2021 }
2022 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2023 break;
2024 }
2025
2026 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2027 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2028 at a time from the outer level, thus saving stack. */
2029
2030 if (*ecode == OP_KETRPOS)
2031 {
2032 md->end_match_ptr = eptr;
2033 md->end_offset_top = offset_top;
2034 RRETURN(MATCH_KETRPOS);
2035 }
2036
2037 /* The normal repeating kets try the rest of the pattern or restart from
2038 the preceding bracket, in the appropriate order. In the second case, we can
2039 use tail recursion to avoid using another stack frame, unless we have an
2040 an atomic group or an unlimited repeat of a group that can match an empty
2041 string. */
2042
2043 if (*ecode == OP_KETRMIN)
2044 {
2045 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2046 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2047 if (*prev == OP_ONCE)
2048 {
2049 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2051 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2052 RRETURN(MATCH_ONCE);
2053 }
2054 if (*prev >= OP_SBRA) /* Could match an empty string */
2055 {
2056 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2057 RRETURN(rrc);
2058 }
2059 ecode = prev;
2060 goto TAIL_RECURSE;
2061 }
2062 else /* OP_KETRMAX */
2063 {
2064 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2065 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 if (*prev == OP_ONCE)
2068 {
2069 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2071 md->once_target = prev;
2072 RRETURN(MATCH_ONCE);
2073 }
2074 ecode += 1 + LINK_SIZE;
2075 goto TAIL_RECURSE;
2076 }
2077 /* Control never gets here */
2078
2079 /* Not multiline mode: start of subject assertion, unless notbol. */
2080
2081 case OP_CIRC:
2082 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2083
2084 /* Start of subject assertion */
2085
2086 case OP_SOD:
2087 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2088 ecode++;
2089 break;
2090
2091 /* Multiline mode: start of subject unless notbol, or after any newline. */
2092
2093 case OP_CIRCM:
2094 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2095 if (eptr != md->start_subject &&
2096 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2097 RRETURN(MATCH_NOMATCH);
2098 ecode++;
2099 break;
2100
2101 /* Start of match assertion */
2102
2103 case OP_SOM:
2104 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2105 ecode++;
2106 break;
2107
2108 /* Reset the start of match point */
2109
2110 case OP_SET_SOM:
2111 mstart = eptr;
2112 ecode++;
2113 break;
2114
2115 /* Multiline mode: assert before any newline, or before end of subject
2116 unless noteol is set. */
2117
2118 case OP_DOLLM:
2119 if (eptr < md->end_subject)
2120 {
2121 if (!IS_NEWLINE(eptr))
2122 {
2123 if (md->partial != 0 &&
2124 eptr + 1 >= md->end_subject &&
2125 NLBLOCK->nltype == NLTYPE_FIXED &&
2126 NLBLOCK->nllen == 2 &&
2127 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2128 {
2129 md->hitend = TRUE;
2130 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2131 }
2132 RRETURN(MATCH_NOMATCH);
2133 }
2134 }
2135 else
2136 {
2137 if (md->noteol) RRETURN(MATCH_NOMATCH);
2138 SCHECK_PARTIAL();
2139 }
2140 ecode++;
2141 break;
2142
2143 /* Not multiline mode: assert before a terminating newline or before end of
2144 subject unless noteol is set. */
2145
2146 case OP_DOLL:
2147 if (md->noteol) RRETURN(MATCH_NOMATCH);
2148 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2149
2150 /* ... else fall through for endonly */
2151
2152 /* End of subject assertion (\z) */
2153
2154 case OP_EOD:
2155 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2156 SCHECK_PARTIAL();
2157 ecode++;
2158 break;
2159
2160 /* End of subject or ending \n assertion (\Z) */
2161
2162 case OP_EODN:
2163 ASSERT_NL_OR_EOS:
2164 if (eptr < md->end_subject &&
2165 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2166 {
2167 if (md->partial != 0 &&
2168 eptr + 1 >= md->end_subject &&
2169 NLBLOCK->nltype == NLTYPE_FIXED &&
2170 NLBLOCK->nllen == 2 &&
2171 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2172 {
2173 md->hitend = TRUE;
2174 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2175 }
2176 RRETURN(MATCH_NOMATCH);
2177 }
2178
2179 /* Either at end of string or \n before end. */
2180
2181 SCHECK_PARTIAL();
2182 ecode++;
2183 break;
2184
2185 /* Word boundary assertions */
2186
2187 case OP_NOT_WORD_BOUNDARY:
2188 case OP_WORD_BOUNDARY:
2189 {
2190
2191 /* Find out if the previous and current characters are "word" characters.
2192 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2193 be "non-word" characters. Remember the earliest consulted character for
2194 partial matching. */
2195
2196 #ifdef SUPPORT_UTF
2197 if (utf)
2198 {
2199 /* Get status of previous character */
2200
2201 if (eptr == md->start_subject) prev_is_word = FALSE; else
2202 {
2203 PCRE_PUCHAR lastptr = eptr - 1;
2204 BACKCHAR(lastptr);
2205 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2206 GETCHAR(c, lastptr);
2207 #ifdef SUPPORT_UCP
2208 if (md->use_ucp)
2209 {
2210 if (c == '_') prev_is_word = TRUE; else
2211 {
2212 int cat = UCD_CATEGORY(c);
2213 prev_is_word = (cat == ucp_L || cat == ucp_N);
2214 }
2215 }
2216 else
2217 #endif
2218 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2219 }
2220
2221 /* Get status of next character */
2222
2223 if (eptr >= md->end_subject)
2224 {
2225 SCHECK_PARTIAL();
2226 cur_is_word = FALSE;
2227 }
2228 else
2229 {
2230 GETCHAR(c, eptr);
2231 #ifdef SUPPORT_UCP
2232 if (md->use_ucp)
2233 {
2234 if (c == '_') cur_is_word = TRUE; else
2235 {
2236 int cat = UCD_CATEGORY(c);
2237 cur_is_word = (cat == ucp_L || cat == ucp_N);
2238 }
2239 }
2240 else
2241 #endif
2242 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2243 }
2244 }
2245 else
2246 #endif
2247
2248 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2249 consistency with the behaviour of \w we do use it in this case. */
2250
2251 {
2252 /* Get status of previous character */
2253
2254 if (eptr == md->start_subject) prev_is_word = FALSE; else
2255 {
2256 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2257 #ifdef SUPPORT_UCP
2258 if (md->use_ucp)
2259 {
2260 c = eptr[-1];
2261 if (c == '_') prev_is_word = TRUE; else
2262 {
2263 int cat = UCD_CATEGORY(c);
2264 prev_is_word = (cat == ucp_L || cat == ucp_N);
2265 }
2266 }
2267 else
2268 #endif
2269 prev_is_word = MAX_255(eptr[-1])
2270 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2271 }
2272
2273 /* Get status of next character */
2274
2275 if (eptr >= md->end_subject)
2276 {
2277 SCHECK_PARTIAL();
2278 cur_is_word = FALSE;
2279 }
2280 else
2281 #ifdef SUPPORT_UCP
2282 if (md->use_ucp)
2283 {
2284 c = *eptr;
2285 if (c == '_') cur_is_word = TRUE; else
2286 {
2287 int cat = UCD_CATEGORY(c);
2288 cur_is_word = (cat == ucp_L || cat == ucp_N);
2289 }
2290 }
2291 else
2292 #endif
2293 cur_is_word = MAX_255(*eptr)
2294 && ((md->ctypes[*eptr] & ctype_word) != 0);
2295 }
2296
2297 /* Now see if the situation is what we want */
2298
2299 if ((*ecode++ == OP_WORD_BOUNDARY)?
2300 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2301 RRETURN(MATCH_NOMATCH);
2302 }
2303 break;
2304
2305 /* Match any single character type except newline; have to take care with
2306 CRLF newlines and partial matching. */
2307
2308 case OP_ANY:
2309 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2310 if (md->partial != 0 &&
2311 eptr + 1 >= md->end_subject &&
2312 NLBLOCK->nltype == NLTYPE_FIXED &&
2313 NLBLOCK->nllen == 2 &&
2314 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2315 {
2316 md->hitend = TRUE;
2317 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2318 }
2319
2320 /* Fall through */
2321
2322 /* Match any single character whatsoever. */
2323
2324 case OP_ALLANY:
2325 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2326 { /* not be updated before SCHECK_PARTIAL. */
2327 SCHECK_PARTIAL();
2328 RRETURN(MATCH_NOMATCH);
2329 }
2330 eptr++;
2331 #ifdef SUPPORT_UTF
2332 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2333 #endif
2334 ecode++;
2335 break;
2336
2337 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2338 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2339
2340 case OP_ANYBYTE:
2341 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2342 { /* not be updated before SCHECK_PARTIAL. */
2343 SCHECK_PARTIAL();
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 eptr++;
2347 ecode++;
2348 break;
2349
2350 case OP_NOT_DIGIT:
2351 if (eptr >= md->end_subject)
2352 {
2353 SCHECK_PARTIAL();
2354 RRETURN(MATCH_NOMATCH);
2355 }
2356 GETCHARINCTEST(c, eptr);
2357 if (
2358 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2359 c < 256 &&
2360 #endif
2361 (md->ctypes[c] & ctype_digit) != 0
2362 )
2363 RRETURN(MATCH_NOMATCH);
2364 ecode++;
2365 break;
2366
2367 case OP_DIGIT:
2368 if (eptr >= md->end_subject)
2369 {
2370 SCHECK_PARTIAL();
2371 RRETURN(MATCH_NOMATCH);
2372 }
2373 GETCHARINCTEST(c, eptr);
2374 if (
2375 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2376 c > 255 ||
2377 #endif
2378 (md->ctypes[c] & ctype_digit) == 0
2379 )
2380 RRETURN(MATCH_NOMATCH);
2381 ecode++;
2382 break;
2383
2384 case OP_NOT_WHITESPACE:
2385 if (eptr >= md->end_subject)
2386 {
2387 SCHECK_PARTIAL();
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 GETCHARINCTEST(c, eptr);
2391 if (
2392 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2393 c < 256 &&
2394 #endif
2395 (md->ctypes[c] & ctype_space) != 0
2396 )
2397 RRETURN(MATCH_NOMATCH);
2398 ecode++;
2399 break;
2400
2401 case OP_WHITESPACE:
2402 if (eptr >= md->end_subject)
2403 {
2404 SCHECK_PARTIAL();
2405 RRETURN(MATCH_NOMATCH);
2406 }
2407 GETCHARINCTEST(c, eptr);
2408 if (
2409 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2410 c > 255 ||
2411 #endif
2412 (md->ctypes[c] & ctype_space) == 0
2413 )
2414 RRETURN(MATCH_NOMATCH);
2415 ecode++;
2416 break;
2417
2418 case OP_NOT_WORDCHAR:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 RRETURN(MATCH_NOMATCH);
2423 }
2424 GETCHARINCTEST(c, eptr);
2425 if (
2426 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2427 c < 256 &&
2428 #endif
2429 (md->ctypes[c] & ctype_word) != 0
2430 )
2431 RRETURN(MATCH_NOMATCH);
2432 ecode++;
2433 break;
2434
2435 case OP_WORDCHAR:
2436 if (eptr >= md->end_subject)
2437 {
2438 SCHECK_PARTIAL();
2439 RRETURN(MATCH_NOMATCH);
2440 }
2441 GETCHARINCTEST(c, eptr);
2442 if (
2443 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2444 c > 255 ||
2445 #endif
2446 (md->ctypes[c] & ctype_word) == 0
2447 )
2448 RRETURN(MATCH_NOMATCH);
2449 ecode++;
2450 break;
2451
2452 case OP_ANYNL:
2453 if (eptr >= md->end_subject)
2454 {
2455 SCHECK_PARTIAL();
2456 RRETURN(MATCH_NOMATCH);
2457 }
2458 GETCHARINCTEST(c, eptr);
2459 switch(c)
2460 {
2461 default: RRETURN(MATCH_NOMATCH);
2462
2463 case CHAR_CR:
2464 if (eptr >= md->end_subject)
2465 {
2466 SCHECK_PARTIAL();
2467 }
2468 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2469 break;
2470
2471 case CHAR_LF:
2472 break;
2473
2474 case CHAR_VT:
2475 case CHAR_FF:
2476 case CHAR_NEL:
2477 #ifndef EBCDIC
2478 case 0x2028:
2479 case 0x2029:
2480 #endif /* Not EBCDIC */
2481 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2482 break;
2483 }
2484 ecode++;
2485 break;
2486
2487 case OP_NOT_HSPACE:
2488 if (eptr >= md->end_subject)
2489 {
2490 SCHECK_PARTIAL();
2491 RRETURN(MATCH_NOMATCH);
2492 }
2493 GETCHARINCTEST(c, eptr);
2494 switch(c)
2495 {
2496 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2497 default: break;
2498 }
2499 ecode++;
2500 break;
2501
2502 case OP_HSPACE:
2503 if (eptr >= md->end_subject)
2504 {
2505 SCHECK_PARTIAL();
2506 RRETURN(MATCH_NOMATCH);
2507 }
2508 GETCHARINCTEST(c, eptr);
2509 switch(c)
2510 {
2511 HSPACE_CASES: break; /* Byte and multibyte cases */
2512 default: RRETURN(MATCH_NOMATCH);
2513 }
2514 ecode++;
2515 break;
2516
2517 case OP_NOT_VSPACE:
2518 if (eptr >= md->end_subject)
2519 {
2520 SCHECK_PARTIAL();
2521 RRETURN(MATCH_NOMATCH);
2522 }
2523 GETCHARINCTEST(c, eptr);
2524 switch(c)
2525 {
2526 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2527 default: break;
2528 }
2529 ecode++;
2530 break;
2531
2532 case OP_VSPACE:
2533 if (eptr >= md->end_subject)
2534 {
2535 SCHECK_PARTIAL();
2536 RRETURN(MATCH_NOMATCH);
2537 }
2538 GETCHARINCTEST(c, eptr);
2539 switch(c)
2540 {
2541 VSPACE_CASES: break;
2542 default: RRETURN(MATCH_NOMATCH);
2543 }
2544 ecode++;
2545 break;
2546
2547 #ifdef SUPPORT_UCP
2548 /* Check the next character by Unicode property. We will get here only
2549 if the support is in the binary; otherwise a compile-time error occurs. */
2550
2551 case OP_PROP:
2552 case OP_NOTPROP:
2553 if (eptr >= md->end_subject)
2554 {
2555 SCHECK_PARTIAL();
2556 RRETURN(MATCH_NOMATCH);
2557 }
2558 GETCHARINCTEST(c, eptr);
2559 {
2560 const pcre_uint32 *cp;
2561 const ucd_record *prop = GET_UCD(c);
2562
2563 switch(ecode[1])
2564 {
2565 case PT_ANY:
2566 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2567 break;
2568
2569 case PT_LAMP:
2570 if ((prop->chartype == ucp_Lu ||
2571 prop->chartype == ucp_Ll ||
2572 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2573 RRETURN(MATCH_NOMATCH);
2574 break;
2575
2576 case PT_GC:
2577 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2580
2581 case PT_PC:
2582 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2583 RRETURN(MATCH_NOMATCH);
2584 break;
2585
2586 case PT_SC:
2587 if ((ecode[2] != prop->script) == (op == OP_PROP))
2588 RRETURN(MATCH_NOMATCH);
2589 break;
2590
2591 /* These are specials */
2592
2593 case PT_ALNUM:
2594 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2595 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2596 RRETURN(MATCH_NOMATCH);
2597 break;
2598
2599 case PT_SPACE: /* Perl space */
2600 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2601 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2602 == (op == OP_NOTPROP))
2603 RRETURN(MATCH_NOMATCH);
2604 break;
2605
2606 case PT_PXSPACE: /* POSIX space */
2607 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2608 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2609 c == CHAR_FF || c == CHAR_CR)
2610 == (op == OP_NOTPROP))
2611 RRETURN(MATCH_NOMATCH);
2612 break;
2613
2614 case PT_WORD:
2615 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2616 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2617 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2618 RRETURN(MATCH_NOMATCH);
2619 break;
2620
2621 case PT_CLIST:
2622 cp = PRIV(ucd_caseless_sets) + ecode[2];
2623 for (;;)
2624 {
2625 if (c < *cp)
2626 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2627 if (c == *cp++)
2628 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2629 }
2630 break;
2631
2632 case PT_UCNC:
2633 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2634 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2635 c >= 0xe000) == (op == OP_NOTPROP))
2636 RRETURN(MATCH_NOMATCH);
2637 break;
2638
2639 /* This should never occur */
2640
2641 default:
2642 RRETURN(PCRE_ERROR_INTERNAL);
2643 }
2644
2645 ecode += 3;
2646 }
2647 break;
2648
2649 /* Match an extended Unicode sequence. We will get here only if the support
2650 is in the binary; otherwise a compile-time error occurs. */
2651
2652 case OP_EXTUNI:
2653 if (eptr >= md->end_subject)
2654 {
2655 SCHECK_PARTIAL();
2656 RRETURN(MATCH_NOMATCH);
2657 }
2658 else
2659 {
2660 int lgb, rgb;
2661 GETCHARINCTEST(c, eptr);
2662 lgb = UCD_GRAPHBREAK(c);
2663 while (eptr < md->end_subject)
2664 {
2665 int len = 1;
2666 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2667 rgb = UCD_GRAPHBREAK(c);
2668 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2669 lgb = rgb;
2670 eptr += len;
2671 }
2672 }
2673 CHECK_PARTIAL();
2674 ecode++;
2675 break;
2676 #endif /* SUPPORT_UCP */
2677
2678
2679 /* Match a back reference, possibly repeatedly. Look past the end of the
2680 item to see if there is repeat information following. The code is similar
2681 to that for character classes, but repeated for efficiency. Then obey
2682 similar code to character type repeats - written out again for speed.
2683 However, if the referenced string is the empty string, always treat
2684 it as matched, any number of times (otherwise there could be infinite
2685 loops). */
2686
2687 case OP_REF:
2688 case OP_REFI:
2689 caseless = op == OP_REFI;
2690 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2691 ecode += 1 + IMM2_SIZE;
2692
2693 /* If the reference is unset, there are two possibilities:
2694
2695 (a) In the default, Perl-compatible state, set the length negative;
2696 this ensures that every attempt at a match fails. We can't just fail
2697 here, because of the possibility of quantifiers with zero minima.
2698
2699 (b) If the JavaScript compatibility flag is set, set the length to zero
2700 so that the back reference matches an empty string.
2701
2702 Otherwise, set the length to the length of what was matched by the
2703 referenced subpattern. */
2704
2705 if (offset >= offset_top || md->offset_vector[offset] < 0)
2706 length = (md->jscript_compat)? 0 : -1;
2707 else
2708 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2709
2710 /* Set up for repetition, or handle the non-repeated case */
2711
2712 switch (*ecode)
2713 {
2714 case OP_CRSTAR:
2715 case OP_CRMINSTAR:
2716 case OP_CRPLUS:
2717 case OP_CRMINPLUS:
2718 case OP_CRQUERY:
2719 case OP_CRMINQUERY:
2720 c = *ecode++ - OP_CRSTAR;
2721 minimize = (c & 1) != 0;
2722 min = rep_min[c]; /* Pick up values from tables; */
2723 max = rep_max[c]; /* zero for max => infinity */
2724 if (max == 0) max = INT_MAX;
2725 break;
2726
2727 case OP_CRRANGE:
2728 case OP_CRMINRANGE:
2729 minimize = (*ecode == OP_CRMINRANGE);
2730 min = GET2(ecode, 1);
2731 max = GET2(ecode, 1 + IMM2_SIZE);
2732 if (max == 0) max = INT_MAX;
2733 ecode += 1 + 2 * IMM2_SIZE;
2734 break;
2735
2736 default: /* No repeat follows */
2737 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2738 {
2739 if (length == -2) eptr = md->end_subject; /* Partial match */
2740 CHECK_PARTIAL();
2741 RRETURN(MATCH_NOMATCH);
2742 }
2743 eptr += length;
2744 continue; /* With the main loop */
2745 }
2746
2747 /* Handle repeated back references. If the length of the reference is
2748 zero, just continue with the main loop. If the length is negative, it
2749 means the reference is unset in non-Java-compatible mode. If the minimum is
2750 zero, we can continue at the same level without recursion. For any other
2751 minimum, carrying on will result in NOMATCH. */
2752
2753 if (length == 0) continue;
2754 if (length < 0 && min == 0) continue;
2755
2756 /* First, ensure the minimum number of matches are present. We get back
2757 the length of the reference string explicitly rather than passing the
2758 address of eptr, so that eptr can be a register variable. */
2759
2760 for (i = 1; i <= min; i++)
2761 {
2762 int slength;
2763 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2764 {
2765 if (slength == -2) eptr = md->end_subject; /* Partial match */
2766 CHECK_PARTIAL();
2767 RRETURN(MATCH_NOMATCH);
2768 }
2769 eptr += slength;
2770 }
2771
2772 /* If min = max, continue at the same level without recursion.
2773 They are not both allowed to be zero. */
2774
2775 if (min == max) continue;
2776
2777 /* If minimizing, keep trying and advancing the pointer */
2778
2779 if (minimize)
2780 {
2781 for (fi = min;; fi++)
2782 {
2783 int slength;
2784 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2786 if (fi >= max) RRETURN(MATCH_NOMATCH);
2787 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2788 {
2789 if (slength == -2) eptr = md->end_subject; /* Partial match */
2790 CHECK_PARTIAL();
2791 RRETURN(MATCH_NOMATCH);
2792 }
2793 eptr += slength;
2794 }
2795 /* Control never gets here */
2796 }
2797
2798 /* If maximizing, find the longest string and work backwards */
2799
2800 else
2801 {
2802 pp = eptr;
2803 for (i = min; i < max; i++)
2804 {
2805 int slength;
2806 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2807 {
2808 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2809 the soft partial matching case. */
2810
2811 if (slength == -2 && md->partial != 0 &&
2812 md->end_subject > md->start_used_ptr)
2813 {
2814 md->hitend = TRUE;
2815 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2816 }
2817 break;
2818 }
2819 eptr += slength;
2820 }
2821
2822 while (eptr >= pp)
2823 {
2824 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2825 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2826 eptr -= length;
2827 }
2828 RRETURN(MATCH_NOMATCH);
2829 }
2830 /* Control never gets here */
2831
2832 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2833 used when all the characters in the class have values in the range 0-255,
2834 and either the matching is caseful, or the characters are in the range
2835 0-127 when UTF-8 processing is enabled. The only difference between
2836 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2837 encountered.
2838
2839 First, look past the end of the item to see if there is repeat information
2840 following. Then obey similar code to character type repeats - written out
2841 again for speed. */
2842
2843 case OP_NCLASS:
2844 case OP_CLASS:
2845 {
2846 /* The data variable is saved across frames, so the byte map needs to
2847 be stored there. */
2848 #define BYTE_MAP ((pcre_uint8 *)data)
2849 data = ecode + 1; /* Save for matching */
2850 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2851
2852 switch (*ecode)
2853 {
2854 case OP_CRSTAR:
2855 case OP_CRMINSTAR:
2856 case OP_CRPLUS:
2857 case OP_CRMINPLUS:
2858 case OP_CRQUERY:
2859 case OP_CRMINQUERY:
2860 c = *ecode++ - OP_CRSTAR;
2861 minimize = (c & 1) != 0;
2862 min = rep_min[c]; /* Pick up values from tables; */
2863 max = rep_max[c]; /* zero for max => infinity */
2864 if (max == 0) max = INT_MAX;
2865 break;
2866
2867 case OP_CRRANGE:
2868 case OP_CRMINRANGE:
2869 minimize = (*ecode == OP_CRMINRANGE);
2870 min = GET2(ecode, 1);
2871 max = GET2(ecode, 1 + IMM2_SIZE);
2872 if (max == 0) max = INT_MAX;
2873 ecode += 1 + 2 * IMM2_SIZE;
2874 break;
2875
2876 default: /* No repeat follows */
2877 min = max = 1;
2878 break;
2879 }
2880
2881 /* First, ensure the minimum number of matches are present. */
2882
2883 #ifdef SUPPORT_UTF
2884 if (utf)
2885 {
2886 for (i = 1; i <= min; i++)
2887 {
2888 if (eptr >= md->end_subject)
2889 {
2890 SCHECK_PARTIAL();
2891 RRETURN(MATCH_NOMATCH);
2892 }
2893 GETCHARINC(c, eptr);
2894 if (c > 255)
2895 {
2896 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2897 }
2898 else
2899 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2900 }
2901 }
2902 else
2903 #endif
2904 /* Not UTF mode */
2905 {
2906 for (i = 1; i <= min; i++)
2907 {
2908 if (eptr >= md->end_subject)
2909 {
2910 SCHECK_PARTIAL();
2911 RRETURN(MATCH_NOMATCH);
2912 }
2913 c = *eptr++;
2914 #ifndef COMPILE_PCRE8
2915 if (c > 255)
2916 {
2917 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2918 }
2919 else
2920 #endif
2921 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2922 }
2923 }
2924
2925 /* If max == min we can continue with the main loop without the
2926 need to recurse. */
2927
2928 if (min == max) continue;
2929
2930 /* If minimizing, keep testing the rest of the expression and advancing
2931 the pointer while it matches the class. */
2932
2933 if (minimize)
2934 {
2935 #ifdef SUPPORT_UTF
2936 if (utf)
2937 {
2938 for (fi = min;; fi++)
2939 {
2940 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2942 if (fi >= max) RRETURN(MATCH_NOMATCH);
2943 if (eptr >= md->end_subject)
2944 {
2945 SCHECK_PARTIAL();
2946 RRETURN(MATCH_NOMATCH);
2947 }
2948 GETCHARINC(c, eptr);
2949 if (c > 255)
2950 {
2951 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2952 }
2953 else
2954 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2955 }
2956 }
2957 else
2958 #endif
2959 /* Not UTF mode */
2960 {
2961 for (fi = min;; fi++)
2962 {
2963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2965 if (fi >= max) RRETURN(MATCH_NOMATCH);
2966 if (eptr >= md->end_subject)
2967 {
2968 SCHECK_PARTIAL();
2969 RRETURN(MATCH_NOMATCH);
2970 }
2971 c = *eptr++;
2972 #ifndef COMPILE_PCRE8
2973 if (c > 255)
2974 {
2975 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2976 }
2977 else
2978 #endif
2979 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2980 }
2981 }
2982 /* Control never gets here */
2983 }
2984
2985 /* If maximizing, find the longest possible run, then work backwards. */
2986
2987 else
2988 {
2989 pp = eptr;
2990
2991 #ifdef SUPPORT_UTF
2992 if (utf)
2993 {
2994 for (i = min; i < max; i++)
2995 {
2996 int len = 1;
2997 if (eptr >= md->end_subject)
2998 {
2999 SCHECK_PARTIAL();
3000 break;
3001 }
3002 GETCHARLEN(c, eptr, len);
3003 if (c > 255)
3004 {
3005 if (op == OP_CLASS) break;
3006 }
3007 else
3008 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3009 eptr += len;
3010 }
3011 for (;;)
3012 {
3013 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015 if (eptr-- == pp) break; /* Stop if tried at original pos */
3016 BACKCHAR(eptr);
3017 }
3018 }
3019 else
3020 #endif
3021 /* Not UTF mode */
3022 {
3023 for (i = min; i < max; i++)
3024 {
3025 if (eptr >= md->end_subject)
3026 {
3027 SCHECK_PARTIAL();
3028 break;
3029 }
3030 c = *eptr;
3031 #ifndef COMPILE_PCRE8
3032 if (c > 255)
3033 {
3034 if (op == OP_CLASS) break;
3035 }
3036 else
3037 #endif
3038 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3039 eptr++;
3040 }
3041 while (eptr >= pp)
3042 {
3043 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3045 eptr--;
3046 }
3047 }
3048
3049 RRETURN(MATCH_NOMATCH);
3050 }
3051 #undef BYTE_MAP
3052 }
3053 /* Control never gets here */
3054
3055
3056 /* Match an extended character class. This opcode is encountered only
3057 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3058 mode, because Unicode properties are supported in non-UTF-8 mode. */
3059
3060 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3061 case OP_XCLASS:
3062 {
3063 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3064 ecode += GET(ecode, 1); /* Advance past the item */
3065
3066 switch (*ecode)
3067 {
3068 case OP_CRSTAR:
3069 case OP_CRMINSTAR:
3070 case OP_CRPLUS:
3071 case OP_CRMINPLUS:
3072 case OP_CRQUERY:
3073 case OP_CRMINQUERY:
3074 c = *ecode++ - OP_CRSTAR;
3075 minimize = (c & 1) != 0;
3076 min = rep_min[c]; /* Pick up values from tables; */
3077 max = rep_max[c]; /* zero for max => infinity */
3078 if (max == 0) max = INT_MAX;
3079 break;
3080
3081 case OP_CRRANGE:
3082 case OP_CRMINRANGE:
3083 minimize = (*ecode == OP_CRMINRANGE);
3084 min = GET2(ecode, 1);
3085 max = GET2(ecode, 1 + IMM2_SIZE);
3086 if (max == 0) max = INT_MAX;
3087 ecode += 1 + 2 * IMM2_SIZE;
3088 break;
3089
3090 default: /* No repeat follows */
3091 min = max = 1;
3092 break;
3093 }
3094
3095 /* First, ensure the minimum number of matches are present. */
3096
3097 for (i = 1; i <= min; i++)
3098 {
3099 if (eptr >= md->end_subject)
3100 {
3101 SCHECK_PARTIAL();
3102 RRETURN(MATCH_NOMATCH);
3103 }
3104 GETCHARINCTEST(c, eptr);
3105 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3106 }
3107
3108 /* If max == min we can continue with the main loop without the
3109 need to recurse. */
3110
3111 if (min == max) continue;
3112
3113 /* If minimizing, keep testing the rest of the expression and advancing
3114 the pointer while it matches the class. */
3115
3116 if (minimize)
3117 {
3118 for (fi = min;; fi++)
3119 {
3120 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3122 if (fi >= max) RRETURN(MATCH_NOMATCH);
3123 if (eptr >= md->end_subject)
3124 {
3125 SCHECK_PARTIAL();
3126 RRETURN(MATCH_NOMATCH);
3127 }
3128 GETCHARINCTEST(c, eptr);
3129 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3130 }
3131 /* Control never gets here */
3132 }
3133
3134 /* If maximizing, find the longest possible run, then work backwards. */
3135
3136 else
3137 {
3138 pp = eptr;
3139 for (i = min; i < max; i++)
3140 {
3141 int len = 1;
3142 if (eptr >= md->end_subject)
3143 {
3144 SCHECK_PARTIAL();
3145 break;
3146 }
3147 #ifdef SUPPORT_UTF
3148 GETCHARLENTEST(c, eptr, len);
3149 #else
3150 c = *eptr;
3151 #endif
3152 if (!PRIV(xclass)(c, data, utf)) break;
3153 eptr += len;
3154 }
3155 for(;;)
3156 {
3157 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3158 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3159 if (eptr-- == pp) break; /* Stop if tried at original pos */
3160 #ifdef SUPPORT_UTF
3161 if (utf) BACKCHAR(eptr);
3162 #endif
3163 }
3164 RRETURN(MATCH_NOMATCH);
3165 }
3166
3167 /* Control never gets here */
3168 }
3169 #endif /* End of XCLASS */
3170
3171 /* Match a single character, casefully */
3172
3173 case OP_CHAR:
3174 #ifdef SUPPORT_UTF
3175 if (utf)
3176 {
3177 length = 1;
3178 ecode++;
3179 GETCHARLEN(fc, ecode, length);
3180 if (length > md->end_subject - eptr)
3181 {
3182 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3183 RRETURN(MATCH_NOMATCH);
3184 }
3185 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3186 }
3187 else
3188 #endif
3189 /* Not UTF mode */
3190 {
3191 if (md->end_subject - eptr < 1)
3192 {
3193 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3194 RRETURN(MATCH_NOMATCH);
3195 }
3196 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3197 ecode += 2;
3198 }
3199 break;
3200
3201 /* Match a single character, caselessly. If we are at the end of the
3202 subject, give up immediately. */
3203
3204 case OP_CHARI:
3205 if (eptr >= md->end_subject)
3206 {
3207 SCHECK_PARTIAL();
3208 RRETURN(MATCH_NOMATCH);
3209 }
3210
3211 #ifdef SUPPORT_UTF
3212 if (utf)
3213 {
3214 length = 1;
3215 ecode++;
3216 GETCHARLEN(fc, ecode, length);
3217
3218 /* If the pattern character's value is < 128, we have only one byte, and
3219 we know that its other case must also be one byte long, so we can use the
3220 fast lookup table. We know that there is at least one byte left in the
3221 subject. */
3222
3223 if (fc < 128)
3224 {
3225 pcre_uint32 cc = RAWUCHAR(eptr);
3226 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3227 ecode++;
3228 eptr++;
3229 }
3230
3231 /* Otherwise we must pick up the subject character. Note that we cannot
3232 use the value of "length" to check for sufficient bytes left, because the
3233 other case of the character may have more or fewer bytes. */
3234
3235 else
3236 {
3237 pcre_uint32 dc;
3238 GETCHARINC(dc, eptr);
3239 ecode += length;
3240
3241 /* If we have Unicode property support, we can use it to test the other
3242 case of the character, if there is one. */
3243
3244 if (fc != dc)
3245 {
3246 #ifdef SUPPORT_UCP
3247 if (dc != UCD_OTHERCASE(fc))
3248 #endif
3249 RRETURN(MATCH_NOMATCH);
3250 }
3251 }
3252 }
3253 else
3254 #endif /* SUPPORT_UTF */
3255
3256 /* Not UTF mode */
3257 {
3258 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3259 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3260 eptr++;
3261 ecode += 2;
3262 }
3263 break;
3264
3265 /* Match a single character repeatedly. */
3266
3267 case OP_EXACT:
3268 case OP_EXACTI:
3269 min = max = GET2(ecode, 1);
3270 ecode += 1 + IMM2_SIZE;
3271 goto REPEATCHAR;
3272
3273 case OP_POSUPTO:
3274 case OP_POSUPTOI:
3275 possessive = TRUE;
3276 /* Fall through */
3277
3278 case OP_UPTO:
3279 case OP_UPTOI:
3280 case OP_MINUPTO:
3281 case OP_MINUPTOI:
3282 min = 0;
3283 max = GET2(ecode, 1);
3284 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3285 ecode += 1 + IMM2_SIZE;
3286 goto REPEATCHAR;
3287
3288 case OP_POSSTAR:
3289 case OP_POSSTARI:
3290 possessive = TRUE;
3291 min = 0;
3292 max = INT_MAX;
3293 ecode++;
3294 goto REPEATCHAR;
3295
3296 case OP_POSPLUS:
3297 case OP_POSPLUSI:
3298 possessive = TRUE;
3299 min = 1;
3300 max = INT_MAX;
3301 ecode++;
3302 goto REPEATCHAR;
3303
3304 case OP_POSQUERY:
3305 case OP_POSQUERYI:
3306 possessive = TRUE;
3307 min = 0;
3308 max = 1;
3309 ecode++;
3310 goto REPEATCHAR;
3311
3312 case OP_STAR:
3313 case OP_STARI:
3314 case OP_MINSTAR:
3315 case OP_MINSTARI:
3316 case OP_PLUS:
3317 case OP_PLUSI:
3318 case OP_MINPLUS:
3319 case OP_MINPLUSI:
3320 case OP_QUERY:
3321 case OP_QUERYI:
3322 case OP_MINQUERY:
3323 case OP_MINQUERYI:
3324 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3325 minimize = (c & 1) != 0;
3326 min = rep_min[c]; /* Pick up values from tables; */
3327 max = rep_max[c]; /* zero for max => infinity */
3328 if (max == 0) max = INT_MAX;
3329
3330 /* Common code for all repeated single-character matches. */
3331
3332 REPEATCHAR:
3333 #ifdef SUPPORT_UTF
3334 if (utf)
3335 {
3336 length = 1;
3337 charptr = ecode;
3338 GETCHARLEN(fc, ecode, length);
3339 ecode += length;
3340
3341 /* Handle multibyte character matching specially here. There is
3342 support for caseless matching if UCP support is present. */
3343
3344 if (length > 1)
3345 {
3346 #ifdef SUPPORT_UCP
3347 pcre_uint32 othercase;
3348 if (op >= OP_STARI && /* Caseless */
3349 (othercase = UCD_OTHERCASE(fc)) != fc)
3350 oclength = PRIV(ord2utf)(othercase, occhars);
3351 else oclength = 0;
3352 #endif /* SUPPORT_UCP */
3353
3354 for (i = 1; i <= min; i++)
3355 {
3356 if (eptr <= md->end_subject - length &&
3357 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3358 #ifdef SUPPORT_UCP
3359 else if (oclength > 0 &&
3360 eptr <= md->end_subject - oclength &&
3361 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3362 #endif /* SUPPORT_UCP */
3363 else
3364 {
3365 CHECK_PARTIAL();
3366 RRETURN(MATCH_NOMATCH);
3367 }
3368 }
3369
3370 if (min == max) continue;
3371
3372 if (minimize)
3373 {
3374 for (fi = min;; fi++)
3375 {
3376 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3377 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3378 if (fi >= max) RRETURN(MATCH_NOMATCH);
3379 if (eptr <= md->end_subject - length &&
3380 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3381 #ifdef SUPPORT_UCP
3382 else if (oclength > 0 &&
3383 eptr <= md->end_subject - oclength &&
3384 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3385 #endif /* SUPPORT_UCP */
3386 else
3387 {
3388 CHECK_PARTIAL();
3389 RRETURN(MATCH_NOMATCH);
3390 }
3391 }
3392 /* Control never gets here */
3393 }
3394
3395 else /* Maximize */
3396 {
3397 pp = eptr;
3398 for (i = min; i < max; i++)
3399 {
3400 if (eptr <= md->end_subject - length &&
3401 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3402 #ifdef SUPPORT_UCP
3403 else if (oclength > 0 &&
3404 eptr <= md->end_subject - oclength &&
3405 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3406 #endif /* SUPPORT_UCP */
3407 else
3408 {
3409 CHECK_PARTIAL();
3410 break;
3411 }
3412 }
3413
3414 if (possessive) continue;
3415
3416 for(;;)
3417 {
3418 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3419 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3420 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3421 #ifdef SUPPORT_UCP
3422 eptr--;
3423 BACKCHAR(eptr);
3424 #else /* without SUPPORT_UCP */
3425 eptr -= length;
3426 #endif /* SUPPORT_UCP */
3427 }
3428 }
3429 /* Control never gets here */
3430 }
3431
3432 /* If the length of a UTF-8 character is 1, we fall through here, and
3433 obey the code as for non-UTF-8 characters below, though in this case the
3434 value of fc will always be < 128. */
3435 }
3436 else
3437 #endif /* SUPPORT_UTF */
3438 /* When not in UTF-8 mode, load a single-byte character. */
3439 fc = *ecode++;
3440
3441 /* The value of fc at this point is always one character, though we may
3442 or may not be in UTF mode. The code is duplicated for the caseless and
3443 caseful cases, for speed, since matching characters is likely to be quite
3444 common. First, ensure the minimum number of matches are present. If min =
3445 max, continue at the same level without recursing. Otherwise, if
3446 minimizing, keep trying the rest of the expression and advancing one
3447 matching character if failing, up to the maximum. Alternatively, if
3448 maximizing, find the maximum number of characters and work backwards. */
3449
3450 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3451 max, (char *)eptr));
3452
3453 if (op >= OP_STARI) /* Caseless */
3454 {
3455 #ifdef COMPILE_PCRE8
3456 /* fc must be < 128 if UTF is enabled. */
3457 foc = md->fcc[fc];
3458 #else
3459 #ifdef SUPPORT_UTF
3460 #ifdef SUPPORT_UCP
3461 if (utf && fc > 127)
3462 foc = UCD_OTHERCASE(fc);
3463 #else
3464 if (utf && fc > 127)
3465 foc = fc;
3466 #endif /* SUPPORT_UCP */
3467 else
3468 #endif /* SUPPORT_UTF */
3469 foc = TABLE_GET(fc, md->fcc, fc);
3470 #endif /* COMPILE_PCRE8 */
3471
3472 for (i = 1; i <= min; i++)
3473 {
3474 pcre_uint32 cc; /* Faster than pcre_uchar */
3475 if (eptr >= md->end_subject)
3476 {
3477 SCHECK_PARTIAL();
3478 RRETURN(MATCH_NOMATCH);
3479 }
3480 cc = RAWUCHARTEST(eptr);
3481 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3482 eptr++;
3483 }
3484 if (min == max) continue;
3485 if (minimize)
3486 {
3487 for (fi = min;; fi++)
3488 {
3489 pcre_uint32 cc; /* Faster than pcre_uchar */
3490 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3491 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3492 if (fi >= max) RRETURN(MATCH_NOMATCH);
3493 if (eptr >= md->end_subject)
3494 {
3495 SCHECK_PARTIAL();
3496 RRETURN(MATCH_NOMATCH);
3497 }
3498 cc = RAWUCHARTEST(eptr);
3499 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3500 eptr++;
3501 }
3502 /* Control never gets here */
3503 }
3504 else /* Maximize */
3505 {
3506 pp = eptr;
3507 for (i = min; i < max; i++)
3508 {
3509 pcre_uint32 cc; /* Faster than pcre_uchar */
3510 if (eptr >= md->end_subject)
3511 {
3512 SCHECK_PARTIAL();
3513 break;
3514 }
3515 cc = RAWUCHARTEST(eptr);
3516 if (fc != cc && foc != cc) break;
3517 eptr++;
3518 }
3519
3520 if (possessive) continue;
3521
3522 while (eptr >= pp)
3523 {
3524 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3525 eptr--;
3526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3527 }
3528 RRETURN(MATCH_NOMATCH);
3529 }
3530 /* Control never gets here */
3531 }
3532
3533 /* Caseful comparisons (includes all multi-byte characters) */
3534
3535 else
3536 {
3537 for (i = 1; i <= min; i++)
3538 {
3539 if (eptr >= md->end_subject)
3540 {
3541 SCHECK_PARTIAL();
3542 RRETURN(MATCH_NOMATCH);
3543 }
3544 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3545 }
3546
3547 if (min == max) continue;
3548
3549 if (minimize)
3550 {
3551 for (fi = min;; fi++)
3552 {
3553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3554 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3555 if (fi >= max) RRETURN(MATCH_NOMATCH);
3556 if (eptr >= md->end_subject)
3557 {
3558 SCHECK_PARTIAL();
3559 RRETURN(MATCH_NOMATCH);
3560 }
3561 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3562 }
3563 /* Control never gets here */
3564 }
3565 else /* Maximize */
3566 {
3567 pp = eptr;
3568 for (i = min; i < max; i++)
3569 {
3570 if (eptr >= md->end_subject)
3571 {
3572 SCHECK_PARTIAL();
3573 break;
3574 }
3575 if (fc != RAWUCHARTEST(eptr)) break;
3576 eptr++;
3577 }
3578 if (possessive) continue;
3579
3580 while (eptr >= pp)
3581 {
3582 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3583 eptr--;
3584 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3585 }
3586 RRETURN(MATCH_NOMATCH);
3587 }
3588 }
3589 /* Control never gets here */
3590
3591 /* Match a negated single one-byte character. The character we are
3592 checking can be multibyte. */
3593
3594 case OP_NOT:
3595 case OP_NOTI:
3596 if (eptr >= md->end_subject)
3597 {
3598 SCHECK_PARTIAL();
3599 RRETURN(MATCH_NOMATCH);
3600 }
3601 #ifdef SUPPORT_UTF
3602 if (utf)
3603 {
3604 register pcre_uint32 ch, och;
3605
3606 ecode++;
3607 GETCHARINC(ch, ecode);
3608 GETCHARINC(c, eptr);
3609
3610 if (op == OP_NOT)
3611 {
3612 if (ch == c) RRETURN(MATCH_NOMATCH);
3613 }
3614 else
3615 {
3616 #ifdef SUPPORT_UCP
3617 if (ch > 127)
3618 och = UCD_OTHERCASE(ch);
3619 #else
3620 if (ch > 127)
3621 och = ch;
3622 #endif /* SUPPORT_UCP */
3623 else
3624 och = TABLE_GET(ch, md->fcc, ch);
3625 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3626 }
3627 }
3628 else
3629 #endif
3630 {
3631 register pcre_uint32 ch = ecode[1];
3632 c = *eptr++;
3633 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3634 RRETURN(MATCH_NOMATCH);
3635 ecode += 2;
3636 }
3637 break;
3638
3639 /* Match a negated single one-byte character repeatedly. This is almost a
3640 repeat of the code for a repeated single character, but I haven't found a
3641 nice way of commoning these up that doesn't require a test of the
3642 positive/negative option for each character match. Maybe that wouldn't add
3643 very much to the time taken, but character matching *is* what this is all
3644 about... */
3645
3646 case OP_NOTEXACT:
3647 case OP_NOTEXACTI:
3648 min = max = GET2(ecode, 1);
3649 ecode += 1 + IMM2_SIZE;
3650 goto REPEATNOTCHAR;
3651
3652 case OP_NOTUPTO:
3653 case OP_NOTUPTOI:
3654 case OP_NOTMINUPTO:
3655 case OP_NOTMINUPTOI:
3656 min = 0;
3657 max = GET2(ecode, 1);
3658 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3659 ecode += 1 + IMM2_SIZE;
3660 goto REPEATNOTCHAR;
3661
3662 case OP_NOTPOSSTAR:
3663 case OP_NOTPOSSTARI:
3664 possessive = TRUE;
3665 min = 0;
3666 max = INT_MAX;
3667 ecode++;
3668 goto REPEATNOTCHAR;
3669
3670 case OP_NOTPOSPLUS:
3671 case OP_NOTPOSPLUSI:
3672 possessive = TRUE;
3673 min = 1;
3674 max = INT_MAX;
3675 ecode++;
3676 goto REPEATNOTCHAR;
3677
3678 case OP_NOTPOSQUERY:
3679 case OP_NOTPOSQUERYI:
3680 possessive = TRUE;
3681 min = 0;
3682 max = 1;
3683 ecode++;
3684 goto REPEATNOTCHAR;
3685
3686 case OP_NOTPOSUPTO:
3687 case OP_NOTPOSUPTOI:
3688 possessive = TRUE;
3689 min = 0;
3690 max = GET2(ecode, 1);
3691 ecode += 1 + IMM2_SIZE;
3692 goto REPEATNOTCHAR;
3693
3694 case OP_NOTSTAR:
3695 case OP_NOTSTARI:
3696 case OP_NOTMINSTAR:
3697 case OP_NOTMINSTARI:
3698 case OP_NOTPLUS:
3699 case OP_NOTPLUSI:
3700 case OP_NOTMINPLUS:
3701 case OP_NOTMINPLUSI:
3702 case OP_NOTQUERY:
3703 case OP_NOTQUERYI:
3704 case OP_NOTMINQUERY:
3705 case OP_NOTMINQUERYI:
3706 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3707 minimize = (c & 1) != 0;
3708 min = rep_min[c]; /* Pick up values from tables; */
3709 max = rep_max[c]; /* zero for max => infinity */
3710 if (max == 0) max = INT_MAX;
3711
3712 /* Common code for all repeated single-byte matches. */
3713
3714 REPEATNOTCHAR:
3715 GETCHARINCTEST(fc, ecode);
3716
3717 /* The code is duplicated for the caseless and caseful cases, for speed,
3718 since matching characters is likely to be quite common. First, ensure the
3719 minimum number of matches are present. If min = max, continue at the same
3720 level without recursing. Otherwise, if minimizing, keep trying the rest of
3721 the expression and advancing one matching character if failing, up to the
3722 maximum. Alternatively, if maximizing, find the maximum number of
3723 characters and work backwards. */
3724
3725 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3726 max, (char *)eptr));
3727
3728 if (op >= OP_NOTSTARI) /* Caseless */
3729 {
3730 #ifdef SUPPORT_UTF
3731 #ifdef SUPPORT_UCP
3732 if (utf && fc > 127)
3733 foc = UCD_OTHERCASE(fc);
3734 #else
3735 if (utf && fc > 127)
3736 foc = fc;
3737 #endif /* SUPPORT_UCP */
3738 else
3739 #endif /* SUPPORT_UTF */
3740 foc = TABLE_GET(fc, md->fcc, fc);
3741
3742 #ifdef SUPPORT_UTF
3743 if (utf)
3744 {
3745 register pcre_uint32 d;
3746 for (i = 1; i <= min; i++)
3747 {
3748 if (eptr >= md->end_subject)
3749 {
3750 SCHECK_PARTIAL();
3751 RRETURN(MATCH_NOMATCH);
3752 }
3753 GETCHARINC(d, eptr);
3754 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3755 }
3756 }
3757 else
3758 #endif
3759 /* Not UTF mode */
3760 {
3761 for (i = 1; i <= min; i++)
3762 {
3763 if (eptr >= md->end_subject)
3764 {
3765 SCHECK_PARTIAL();
3766 RRETURN(MATCH_NOMATCH);
3767 }
3768 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3769 eptr++;
3770 }
3771 }
3772
3773 if (min == max) continue;
3774
3775 if (minimize)
3776 {
3777 #ifdef SUPPORT_UTF
3778 if (utf)
3779 {
3780 register pcre_uint32 d;
3781 for (fi = min;; fi++)
3782 {
3783 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3785 if (fi >= max) RRETURN(MATCH_NOMATCH);
3786 if (eptr >= md->end_subject)
3787 {
3788 SCHECK_PARTIAL();
3789 RRETURN(MATCH_NOMATCH);
3790 }
3791 GETCHARINC(d, eptr);
3792 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3793 }
3794 }
3795 else
3796 #endif
3797 /* Not UTF mode */
3798 {
3799 for (fi = min;; fi++)
3800 {
3801 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3803 if (fi >= max) RRETURN(MATCH_NOMATCH);
3804 if (eptr >= md->end_subject)
3805 {
3806 SCHECK_PARTIAL();
3807 RRETURN(MATCH_NOMATCH);
3808 }
3809 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3810 eptr++;
3811 }
3812 }
3813 /* Control never gets here */
3814 }
3815
3816 /* Maximize case */
3817
3818 else
3819 {
3820 pp = eptr;
3821
3822 #ifdef SUPPORT_UTF
3823 if (utf)
3824 {
3825 register pcre_uint32 d;
3826 for (i = min; i < max; i++)
3827 {
3828 int len = 1;
3829 if (eptr >= md->end_subject)
3830 {
3831 SCHECK_PARTIAL();
3832 break;
3833 }
3834 GETCHARLEN(d, eptr, len);
3835 if (fc == d || (unsigned int)foc == d) break;
3836 eptr += len;
3837 }
3838 if (possessive) continue;
3839 for(;;)
3840 {
3841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3843 if (eptr-- == pp) break; /* Stop if tried at original pos */
3844 BACKCHAR(eptr);
3845 }
3846 }
3847 else
3848 #endif
3849 /* Not UTF mode */
3850 {
3851 for (i = min; i < max; i++)
3852 {
3853 if (eptr >= md->end_subject)
3854 {
3855 SCHECK_PARTIAL();
3856 break;
3857 }
3858 if (fc == *eptr || foc == *eptr) break;
3859 eptr++;
3860 }
3861 if (possessive) continue;
3862 while (eptr >= pp)
3863 {
3864 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3866 eptr--;
3867 }
3868 }
3869
3870 RRETURN(MATCH_NOMATCH);
3871 }
3872 /* Control never gets here */
3873 }
3874
3875 /* Caseful comparisons */
3876
3877 else
3878 {
3879 #ifdef SUPPORT_UTF
3880 if (utf)
3881 {
3882 register pcre_uint32 d;
3883 for (i = 1; i <= min; i++)
3884 {
3885 if (eptr >= md->end_subject)
3886 {
3887 SCHECK_PARTIAL();
3888 RRETURN(MATCH_NOMATCH);
3889 }
3890 GETCHARINC(d, eptr);
3891 if (fc == d) RRETURN(MATCH_NOMATCH);
3892 }
3893 }
3894 else
3895 #endif
3896 /* Not UTF mode */
3897 {
3898 for (i = 1; i <= min; i++)
3899 {
3900 if (eptr >= md->end_subject)
3901 {
3902 SCHECK_PARTIAL();
3903 RRETURN(MATCH_NOMATCH);
3904 }
3905 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3906 }
3907 }
3908
3909 if (min == max) continue;
3910
3911 if (minimize)
3912 {
3913 #ifdef SUPPORT_UTF
3914 if (utf)
3915 {
3916 register pcre_uint32 d;
3917 for (fi = min;; fi++)
3918 {
3919 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3921 if (fi >= max) RRETURN(MATCH_NOMATCH);
3922 if (eptr >= md->end_subject)
3923 {
3924 SCHECK_PARTIAL();
3925 RRETURN(MATCH_NOMATCH);
3926 }
3927 GETCHARINC(d, eptr);
3928 if (fc == d) RRETURN(MATCH_NOMATCH);
3929 }
3930 }
3931 else
3932 #endif
3933 /* Not UTF mode */
3934 {
3935 for (fi = min;; fi++)
3936 {
3937 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3939 if (fi >= max) RRETURN(MATCH_NOMATCH);
3940 if (eptr >= md->end_subject)
3941 {
3942 SCHECK_PARTIAL();
3943 RRETURN(MATCH_NOMATCH);
3944 }
3945 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3946 }
3947 }
3948 /* Control never gets here */
3949 }
3950
3951 /* Maximize case */
3952
3953 else
3954 {
3955 pp = eptr;
3956
3957 #ifdef SUPPORT_UTF
3958 if (utf)
3959 {
3960 register pcre_uint32 d;
3961 for (i = min; i < max; i++)
3962 {
3963 int len = 1;
3964 if (eptr >= md->end_subject)
3965 {
3966 SCHECK_PARTIAL();
3967 break;
3968 }
3969 GETCHARLEN(d, eptr, len);
3970 if (fc == d) break;
3971 eptr += len;
3972 }
3973 if (possessive) continue;
3974 for(;;)
3975 {
3976 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3978 if (eptr-- == pp) break; /* Stop if tried at original pos */
3979 BACKCHAR(eptr);
3980 }
3981 }
3982 else
3983 #endif
3984 /* Not UTF mode */
3985 {
3986 for (i = min; i < max; i++)
3987 {
3988 if (eptr >= md->end_subject)
3989 {
3990 SCHECK_PARTIAL();
3991 break;
3992 }
3993 if (fc == *eptr) break;
3994 eptr++;
3995 }
3996 if (possessive) continue;
3997 while (eptr >= pp)
3998 {
3999 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4001 eptr--;
4002 }
4003 }
4004
4005 RRETURN(MATCH_NOMATCH);
4006 }
4007 }
4008 /* Control never gets here */
4009
4010 /* Match a single character type repeatedly; several different opcodes
4011 share code. This is very similar to the code for single characters, but we
4012 repeat it in the interests of efficiency. */
4013
4014 case OP_TYPEEXACT:
4015 min = max = GET2(ecode, 1);
4016 minimize = TRUE;
4017 ecode += 1 + IMM2_SIZE;
4018 goto REPEATTYPE;
4019
4020 case OP_TYPEUPTO:
4021 case OP_TYPEMINUPTO:
4022 min = 0;
4023 max = GET2(ecode, 1);
4024 minimize = *ecode == OP_TYPEMINUPTO;
4025 ecode += 1 + IMM2_SIZE;
4026 goto REPEATTYPE;
4027
4028 case OP_TYPEPOSSTAR:
4029 possessive = TRUE;
4030 min = 0;
4031 max = INT_MAX;
4032 ecode++;
4033 goto REPEATTYPE;
4034
4035 case OP_TYPEPOSPLUS:
4036 possessive = TRUE;
4037 min = 1;
4038 max = INT_MAX;
4039 ecode++;
4040 goto REPEATTYPE;
4041
4042 case OP_TYPEPOSQUERY:
4043 possessive = TRUE;
4044 min = 0;
4045 max = 1;
4046 ecode++;
4047 goto REPEATTYPE;
4048
4049 case OP_TYPEPOSUPTO:
4050 possessive = TRUE;
4051 min = 0;
4052 max = GET2(ecode, 1);
4053 ecode += 1 + IMM2_SIZE;
4054 goto REPEATTYPE;
4055
4056 case OP_TYPESTAR:
4057 case OP_TYPEMINSTAR:
4058 case OP_TYPEPLUS:
4059 case OP_TYPEMINPLUS:
4060 case OP_TYPEQUERY:
4061 case OP_TYPEMINQUERY:
4062 c = *ecode++ - OP_TYPESTAR;
4063 minimize = (c & 1) != 0;
4064 min = rep_min[c]; /* Pick up values from tables; */
4065 max = rep_max[c]; /* zero for max => infinity */
4066 if (max == 0) max = INT_MAX;
4067
4068 /* Common code for all repeated single character type matches. Note that
4069 in UTF-8 mode, '.' matches a character of any length, but for the other
4070 character types, the valid characters are all one-byte long. */
4071
4072 REPEATTYPE:
4073 ctype = *ecode++; /* Code for the character type */
4074
4075 #ifdef SUPPORT_UCP
4076 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4077 {
4078 prop_fail_result = ctype == OP_NOTPROP;
4079 prop_type = *ecode++;
4080 prop_value = *ecode++;
4081 }
4082 else prop_type = -1;
4083 #endif
4084
4085 /* First, ensure the minimum number of matches are present. Use inline
4086 code for maximizing the speed, and do the type test once at the start
4087 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4088 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4089 and single-bytes. */
4090
4091 if (min > 0)
4092 {
4093 #ifdef SUPPORT_UCP
4094 if (prop_type >= 0)
4095 {
4096 switch(prop_type)
4097 {
4098 case PT_ANY:
4099 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4100 for (i = 1; i <= min; i++)
4101 {
4102 if (eptr >= md->end_subject)
4103 {
4104 SCHECK_PARTIAL();
4105 RRETURN(MATCH_NOMATCH);
4106 }
4107 GETCHARINCTEST(c, eptr);
4108 }
4109 break;
4110
4111 case PT_LAMP:
4112 for (i = 1; i <= min; i++)
4113 {
4114 int chartype;
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 GETCHARINCTEST(c, eptr);
4121 chartype = UCD_CHARTYPE(c);
4122 if ((chartype == ucp_Lu ||
4123 chartype == ucp_Ll ||
4124 chartype == ucp_Lt) == prop_fail_result)
4125 RRETURN(MATCH_NOMATCH);
4126 }
4127 break;
4128
4129 case PT_GC:
4130 for (i = 1; i <= min; i++)
4131 {
4132 if (eptr >= md->end_subject)
4133 {
4134 SCHECK_PARTIAL();
4135 RRETURN(MATCH_NOMATCH);
4136 }
4137 GETCHARINCTEST(c, eptr);
4138 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4139 RRETURN(MATCH_NOMATCH);
4140 }
4141 break;
4142
4143 case PT_PC:
4144 for (i = 1; i <= min; i++)
4145 {
4146 if (eptr >= md->end_subject)
4147 {
4148 SCHECK_PARTIAL();
4149 RRETURN(MATCH_NOMATCH);
4150 }
4151 GETCHARINCTEST(c, eptr);
4152 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4153 RRETURN(MATCH_NOMATCH);
4154 }
4155 break;
4156
4157 case PT_SC:
4158 for (i = 1; i <= min; i++)
4159 {
4160 if (eptr >= md->end_subject)
4161 {
4162 SCHECK_PARTIAL();
4163 RRETURN(MATCH_NOMATCH);
4164 }
4165 GETCHARINCTEST(c, eptr);
4166 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4167 RRETURN(MATCH_NOMATCH);
4168 }
4169 break;
4170
4171 case PT_ALNUM:
4172 for (i = 1; i <= min; i++)
4173 {
4174 int category;
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 GETCHARINCTEST(c, eptr);
4181 category = UCD_CATEGORY(c);
4182 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 break;
4186
4187 case PT_SPACE: /* Perl space */
4188 for (i = 1; i <= min; i++)
4189 {
4190 if (eptr >= md->end_subject)
4191 {
4192 SCHECK_PARTIAL();
4193 RRETURN(MATCH_NOMATCH);
4194 }
4195 GETCHARINCTEST(c, eptr);
4196 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4197 c == CHAR_FF || c == CHAR_CR)
4198 == prop_fail_result)
4199 RRETURN(MATCH_NOMATCH);
4200 }
4201 break;
4202
4203 case PT_PXSPACE: /* POSIX space */
4204 for (i = 1; i <= min; i++)
4205 {
4206 if (eptr >= md->end_subject)
4207 {
4208 SCHECK_PARTIAL();
4209 RRETURN(MATCH_NOMATCH);
4210 }
4211 GETCHARINCTEST(c, eptr);
4212 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4213 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4214 == prop_fail_result)
4215 RRETURN(MATCH_NOMATCH);
4216 }
4217 break;
4218
4219 case PT_WORD:
4220 for (i = 1; i <= min; i++)
4221 {
4222 int category;
4223 if (eptr >= md->end_subject)
4224 {
4225 SCHECK_PARTIAL();
4226 RRETURN(MATCH_NOMATCH);
4227 }
4228 GETCHARINCTEST(c, eptr);
4229 category = UCD_CATEGORY(c);
4230 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4231 == prop_fail_result)
4232 RRETURN(MATCH_NOMATCH);
4233 }
4234 break;
4235
4236 case PT_CLIST:
4237 for (i = 1; i <= min; i++)
4238 {
4239 const pcre_uint32 *cp;
4240 if (eptr >= md->end_subject)
4241 {
4242 SCHECK_PARTIAL();
4243 RRETURN(MATCH_NOMATCH);
4244 }
4245 GETCHARINCTEST(c, eptr);
4246 cp = PRIV(ucd_caseless_sets) + prop_value;
4247 for (;;)
4248 {
4249 if (c < *cp)
4250 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4251 if (c == *cp++)
4252 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4253 }
4254 }
4255 break;
4256
4257 case PT_UCNC:
4258 for (i = 1; i <= min; i++)
4259 {
4260 if (eptr >= md->end_subject)
4261 {
4262 SCHECK_PARTIAL();
4263 RRETURN(MATCH_NOMATCH);
4264 }
4265 GETCHARINCTEST(c, eptr);
4266 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4267 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4268 c >= 0xe000) == prop_fail_result)
4269 RRETURN(MATCH_NOMATCH);
4270 }
4271 break;
4272
4273 /* This should not occur */
4274
4275 default:
4276 RRETURN(PCRE_ERROR_INTERNAL);
4277 }
4278 }
4279
4280 /* Match extended Unicode sequences. We will get here only if the
4281 support is in the binary; otherwise a compile-time error occurs. */
4282
4283 else if (ctype == OP_EXTUNI)
4284 {
4285 for (i = 1; i <= min; i++)
4286 {
4287 if (eptr >= md->end_subject)
4288 {
4289 SCHECK_PARTIAL();
4290 RRETURN(MATCH_NOMATCH);
4291 }
4292 else
4293 {
4294 int lgb, rgb;
4295 GETCHARINCTEST(c, eptr);
4296 lgb = UCD_GRAPHBREAK(c);
4297 while (eptr < md->end_subject)
4298 {
4299 int len = 1;
4300 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4301 rgb = UCD_GRAPHBREAK(c);
4302 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4303 lgb = rgb;
4304 eptr += len;
4305 }
4306 }
4307 CHECK_PARTIAL();
4308 }
4309 }
4310
4311 else
4312 #endif /* SUPPORT_UCP */
4313
4314 /* Handle all other cases when the coding is UTF-8 */
4315
4316 #ifdef SUPPORT_UTF
4317 if (utf) switch(ctype)
4318 {
4319 case OP_ANY:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 RRETURN(MATCH_NOMATCH);
4326 }
4327 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4328 if (md->partial != 0 &&
4329 eptr + 1 >= md->end_subject &&
4330 NLBLOCK->nltype == NLTYPE_FIXED &&
4331 NLBLOCK->nllen == 2 &&
4332 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4333 {
4334 md->hitend = TRUE;
4335 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4336 }
4337 eptr++;
4338 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4339 }
4340 break;
4341
4342 case OP_ALLANY:
4343 for (i = 1; i <= min; i++)
4344 {
4345 if (eptr >= md->end_subject)
4346 {
4347 SCHECK_PARTIAL();
4348 RRETURN(MATCH_NOMATCH);
4349 }
4350 eptr++;
4351 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4352 }
4353 break;
4354
4355 case OP_ANYBYTE:
4356 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4357 eptr += min;
4358 break;
4359
4360 case OP_ANYNL:
4361 for (i = 1; i <= min; i++)
4362 {
4363 if (eptr >= md->end_subject)
4364 {
4365 SCHECK_PARTIAL();
4366 RRETURN(MATCH_NOMATCH);
4367 }
4368 GETCHARINC(c, eptr);
4369 switch(c)
4370 {
4371 default: RRETURN(MATCH_NOMATCH);
4372
4373 case CHAR_CR:
4374 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4375 break;
4376
4377 case CHAR_LF:
4378 break;
4379
4380 case CHAR_VT:
4381 case CHAR_FF:
4382 case CHAR_NEL:
4383 #ifndef EBCDIC
4384 case 0x2028:
4385 case 0x2029:
4386 #endif /* Not EBCDIC */
4387 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4388 break;
4389 }
4390 }
4391 break;
4392
4393 case OP_NOT_HSPACE:
4394 for (i = 1; i <= min; i++)
4395 {
4396 if (eptr >= md->end_subject)
4397 {
4398 SCHECK_PARTIAL();
4399 RRETURN(MATCH_NOMATCH);
4400 }
4401 GETCHARINC(c, eptr);
4402 switch(c)
4403 {
4404 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4405 default: break;
4406 }
4407 }
4408 break;
4409
4410 case OP_HSPACE:
4411 for (i = 1; i <= min; i++)
4412 {
4413 if (eptr >= md->end_subject)
4414 {
4415 SCHECK_PARTIAL();
4416 RRETURN(MATCH_NOMATCH);
4417 }
4418 GETCHARINC(c, eptr);
4419 switch(c)
4420 {
4421 HSPACE_CASES: break; /* Byte and multibyte cases */
4422 default: RRETURN(MATCH_NOMATCH);
4423 }
4424 }
4425 break;
4426
4427 case OP_NOT_VSPACE:
4428 for (i = 1; i <= min; i++)
4429 {
4430 if (eptr >= md->end_subject)
4431 {
4432 SCHECK_PARTIAL();
4433 RRETURN(MATCH_NOMATCH);
4434 }
4435 GETCHARINC(c, eptr);
4436 switch(c)
4437 {
4438 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4439 default: break;
4440 }
4441 }
4442 break;
4443
4444 case OP_VSPACE:
4445 for (i = 1; i <= min; i++)
4446 {
4447 if (eptr >= md->end_subject)
4448 {
4449 SCHECK_PARTIAL();
4450 RRETURN(MATCH_NOMATCH);
4451 }
4452 GETCHARINC(c, eptr);
4453 switch(c)
4454 {
4455 VSPACE_CASES: break;
4456 default: RRETURN(MATCH_NOMATCH);
4457 }
4458 }
4459 break;
4460
4461 case OP_NOT_DIGIT:
4462 for (i = 1; i <= min; i++)
4463 {
4464 if (eptr >= md->end_subject)
4465 {
4466 SCHECK_PARTIAL();
4467 RRETURN(MATCH_NOMATCH);
4468 }
4469 GETCHARINC(c, eptr);
4470 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4471 RRETURN(MATCH_NOMATCH);
4472 }
4473 break;
4474
4475 case OP_DIGIT:
4476 for (i = 1; i <= min; i++)
4477 {
4478 pcre_uint32 cc;
4479 if (eptr >= md->end_subject)
4480 {
4481 SCHECK_PARTIAL();
4482 RRETURN(MATCH_NOMATCH);
4483 }
4484 cc = RAWUCHAR(eptr);
4485 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4486 RRETURN(MATCH_NOMATCH);
4487 eptr++;
4488 /* No need to skip more bytes - we know it's a 1-byte character */
4489 }
4490 break;
4491
4492 case OP_NOT_WHITESPACE:
4493 for (i = 1; i <= min; i++)
4494 {
4495 pcre_uint32 cc;
4496 if (eptr >= md->end_subject)
4497 {
4498 SCHECK_PARTIAL();
4499 RRETURN(MATCH_NOMATCH);
4500 }
4501 cc = RAWUCHAR(eptr);
4502 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4503 RRETURN(MATCH_NOMATCH);
4504 eptr++;
4505 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4506 }
4507 break;
4508
4509 case OP_WHITESPACE:
4510 for (i = 1; i <= min; i++)
4511 {
4512 pcre_uint32 cc;
4513 if (eptr >= md->end_subject)
4514 {
4515 SCHECK_PARTIAL();
4516 RRETURN(MATCH_NOMATCH);
4517 }
4518 cc = RAWUCHAR(eptr);
4519 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4520 RRETURN(MATCH_NOMATCH);
4521 eptr++;
4522 /* No need to skip more bytes - we know it's a 1-byte character */
4523 }
4524 break;
4525
4526 case OP_NOT_WORDCHAR:
4527 for (i = 1; i <= min; i++)
4528 {
4529 pcre_uint32 cc;
4530 if (eptr >= md->end_subject)
4531 {
4532 SCHECK_PARTIAL();
4533 RRETURN(MATCH_NOMATCH);
4534 }
4535 cc = RAWUCHAR(eptr);
4536 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4537 RRETURN(MATCH_NOMATCH);
4538 eptr++;
4539 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4540 }
4541 break;
4542
4543 case OP_WORDCHAR:
4544 for (i = 1; i <= min; i++)
4545 {
4546 pcre_uint32 cc;
4547 if (eptr >= md->end_subject)
4548 {
4549 SCHECK_PARTIAL();
4550 RRETURN(MATCH_NOMATCH);
4551 }
4552 cc = RAWUCHAR(eptr);
4553 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4554 RRETURN(MATCH_NOMATCH);
4555 eptr++;
4556 /* No need to skip more bytes - we know it's a 1-byte character */
4557 }
4558 break;
4559
4560 default:
4561 RRETURN(PCRE_ERROR_INTERNAL);
4562 } /* End switch(ctype) */
4563
4564 else
4565 #endif /* SUPPORT_UTF */
4566
4567 /* Code for the non-UTF-8 case for minimum matching of operators other
4568 than OP_PROP and OP_NOTPROP. */
4569
4570 switch(ctype)
4571 {
4572 case OP_ANY:
4573 for (i = 1; i <= min; i++)
4574 {
4575 if (eptr >= md->end_subject)
4576 {
4577 SCHECK_PARTIAL();
4578 RRETURN(MATCH_NOMATCH);
4579 }
4580 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4581 if (md->partial != 0 &&
4582 eptr + 1 >= md->end_subject &&
4583 NLBLOCK->nltype == NLTYPE_FIXED &&
4584 NLBLOCK->nllen == 2 &&
4585 *eptr == NLBLOCK->nl[0])
4586 {
4587 md->hitend = TRUE;
4588 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4589 }
4590 eptr++;
4591 }
4592 break;
4593
4594 case OP_ALLANY:
4595 if (eptr > md->end_subject - min)
4596 {
4597 SCHECK_PARTIAL();
4598 RRETURN(MATCH_NOMATCH);
4599 }
4600 eptr += min;
4601 break;
4602
4603 case OP_ANYBYTE:
4604 if (eptr > md->end_subject - min)
4605 {
4606 SCHECK_PARTIAL();
4607 RRETURN(MATCH_NOMATCH);
4608 }
4609 eptr += min;
4610 break;
4611
4612 case OP_ANYNL:
4613 for (i = 1; i <= min; i++)
4614 {
4615 if (eptr >= md->end_subject)
4616 {
4617 SCHECK_PARTIAL();
4618 RRETURN(MATCH_NOMATCH);
4619 }
4620 switch(*eptr++)
4621 {
4622 default: RRETURN(MATCH_NOMATCH);
4623
4624 case CHAR_CR:
4625 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4626 break;
4627
4628 case CHAR_LF:
4629 break;
4630
4631 case CHAR_VT:
4632 case CHAR_FF:
4633 case CHAR_NEL:
4634 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4635 case 0x2028:
4636 case 0x2029:
4637 #endif
4638 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4639 break;
4640 }
4641 }
4642 break;
4643
4644 case OP_NOT_HSPACE:
4645 for (i = 1; i <= min; i++)
4646 {
4647 if (eptr >= md->end_subject)
4648 {
4649 SCHECK_PARTIAL();
4650 RRETURN(MATCH_NOMATCH);
4651 }
4652 switch(*eptr++)
4653 {
4654 default: break;
4655 HSPACE_BYTE_CASES:
4656 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4657 HSPACE_MULTIBYTE_CASES:
4658 #endif
4659 RRETURN(MATCH_NOMATCH);
4660 }
4661 }
4662 break;
4663
4664 case OP_HSPACE:
4665 for (i = 1; i <= min; i++)
4666 {
4667 if (eptr >= md->end_subject)
4668 {
4669 SCHECK_PARTIAL();
4670 RRETURN(MATCH_NOMATCH);
4671 }
4672 switch(*eptr++)
4673 {
4674 default: RRETURN(MATCH_NOMATCH);
4675 HSPACE_BYTE_CASES:
4676 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4677 HSPACE_MULTIBYTE_CASES:
4678 #endif
4679 break;
4680 }
4681 }
4682 break;
4683
4684 case OP_NOT_VSPACE:
4685 for (i = 1; i <= min; i++)
4686 {
4687 if (eptr >= md->end_subject)
4688 {
4689 SCHECK_PARTIAL();
4690 RRETURN(MATCH_NOMATCH);
4691 }
4692 switch(*eptr++)
4693 {
4694 VSPACE_BYTE_CASES:
4695 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4696 VSPACE_MULTIBYTE_CASES:
4697 #endif
4698 RRETURN(MATCH_NOMATCH);
4699 default: break;
4700 }
4701 }
4702 break;
4703
4704 case OP_VSPACE:
4705 for (i = 1; i <= min; i++)
4706 {
4707 if (eptr >= md->end_subject)
4708 {
4709 SCHECK_PARTIAL();
4710 RRETURN(MATCH_NOMATCH);
4711 }
4712 switch(*eptr++)
4713 {
4714 default: RRETURN(MATCH_NOMATCH);
4715 VSPACE_BYTE_CASES:
4716 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4717 VSPACE_MULTIBYTE_CASES:
4718 #endif
4719 break;
4720 }
4721 }
4722 break;
4723
4724 case OP_NOT_DIGIT:
4725 for (i = 1; i <= min; i++)
4726 {
4727 if (eptr >= md->end_subject)
4728 {
4729 SCHECK_PARTIAL();
4730 RRETURN(MATCH_NOMATCH);
4731 }
4732 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4733 RRETURN(MATCH_NOMATCH);
4734 eptr++;
4735 }
4736 break;
4737
4738 case OP_DIGIT:
4739 for (i = 1; i <= min; i++)
4740 {
4741 if (eptr >= md->end_subject)
4742 {
4743 SCHECK_PARTIAL();
4744 RRETURN(MATCH_NOMATCH);
4745 }
4746 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4747 RRETURN(MATCH_NOMATCH);
4748 eptr++;
4749 }
4750 break;
4751
4752 case OP_NOT_WHITESPACE:
4753 for (i = 1; i <= min; i++)
4754 {
4755 if (eptr >= md->end_subject)
4756 {
4757 SCHECK_PARTIAL();
4758 RRETURN(MATCH_NOMATCH);
4759 }
4760 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4761 RRETURN(MATCH_NOMATCH);
4762 eptr++;
4763 }
4764 break;
4765
4766 case OP_WHITESPACE:
4767 for (i = 1; i <= min; i++)
4768 {
4769 if (eptr >= md->end_subject)
4770 {
4771 SCHECK_PARTIAL();
4772 RRETURN(MATCH_NOMATCH);
4773 }
4774 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4775 RRETURN(MATCH_NOMATCH);
4776 eptr++;
4777 }
4778 break;
4779
4780 case OP_NOT_WORDCHAR:
4781 for (i = 1; i <= min; i++)
4782 {
4783 if (eptr >= md->end_subject)
4784 {
4785 SCHECK_PARTIAL();
4786 RRETURN(MATCH_NOMATCH);
4787 }
4788 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4789 RRETURN(MATCH_NOMATCH);
4790 eptr++;
4791 }
4792 break;
4793
4794 case OP_WORDCHAR:
4795 for (i = 1; i <= min; i++)
4796 {
4797 if (eptr >= md->end_subject)
4798 {
4799 SCHECK_PARTIAL();
4800 RRETURN(MATCH_NOMATCH);
4801 }
4802 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4803 RRETURN(MATCH_NOMATCH);
4804 eptr++;
4805 }
4806 break;
4807
4808 default:
4809 RRETURN(PCRE_ERROR_INTERNAL);
4810 }
4811 }
4812
4813 /* If min = max, continue at the same level without recursing */
4814
4815 if (min == max) continue;
4816
4817 /* If minimizing, we have to test the rest of the pattern before each
4818 subsequent match. Again, separate the UTF-8 case for speed, and also
4819 separate the UCP cases. */
4820
4821 if (minimize)
4822 {
4823 #ifdef SUPPORT_UCP
4824 if (prop_type >= 0)
4825 {
4826 switch(prop_type)
4827 {
4828 case PT_ANY:
4829 for (fi = min;; fi++)
4830 {
4831 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4833 if (fi >= max) RRETURN(MATCH_NOMATCH);
4834 if (eptr >= md->end_subject)
4835 {
4836 SCHECK_PARTIAL();
4837 RRETURN(MATCH_NOMATCH);
4838 }
4839 GETCHARINCTEST(c, eptr);
4840 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4841 }
4842 /* Control never gets here */
4843
4844 case PT_LAMP:
4845 for (fi = min;; fi++)
4846 {
4847 int chartype;
4848 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4850 if (fi >= max) RRETURN(MATCH_NOMATCH);
4851 if (eptr >= md->end_subject)
4852 {
4853 SCHECK_PARTIAL();
4854 RRETURN(MATCH_NOMATCH);
4855 }
4856 GETCHARINCTEST(c, eptr);
4857 chartype = UCD_CHARTYPE(c);
4858 if ((chartype == ucp_Lu ||
4859 chartype == ucp_Ll ||
4860 chartype == ucp_Lt) == prop_fail_result)
4861 RRETURN(MATCH_NOMATCH);
4862 }
4863 /* Control never gets here */
4864
4865 case PT_GC:
4866 for (fi = min;; fi++)
4867 {
4868 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4869 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4870 if (fi >= max) RRETURN(MATCH_NOMATCH);
4871 if (eptr >= md->end_subject)
4872 {
4873 SCHECK_PARTIAL();
4874 RRETURN(MATCH_NOMATCH);
4875 }
4876 GETCHARINCTEST(c, eptr);
4877 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4878 RRETURN(MATCH_NOMATCH);
4879 }
4880 /* Control never gets here */
4881
4882 case PT_PC:
4883 for (fi = min;; fi++)
4884 {
4885 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4887 if (fi >= max) RRETURN(MATCH_NOMATCH);
4888 if (eptr >= md->end_subject)
4889 {
4890 SCHECK_PARTIAL();
4891 RRETURN(MATCH_NOMATCH);
4892 }
4893 GETCHARINCTEST(c, eptr);
4894 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4895 RRETURN(MATCH_NOMATCH);
4896 }
4897 /* Control never gets here */
4898
4899 case PT_SC:
4900 for (fi = min;; fi++)
4901 {
4902 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4904 if (fi >= max) RRETURN(MATCH_NOMATCH);
4905 if (eptr >= md->end_subject)
4906 {
4907 SCHECK_PARTIAL();
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 GETCHARINCTEST(c, eptr);
4911 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4912 RRETURN(MATCH_NOMATCH);
4913 }
4914 /* Control never gets here */
4915
4916 case PT_ALNUM:
4917 for (fi = min;; fi++)
4918 {
4919 int category;
4920 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4922 if (fi >= max) RRETURN(MATCH_NOMATCH);
4923 if (eptr >= md->end_subject)
4924 {
4925 SCHECK_PARTIAL();
4926 RRETURN(MATCH_NOMATCH);
4927 }
4928 GETCHARINCTEST(c, eptr);
4929 category = UCD_CATEGORY(c);
4930 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4931 RRETURN(MATCH_NOMATCH);
4932 }
4933 /* Control never gets here */
4934
4935 case PT_SPACE: /* Perl space */
4936 for (fi = min;; fi++)
4937 {
4938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4940 if (fi >= max) RRETURN(MATCH_NOMATCH);
4941 if (eptr >= md->end_subject)
4942 {
4943 SCHECK_PARTIAL();
4944 RRETURN(MATCH_NOMATCH);
4945 }
4946 GETCHARINCTEST(c, eptr);
4947 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4948 c == CHAR_FF || c == CHAR_CR)
4949 == prop_fail_result)
4950 RRETURN(MATCH_NOMATCH);
4951 }
4952 /* Control never gets here */
4953
4954 case PT_PXSPACE: /* POSIX space */
4955 for (fi = min;; fi++)
4956 {
4957 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4958 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4959 if (fi >= max) RRETURN(MATCH_NOMATCH);
4960 if (eptr >= md->end_subject)
4961 {
4962 SCHECK_PARTIAL();
4963 RRETURN(MATCH_NOMATCH);
4964 }
4965 GETCHARINCTEST(c, eptr);
4966 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4967 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4968 == prop_fail_result)
4969 RRETURN(MATCH_NOMATCH);
4970 }
4971 /* Control never gets here */
4972
4973 case PT_WORD:
4974 for (fi = min;; fi++)
4975 {
4976 int category;
4977 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4979 if (fi >= max) RRETURN(MATCH_NOMATCH);
4980 if (eptr >= md->end_subject)
4981 {
4982 SCHECK_PARTIAL();
4983 RRETURN(MATCH_NOMATCH);
4984 }
4985 GETCHARINCTEST(c, eptr);
4986 category = UCD_CATEGORY(c);
4987 if ((category == ucp_L ||
4988 category == ucp_N ||
4989 c == CHAR_UNDERSCORE)
4990 == prop_fail_result)
4991 RRETURN(MATCH_NOMATCH);
4992 }
4993 /* Control never gets here */
4994
4995 case PT_CLIST:
4996 for (fi = min;; fi++)
4997 {
4998 const pcre_uint32 *cp;
4999 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5001 if (fi >= max) RRETURN(MATCH_NOMATCH);
5002 if (eptr >= md->end_subject)
5003 {
5004 SCHECK_PARTIAL();
5005 RRETURN(MATCH_NOMATCH);
5006 }
5007 GETCHARINCTEST(c, eptr);
5008 cp = PRIV(ucd_caseless_sets) + prop_value;
5009 for (;;)
5010 {
5011 if (c < *cp)
5012 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5013 if (c == *cp++)
5014 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5015 }
5016 }
5017 /* Control never gets here */
5018
5019 case PT_UCNC:
5020 for (fi = min;; fi++)
5021 {
5022 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5024 if (fi >= max) RRETURN(MATCH_NOMATCH);
5025 if (eptr >= md->end_subject)
5026 {
5027 SCHECK_PARTIAL();
5028 RRETURN(MATCH_NOMATCH);
5029 }
5030 GETCHARINCTEST(c, eptr);
5031 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5032 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5033 c >= 0xe000) == prop_fail_result)
5034 RRETURN(MATCH_NOMATCH);
5035 }
5036 /* Control never gets here */
5037
5038 /* This should never occur */
5039 default:
5040 RRETURN(PCRE_ERROR_INTERNAL);
5041 }
5042 }
5043
5044 /* Match extended Unicode sequences. We will get here only if the
5045 support is in the binary; otherwise a compile-time error occurs. */
5046
5047 else if (ctype == OP_EXTUNI)
5048 {
5049 for (fi = min;; fi++)
5050 {
5051 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5053 if (fi >= max) RRETURN(MATCH_NOMATCH);
5054 if (eptr >= md->end_subject)
5055 {
5056 SCHECK_PARTIAL();
5057 RRETURN(MATCH_NOMATCH);
5058 }
5059 else
5060 {
5061 int lgb, rgb;
5062 GETCHARINCTEST(c, eptr);
5063 lgb = UCD_GRAPHBREAK(c);
5064 while (eptr < md->end_subject)
5065 {
5066 int len = 1;
5067 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5068 rgb = UCD_GRAPHBREAK(c);
5069 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5070 lgb = rgb;
5071 eptr += len;
5072 }
5073 }
5074 CHECK_PARTIAL();
5075 }
5076 }
5077 else
5078 #endif /* SUPPORT_UCP */
5079
5080 #ifdef SUPPORT_UTF
5081 if (utf)
5082 {
5083 for (fi = min;; fi++)
5084 {
5085 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5086 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5087 if (fi >= max) RRETURN(MATCH_NOMATCH);
5088 if (eptr >= md->end_subject)
5089 {
5090 SCHECK_PARTIAL();
5091 RRETURN(MATCH_NOMATCH);
5092 }
5093 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5094 RRETURN(MATCH_NOMATCH);
5095 GETCHARINC(c, eptr);
5096 switch(ctype)
5097 {
5098 case OP_ANY: /* This is the non-NL case */
5099 if (md->partial != 0 && /* Take care with CRLF partial */
5100 eptr >= md->end_subject &&
5101 NLBLOCK->nltype == NLTYPE_FIXED &&
5102 NLBLOCK->nllen == 2 &&
5103 c == NLBLOCK->nl[0])
5104 {
5105 md->hitend = TRUE;
5106 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5107 }
5108 break;
5109
5110 case OP_ALLANY:
5111 case OP_ANYBYTE:
5112 break;
5113
5114 case OP_ANYNL:
5115 switch(c)
5116 {
5117 default: RRETURN(MATCH_NOMATCH);
5118 case CHAR_CR:
5119 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5120 break;
5121
5122 case CHAR_LF:
5123 break;
5124
5125 case CHAR_VT:
5126 case CHAR_FF:
5127 case CHAR_NEL:
5128 #ifndef EBCDIC
5129 case 0x2028:
5130 case 0x2029:
5131 #endif /* Not EBCDIC */
5132 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5133 break;
5134 }
5135 break;
5136
5137 case OP_NOT_HSPACE:
5138 switch(c)
5139 {
5140 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5141 default: break;
5142 }
5143 break;
5144
5145 case OP_HSPACE:
5146 switch(c)
5147 {
5148 HSPACE_CASES: break;
5149 default: RRETURN(MATCH_NOMATCH);
5150 }
5151 break;
5152
5153 case OP_NOT_VSPACE:
5154 switch(c)
5155 {
5156 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5157 default: break;
5158 }
5159 break;
5160
5161 case OP_VSPACE:
5162 switch(c)
5163 {
5164 VSPACE_CASES: break;
5165 default: RRETURN(MATCH_NOMATCH);
5166 }
5167 break;
5168
5169 case OP_NOT_DIGIT:
5170 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5171 RRETURN(MATCH_NOMATCH);
5172 break;
5173
5174 case OP_DIGIT:
5175 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5176 RRETURN(MATCH_NOMATCH);
5177 break;
5178
5179 case OP_NOT_WHITESPACE:
5180 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5181 RRETURN(MATCH_NOMATCH);
5182 break;
5183
5184 case OP_WHITESPACE:
5185 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5186 RRETURN(MATCH_NOMATCH);
5187 break;
5188
5189 case OP_NOT_WORDCHAR:
5190 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5191 RRETURN(MATCH_NOMATCH);
5192 break;
5193
5194 case OP_WORDCHAR:
5195 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5196 RRETURN(MATCH_NOMATCH);
5197 break;
5198
5199 default:
5200 RRETURN(PCRE_ERROR_INTERNAL);
5201 }
5202 }
5203 }
5204 else
5205 #endif
5206 /* Not UTF mode */
5207 {
5208 for (fi = min;; fi++)
5209 {
5210 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5211 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5212 if (fi >= max) RRETURN(MATCH_NOMATCH);
5213 if (eptr >= md->end_subject)
5214 {
5215 SCHECK_PARTIAL();
5216 RRETURN(MATCH_NOMATCH);
5217 }
5218 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5219 RRETURN(MATCH_NOMATCH);
5220 c = *eptr++;
5221 switch(ctype)
5222 {
5223 case OP_ANY: /* This is the non-NL case */
5224 if (md->partial != 0 && /* Take care with CRLF partial */
5225 eptr >= md->end_subject &&
5226 NLBLOCK->nltype == NLTYPE_FIXED &&
5227 NLBLOCK->nllen == 2 &&
5228 c == NLBLOCK->nl[0])
5229 {
5230 md->hitend = TRUE;
5231 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5232 }
5233 break;
5234
5235 case OP_ALLANY:
5236 case OP_ANYBYTE:
5237 break;
5238
5239 case OP_ANYNL:
5240 switch(c)
5241 {
5242 default: RRETURN(MATCH_NOMATCH);
5243 case CHAR_CR:
5244 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5245 break;
5246
5247 case CHAR_LF:
5248 break;
5249
5250 case CHAR_VT:
5251 case CHAR_FF:
5252 case CHAR_NEL:
5253 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5254 case 0x2028:
5255 case 0x2029:
5256 #endif
5257 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5258 break;
5259 }
5260 break;
5261
5262 case OP_NOT_HSPACE:
5263 switch(c)
5264 {
5265 default: break;
5266 HSPACE_BYTE_CASES:
5267 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5268 HSPACE_MULTIBYTE_CASES:
5269 #endif
5270 RRETURN(MATCH_NOMATCH);
5271 }
5272 break;
5273
5274 case OP_HSPACE:
5275 switch(c)
5276 {
5277 default: RRETURN(MATCH_NOMATCH);
5278 HSPACE_BYTE_CASES:
5279 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5280 HSPACE_MULTIBYTE_CASES:
5281 #endif
5282 break;
5283 }
5284 break;
5285
5286 case OP_NOT_VSPACE:
5287 switch(c)
5288 {
5289 default: break;
5290 VSPACE_BYTE_CASES:
5291 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5292 VSPACE_MULTIBYTE_CASES:
5293 #endif
5294 RRETURN(MATCH_NOMATCH);
5295 }
5296 break;
5297
5298 case OP_VSPACE:
5299 switch(c)
5300 {
5301 default: RRETURN(MATCH_NOMATCH);
5302 VSPACE_BYTE_CASES:
5303 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5304 VSPACE_MULTIBYTE_CASES:
5305 #endif
5306 break;
5307 }
5308 break;
5309
5310 case OP_NOT_DIGIT:
5311 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5312 break;
5313
5314 case OP_DIGIT:
5315 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5316 break;
5317
5318 case OP_NOT_WHITESPACE:
5319 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5320 break;
5321
5322 case OP_WHITESPACE:
5323 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5324 break;
5325
5326 case OP_NOT_WORDCHAR:
5327 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5328 break;
5329
5330 case OP_WORDCHAR:
5331 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5332 break;
5333
5334 default:
5335 RRETURN(PCRE_ERROR_INTERNAL);
5336 }
5337 }
5338 }
5339 /* Control never gets here */
5340 }
5341
5342 /* If maximizing, it is worth using inline code for speed, doing the type
5343 test once at the start (i.e. keep it out of the loop). Again, keep the
5344 UTF-8 and UCP stuff separate. */
5345
5346 else
5347 {
5348 pp = eptr; /* Remember where we started */
5349
5350 #ifdef SUPPORT_UCP
5351 if (prop_type >= 0)
5352 {
5353 switch(prop_type)
5354 {
5355 case PT_ANY:
5356 for (i = min; i < max; i++)
5357 {
5358 int len = 1;
5359 if (eptr >= md->end_subject)
5360 {
5361 SCHECK_PARTIAL();
5362 break;
5363 }
5364 GETCHARLENTEST(c, eptr, len);
5365 if (prop_fail_result) break;
5366 eptr+= len;
5367 }
5368 break;
5369
5370 case PT_LAMP:
5371 for (i = min; i < max; i++)
5372 {
5373 int chartype;
5374 int len = 1;
5375 if (eptr >= md->end_subject)
5376 {
5377 SCHECK_PARTIAL();
5378 break;
5379 }
5380 GETCHARLENTEST(c, eptr, len);
5381 chartype = UCD_CHARTYPE(c);
5382 if ((chartype == ucp_Lu ||
5383 chartype == ucp_Ll ||
5384 chartype == ucp_Lt) == prop_fail_result)
5385 break;
5386 eptr+= len;
5387 }
5388 break;
5389
5390 case PT_GC:
5391 for (i = min; i < max; i++)
5392 {
5393 int len = 1;
5394 if (eptr >= md->end_subject)
5395 {
5396 SCHECK_PARTIAL();
5397 break;
5398 }
5399 GETCHARLENTEST(c, eptr, len);
5400 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5401 eptr+= len;
5402 }
5403 break;
5404
5405 case PT_PC:
5406 for (i = min; i < max; i++)
5407 {
5408 int len = 1;
5409 if (eptr >= md->end_subject)
5410 {
5411 SCHECK_PARTIAL();
5412 break;
5413 }
5414 GETCHARLENTEST(c, eptr, len);
5415 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5416 eptr+= len;
5417 }
5418 break;
5419
5420 case PT_SC:
5421 for (i = min; i < max; i++)
5422 {
5423 int len = 1;
5424 if (eptr >= md->end_subject)
5425 {
5426 SCHECK_PARTIAL();
5427 break;
5428 }
5429 GETCHARLENTEST(c, eptr, len);
5430 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5431 eptr+= len;
5432 }
5433 break;
5434
5435 case PT_ALNUM:
5436 for (i = min; i < max; i++)
5437 {
5438 int category;
5439 int len = 1;
5440 if (eptr >= md->end_subject)
5441 {
5442 SCHECK_PARTIAL();
5443 break;
5444 }
5445 GETCHARLENTEST(c, eptr, len);
5446 category = UCD_CATEGORY(c);
5447 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5448 break;
5449 eptr+= len;
5450 }
5451 break;
5452
5453 case PT_SPACE: /* Perl space */
5454 for (i = min; i < max; i++)
5455 {
5456 int len = 1;
5457 if (eptr >= md->end_subject)
5458 {
5459 SCHECK_PARTIAL();
5460 break;
5461 }
5462 GETCHARLENTEST(c, eptr, len);
5463 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5464 c == CHAR_FF || c == CHAR_CR)
5465 == prop_fail_result)
5466 break;
5467 eptr+= len;
5468 }
5469 break;
5470
5471 case PT_PXSPACE: /* POSIX space */
5472 for (i = min; i < max; i++)
5473 {
5474 int len = 1;
5475 if (eptr >= md->end_subject)
5476 {
5477 SCHECK_PARTIAL();
5478 break;
5479 }
5480 GETCHARLENTEST(c, eptr, len);
5481 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5482 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5483 == prop_fail_result)
5484 break;
5485 eptr+= len;
5486 }
5487 break;
5488
5489 case PT_WORD:
5490 for (i = min; i < max; i++)
5491 {
5492 int category;
5493 int len = 1;
5494 if (eptr >= md->end_subject)
5495 {
5496 SCHECK_PARTIAL();
5497 break;
5498 }
5499 GETCHARLENTEST(c, eptr, len);
5500 category = UCD_CATEGORY(c);
5501 if ((category == ucp_L || category == ucp_N ||
5502 c == CHAR_UNDERSCORE) == prop_fail_result)
5503 break;
5504 eptr+= len;
5505 }
5506 break;
5507
5508 case PT_CLIST:
5509 for (i = min; i < max; i++)
5510 {
5511 const pcre_uint32 *cp;
5512 int len = 1;
5513 if (eptr >= md->end_subject)
5514 {
5515 SCHECK_PARTIAL();
5516 break;
5517 }
5518 GETCHARLENTEST(c, eptr, len);
5519 cp = PRIV(ucd_caseless_sets) + prop_value;
5520 for (;;)
5521 {
5522 if (c < *cp)
5523 { if (prop_fail_result) break; else goto GOT_MAX; }
5524 if (c == *cp++)
5525 { if (prop_fail_result) goto GOT_MAX; else break; }
5526 }
5527 eptr += len;
5528 }
5529 GOT_MAX:
5530 break;
5531
5532 case PT_UCNC:
5533 for (i = min; i < max; i++)
5534 {
5535 int len = 1;
5536 if (eptr >= md->end_subject)
5537 {
5538 SCHECK_PARTIAL();
5539 break;
5540 }
5541 GETCHARLENTEST(c, eptr, len);
5542 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5543 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5544 c >= 0xe000) == prop_fail_result)
5545 break;
5546 eptr += len;
5547 }
5548 break;
5549
5550 default:
5551 RRETURN(PCRE_ERROR_INTERNAL);
5552 }
5553
5554 /* eptr is now past the end of the maximum run */
5555
5556 if (possessive) continue;
5557 for(;;)
5558 {
5559 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5561 if (eptr-- == pp) break; /* Stop if tried at original pos */
5562 if (utf) BACKCHAR(eptr);
5563 }
5564 }
5565
5566 /* Match extended Unicode sequences. We will get here only if the
5567 support is in the binary; otherwise a compile-time error occurs. */
5568
5569 else if (ctype == OP_EXTUNI)
5570 {
5571 for (i = min; i < max; i++)
5572 {
5573 if (eptr >= md->end_subject)
5574 {
5575 SCHECK_PARTIAL();
5576 break;
5577 }
5578 else
5579 {
5580 int lgb, rgb;
5581 GETCHARINCTEST(c, eptr);
5582 lgb = UCD_GRAPHBREAK(c);
5583 while (eptr < md->end_subject)
5584 {
5585 int len = 1;
5586 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5587 rgb = UCD_GRAPHBREAK(c);
5588 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5589 lgb = rgb;
5590 eptr += len;
5591 }
5592 }
5593 CHECK_PARTIAL();
5594 }
5595
5596 /* eptr is now past the end of the maximum run */
5597
5598 if (possessive) continue;
5599
5600 for(;;)
5601 {
5602 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5603 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5604 if (eptr-- == pp) break; /* Stop if tried at original pos */
5605 for (;;) /* Move back over one extended */
5606 {
5607 if (!utf) c = *eptr; else
5608 {
5609 BACKCHAR(eptr);
5610 GETCHAR(c, eptr);
5611 }
5612 if (UCD_CATEGORY(c) != ucp_M) break;
5613 eptr--;
5614 }
5615 }
5616 }
5617
5618 else
5619 #endif /* SUPPORT_UCP */
5620
5621 #ifdef SUPPORT_UTF
5622 if (utf)
5623 {
5624 switch(ctype)
5625 {
5626 case OP_ANY:
5627 if (max < INT_MAX)
5628 {
5629 for (i = min; i < max; i++)
5630 {
5631 if (eptr >= md->end_subject)
5632 {
5633 SCHECK_PARTIAL();
5634 break;
5635 }
5636 if (IS_NEWLINE(eptr)) break;
5637 if (md->partial != 0 && /* Take care with CRLF partial */
5638 eptr + 1 >= md->end_subject &&
5639 NLBLOCK->nltype == NLTYPE_FIXED &&
5640 NLBLOCK->nllen == 2 &&
5641 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5642 {
5643 md->hitend = TRUE;
5644 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5645 }
5646 eptr++;
5647 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5648 }
5649 }
5650
5651 /* Handle unlimited UTF-8 repeat */
5652
5653 else
5654 {
5655 for (i = min; i < max; i++)
5656 {
5657 if (eptr >= md->end_subject)
5658 {
5659 SCHECK_PARTIAL();
5660 break;
5661 }
5662 if (IS_NEWLINE(eptr)) break;
5663 if (md->partial != 0 && /* Take care with CRLF partial */
5664 eptr + 1 >= md->end_subject &&
5665 NLBLOCK->nltype == NLTYPE_FIXED &&
5666 NLBLOCK->nllen == 2 &&
5667 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5668 {
5669 md->hitend = TRUE;
5670 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5671 }
5672 eptr++;
5673 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5674 }
5675 }
5676 break;
5677
5678 case OP_ALLANY:
5679 if (max < INT_MAX)
5680 {
5681 for (i = min; i < max; i++)
5682 {
5683 if (eptr >= md->end_subject)
5684 {
5685 SCHECK_PARTIAL();
5686 break;
5687 }
5688 eptr++;
5689 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5690 }
5691 }
5692 else
5693 {
5694 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5695 SCHECK_PARTIAL();
5696 }
5697 break;
5698
5699 /* The byte case is the same as non-UTF8 */
5700
5701 case OP_ANYBYTE:
5702 c = max - min;
5703 if (c > (unsigned int)(md->end_subject - eptr))
5704 {
5705 eptr = md->end_subject;
5706 SCHECK_PARTIAL();
5707 }
5708 else eptr += c;
5709 break;
5710
5711 case OP_ANYNL:
5712 for (i = min; i < max; i++)
5713 {
5714 int len = 1;
5715 if (eptr >= md->end_subject)
5716 {
5717 SCHECK_PARTIAL();
5718 break;
5719 }
5720 GETCHARLEN(c, eptr, len);
5721 if (c == CHAR_CR)
5722 {
5723 if (++eptr >= md->end_subject) break;
5724 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5725 }
5726 else
5727 {
5728 if (c != CHAR_LF &&
5729 (md->bsr_anycrlf ||
5730 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5731 #ifndef EBCDIC
5732 && c != 0x2028 && c != 0x2029
5733 #endif /* Not EBCDIC */
5734 )))
5735 break;
5736 eptr += len;
5737 }
5738 }
5739 break;
5740
5741 case OP_NOT_HSPACE:
5742 case OP_HSPACE:
5743 for (i = min; i < max; i++)
5744 {
5745 BOOL gotspace;
5746 int len = 1;
5747 if (eptr >= md->end_subject)
5748 {
5749 SCHECK_PARTIAL();
5750 break;
5751 }
5752 GETCHARLEN(c, eptr, len);
5753 switch(c)
5754 {
5755 HSPACE_CASES: gotspace = TRUE; break;
5756 default: gotspace = FALSE; break;
5757 }
5758 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5759 eptr += len;
5760 }
5761 break;
5762
5763 case OP_NOT_VSPACE:
5764 case OP_VSPACE:
5765 for (i = min; i < max; i++)
5766 {
5767 BOOL gotspace;
5768 int len = 1;
5769 if (eptr >= md->end_subject)
5770 {
5771 SCHECK_PARTIAL();
5772 break;
5773 }
5774 GETCHARLEN(c, eptr, len);
5775 switch(c)
5776 {
5777 VSPACE_CASES: gotspace = TRUE; break;
5778 default: gotspace = FALSE; break;
5779 }
5780 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5781 eptr += len;
5782 }
5783 break;
5784
5785 case OP_NOT_DIGIT:
5786 for (i = min; i < max; i++)
5787 {
5788 int len = 1;
5789 if (eptr >= md->end_subject)
5790 {
5791 SCHECK_PARTIAL();
5792 break;
5793 }
5794 GETCHARLEN(c, eptr, len);
5795 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5796 eptr+= len;
5797 }
5798 break;
5799
5800 case OP_DIGIT:
5801 for (i = min; i < max; i++)
5802 {
5803 int len = 1;
5804 if (eptr >= md->end_subject)
5805 {
5806 SCHECK_PARTIAL();
5807 break;
5808 }
5809 GETCHARLEN(c, eptr, len);
5810 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5811 eptr+= len;
5812 }
5813 break;
5814
5815 case OP_NOT_WHITESPACE:
5816 for (i = min; i < max; i++)
5817 {
5818 int len = 1;
5819 if (eptr >= md->end_subject)
5820 {
5821 SCHECK_PARTIAL();
5822 break;
5823 }
5824 GETCHARLEN(c, eptr, len);
5825 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5826 eptr+= len;
5827 }
5828 break;
5829
5830 case OP_WHITESPACE:
5831 for (i = min; i < max; i++)
5832 {
5833 int len = 1;
5834 if (eptr >= md->end_subject)
5835 {
5836 SCHECK_PARTIAL();
5837 break;
5838 }
5839 GETCHARLEN(c, eptr, len);
5840 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5841 eptr+= len;
5842 }
5843 break;
5844
5845 case OP_NOT_WORDCHAR:
5846 for (i = min; i < max; i++)
5847 {
5848 int len = 1;
5849 if (eptr >= md->end_subject)
5850 {
5851 SCHECK_PARTIAL();
5852 break;
5853 }
5854 GETCHARLEN(c, eptr, len);
5855 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5856 eptr+= len;
5857 }
5858 break;
5859
5860 case OP_WORDCHAR:
5861 for (i = min; i < max; i++)
5862 {
5863 int len = 1;
5864 if (eptr >= md->end_subject)
5865 {
5866 SCHECK_PARTIAL();
5867 break;
5868 }
5869 GETCHARLEN(c, eptr, len);
5870 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5871 eptr+= len;
5872 }
5873 break;
5874
5875 default:
5876 RRETURN(PCRE_ERROR_INTERNAL);
5877 }
5878
5879 /* eptr is now past the end of the maximum run. If possessive, we are
5880 done (no backing up). Otherwise, match at this position; anything other
5881 than no match is immediately returned. For nomatch, back up one
5882 character, unless we are matching \R and the last thing matched was
5883 \r\n, in which case, back up two bytes. */
5884
5885 if (possessive) continue;
5886 for(;;)
5887 {
5888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5890 if (eptr-- == pp) break; /* Stop if tried at original pos */
5891 BACKCHAR(eptr);
5892 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5893 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5894 }
5895 }
5896 else
5897 #endif /* SUPPORT_UTF */
5898 /* Not UTF mode */
5899 {
5900 switch(ctype)
5901 {
5902 case OP_ANY:
5903 for (i = min; i < max; i++)
5904 {
5905 if (eptr >= md->end_subject)
5906 {
5907 SCHECK_PARTIAL();
5908 break;
5909 }
5910 if (IS_NEWLINE(eptr)) break;
5911 if (md->partial != 0 && /* Take care with CRLF partial */
5912 eptr + 1 >= md->end_subject &&
5913 NLBLOCK->nltype == NLTYPE_FIXED &&
5914 NLBLOCK->nllen == 2 &&
5915 *eptr == NLBLOCK->nl[0])
5916 {
5917 md->hitend = TRUE;
5918 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5919 }
5920 eptr++;
5921 }
5922 break;
5923
5924 case OP_ALLANY:
5925 case OP_ANYBYTE:
5926 c = max - min;
5927 if (c > (unsigned int)(md->end_subject - eptr))
5928 {
5929 eptr = md->end_subject;
5930 SCHECK_PARTIAL();
5931 }
5932 else eptr += c;
5933 break;
5934
5935 case OP_ANYNL:
5936 for (i = min; i < max; i++)
5937 {
5938 if (eptr >= md->end_subject)
5939 {
5940 SCHECK_PARTIAL();
5941 break;
5942 }
5943 c = *eptr;
5944 if (c == CHAR_CR)
5945 {
5946 if (++eptr >= md->end_subject) break;
5947 if (*eptr == CHAR_LF) eptr++;
5948 }
5949 else
5950 {
5951 if (c != CHAR_LF && (md->bsr_anycrlf ||
5952 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5953 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5954 && c != 0x2028 && c != 0x2029
5955 #endif
5956 ))) break;
5957 eptr++;
5958 }
5959 }
5960 break;
5961
5962 case OP_NOT_HSPACE:
5963 for (i = min; i < max; i++)
5964 {
5965 if (eptr >= md->end_subject)
5966 {
5967 SCHECK_PARTIAL();
5968 break;
5969 }
5970 switch(*eptr)
5971 {
5972 default: eptr++; break;
5973 HSPACE_BYTE_CASES:
5974 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5975 HSPACE_MULTIBYTE_CASES:
5976 #endif
5977 goto ENDLOOP00;
5978 }
5979 }
5980 ENDLOOP00:
5981 break;
5982
5983 case OP_HSPACE:
5984 for (i = min; i < max; i++)
5985 {
5986 if (eptr >= md->end_subject)
5987 {
5988 SCHECK_PARTIAL();
5989 break;
5990 }
5991 switch(*eptr)
5992 {
5993 default: goto ENDLOOP01;
5994 HSPACE_BYTE_CASES:
5995 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5996 HSPACE_MULTIBYTE_CASES:
5997 #endif
5998 eptr++; break;
5999 }
6000 }
6001 ENDLOOP01:
6002 break;
6003
6004 case OP_NOT_VSPACE:
6005 for (i = min; i < max; i++)
6006 {
6007 if (eptr >= md->end_subject)
6008 {
6009 SCHECK_PARTIAL();
6010 break;
6011 }
6012 switch(*eptr)
6013 {
6014 default: eptr++; break;
6015 VSPACE_BYTE_CASES:
6016 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6017 VSPACE_MULTIBYTE_CASES:
6018 #endif
6019 goto ENDLOOP02;
6020 }
6021 }
6022 ENDLOOP02:
6023 break;
6024
6025 case OP_VSPACE:
6026 for (i = min; i < max; i++)
6027 {
6028 if (eptr >= md->end_subject)
6029 {
6030 SCHECK_PARTIAL();
6031 break;
6032 }
6033 switch(*eptr)
6034 {
6035 default: goto ENDLOOP03;
6036 VSPACE_BYTE_CASES:
6037 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6038 VSPACE_MULTIBYTE_CASES:
6039 #endif
6040 eptr++; break;
6041 }
6042 }
6043 ENDLOOP03:
6044 break;
6045
6046 case OP_NOT_DIGIT:
6047 for (i = min; i < max; i++)
6048 {
6049 if (eptr >= md->end_subject)
6050 {
6051 SCHECK_PARTIAL();
6052 break;
6053 }
6054 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6055 eptr++;
6056 }
6057 break;
6058
6059 case OP_DIGIT:
6060 for (i = min; i < max; i++)
6061 {
6062 if (eptr >= md->end_subject)
6063 {
6064 SCHECK_PARTIAL();
6065 break;
6066 }
6067 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6068 eptr++;
6069 }
6070 break;
6071
6072 case OP_NOT_WHITESPACE:
6073 for (i = min; i < max; i++)
6074 {
6075 if (eptr >= md->end_subject)
6076 {
6077 SCHECK_PARTIAL();
6078 break;
6079 }
6080 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6081 eptr++;
6082 }
6083 break;
6084
6085 case OP_WHITESPACE:
6086 for (i = min; i < max; i++)
6087 {
6088 if (eptr >= md->end_subject)
6089 {
6090 SCHECK_PARTIAL();
6091 break;
6092 }
6093 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6094 eptr++;
6095 }
6096 break;
6097
6098 case OP_NOT_WORDCHAR:
6099 for (i = min; i < max; i++)
6100 {
6101 if (eptr >= md->end_subject)
6102 {
6103 SCHECK_PARTIAL();
6104 break;
6105 }
6106 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6107 eptr++;
6108 }
6109 break;
6110
6111 case OP_WORDCHAR:
6112 for (i = min; i < max; i++)
6113 {
6114 if (eptr >= md->end_subject)
6115 {
6116 SCHECK_PARTIAL();
6117 break;
6118 }
6119 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6120 eptr++;
6121 }
6122 break;
6123
6124 default:
6125 RRETURN(PCRE_ERROR_INTERNAL);
6126 }
6127
6128 /* eptr is now past the end of the maximum run. If possessive, we are
6129 done (no backing up). Otherwise, match at this position; anything other
6130 than no match is immediately returned. For nomatch, back up one
6131 character (byte), unless we are matching \R and the last thing matched
6132 was \r\n, in which case, back up two bytes. */
6133
6134 if (possessive) continue;
6135 while (eptr >= pp)
6136 {
6137 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6138 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6139 eptr--;
6140 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6141 eptr[-1] == CHAR_CR) eptr--;
6142 }
6143 }
6144
6145 /* Get here if we can't make it match with any permitted repetitions */
6146
6147 RRETURN(MATCH_NOMATCH);
6148 }
6149 /* Control never gets here */
6150
6151 /* There's been some horrible disaster. Arrival here can only mean there is
6152 something seriously wrong in the code above or the OP_xxx definitions. */
6153
6154 default:
6155 DPRINTF(("Unknown opcode %d\n", *ecode));
6156 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6157 }
6158
6159 /* Do not stick any code in here without much thought; it is assumed
6160 that "continue" in the code above comes out to here to repeat the main
6161 loop. */
6162
6163 } /* End of main loop */
6164 /* Control never reaches here */
6165
6166
6167 /* When compiling to use the heap rather than the stack for recursive calls to
6168 match(), the RRETURN() macro jumps here. The number that is saved in
6169 frame->Xwhere indicates which label we actually want to return to. */
6170
6171 #ifdef NO_RECURSE
6172 #define LBL(val) case val: goto L_RM##val;
6173 HEAP_RETURN:
6174 switch (frame->Xwhere)
6175 {
6176 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6177 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6178 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6179 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6180 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6181 LBL(65) LBL(66)
6182 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6183 LBL(21)
6184 #endif
6185 #ifdef SUPPORT_UTF
6186 LBL(16) LBL(18) LBL(20)
6187 LBL(22) LBL(23) LBL(28) LBL(30)
6188 LBL(32) LBL(34) LBL(42) LBL(46)
6189 #ifdef SUPPORT_UCP
6190 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6191 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6192 #endif /* SUPPORT_UCP */
6193 #endif /* SUPPORT_UTF */
6194 default:
6195 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6196 return PCRE_ERROR_INTERNAL;
6197 }
6198 #undef LBL
6199 #endif /* NO_RECURSE */
6200 }
6201
6202
6203 /***************************************************************************
6204 ****************************************************************************
6205 RECURSION IN THE match() FUNCTION
6206
6207 Undefine all the macros that were defined above to handle this. */
6208
6209 #ifdef NO_RECURSE
6210 #undef eptr
6211 #undef ecode
6212 #undef mstart
6213 #undef offset_top
6214 #undef eptrb
6215 #undef flags
6216
6217 #undef callpat
6218 #undef charptr
6219 #undef data
6220 #undef next
6221 #undef pp
6222 #undef prev
6223 #undef saved_eptr
6224
6225 #undef new_recursive
6226
6227 #undef cur_is_word
6228 #undef condition
6229 #undef prev_is_word
6230
6231 #undef ctype
6232 #undef length
6233 #undef max
6234 #undef min
6235 #undef number
6236 #undef offset
6237 #undef op
6238 #undef save_capture_last
6239 #undef save_offset1
6240 #undef save_offset2
6241 #undef save_offset3
6242 #undef stacksave
6243
6244 #undef newptrb
6245
6246 #endif
6247
6248 /* These two are defined as macros in both cases */
6249
6250 #undef fc
6251 #undef fi
6252
6253 /***************************************************************************
6254 ***************************************************************************/
6255
6256
6257 #ifdef NO_RECURSE
6258 /*************************************************
6259 * Release allocated heap frames *
6260 *************************************************/
6261
6262 /* This function releases all the allocated frames. The base frame is on the
6263 machine stack, and so must not be freed.
6264
6265 Argument: the address of the base frame
6266 Returns: nothing
6267 */
6268
6269 static void
6270 release_match_heapframes (heapframe *frame_base)
6271 {
6272 heapframe *nextframe = frame_base->Xnextframe;
6273 while (nextframe != NULL)
6274 {
6275 heapframe *oldframe = nextframe;
6276 nextframe = nextframe->Xnextframe;
6277 (PUBL(stack_free))(oldframe);
6278 }
6279 }
6280 #endif
6281
6282
6283 /*************************************************
6284 * Execute a Regular Expression *
6285 *************************************************/
6286
6287 /* This function applies a compiled re to a subject string and picks out
6288 portions of the string if it matches. Two elements in the vector are set for
6289 each substring: the offsets to the start and end of the substring.
6290
6291 Arguments:
6292 argument_re points to the compiled expression
6293 extra_data points to extra data or is NULL
6294 subject points to the subject string
6295 length length of subject string (may contain binary zeros)
6296 start_offset where to start in the subject string
6297 options option bits
6298 offsets points to a vector of ints to be filled in with offsets
6299 offsetcount the number of elements in the vector
6300
6301 Returns: > 0 => success; value is the number of elements filled in
6302 = 0 => success, but offsets is not big enough
6303 -1 => failed to match
6304 < -1 => some kind of unexpected problem
6305 */
6306
6307 #if defined COMPILE_PCRE8
6308 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6309 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6310 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6311 int offsetcount)
6312 #elif defined COMPILE_PCRE16
6313 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6314 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6315 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6316 int offsetcount)
6317 #elif defined COMPILE_PCRE32
6318 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6319 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6320 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6321 int offsetcount)
6322 #endif
6323 {
6324 int rc, ocount, arg_offset_max;
6325 int newline;
6326 BOOL using_temporary_offsets = FALSE;
6327 BOOL anchored;
6328 BOOL startline;
6329 BOOL firstline;
6330 BOOL utf;
6331 BOOL has_first_char = FALSE;
6332 BOOL has_req_char = FALSE;
6333 pcre_uchar first_char = 0;
6334 pcre_uchar first_char2 = 0;
6335 pcre_uchar req_char = 0;
6336 pcre_uchar req_char2 = 0;
6337 match_data match_block;
6338 match_data *md = &match_block;
6339 const pcre_uint8 *tables;
6340 const pcre_uint8 *start_bits = NULL;
6341 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6342 PCRE_PUCHAR end_subject;
6343 PCRE_PUCHAR start_partial = NULL;
6344 PCRE_PUCHAR match_partial;
6345 PCRE_PUCHAR req_char_ptr = start_match - 1;
6346
6347 const pcre_study_data *study;
6348 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6349
6350 #ifdef NO_RECURSE
6351 heapframe frame_zero;
6352 frame_zero.Xprevframe = NULL; /* Marks the top level */
6353 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6354 md->match_frames_base = &frame_zero;
6355 #endif
6356
6357 /* Check for the special magic call that measures the size of the stack used
6358 per recursive call of match(). Without the funny casting for sizeof, a Windows
6359 compiler gave this error: "unary minus operator applied to unsigned type,
6360 result still unsigned". Hopefully the cast fixes that. */
6361
6362 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6363 start_offset == -999)
6364 #ifdef NO_RECURSE
6365 return -((int)sizeof(heapframe));
6366 #else
6367 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6368 #endif
6369
6370 /* Plausibility checks */
6371
6372 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6373 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6374 return PCRE_ERROR_NULL;
6375 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6376 if (length < 0) return PCRE_ERROR_BADLENGTH;
6377 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6378
6379 /* Check that the first field in the block is the magic number. If it is not,
6380 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6381 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6382 means that the pattern is likely compiled with different endianness. */
6383
6384 if (re->magic_number != MAGIC_NUMBER)
6385 return re->magic_number == REVERSED_MAGIC_NUMBER?
6386 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6387 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6388
6389 /* These two settings are used in the code for checking a UTF-8 string that
6390 follows immediately afterwards. Other values in the md block are used only
6391 during "normal" pcre_exec() processing, not when the JIT support is in use,
6392 so they are set up later. */
6393
6394 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6395 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6396 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6397 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6398
6399 /* Check a UTF-8 string if required. Pass back the character offset and error
6400 code for an invalid string if a results vector is available. */
6401
6402 #ifdef SUPPORT_UTF
6403 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6404 {
6405 int erroroffset;
6406 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6407 if (errorcode != 0)
6408 {
6409 if (offsetcount >= 2)
6410 {
6411 offsets[0] = erroroffset;
6412 offsets[1] = errorcode;
6413 }
6414 #if defined COMPILE_PCRE8
6415 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6416 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6417 #elif defined COMPILE_PCRE16
6418 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6419 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6420 #elif defined COMPILE_PCRE32
6421 return PCRE_ERROR_BADUTF32;
6422 #endif
6423 }
6424 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6425 /* Check that a start_offset points to the start of a UTF character. */
6426 if (start_offset > 0 && start_offset < length &&
6427 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6428 return PCRE_ERROR_BADUTF8_OFFSET;
6429 #endif
6430 }
6431 #endif
6432
6433 /* If the pattern was successfully studied with JIT support, run the JIT
6434 executable instead of the rest of this function. Most options must be set at
6435 compile time for the JIT code to be usable. Fallback to the normal code path if
6436 an unsupported flag is set. */
6437
6438 #ifdef SUPPORT_JIT
6439 if (extra_data != NULL
6440 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6441 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6442 && extra_data->executable_jit != NULL
6443 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6444 {
6445 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6446 start_offset, options, offsets, offsetcount);
6447
6448 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6449 mode is not compiled. In this case we simply fallback to interpreter. */
6450
6451 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6452 }
6453 #endif
6454
6455 /* Carry on with non-JIT matching. This information is for finding all the
6456 numbers associated with a given name, for condition testing. */
6457
6458 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6459 md->name_count = re->name_count;
6460 md->name_entry_size = re->name_entry_size;
6461
6462 /* Fish out the optional data from the extra_data structure, first setting
6463 the default values. */
6464
6465 study = NULL;
6466 md->match_limit = MATCH_LIMIT;
6467 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6468 md->callout_data = NULL;
6469
6470 /* The table pointer is always in native byte order. */
6471
6472 tables = re->tables;
6473
6474 if (extra_data != NULL)
6475 {
6476 register unsigned int flags = extra_data->flags;
6477 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6478 study = (const pcre_study_data *)extra_data->study_data;
6479 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6480 md->match_limit = extra_data->match_limit;
6481 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6482 md->match_limit_recursion = extra_data->match_limit_recursion;
6483 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6484 md->callout_data = extra_data->callout_data;
6485 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6486 }
6487
6488 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6489 is a feature that makes it possible to save compiled regex and re-use them
6490 in other programs later. */
6491
6492 if (tables == NULL) tables = PRIV(default_tables);
6493
6494 /* Set up other data */
6495
6496 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6497 startline = (re->flags & PCRE_STARTLINE) != 0;
6498 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6499
6500 /* The code starts after the real_pcre block and the capture name table. */
6501
6502 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6503 re->name_count * re->name_entry_size;
6504
6505 md->start_subject = (PCRE_PUCHAR)subject;
6506 md->start_offset = start_offset;
6507 md->end_subject = md->start_subject + length;
6508 end_subject = md->end_subject;
6509
6510 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6511 md->use_ucp = (re->options & PCRE_UCP) != 0;
6512 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6513 md->ignore_skip_arg = FALSE;
6514
6515 /* Some options are unpacked into BOOL variables in the hope that testing
6516 them will be faster than individual option bits. */
6517
6518 md->notbol = (options & PCRE_NOTBOL) != 0;
6519 md->noteol = (options & PCRE_NOTEOL) != 0;
6520 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6521 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6522
6523 md->hitend = FALSE;
6524 md->mark = md->nomatch_mark = NULL; /* In case never set */
6525
6526 md->recursive = NULL; /* No recursion at top level */
6527 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6528
6529 md->lcc = tables + lcc_offset;
6530 md->fcc = tables + fcc_offset;
6531 md->ctypes = tables + ctypes_offset;
6532
6533 /* Handle different \R options. */
6534
6535 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6536 {
6537 case 0:
6538 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6539 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6540 else
6541 #ifdef BSR_ANYCRLF
6542 md->bsr_anycrlf = TRUE;
6543 #else
6544 md->bsr_anycrlf = FALSE;
6545 #endif
6546 break;
6547
6548 case PCRE_BSR_ANYCRLF:
6549 md->bsr_anycrlf = TRUE;
6550 break;
6551
6552 case PCRE_BSR_UNICODE:
6553 md->bsr_anycrlf = FALSE;
6554 break;
6555
6556 default: return PCRE_ERROR_BADNEWLINE;
6557 }
6558
6559 /* Handle different types of newline. The three bits give eight cases. If
6560 nothing is set at run time, whatever was used at compile time applies. */
6561
6562 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6563 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6564 {
6565 case 0: newline = NEWLINE; break; /* Compile-time default */
6566 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6567 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6568 case PCRE_NEWLINE_CR+
6569 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6570 case PCRE_NEWLINE_ANY: newline = -1; break;
6571 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6572 default: return PCRE_ERROR_BADNEWLINE;
6573 }
6574
6575 if (newline == -2)
6576 {
6577 md->nltype = NLTYPE_ANYCRLF;
6578 }
6579 else if (newline < 0)
6580 {
6581 md->nltype = NLTYPE_ANY;
6582 }
6583 else
6584 {
6585 md->nltype = NLTYPE_FIXED;
6586 if (newline > 255)
6587 {
6588 md->nllen = 2;
6589 md->nl[0] = (newline >> 8) & 255;
6590 md->nl[1] = newline & 255;
6591 }
6592 else
6593 {
6594 md->nllen = 1;
6595 md->nl[0] = newline;
6596 }
6597 }
6598
6599 /* Partial matching was originally supported only for a restricted set of
6600 regexes; from release 8.00 there are no restrictions, but the bits are still
6601 defined (though never set). So there's no harm in leaving this code. */
6602
6603 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6604 return PCRE_ERROR_BADPARTIAL;
6605
6606 /* If the expression has got more back references than the offsets supplied can
6607 hold, we get a temporary chunk of working store to use during the matching.
6608 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6609 of 3. */
6610
6611 ocount = offsetcount - (offsetcount % 3);
6612 arg_offset_max = (2*ocount)/3;
6613
6614 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6615 {
6616 ocount = re->top_backref * 3 + 3;
6617 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6618 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6619 using_temporary_offsets = TRUE;
6620 DPRINTF(("Got memory to hold back references\n"));
6621 }
6622 else md->offset_vector = offsets;
6623 md->offset_end = ocount;
6624 md->offset_max = (2*ocount)/3;
6625 md->capture_last = 0;
6626
6627 /* Reset the working variable associated with each extraction. These should
6628 never be used unless previously set, but they get saved and restored, and so we
6629 initialize them to avoid reading uninitialized locations. Also, unset the
6630 offsets for the matched string. This is really just for tidiness with callouts,
6631 in case they inspect these fields. */
6632
6633 if (md->offset_vector != NULL)
6634 {
6635 register int *iptr = md->offset_vector + ocount;
6636 register int *iend = iptr - re->top_bracket;
6637 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6638 while (--iptr >= iend) *iptr = -1;
6639 md->offset_vector[0] = md->offset_vector[1] = -1;
6640 }
6641
6642 /* Set up the first character to match, if available. The first_char value is
6643 never set for an anchored regular expression, but the anchoring may be forced
6644 at run time, so we have to test for anchoring. The first char may be unset for
6645 an unanchored pattern, of course. If there's no first char and the pattern was
6646 studied, there may be a bitmap of possible first characters. */
6647
6648 if (!anchored)
6649 {
6650 if ((re->flags & PCRE_FIRSTSET) != 0)
6651 {
6652 has_first_char = TRUE;
6653 first_char = first_char2 = (pcre_uchar)(re->first_char);
6654 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6655 {
6656 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6657 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6658 if (utf && first_char > 127)
6659 first_char2 = UCD_OTHERCASE(first_char);
6660 #endif
6661 }
6662 }
6663 else
6664 if (!startline && study != NULL &&
6665 (study->flags & PCRE_STUDY_MAPPED) != 0)
6666 start_bits = study->start_bits;
6667 }
6668
6669 /* For anchored or unanchored matches, there may be a "last known required
6670 character" set. */
6671
6672 if ((re->flags & PCRE_REQCHSET) != 0)
6673 {
6674 has_req_char = TRUE;
6675 req_char = req_char2 = (pcre_uchar)(re->req_char);
6676 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6677 {
6678 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6679 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6680 if (utf && req_char > 127)
6681 req_char2 = UCD_OTHERCASE(req_char);
6682 #endif
6683 }
6684 }
6685
6686
6687 /* ==========================================================================*/
6688
6689 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6690 the loop runs just once. */
6691
6692 for(;;)
6693 {
6694 PCRE_PUCHAR save_end_subject = end_subject;
6695 PCRE_PUCHAR new_start_match;
6696
6697 /* If firstline is TRUE, the start of the match is constrained to the first
6698 line of a multiline string. That is, the match must be before or at the first
6699 newline. Implement this by temporarily adjusting end_subject so that we stop
6700 scanning at a newline. If the match fails at the newline, later code breaks
6701 this loop. */
6702
6703 if (firstline)
6704 {
6705 PCRE_PUCHAR t = start_match;
6706 #ifdef SUPPORT_UTF
6707 if (utf)
6708 {
6709 while (t < md->end_subject && !IS_NEWLINE(t))
6710 {
6711 t++;
6712 ACROSSCHAR(t < end_subject, *t, t++);
6713 }
6714 }
6715 else
6716 #endif
6717 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6718 end_subject = t;
6719 }
6720
6721 /* There are some optimizations that avoid running the match if a known
6722 starting point is not found, or if a known later character is not present.
6723 However, there is an option that disables these, for testing and for ensuring
6724 that all callouts do actually occur. The option can be set in the regex by
6725 (*NO_START_OPT) or passed in match-time options. */
6726
6727 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6728 {
6729 /* Advance to a unique first char if there is one. */
6730
6731 if (has_first_char)
6732 {
6733 pcre_uchar smc;
6734
6735 if (first_char != first_char2)
6736 while (start_match < end_subject &&
6737 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6738 start_match++;
6739 else
6740 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6741 start_match++;
6742 }
6743
6744 /* Or to just after a linebreak for a multiline match */
6745
6746 else if (startline)
6747 {
6748 if (start_match > md->start_subject + start_offset)
6749 {
6750 #ifdef SUPPORT_UTF
6751 if (utf)
6752 {
6753 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6754 {
6755 start_match++;
6756 ACROSSCHAR(start_match < end_subject, *start_match,
6757 start_match++);
6758 }
6759 }