/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1298 - (show annotations)
Fri Mar 22 16:13:13 2013 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 217104 byte(s)
Fix COMMIT in recursion; document backtracking verbs in assertions and 
subroutines.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #ifdef SUPPORT_UTF
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #ifdef SUPPORT_UTF
199 #ifdef SUPPORT_UCP
200 if (utf)
201 {
202 /* Match characters up to the end of the reference. NOTE: the number of
203 data units matched may differ, because in UTF-8 there are some characters
204 whose upper and lower case versions code have different numbers of bytes.
205 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
206 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
207 sequence of two of the latter. It is important, therefore, to check the
208 length along the reference, not along the subject (earlier code did this
209 wrong). */
210
211 PCRE_PUCHAR endptr = p + length;
212 while (p < endptr)
213 {
214 pcre_uint32 c, d;
215 const ucd_record *ur;
216 if (eptr >= md->end_subject) return -2; /* Partial match */
217 GETCHARINC(c, eptr);
218 GETCHARINC(d, p);
219 ur = GET_UCD(d);
220 if (c != d && c != d + ur->other_case)
221 {
222 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
223 for (;;)
224 {
225 if (c < *pp) return -1;
226 if (c == *pp++) break;
227 }
228 }
229 }
230 }
231 else
232 #endif
233 #endif
234
235 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
236 is no UCP support. */
237 {
238 while (length-- > 0)
239 {
240 pcre_uint32 cc, cp;
241 if (eptr >= md->end_subject) return -2; /* Partial match */
242 cc = RAWUCHARTEST(eptr);
243 cp = RAWUCHARTEST(p);
244 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
245 p++;
246 eptr++;
247 }
248 }
249 }
250
251 /* In the caseful case, we can just compare the bytes, whether or not we
252 are in UTF-8 mode. */
253
254 else
255 {
256 while (length-- > 0)
257 {
258 if (eptr >= md->end_subject) return -2; /* Partial match */
259 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
260 }
261 }
262
263 return (int)(eptr - eptr_start);
264 }
265
266
267
268 /***************************************************************************
269 ****************************************************************************
270 RECURSION IN THE match() FUNCTION
271
272 The match() function is highly recursive, though not every recursive call
273 increases the recursive depth. Nevertheless, some regular expressions can cause
274 it to recurse to a great depth. I was writing for Unix, so I just let it call
275 itself recursively. This uses the stack for saving everything that has to be
276 saved for a recursive call. On Unix, the stack can be large, and this works
277 fine.
278
279 It turns out that on some non-Unix-like systems there are problems with
280 programs that use a lot of stack. (This despite the fact that every last chip
281 has oodles of memory these days, and techniques for extending the stack have
282 been known for decades.) So....
283
284 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
285 calls by keeping local variables that need to be preserved in blocks of memory
286 obtained from malloc() instead instead of on the stack. Macros are used to
287 achieve this so that the actual code doesn't look very different to what it
288 always used to.
289
290 The original heap-recursive code used longjmp(). However, it seems that this
291 can be very slow on some operating systems. Following a suggestion from Stan
292 Switzer, the use of longjmp() has been abolished, at the cost of having to
293 provide a unique number for each call to RMATCH. There is no way of generating
294 a sequence of numbers at compile time in C. I have given them names, to make
295 them stand out more clearly.
296
297 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
298 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
299 tests. Furthermore, not using longjmp() means that local dynamic variables
300 don't have indeterminate values; this has meant that the frame size can be
301 reduced because the result can be "passed back" by straight setting of the
302 variable instead of being passed in the frame.
303 ****************************************************************************
304 ***************************************************************************/
305
306 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
307 below must be updated in sync. */
308
309 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
310 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
311 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
312 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
313 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
314 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
315 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
316
317 /* These versions of the macros use the stack, as normal. There are debugging
318 versions and production versions. Note that the "rw" argument of RMATCH isn't
319 actually used in this definition. */
320
321 #ifndef NO_RECURSE
322 #define REGISTER register
323
324 #ifdef PCRE_DEBUG
325 #define RMATCH(ra,rb,rc,rd,re,rw) \
326 { \
327 printf("match() called in line %d\n", __LINE__); \
328 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
329 printf("to line %d\n", __LINE__); \
330 }
331 #define RRETURN(ra) \
332 { \
333 printf("match() returned %d from line %d\n", ra, __LINE__); \
334 return ra; \
335 }
336 #else
337 #define RMATCH(ra,rb,rc,rd,re,rw) \
338 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
339 #define RRETURN(ra) return ra
340 #endif
341
342 #else
343
344
345 /* These versions of the macros manage a private stack on the heap. Note that
346 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
347 argument of match(), which never changes. */
348
349 #define REGISTER
350
351 #define RMATCH(ra,rb,rc,rd,re,rw)\
352 {\
353 heapframe *newframe = frame->Xnextframe;\
354 if (newframe == NULL)\
355 {\
356 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
357 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
358 newframe->Xnextframe = NULL;\
359 frame->Xnextframe = newframe;\
360 }\
361 frame->Xwhere = rw;\
362 newframe->Xeptr = ra;\
363 newframe->Xecode = rb;\
364 newframe->Xmstart = mstart;\
365 newframe->Xoffset_top = rc;\
366 newframe->Xeptrb = re;\
367 newframe->Xrdepth = frame->Xrdepth + 1;\
368 newframe->Xprevframe = frame;\
369 frame = newframe;\
370 DPRINTF(("restarting from line %d\n", __LINE__));\
371 goto HEAP_RECURSE;\
372 L_##rw:\
373 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 }
375
376 #define RRETURN(ra)\
377 {\
378 heapframe *oldframe = frame;\
379 frame = oldframe->Xprevframe;\
380 if (frame != NULL)\
381 {\
382 rrc = ra;\
383 goto HEAP_RETURN;\
384 }\
385 return ra;\
386 }
387
388
389 /* Structure for remembering the local variables in a private frame */
390
391 typedef struct heapframe {
392 struct heapframe *Xprevframe;
393 struct heapframe *Xnextframe;
394
395 /* Function arguments that may change */
396
397 PCRE_PUCHAR Xeptr;
398 const pcre_uchar *Xecode;
399 PCRE_PUCHAR Xmstart;
400 int Xoffset_top;
401 eptrblock *Xeptrb;
402 unsigned int Xrdepth;
403
404 /* Function local variables */
405
406 PCRE_PUCHAR Xcallpat;
407 #ifdef SUPPORT_UTF
408 PCRE_PUCHAR Xcharptr;
409 #endif
410 PCRE_PUCHAR Xdata;
411 PCRE_PUCHAR Xnext;
412 PCRE_PUCHAR Xpp;
413 PCRE_PUCHAR Xprev;
414 PCRE_PUCHAR Xsaved_eptr;
415
416 recursion_info Xnew_recursive;
417
418 BOOL Xcur_is_word;
419 BOOL Xcondition;
420 BOOL Xprev_is_word;
421
422 #ifdef SUPPORT_UCP
423 int Xprop_type;
424 unsigned int Xprop_value;
425 int Xprop_fail_result;
426 int Xoclength;
427 pcre_uchar Xocchars[6];
428 #endif
429
430 int Xcodelink;
431 int Xctype;
432 unsigned int Xfc;
433 int Xfi;
434 int Xlength;
435 int Xmax;
436 int Xmin;
437 unsigned int Xnumber;
438 int Xoffset;
439 unsigned int Xop;
440 pcre_int32 Xsave_capture_last;
441 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
442 int Xstacksave[REC_STACK_SAVE_MAX];
443
444 eptrblock Xnewptrb;
445
446 /* Where to jump back to */
447
448 int Xwhere;
449
450 } heapframe;
451
452 #endif
453
454
455 /***************************************************************************
456 ***************************************************************************/
457
458
459
460 /*************************************************
461 * Match from current position *
462 *************************************************/
463
464 /* This function is called recursively in many circumstances. Whenever it
465 returns a negative (error) response, the outer incarnation must also return the
466 same response. */
467
468 /* These macros pack up tests that are used for partial matching, and which
469 appear several times in the code. We set the "hit end" flag if the pointer is
470 at the end of the subject and also past the start of the subject (i.e.
471 something has been matched). For hard partial matching, we then return
472 immediately. The second one is used when we already know we are past the end of
473 the subject. */
474
475 #define CHECK_PARTIAL()\
476 if (md->partial != 0 && eptr >= md->end_subject && \
477 eptr > md->start_used_ptr) \
478 { \
479 md->hitend = TRUE; \
480 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 }
482
483 #define SCHECK_PARTIAL()\
484 if (md->partial != 0 && eptr > md->start_used_ptr) \
485 { \
486 md->hitend = TRUE; \
487 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
488 }
489
490
491 /* Performance note: It might be tempting to extract commonly used fields from
492 the md structure (e.g. utf, end_subject) into individual variables to improve
493 performance. Tests using gcc on a SPARC disproved this; in the first case, it
494 made performance worse.
495
496 Arguments:
497 eptr pointer to current character in subject
498 ecode pointer to current position in compiled code
499 mstart pointer to the current match start position (can be modified
500 by encountering \K)
501 offset_top current top pointer
502 md pointer to "static" info for the match
503 eptrb pointer to chain of blocks containing eptr at start of
504 brackets - for testing for empty matches
505 rdepth the recursion depth
506
507 Returns: MATCH_MATCH if matched ) these values are >= 0
508 MATCH_NOMATCH if failed to match )
509 a negative MATCH_xxx value for PRUNE, SKIP, etc
510 a negative PCRE_ERROR_xxx value if aborted by an error condition
511 (e.g. stopped by repeated call or recursion limit)
512 */
513
514 static int
515 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
516 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
517 unsigned int rdepth)
518 {
519 /* These variables do not need to be preserved over recursion in this function,
520 so they can be ordinary variables in all cases. Mark some of them with
521 "register" because they are used a lot in loops. */
522
523 register int rrc; /* Returns from recursive calls */
524 register int i; /* Used for loops not involving calls to RMATCH() */
525 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
526 register BOOL utf; /* Local copy of UTF flag for speed */
527
528 BOOL minimize, possessive; /* Quantifier options */
529 BOOL caseless;
530 int condcode;
531
532 /* When recursion is not being used, all "local" variables that have to be
533 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
534 frame on the stack here; subsequent instantiations are obtained from the heap
535 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
536 the top-level on the stack rather than malloc-ing them all gives a performance
537 boost in many cases where there is not much "recursion". */
538
539 #ifdef NO_RECURSE
540 heapframe *frame = (heapframe *)md->match_frames_base;
541
542 /* Copy in the original argument variables */
543
544 frame->Xeptr = eptr;
545 frame->Xecode = ecode;
546 frame->Xmstart = mstart;
547 frame->Xoffset_top = offset_top;
548 frame->Xeptrb = eptrb;
549 frame->Xrdepth = rdepth;
550
551 /* This is where control jumps back to to effect "recursion" */
552
553 HEAP_RECURSE:
554
555 /* Macros make the argument variables come from the current frame */
556
557 #define eptr frame->Xeptr
558 #define ecode frame->Xecode
559 #define mstart frame->Xmstart
560 #define offset_top frame->Xoffset_top
561 #define eptrb frame->Xeptrb
562 #define rdepth frame->Xrdepth
563
564 /* Ditto for the local variables */
565
566 #ifdef SUPPORT_UTF
567 #define charptr frame->Xcharptr
568 #endif
569 #define callpat frame->Xcallpat
570 #define codelink frame->Xcodelink
571 #define data frame->Xdata
572 #define next frame->Xnext
573 #define pp frame->Xpp
574 #define prev frame->Xprev
575 #define saved_eptr frame->Xsaved_eptr
576
577 #define new_recursive frame->Xnew_recursive
578
579 #define cur_is_word frame->Xcur_is_word
580 #define condition frame->Xcondition
581 #define prev_is_word frame->Xprev_is_word
582
583 #ifdef SUPPORT_UCP
584 #define prop_type frame->Xprop_type
585 #define prop_value frame->Xprop_value
586 #define prop_fail_result frame->Xprop_fail_result
587 #define oclength frame->Xoclength
588 #define occhars frame->Xocchars
589 #endif
590
591 #define ctype frame->Xctype
592 #define fc frame->Xfc
593 #define fi frame->Xfi
594 #define length frame->Xlength
595 #define max frame->Xmax
596 #define min frame->Xmin
597 #define number frame->Xnumber
598 #define offset frame->Xoffset
599 #define op frame->Xop
600 #define save_capture_last frame->Xsave_capture_last
601 #define save_offset1 frame->Xsave_offset1
602 #define save_offset2 frame->Xsave_offset2
603 #define save_offset3 frame->Xsave_offset3
604 #define stacksave frame->Xstacksave
605
606 #define newptrb frame->Xnewptrb
607
608 /* When recursion is being used, local variables are allocated on the stack and
609 get preserved during recursion in the normal way. In this environment, fi and
610 i, and fc and c, can be the same variables. */
611
612 #else /* NO_RECURSE not defined */
613 #define fi i
614 #define fc c
615
616 /* Many of the following variables are used only in small blocks of the code.
617 My normal style of coding would have declared them within each of those blocks.
618 However, in order to accommodate the version of this code that uses an external
619 "stack" implemented on the heap, it is easier to declare them all here, so the
620 declarations can be cut out in a block. The only declarations within blocks
621 below are for variables that do not have to be preserved over a recursive call
622 to RMATCH(). */
623
624 #ifdef SUPPORT_UTF
625 const pcre_uchar *charptr;
626 #endif
627 const pcre_uchar *callpat;
628 const pcre_uchar *data;
629 const pcre_uchar *next;
630 PCRE_PUCHAR pp;
631 const pcre_uchar *prev;
632 PCRE_PUCHAR saved_eptr;
633
634 recursion_info new_recursive;
635
636 BOOL cur_is_word;
637 BOOL condition;
638 BOOL prev_is_word;
639
640 #ifdef SUPPORT_UCP
641 int prop_type;
642 unsigned int prop_value;
643 int prop_fail_result;
644 int oclength;
645 pcre_uchar occhars[6];
646 #endif
647
648 int codelink;
649 int ctype;
650 int length;
651 int max;
652 int min;
653 unsigned int number;
654 int offset;
655 unsigned int op;
656 pcre_int32 save_capture_last;
657 int save_offset1, save_offset2, save_offset3;
658 int stacksave[REC_STACK_SAVE_MAX];
659
660 eptrblock newptrb;
661
662 /* There is a special fudge for calling match() in a way that causes it to
663 measure the size of its basic stack frame when the stack is being used for
664 recursion. The second argument (ecode) being NULL triggers this behaviour. It
665 cannot normally ever be NULL. The return is the negated value of the frame
666 size. */
667
668 if (ecode == NULL)
669 {
670 if (rdepth == 0)
671 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
672 else
673 {
674 int len = (char *)&rdepth - (char *)eptr;
675 return (len > 0)? -len : len;
676 }
677 }
678 #endif /* NO_RECURSE */
679
680 /* To save space on the stack and in the heap frame, I have doubled up on some
681 of the local variables that are used only in localised parts of the code, but
682 still need to be preserved over recursive calls of match(). These macros define
683 the alternative names that are used. */
684
685 #define allow_zero cur_is_word
686 #define cbegroup condition
687 #define code_offset codelink
688 #define condassert condition
689 #define matched_once prev_is_word
690 #define foc number
691 #define save_mark data
692
693 /* These statements are here to stop the compiler complaining about unitialized
694 variables. */
695
696 #ifdef SUPPORT_UCP
697 prop_value = 0;
698 prop_fail_result = 0;
699 #endif
700
701
702 /* This label is used for tail recursion, which is used in a few cases even
703 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
704 used. Thanks to Ian Taylor for noticing this possibility and sending the
705 original patch. */
706
707 TAIL_RECURSE:
708
709 /* OK, now we can get on with the real code of the function. Recursive calls
710 are specified by the macro RMATCH and RRETURN is used to return. When
711 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
712 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
713 defined). However, RMATCH isn't like a function call because it's quite a
714 complicated macro. It has to be used in one particular way. This shouldn't,
715 however, impact performance when true recursion is being used. */
716
717 #ifdef SUPPORT_UTF
718 utf = md->utf; /* Local copy of the flag */
719 #else
720 utf = FALSE;
721 #endif
722
723 /* First check that we haven't called match() too many times, or that we
724 haven't exceeded the recursive call limit. */
725
726 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
727 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
728
729 /* At the start of a group with an unlimited repeat that may match an empty
730 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
731 done this way to save having to use another function argument, which would take
732 up space on the stack. See also MATCH_CONDASSERT below.
733
734 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
735 such remembered pointers, to be checked when we hit the closing ket, in order
736 to break infinite loops that match no characters. When match() is called in
737 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
738 NOT be used with tail recursion, because the memory block that is used is on
739 the stack, so a new one may be required for each match(). */
740
741 if (md->match_function_type == MATCH_CBEGROUP)
742 {
743 newptrb.epb_saved_eptr = eptr;
744 newptrb.epb_prev = eptrb;
745 eptrb = &newptrb;
746 md->match_function_type = 0;
747 }
748
749 /* Now start processing the opcodes. */
750
751 for (;;)
752 {
753 minimize = possessive = FALSE;
754 op = *ecode;
755
756 switch(op)
757 {
758 case OP_MARK:
759 md->nomatch_mark = ecode + 2;
760 md->mark = NULL; /* In case previously set by assertion */
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
762 eptrb, RM55);
763 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
764 md->mark == NULL) md->mark = ecode + 2;
765
766 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
767 argument, and we must check whether that argument matches this MARK's
768 argument. It is passed back in md->start_match_ptr (an overloading of that
769 variable). If it does match, we reset that variable to the current subject
770 position and return MATCH_SKIP. Otherwise, pass back the return code
771 unaltered. */
772
773 else if (rrc == MATCH_SKIP_ARG &&
774 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
775 {
776 md->start_match_ptr = eptr;
777 RRETURN(MATCH_SKIP);
778 }
779 RRETURN(rrc);
780
781 case OP_FAIL:
782 RRETURN(MATCH_NOMATCH);
783
784 case OP_COMMIT:
785 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
786 eptrb, RM52);
787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
788 RRETURN(MATCH_COMMIT);
789
790 case OP_PRUNE:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
792 eptrb, RM51);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 RRETURN(MATCH_PRUNE);
795
796 case OP_PRUNE_ARG:
797 md->nomatch_mark = ecode + 2;
798 md->mark = NULL; /* In case previously set by assertion */
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM56);
801 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
802 md->mark == NULL) md->mark = ecode + 2;
803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
804 RRETURN(MATCH_PRUNE);
805
806 case OP_SKIP:
807 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
808 eptrb, RM53);
809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
810 md->start_match_ptr = eptr; /* Pass back current position */
811 RRETURN(MATCH_SKIP);
812
813 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
814 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
815 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
816 that failed and any that preceed it (either they also failed, or were not
817 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
818 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
819 set to the count of the one that failed. */
820
821 case OP_SKIP_ARG:
822 md->skip_arg_count++;
823 if (md->skip_arg_count <= md->ignore_skip_arg)
824 {
825 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
826 break;
827 }
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
829 eptrb, RM57);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831
832 /* Pass back the current skip name by overloading md->start_match_ptr and
833 returning the special MATCH_SKIP_ARG return code. This will either be
834 caught by a matching MARK, or get to the top, where it causes a rematch
835 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
836
837 md->start_match_ptr = ecode + 2;
838 RRETURN(MATCH_SKIP_ARG);
839
840 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
841 the branch in which it occurs can be determined. Overload the start of
842 match pointer to do this. */
843
844 case OP_THEN:
845 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
846 eptrb, RM54);
847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848 md->start_match_ptr = ecode;
849 RRETURN(MATCH_THEN);
850
851 case OP_THEN_ARG:
852 md->nomatch_mark = ecode + 2;
853 md->mark = NULL; /* In case previously set by assertion */
854 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
855 md, eptrb, RM58);
856 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
857 md->mark == NULL) md->mark = ecode + 2;
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 md->start_match_ptr = ecode;
860 RRETURN(MATCH_THEN);
861
862 /* Handle an atomic group that does not contain any capturing parentheses.
863 This can be handled like an assertion. Prior to 8.13, all atomic groups
864 were handled this way. In 8.13, the code was changed as below for ONCE, so
865 that backups pass through the group and thereby reset captured values.
866 However, this uses a lot more stack, so in 8.20, atomic groups that do not
867 contain any captures generate OP_ONCE_NC, which can be handled in the old,
868 less stack intensive way.
869
870 Check the alternative branches in turn - the matching won't pass the KET
871 for this kind of subpattern. If any one branch matches, we carry on as at
872 the end of a normal bracket, leaving the subject pointer, but resetting
873 the start-of-match value in case it was changed by \K. */
874
875 case OP_ONCE_NC:
876 prev = ecode;
877 saved_eptr = eptr;
878 save_mark = md->mark;
879 do
880 {
881 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
882 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
883 {
884 mstart = md->start_match_ptr;
885 break;
886 }
887 if (rrc == MATCH_THEN)
888 {
889 next = ecode + GET(ecode,1);
890 if (md->start_match_ptr < next &&
891 (*ecode == OP_ALT || *next == OP_ALT))
892 rrc = MATCH_NOMATCH;
893 }
894
895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896 ecode += GET(ecode,1);
897 md->mark = save_mark;
898 }
899 while (*ecode == OP_ALT);
900
901 /* If hit the end of the group (which could be repeated), fail */
902
903 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
904
905 /* Continue as from after the group, updating the offsets high water
906 mark, since extracts may have been taken. */
907
908 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
909
910 offset_top = md->end_offset_top;
911 eptr = md->end_match_ptr;
912
913 /* For a non-repeating ket, just continue at this level. This also
914 happens for a repeating ket if no characters were matched in the group.
915 This is the forcible breaking of infinite loops as implemented in Perl
916 5.005. */
917
918 if (*ecode == OP_KET || eptr == saved_eptr)
919 {
920 ecode += 1+LINK_SIZE;
921 break;
922 }
923
924 /* The repeating kets try the rest of the pattern or restart from the
925 preceding bracket, in the appropriate order. The second "call" of match()
926 uses tail recursion, to avoid using another stack frame. */
927
928 if (*ecode == OP_KETRMIN)
929 {
930 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
932 ecode = prev;
933 goto TAIL_RECURSE;
934 }
935 else /* OP_KETRMAX */
936 {
937 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
939 ecode += 1 + LINK_SIZE;
940 goto TAIL_RECURSE;
941 }
942 /* Control never gets here */
943
944 /* Handle a capturing bracket, other than those that are possessive with an
945 unlimited repeat. If there is space in the offset vector, save the current
946 subject position in the working slot at the top of the vector. We mustn't
947 change the current values of the data slot, because they may be set from a
948 previous iteration of this group, and be referred to by a reference inside
949 the group. A failure to match might occur after the group has succeeded,
950 if something later on doesn't match. For this reason, we need to restore
951 the working value and also the values of the final offsets, in case they
952 were set by a previous iteration of the same bracket.
953
954 If there isn't enough space in the offset vector, treat this as if it were
955 a non-capturing bracket. Don't worry about setting the flag for the error
956 case here; that is handled in the code for KET. */
957
958 case OP_CBRA:
959 case OP_SCBRA:
960 number = GET2(ecode, 1+LINK_SIZE);
961 offset = number << 1;
962
963 #ifdef PCRE_DEBUG
964 printf("start bracket %d\n", number);
965 printf("subject=");
966 pchars(eptr, 16, TRUE, md);
967 printf("\n");
968 #endif
969
970 if (offset < md->offset_max)
971 {
972 save_offset1 = md->offset_vector[offset];
973 save_offset2 = md->offset_vector[offset+1];
974 save_offset3 = md->offset_vector[md->offset_end - number];
975 save_capture_last = md->capture_last;
976 save_mark = md->mark;
977
978 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
979 md->offset_vector[md->offset_end - number] =
980 (int)(eptr - md->start_subject);
981
982 for (;;)
983 {
984 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
986 eptrb, RM1);
987 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
988
989 /* If we backed up to a THEN, check whether it is within the current
990 branch by comparing the address of the THEN that is passed back with
991 the end of the branch. If it is within the current branch, and the
992 branch is one of two or more alternatives (it either starts or ends
993 with OP_ALT), we have reached the limit of THEN's action, so convert
994 the return code to NOMATCH, which will cause normal backtracking to
995 happen from now on. Otherwise, THEN is passed back to an outer
996 alternative. This implements Perl's treatment of parenthesized groups,
997 where a group not containing | does not affect the current alternative,
998 that is, (X) is NOT the same as (X|(*F)). */
999
1000 if (rrc == MATCH_THEN)
1001 {
1002 next = ecode + GET(ecode,1);
1003 if (md->start_match_ptr < next &&
1004 (*ecode == OP_ALT || *next == OP_ALT))
1005 rrc = MATCH_NOMATCH;
1006 }
1007
1008 /* Anything other than NOMATCH is passed back. */
1009
1010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011 md->capture_last = save_capture_last;
1012 ecode += GET(ecode, 1);
1013 md->mark = save_mark;
1014 if (*ecode != OP_ALT) break;
1015 }
1016
1017 DPRINTF(("bracket %d failed\n", number));
1018 md->offset_vector[offset] = save_offset1;
1019 md->offset_vector[offset+1] = save_offset2;
1020 md->offset_vector[md->offset_end - number] = save_offset3;
1021
1022 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1023
1024 RRETURN(rrc);
1025 }
1026
1027 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1028 as a non-capturing bracket. */
1029
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1032
1033 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1034
1035 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1037
1038 /* Non-capturing or atomic group, except for possessive with unlimited
1039 repeat and ONCE group with no captures. Loop for all the alternatives.
1040
1041 When we get to the final alternative within the brackets, we used to return
1042 the result of a recursive call to match() whatever happened so it was
1043 possible to reduce stack usage by turning this into a tail recursion,
1044 except in the case of a possibly empty group. However, now that there is
1045 the possiblity of (*THEN) occurring in the final alternative, this
1046 optimization is no longer always possible.
1047
1048 We can optimize if we know there are no (*THEN)s in the pattern; at present
1049 this is the best that can be done.
1050
1051 MATCH_ONCE is returned when the end of an atomic group is successfully
1052 reached, but subsequent matching fails. It passes back up the tree (causing
1053 captured values to be reset) until the original atomic group level is
1054 reached. This is tested by comparing md->once_target with the start of the
1055 group. At this point, the return is converted into MATCH_NOMATCH so that
1056 previous backup points can be taken. */
1057
1058 case OP_ONCE:
1059 case OP_BRA:
1060 case OP_SBRA:
1061 DPRINTF(("start non-capturing bracket\n"));
1062
1063 for (;;)
1064 {
1065 if (op >= OP_SBRA || op == OP_ONCE)
1066 md->match_function_type = MATCH_CBEGROUP;
1067
1068 /* If this is not a possibly empty group, and there are no (*THEN)s in
1069 the pattern, and this is the final alternative, optimize as described
1070 above. */
1071
1072 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1073 {
1074 ecode += PRIV(OP_lengths)[*ecode];
1075 goto TAIL_RECURSE;
1076 }
1077
1078 /* In all other cases, we have to make another call to match(). */
1079
1080 save_mark = md->mark;
1081 save_capture_last = md->capture_last;
1082 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1083 RM2);
1084
1085 /* See comment in the code for capturing groups above about handling
1086 THEN. */
1087
1088 if (rrc == MATCH_THEN)
1089 {
1090 next = ecode + GET(ecode,1);
1091 if (md->start_match_ptr < next &&
1092 (*ecode == OP_ALT || *next == OP_ALT))
1093 rrc = MATCH_NOMATCH;
1094 }
1095
1096 if (rrc != MATCH_NOMATCH)
1097 {
1098 if (rrc == MATCH_ONCE)
1099 {
1100 const pcre_uchar *scode = ecode;
1101 if (*scode != OP_ONCE) /* If not at start, find it */
1102 {
1103 while (*scode == OP_ALT) scode += GET(scode, 1);
1104 scode -= GET(scode, 1);
1105 }
1106 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1107 }
1108 RRETURN(rrc);
1109 }
1110 ecode += GET(ecode, 1);
1111 md->mark = save_mark;
1112 if (*ecode != OP_ALT) break;
1113 md->capture_last = save_capture_last;
1114 }
1115
1116 RRETURN(MATCH_NOMATCH);
1117
1118 /* Handle possessive capturing brackets with an unlimited repeat. We come
1119 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1120 handled similarly to the normal case above. However, the matching is
1121 different. The end of these brackets will always be OP_KETRPOS, which
1122 returns MATCH_KETRPOS without going further in the pattern. By this means
1123 we can handle the group by iteration rather than recursion, thereby
1124 reducing the amount of stack needed. */
1125
1126 case OP_CBRAPOS:
1127 case OP_SCBRAPOS:
1128 allow_zero = FALSE;
1129
1130 POSSESSIVE_CAPTURE:
1131 number = GET2(ecode, 1+LINK_SIZE);
1132 offset = number << 1;
1133
1134 #ifdef PCRE_DEBUG
1135 printf("start possessive bracket %d\n", number);
1136 printf("subject=");
1137 pchars(eptr, 16, TRUE, md);
1138 printf("\n");
1139 #endif
1140
1141 if (offset < md->offset_max)
1142 {
1143 matched_once = FALSE;
1144 code_offset = (int)(ecode - md->start_code);
1145
1146 save_offset1 = md->offset_vector[offset];
1147 save_offset2 = md->offset_vector[offset+1];
1148 save_offset3 = md->offset_vector[md->offset_end - number];
1149 save_capture_last = md->capture_last;
1150
1151 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1152
1153 /* Each time round the loop, save the current subject position for use
1154 when the group matches. For MATCH_MATCH, the group has matched, so we
1155 restart it with a new subject starting position, remembering that we had
1156 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1157 usual. If we haven't matched any alternatives in any iteration, check to
1158 see if a previous iteration matched. If so, the group has matched;
1159 continue from afterwards. Otherwise it has failed; restore the previous
1160 capture values before returning NOMATCH. */
1161
1162 for (;;)
1163 {
1164 md->offset_vector[md->offset_end - number] =
1165 (int)(eptr - md->start_subject);
1166 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1167 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1168 eptrb, RM63);
1169 if (rrc == MATCH_KETRPOS)
1170 {
1171 offset_top = md->end_offset_top;
1172 eptr = md->end_match_ptr;
1173 ecode = md->start_code + code_offset;
1174 save_capture_last = md->capture_last;
1175 matched_once = TRUE;
1176 continue;
1177 }
1178
1179 /* See comment in the code for capturing groups above about handling
1180 THEN. */
1181
1182 if (rrc == MATCH_THEN)
1183 {
1184 next = ecode + GET(ecode,1);
1185 if (md->start_match_ptr < next &&
1186 (*ecode == OP_ALT || *next == OP_ALT))
1187 rrc = MATCH_NOMATCH;
1188 }
1189
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 md->capture_last = save_capture_last;
1192 ecode += GET(ecode, 1);
1193 if (*ecode != OP_ALT) break;
1194 }
1195
1196 if (!matched_once)
1197 {
1198 md->offset_vector[offset] = save_offset1;
1199 md->offset_vector[offset+1] = save_offset2;
1200 md->offset_vector[md->offset_end - number] = save_offset3;
1201 }
1202
1203 if (allow_zero || matched_once)
1204 {
1205 ecode += 1 + LINK_SIZE;
1206 break;
1207 }
1208
1209 RRETURN(MATCH_NOMATCH);
1210 }
1211
1212 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1213 as a non-capturing bracket. */
1214
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1216 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217
1218 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222
1223 /* Non-capturing possessive bracket with unlimited repeat. We come here
1224 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1225 without the capturing complication. It is written out separately for speed
1226 and cleanliness. */
1227
1228 case OP_BRAPOS:
1229 case OP_SBRAPOS:
1230 allow_zero = FALSE;
1231
1232 POSSESSIVE_NON_CAPTURE:
1233 matched_once = FALSE;
1234 code_offset = (int)(ecode - md->start_code);
1235 save_capture_last = md->capture_last;
1236
1237 for (;;)
1238 {
1239 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1240 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1241 eptrb, RM48);
1242 if (rrc == MATCH_KETRPOS)
1243 {
1244 offset_top = md->end_offset_top;
1245 eptr = md->end_match_ptr;
1246 ecode = md->start_code + code_offset;
1247 matched_once = TRUE;
1248 continue;
1249 }
1250
1251 /* See comment in the code for capturing groups above about handling
1252 THEN. */
1253
1254 if (rrc == MATCH_THEN)
1255 {
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1260 }
1261
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1266 }
1267
1268 if (matched_once || allow_zero)
1269 {
1270 ecode += 1 + LINK_SIZE;
1271 break;
1272 }
1273 RRETURN(MATCH_NOMATCH);
1274
1275 /* Control never reaches here. */
1276
1277 /* Conditional group: compilation checked that there are no more than
1278 two branches. If the condition is false, skipping the first branch takes us
1279 past the end if there is only one branch, but that's OK because that is
1280 exactly what going to the ket would do. */
1281
1282 case OP_COND:
1283 case OP_SCOND:
1284 codelink = GET(ecode, 1);
1285
1286 /* Because of the way auto-callout works during compile, a callout item is
1287 inserted between OP_COND and an assertion condition. */
1288
1289 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1290 {
1291 if (PUBL(callout) != NULL)
1292 {
1293 PUBL(callout_block) cb;
1294 cb.version = 2; /* Version 1 of the callout block */
1295 cb.callout_number = ecode[LINK_SIZE+2];
1296 cb.offset_vector = md->offset_vector;
1297 #if defined COMPILE_PCRE8
1298 cb.subject = (PCRE_SPTR)md->start_subject;
1299 #elif defined COMPILE_PCRE16
1300 cb.subject = (PCRE_SPTR16)md->start_subject;
1301 #elif defined COMPILE_PCRE32
1302 cb.subject = (PCRE_SPTR32)md->start_subject;
1303 #endif
1304 cb.subject_length = (int)(md->end_subject - md->start_subject);
1305 cb.start_match = (int)(mstart - md->start_subject);
1306 cb.current_position = (int)(eptr - md->start_subject);
1307 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1308 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1309 cb.capture_top = offset_top/2;
1310 cb.capture_last = md->capture_last & CAPLMASK;
1311 /* Internal change requires this for API compatibility. */
1312 if (cb.capture_last == 0) cb.capture_last = -1;
1313 cb.callout_data = md->callout_data;
1314 cb.mark = md->nomatch_mark;
1315 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1316 if (rrc < 0) RRETURN(rrc);
1317 }
1318 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1319 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1320 }
1321
1322 condcode = ecode[LINK_SIZE+1];
1323
1324 /* Now see what the actual condition is */
1325
1326 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1327 {
1328 if (md->recursive == NULL) /* Not recursing => FALSE */
1329 {
1330 condition = FALSE;
1331 ecode += GET(ecode, 1);
1332 }
1333 else
1334 {
1335 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1336 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1337
1338 /* If the test is for recursion into a specific subpattern, and it is
1339 false, but the test was set up by name, scan the table to see if the
1340 name refers to any other numbers, and test them. The condition is true
1341 if any one is set. */
1342
1343 if (!condition && condcode == OP_NRREF)
1344 {
1345 pcre_uchar *slotA = md->name_table;
1346 for (i = 0; i < md->name_count; i++)
1347 {
1348 if (GET2(slotA, 0) == recno) break;
1349 slotA += md->name_entry_size;
1350 }
1351
1352 /* Found a name for the number - there can be only one; duplicate
1353 names for different numbers are allowed, but not vice versa. First
1354 scan down for duplicates. */
1355
1356 if (i < md->name_count)
1357 {
1358 pcre_uchar *slotB = slotA;
1359 while (slotB > md->name_table)
1360 {
1361 slotB -= md->name_entry_size;
1362 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1363 {
1364 condition = GET2(slotB, 0) == md->recursive->group_num;
1365 if (condition) break;
1366 }
1367 else break;
1368 }
1369
1370 /* Scan up for duplicates */
1371
1372 if (!condition)
1373 {
1374 slotB = slotA;
1375 for (i++; i < md->name_count; i++)
1376 {
1377 slotB += md->name_entry_size;
1378 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1379 {
1380 condition = GET2(slotB, 0) == md->recursive->group_num;
1381 if (condition) break;
1382 }
1383 else break;
1384 }
1385 }
1386 }
1387 }
1388
1389 /* Chose branch according to the condition */
1390
1391 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1392 }
1393 }
1394
1395 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1396 {
1397 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1398 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1399
1400 /* If the numbered capture is unset, but the reference was by name,
1401 scan the table to see if the name refers to any other numbers, and test
1402 them. The condition is true if any one is set. This is tediously similar
1403 to the code above, but not close enough to try to amalgamate. */
1404
1405 if (!condition && condcode == OP_NCREF)
1406 {
1407 unsigned int refno = offset >> 1;
1408 pcre_uchar *slotA = md->name_table;
1409
1410 for (i = 0; i < md->name_count; i++)
1411 {
1412 if (GET2(slotA, 0) == refno) break;
1413 slotA += md->name_entry_size;
1414 }
1415
1416 /* Found a name for the number - there can be only one; duplicate names
1417 for different numbers are allowed, but not vice versa. First scan down
1418 for duplicates. */
1419
1420 if (i < md->name_count)
1421 {
1422 pcre_uchar *slotB = slotA;
1423 while (slotB > md->name_table)
1424 {
1425 slotB -= md->name_entry_size;
1426 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1427 {
1428 offset = GET2(slotB, 0) << 1;
1429 condition = offset < offset_top &&
1430 md->offset_vector[offset] >= 0;
1431 if (condition) break;
1432 }
1433 else break;
1434 }
1435
1436 /* Scan up for duplicates */
1437
1438 if (!condition)
1439 {
1440 slotB = slotA;
1441 for (i++; i < md->name_count; i++)
1442 {
1443 slotB += md->name_entry_size;
1444 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1445 {
1446 offset = GET2(slotB, 0) << 1;
1447 condition = offset < offset_top &&
1448 md->offset_vector[offset] >= 0;
1449 if (condition) break;
1450 }
1451 else break;
1452 }
1453 }
1454 }
1455 }
1456
1457 /* Chose branch according to the condition */
1458
1459 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1460 }
1461
1462 else if (condcode == OP_DEF) /* DEFINE - always false */
1463 {
1464 condition = FALSE;
1465 ecode += GET(ecode, 1);
1466 }
1467
1468 /* The condition is an assertion. Call match() to evaluate it - setting
1469 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1470 an assertion. */
1471
1472 else
1473 {
1474 md->match_function_type = MATCH_CONDASSERT;
1475 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1476 if (rrc == MATCH_MATCH)
1477 {
1478 if (md->end_offset_top > offset_top)
1479 offset_top = md->end_offset_top; /* Captures may have happened */
1480 condition = TRUE;
1481 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1482 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1483 }
1484
1485 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1486 assertion; it is therefore treated as NOMATCH. */
1487
1488 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1489 {
1490 RRETURN(rrc); /* Need braces because of following else */
1491 }
1492 else
1493 {
1494 condition = FALSE;
1495 ecode += codelink;
1496 }
1497 }
1498
1499 /* We are now at the branch that is to be obeyed. As there is only one, can
1500 use tail recursion to avoid using another stack frame, except when there is
1501 unlimited repeat of a possibly empty group. In the latter case, a recursive
1502 call to match() is always required, unless the second alternative doesn't
1503 exist, in which case we can just plough on. Note that, for compatibility
1504 with Perl, the | in a conditional group is NOT treated as creating two
1505 alternatives. If a THEN is encountered in the branch, it propagates out to
1506 the enclosing alternative (unless nested in a deeper set of alternatives,
1507 of course). */
1508
1509 if (condition || *ecode == OP_ALT)
1510 {
1511 if (op != OP_SCOND)
1512 {
1513 ecode += 1 + LINK_SIZE;
1514 goto TAIL_RECURSE;
1515 }
1516
1517 md->match_function_type = MATCH_CBEGROUP;
1518 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1519 RRETURN(rrc);
1520 }
1521
1522 /* Condition false & no alternative; continue after the group. */
1523
1524 else
1525 {
1526 ecode += 1 + LINK_SIZE;
1527 }
1528 break;
1529
1530
1531 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1532 to close any currently open capturing brackets. */
1533
1534 case OP_CLOSE:
1535 number = GET2(ecode, 1); /* Must be less than 65536 */
1536 offset = number << 1;
1537
1538 #ifdef PCRE_DEBUG
1539 printf("end bracket %d at *ACCEPT", number);
1540 printf("\n");
1541 #endif
1542
1543 md->capture_last = (md->capture_last & OVFLMASK) | number;
1544 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1545 {
1546 md->offset_vector[offset] =
1547 md->offset_vector[md->offset_end - number];
1548 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1549 if (offset_top <= offset) offset_top = offset + 2;
1550 }
1551 ecode += 1 + IMM2_SIZE;
1552 break;
1553
1554
1555 /* End of the pattern, either real or forced. */
1556
1557 case OP_END:
1558 case OP_ACCEPT:
1559 case OP_ASSERT_ACCEPT:
1560
1561 /* If we have matched an empty string, fail if not in an assertion and not
1562 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1563 is set and we have matched at the start of the subject. In both cases,
1564 backtracking will then try other alternatives, if any. */
1565
1566 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1567 md->recursive == NULL &&
1568 (md->notempty ||
1569 (md->notempty_atstart &&
1570 mstart == md->start_subject + md->start_offset)))
1571 RRETURN(MATCH_NOMATCH);
1572
1573 /* Otherwise, we have a match. */
1574
1575 md->end_match_ptr = eptr; /* Record where we ended */
1576 md->end_offset_top = offset_top; /* and how many extracts were taken */
1577 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1578
1579 /* For some reason, the macros don't work properly if an expression is
1580 given as the argument to RRETURN when the heap is in use. */
1581
1582 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1583 RRETURN(rrc);
1584
1585 /* Assertion brackets. Check the alternative branches in turn - the
1586 matching won't pass the KET for an assertion. If any one branch matches,
1587 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1588 start of each branch to move the current point backwards, so the code at
1589 this level is identical to the lookahead case. When the assertion is part
1590 of a condition, we want to return immediately afterwards. The caller of
1591 this incarnation of the match() function will have set MATCH_CONDASSERT in
1592 md->match_function type, and one of these opcodes will be the first opcode
1593 that is processed. We use a local variable that is preserved over calls to
1594 match() to remember this case. */
1595
1596 case OP_ASSERT:
1597 case OP_ASSERTBACK:
1598 save_mark = md->mark;
1599 if (md->match_function_type == MATCH_CONDASSERT)
1600 {
1601 condassert = TRUE;
1602 md->match_function_type = 0;
1603 }
1604 else condassert = FALSE;
1605
1606 /* Loop for each branch */
1607
1608 do
1609 {
1610 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1611 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1612 {
1613 mstart = md->start_match_ptr; /* In case \K reset it */
1614 break;
1615 }
1616 md->mark = save_mark;
1617
1618 /* See comment in the code for capturing groups above about handling
1619 THEN. */
1620
1621 if (rrc == MATCH_THEN)
1622 {
1623 next = ecode + GET(ecode,1);
1624 if (md->start_match_ptr < next &&
1625 (*ecode == OP_ALT || *next == OP_ALT))
1626 rrc = MATCH_NOMATCH;
1627 }
1628
1629 /* Anything other than NOMATCH causes the assertion to fail. This
1630 includes COMMIT, SKIP, and PRUNE. However, this consistent approach does
1631 not always have exactly the same effect as in Perl. */
1632
1633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1634 ecode += GET(ecode, 1);
1635 }
1636 while (*ecode == OP_ALT);
1637
1638 /* If we have tried all the alternative branches, the assertion has
1639 failed. */
1640
1641 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1642
1643 /* If checking an assertion for a condition, return MATCH_MATCH. */
1644
1645 if (condassert) RRETURN(MATCH_MATCH);
1646
1647 /* Continue from after a successful assertion, updating the offsets high
1648 water mark, since extracts may have been taken during the assertion. */
1649
1650 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1651 ecode += 1 + LINK_SIZE;
1652 offset_top = md->end_offset_top;
1653 continue;
1654
1655 /* Negative assertion: all branches must fail to match for the assertion to
1656 succeed. */
1657
1658 case OP_ASSERT_NOT:
1659 case OP_ASSERTBACK_NOT:
1660 save_mark = md->mark;
1661 if (md->match_function_type == MATCH_CONDASSERT)
1662 {
1663 condassert = TRUE;
1664 md->match_function_type = 0;
1665 }
1666 else condassert = FALSE;
1667
1668 /* Loop for each alternative branch. */
1669
1670 do
1671 {
1672 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1673 md->mark = save_mark;
1674
1675 /* A successful match means the assertion has failed. */
1676
1677 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1678
1679 /* See comment in the code for capturing groups above about handling
1680 THEN. */
1681
1682 if (rrc == MATCH_THEN)
1683 {
1684 next = ecode + GET(ecode,1);
1685 if (md->start_match_ptr < next &&
1686 (*ecode == OP_ALT || *next == OP_ALT))
1687 rrc = MATCH_NOMATCH;
1688 }
1689
1690 /* No match on a branch means we must carry on and try the next branch.
1691 Anything else, in particular, SKIP, PRUNE, etc. causes a failure in the
1692 enclosing branch. This is a consistent approach, but does not always have
1693 the same effect as in Perl. */
1694
1695 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1696 ecode += GET(ecode,1);
1697 }
1698 while (*ecode == OP_ALT);
1699
1700 /* All branches in the assertion failed to match. */
1701
1702 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1703 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1704 continue;
1705
1706 /* Move the subject pointer back. This occurs only at the start of
1707 each branch of a lookbehind assertion. If we are too close to the start to
1708 move back, this match function fails. When working with UTF-8 we move
1709 back a number of characters, not bytes. */
1710
1711 case OP_REVERSE:
1712 #ifdef SUPPORT_UTF
1713 if (utf)
1714 {
1715 i = GET(ecode, 1);
1716 while (i-- > 0)
1717 {
1718 eptr--;
1719 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1720 BACKCHAR(eptr);
1721 }
1722 }
1723 else
1724 #endif
1725
1726 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1727
1728 {
1729 eptr -= GET(ecode, 1);
1730 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1731 }
1732
1733 /* Save the earliest consulted character, then skip to next op code */
1734
1735 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1736 ecode += 1 + LINK_SIZE;
1737 break;
1738
1739 /* The callout item calls an external function, if one is provided, passing
1740 details of the match so far. This is mainly for debugging, though the
1741 function is able to force a failure. */
1742
1743 case OP_CALLOUT:
1744 if (PUBL(callout) != NULL)
1745 {
1746 PUBL(callout_block) cb;
1747 cb.version = 2; /* Version 1 of the callout block */
1748 cb.callout_number = ecode[1];
1749 cb.offset_vector = md->offset_vector;
1750 #if defined COMPILE_PCRE8
1751 cb.subject = (PCRE_SPTR)md->start_subject;
1752 #elif defined COMPILE_PCRE16
1753 cb.subject = (PCRE_SPTR16)md->start_subject;
1754 #elif defined COMPILE_PCRE32
1755 cb.subject = (PCRE_SPTR32)md->start_subject;
1756 #endif
1757 cb.subject_length = (int)(md->end_subject - md->start_subject);
1758 cb.start_match = (int)(mstart - md->start_subject);
1759 cb.current_position = (int)(eptr - md->start_subject);
1760 cb.pattern_position = GET(ecode, 2);
1761 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1762 cb.capture_top = offset_top/2;
1763 cb.capture_last = md->capture_last & CAPLMASK;
1764 /* Internal change requires this for API compatibility. */
1765 if (cb.capture_last == 0) cb.capture_last = -1;
1766 cb.callout_data = md->callout_data;
1767 cb.mark = md->nomatch_mark;
1768 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1769 if (rrc < 0) RRETURN(rrc);
1770 }
1771 ecode += 2 + 2*LINK_SIZE;
1772 break;
1773
1774 /* Recursion either matches the current regex, or some subexpression. The
1775 offset data is the offset to the starting bracket from the start of the
1776 whole pattern. (This is so that it works from duplicated subpatterns.)
1777
1778 The state of the capturing groups is preserved over recursion, and
1779 re-instated afterwards. We don't know how many are started and not yet
1780 finished (offset_top records the completed total) so we just have to save
1781 all the potential data. There may be up to 65535 such values, which is too
1782 large to put on the stack, but using malloc for small numbers seems
1783 expensive. As a compromise, the stack is used when there are no more than
1784 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1785
1786 There are also other values that have to be saved. We use a chained
1787 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1788 for the original version of this logic. It has, however, been hacked around
1789 a lot, so he is not to blame for the current way it works. */
1790
1791 case OP_RECURSE:
1792 {
1793 recursion_info *ri;
1794 unsigned int recno;
1795
1796 callpat = md->start_code + GET(ecode, 1);
1797 recno = (callpat == md->start_code)? 0 :
1798 GET2(callpat, 1 + LINK_SIZE);
1799
1800 /* Check for repeating a recursion without advancing the subject pointer.
1801 This should catch convoluted mutual recursions. (Some simple cases are
1802 caught at compile time.) */
1803
1804 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1805 if (recno == ri->group_num && eptr == ri->subject_position)
1806 RRETURN(PCRE_ERROR_RECURSELOOP);
1807
1808 /* Add to "recursing stack" */
1809
1810 new_recursive.group_num = recno;
1811 new_recursive.saved_capture_last = md->capture_last;
1812 new_recursive.subject_position = eptr;
1813 new_recursive.prevrec = md->recursive;
1814 md->recursive = &new_recursive;
1815
1816 /* Where to continue from afterwards */
1817
1818 ecode += 1 + LINK_SIZE;
1819
1820 /* Now save the offset data */
1821
1822 new_recursive.saved_max = md->offset_end;
1823 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1824 new_recursive.offset_save = stacksave;
1825 else
1826 {
1827 new_recursive.offset_save =
1828 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1829 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1830 }
1831 memcpy(new_recursive.offset_save, md->offset_vector,
1832 new_recursive.saved_max * sizeof(int));
1833
1834 /* OK, now we can do the recursion. After processing each alternative,
1835 restore the offset data and the last captured value. If there were nested
1836 recursions, md->recursive might be changed, so reset it before looping.
1837 */
1838
1839 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1840 cbegroup = (*callpat >= OP_SBRA);
1841 do
1842 {
1843 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1844 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1845 md, eptrb, RM6);
1846 memcpy(md->offset_vector, new_recursive.offset_save,
1847 new_recursive.saved_max * sizeof(int));
1848 md->capture_last = new_recursive.saved_capture_last;
1849 md->recursive = new_recursive.prevrec;
1850 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1851 {
1852 DPRINTF(("Recursion matched\n"));
1853 if (new_recursive.offset_save != stacksave)
1854 (PUBL(free))(new_recursive.offset_save);
1855
1856 /* Set where we got to in the subject, and reset the start in case
1857 it was changed by \K. This *is* propagated back out of a recursion,
1858 for Perl compatibility. */
1859
1860 eptr = md->end_match_ptr;
1861 mstart = md->start_match_ptr;
1862 goto RECURSION_MATCHED; /* Exit loop; end processing */
1863 }
1864
1865 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1866 recursion; they cause a NOMATCH for the entire recursion. These codes
1867 are defined in a range that can be tested for. */
1868
1869 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1870 RRETURN(MATCH_NOMATCH);
1871
1872 /* Any return code other than NOMATCH is an error. */
1873
1874 if (rrc != MATCH_NOMATCH)
1875 {
1876 DPRINTF(("Recursion gave error %d\n", rrc));
1877 if (new_recursive.offset_save != stacksave)
1878 (PUBL(free))(new_recursive.offset_save);
1879 RRETURN(rrc);
1880 }
1881
1882 md->recursive = &new_recursive;
1883 callpat += GET(callpat, 1);
1884 }
1885 while (*callpat == OP_ALT);
1886
1887 DPRINTF(("Recursion didn't match\n"));
1888 md->recursive = new_recursive.prevrec;
1889 if (new_recursive.offset_save != stacksave)
1890 (PUBL(free))(new_recursive.offset_save);
1891 RRETURN(MATCH_NOMATCH);
1892 }
1893
1894 RECURSION_MATCHED:
1895 break;
1896
1897 /* An alternation is the end of a branch; scan along to find the end of the
1898 bracketed group and go to there. */
1899
1900 case OP_ALT:
1901 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1902 break;
1903
1904 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1905 indicating that it may occur zero times. It may repeat infinitely, or not
1906 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1907 with fixed upper repeat limits are compiled as a number of copies, with the
1908 optional ones preceded by BRAZERO or BRAMINZERO. */
1909
1910 case OP_BRAZERO:
1911 next = ecode + 1;
1912 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1914 do next += GET(next, 1); while (*next == OP_ALT);
1915 ecode = next + 1 + LINK_SIZE;
1916 break;
1917
1918 case OP_BRAMINZERO:
1919 next = ecode + 1;
1920 do next += GET(next, 1); while (*next == OP_ALT);
1921 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1922 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1923 ecode++;
1924 break;
1925
1926 case OP_SKIPZERO:
1927 next = ecode+1;
1928 do next += GET(next,1); while (*next == OP_ALT);
1929 ecode = next + 1 + LINK_SIZE;
1930 break;
1931
1932 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1933 here; just jump to the group, with allow_zero set TRUE. */
1934
1935 case OP_BRAPOSZERO:
1936 op = *(++ecode);
1937 allow_zero = TRUE;
1938 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1939 goto POSSESSIVE_NON_CAPTURE;
1940
1941 /* End of a group, repeated or non-repeating. */
1942
1943 case OP_KET:
1944 case OP_KETRMIN:
1945 case OP_KETRMAX:
1946 case OP_KETRPOS:
1947 prev = ecode - GET(ecode, 1);
1948
1949 /* If this was a group that remembered the subject start, in order to break
1950 infinite repeats of empty string matches, retrieve the subject start from
1951 the chain. Otherwise, set it NULL. */
1952
1953 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1954 {
1955 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1956 eptrb = eptrb->epb_prev; /* Backup to previous group */
1957 }
1958 else saved_eptr = NULL;
1959
1960 /* If we are at the end of an assertion group or a non-capturing atomic
1961 group, stop matching and return MATCH_MATCH, but record the current high
1962 water mark for use by positive assertions. We also need to record the match
1963 start in case it was changed by \K. */
1964
1965 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1966 *prev == OP_ONCE_NC)
1967 {
1968 md->end_match_ptr = eptr; /* For ONCE_NC */
1969 md->end_offset_top = offset_top;
1970 md->start_match_ptr = mstart;
1971 RRETURN(MATCH_MATCH); /* Sets md->mark */
1972 }
1973
1974 /* For capturing groups we have to check the group number back at the start
1975 and if necessary complete handling an extraction by setting the offsets and
1976 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1977 into group 0, so it won't be picked up here. Instead, we catch it when the
1978 OP_END is reached. Other recursion is handled here. We just have to record
1979 the current subject position and start match pointer and give a MATCH
1980 return. */
1981
1982 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1983 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1984 {
1985 number = GET2(prev, 1+LINK_SIZE);
1986 offset = number << 1;
1987
1988 #ifdef PCRE_DEBUG
1989 printf("end bracket %d", number);
1990 printf("\n");
1991 #endif
1992
1993 /* Handle a recursively called group. */
1994
1995 if (md->recursive != NULL && md->recursive->group_num == number)
1996 {
1997 md->end_match_ptr = eptr;
1998 md->start_match_ptr = mstart;
1999 RRETURN(MATCH_MATCH);
2000 }
2001
2002 /* Deal with capturing */
2003
2004 md->capture_last = (md->capture_last & OVFLMASK) | number;
2005 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
2006 {
2007 /* If offset is greater than offset_top, it means that we are
2008 "skipping" a capturing group, and that group's offsets must be marked
2009 unset. In earlier versions of PCRE, all the offsets were unset at the
2010 start of matching, but this doesn't work because atomic groups and
2011 assertions can cause a value to be set that should later be unset.
2012 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
2013 part of the atomic group, but this is not on the final matching path,
2014 so must be unset when 2 is set. (If there is no group 2, there is no
2015 problem, because offset_top will then be 2, indicating no capture.) */
2016
2017 if (offset > offset_top)
2018 {
2019 register int *iptr = md->offset_vector + offset_top;
2020 register int *iend = md->offset_vector + offset;
2021 while (iptr < iend) *iptr++ = -1;
2022 }
2023
2024 /* Now make the extraction */
2025
2026 md->offset_vector[offset] =
2027 md->offset_vector[md->offset_end - number];
2028 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2029 if (offset_top <= offset) offset_top = offset + 2;
2030 }
2031 }
2032
2033 /* For an ordinary non-repeating ket, just continue at this level. This
2034 also happens for a repeating ket if no characters were matched in the
2035 group. This is the forcible breaking of infinite loops as implemented in
2036 Perl 5.005. For a non-repeating atomic group that includes captures,
2037 establish a backup point by processing the rest of the pattern at a lower
2038 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2039 original OP_ONCE level, thereby bypassing intermediate backup points, but
2040 resetting any captures that happened along the way. */
2041
2042 if (*ecode == OP_KET || eptr == saved_eptr)
2043 {
2044 if (*prev == OP_ONCE)
2045 {
2046 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2049 RRETURN(MATCH_ONCE);
2050 }
2051 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2052 break;
2053 }
2054
2055 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2056 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2057 at a time from the outer level, thus saving stack. */
2058
2059 if (*ecode == OP_KETRPOS)
2060 {
2061 md->end_match_ptr = eptr;
2062 md->end_offset_top = offset_top;
2063 RRETURN(MATCH_KETRPOS);
2064 }
2065
2066 /* The normal repeating kets try the rest of the pattern or restart from
2067 the preceding bracket, in the appropriate order. In the second case, we can
2068 use tail recursion to avoid using another stack frame, unless we have an
2069 an atomic group or an unlimited repeat of a group that can match an empty
2070 string. */
2071
2072 if (*ecode == OP_KETRMIN)
2073 {
2074 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2075 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2076 if (*prev == OP_ONCE)
2077 {
2078 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2079 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2080 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2081 RRETURN(MATCH_ONCE);
2082 }
2083 if (*prev >= OP_SBRA) /* Could match an empty string */
2084 {
2085 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2086 RRETURN(rrc);
2087 }
2088 ecode = prev;
2089 goto TAIL_RECURSE;
2090 }
2091 else /* OP_KETRMAX */
2092 {
2093 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2094 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2096 if (*prev == OP_ONCE)
2097 {
2098 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2099 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2100 md->once_target = prev;
2101 RRETURN(MATCH_ONCE);
2102 }
2103 ecode += 1 + LINK_SIZE;
2104 goto TAIL_RECURSE;
2105 }
2106 /* Control never gets here */
2107
2108 /* Not multiline mode: start of subject assertion, unless notbol. */
2109
2110 case OP_CIRC:
2111 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2112
2113 /* Start of subject assertion */
2114
2115 case OP_SOD:
2116 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2117 ecode++;
2118 break;
2119
2120 /* Multiline mode: start of subject unless notbol, or after any newline. */
2121
2122 case OP_CIRCM:
2123 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2124 if (eptr != md->start_subject &&
2125 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2126 RRETURN(MATCH_NOMATCH);
2127 ecode++;
2128 break;
2129
2130 /* Start of match assertion */
2131
2132 case OP_SOM:
2133 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2134 ecode++;
2135 break;
2136
2137 /* Reset the start of match point */
2138
2139 case OP_SET_SOM:
2140 mstart = eptr;
2141 ecode++;
2142 break;
2143
2144 /* Multiline mode: assert before any newline, or before end of subject
2145 unless noteol is set. */
2146
2147 case OP_DOLLM:
2148 if (eptr < md->end_subject)
2149 {
2150 if (!IS_NEWLINE(eptr))
2151 {
2152 if (md->partial != 0 &&
2153 eptr + 1 >= md->end_subject &&
2154 NLBLOCK->nltype == NLTYPE_FIXED &&
2155 NLBLOCK->nllen == 2 &&
2156 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2157 {
2158 md->hitend = TRUE;
2159 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2160 }
2161 RRETURN(MATCH_NOMATCH);
2162 }
2163 }
2164 else
2165 {
2166 if (md->noteol) RRETURN(MATCH_NOMATCH);
2167 SCHECK_PARTIAL();
2168 }
2169 ecode++;
2170 break;
2171
2172 /* Not multiline mode: assert before a terminating newline or before end of
2173 subject unless noteol is set. */
2174
2175 case OP_DOLL:
2176 if (md->noteol) RRETURN(MATCH_NOMATCH);
2177 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2178
2179 /* ... else fall through for endonly */
2180
2181 /* End of subject assertion (\z) */
2182
2183 case OP_EOD:
2184 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2185 SCHECK_PARTIAL();
2186 ecode++;
2187 break;
2188
2189 /* End of subject or ending \n assertion (\Z) */
2190
2191 case OP_EODN:
2192 ASSERT_NL_OR_EOS:
2193 if (eptr < md->end_subject &&
2194 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2195 {
2196 if (md->partial != 0 &&
2197 eptr + 1 >= md->end_subject &&
2198 NLBLOCK->nltype == NLTYPE_FIXED &&
2199 NLBLOCK->nllen == 2 &&
2200 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2201 {
2202 md->hitend = TRUE;
2203 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2204 }
2205 RRETURN(MATCH_NOMATCH);
2206 }
2207
2208 /* Either at end of string or \n before end. */
2209
2210 SCHECK_PARTIAL();
2211 ecode++;
2212 break;
2213
2214 /* Word boundary assertions */
2215
2216 case OP_NOT_WORD_BOUNDARY:
2217 case OP_WORD_BOUNDARY:
2218 {
2219
2220 /* Find out if the previous and current characters are "word" characters.
2221 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2222 be "non-word" characters. Remember the earliest consulted character for
2223 partial matching. */
2224
2225 #ifdef SUPPORT_UTF
2226 if (utf)
2227 {
2228 /* Get status of previous character */
2229
2230 if (eptr == md->start_subject) prev_is_word = FALSE; else
2231 {
2232 PCRE_PUCHAR lastptr = eptr - 1;
2233 BACKCHAR(lastptr);
2234 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2235 GETCHAR(c, lastptr);
2236 #ifdef SUPPORT_UCP
2237 if (md->use_ucp)
2238 {
2239 if (c == '_') prev_is_word = TRUE; else
2240 {
2241 int cat = UCD_CATEGORY(c);
2242 prev_is_word = (cat == ucp_L || cat == ucp_N);
2243 }
2244 }
2245 else
2246 #endif
2247 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2248 }
2249
2250 /* Get status of next character */
2251
2252 if (eptr >= md->end_subject)
2253 {
2254 SCHECK_PARTIAL();
2255 cur_is_word = FALSE;
2256 }
2257 else
2258 {
2259 GETCHAR(c, eptr);
2260 #ifdef SUPPORT_UCP
2261 if (md->use_ucp)
2262 {
2263 if (c == '_') cur_is_word = TRUE; else
2264 {
2265 int cat = UCD_CATEGORY(c);
2266 cur_is_word = (cat == ucp_L || cat == ucp_N);
2267 }
2268 }
2269 else
2270 #endif
2271 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2272 }
2273 }
2274 else
2275 #endif
2276
2277 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2278 consistency with the behaviour of \w we do use it in this case. */
2279
2280 {
2281 /* Get status of previous character */
2282
2283 if (eptr == md->start_subject) prev_is_word = FALSE; else
2284 {
2285 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2286 #ifdef SUPPORT_UCP
2287 if (md->use_ucp)
2288 {
2289 c = eptr[-1];
2290 if (c == '_') prev_is_word = TRUE; else
2291 {
2292 int cat = UCD_CATEGORY(c);
2293 prev_is_word = (cat == ucp_L || cat == ucp_N);
2294 }
2295 }
2296 else
2297 #endif
2298 prev_is_word = MAX_255(eptr[-1])
2299 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2300 }
2301
2302 /* Get status of next character */
2303
2304 if (eptr >= md->end_subject)
2305 {
2306 SCHECK_PARTIAL();
2307 cur_is_word = FALSE;
2308 }
2309 else
2310 #ifdef SUPPORT_UCP
2311 if (md->use_ucp)
2312 {
2313 c = *eptr;
2314 if (c == '_') cur_is_word = TRUE; else
2315 {
2316 int cat = UCD_CATEGORY(c);
2317 cur_is_word = (cat == ucp_L || cat == ucp_N);
2318 }
2319 }
2320 else
2321 #endif
2322 cur_is_word = MAX_255(*eptr)
2323 && ((md->ctypes[*eptr] & ctype_word) != 0);
2324 }
2325
2326 /* Now see if the situation is what we want */
2327
2328 if ((*ecode++ == OP_WORD_BOUNDARY)?
2329 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2330 RRETURN(MATCH_NOMATCH);
2331 }
2332 break;
2333
2334 /* Match any single character type except newline; have to take care with
2335 CRLF newlines and partial matching. */
2336
2337 case OP_ANY:
2338 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2339 if (md->partial != 0 &&
2340 eptr + 1 >= md->end_subject &&
2341 NLBLOCK->nltype == NLTYPE_FIXED &&
2342 NLBLOCK->nllen == 2 &&
2343 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2344 {
2345 md->hitend = TRUE;
2346 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2347 }
2348
2349 /* Fall through */
2350
2351 /* Match any single character whatsoever. */
2352
2353 case OP_ALLANY:
2354 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2355 { /* not be updated before SCHECK_PARTIAL. */
2356 SCHECK_PARTIAL();
2357 RRETURN(MATCH_NOMATCH);
2358 }
2359 eptr++;
2360 #ifdef SUPPORT_UTF
2361 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2362 #endif
2363 ecode++;
2364 break;
2365
2366 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2367 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2368
2369 case OP_ANYBYTE:
2370 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2371 { /* not be updated before SCHECK_PARTIAL. */
2372 SCHECK_PARTIAL();
2373 RRETURN(MATCH_NOMATCH);
2374 }
2375 eptr++;
2376 ecode++;
2377 break;
2378
2379 case OP_NOT_DIGIT:
2380 if (eptr >= md->end_subject)
2381 {
2382 SCHECK_PARTIAL();
2383 RRETURN(MATCH_NOMATCH);
2384 }
2385 GETCHARINCTEST(c, eptr);
2386 if (
2387 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2388 c < 256 &&
2389 #endif
2390 (md->ctypes[c] & ctype_digit) != 0
2391 )
2392 RRETURN(MATCH_NOMATCH);
2393 ecode++;
2394 break;
2395
2396 case OP_DIGIT:
2397 if (eptr >= md->end_subject)
2398 {
2399 SCHECK_PARTIAL();
2400 RRETURN(MATCH_NOMATCH);
2401 }
2402 GETCHARINCTEST(c, eptr);
2403 if (
2404 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2405 c > 255 ||
2406 #endif
2407 (md->ctypes[c] & ctype_digit) == 0
2408 )
2409 RRETURN(MATCH_NOMATCH);
2410 ecode++;
2411 break;
2412
2413 case OP_NOT_WHITESPACE:
2414 if (eptr >= md->end_subject)
2415 {
2416 SCHECK_PARTIAL();
2417 RRETURN(MATCH_NOMATCH);
2418 }
2419 GETCHARINCTEST(c, eptr);
2420 if (
2421 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2422 c < 256 &&
2423 #endif
2424 (md->ctypes[c] & ctype_space) != 0
2425 )
2426 RRETURN(MATCH_NOMATCH);
2427 ecode++;
2428 break;
2429
2430 case OP_WHITESPACE:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 RRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 if (
2438 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2439 c > 255 ||
2440 #endif
2441 (md->ctypes[c] & ctype_space) == 0
2442 )
2443 RRETURN(MATCH_NOMATCH);
2444 ecode++;
2445 break;
2446
2447 case OP_NOT_WORDCHAR:
2448 if (eptr >= md->end_subject)
2449 {
2450 SCHECK_PARTIAL();
2451 RRETURN(MATCH_NOMATCH);
2452 }
2453 GETCHARINCTEST(c, eptr);
2454 if (
2455 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2456 c < 256 &&
2457 #endif
2458 (md->ctypes[c] & ctype_word) != 0
2459 )
2460 RRETURN(MATCH_NOMATCH);
2461 ecode++;
2462 break;
2463
2464 case OP_WORDCHAR:
2465 if (eptr >= md->end_subject)
2466 {
2467 SCHECK_PARTIAL();
2468 RRETURN(MATCH_NOMATCH);
2469 }
2470 GETCHARINCTEST(c, eptr);
2471 if (
2472 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2473 c > 255 ||
2474 #endif
2475 (md->ctypes[c] & ctype_word) == 0
2476 )
2477 RRETURN(MATCH_NOMATCH);
2478 ecode++;
2479 break;
2480
2481 case OP_ANYNL:
2482 if (eptr >= md->end_subject)
2483 {
2484 SCHECK_PARTIAL();
2485 RRETURN(MATCH_NOMATCH);
2486 }
2487 GETCHARINCTEST(c, eptr);
2488 switch(c)
2489 {
2490 default: RRETURN(MATCH_NOMATCH);
2491
2492 case CHAR_CR:
2493 if (eptr >= md->end_subject)
2494 {
2495 SCHECK_PARTIAL();
2496 }
2497 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2498 break;
2499
2500 case CHAR_LF:
2501 break;
2502
2503 case CHAR_VT:
2504 case CHAR_FF:
2505 case CHAR_NEL:
2506 #ifndef EBCDIC
2507 case 0x2028:
2508 case 0x2029:
2509 #endif /* Not EBCDIC */
2510 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2511 break;
2512 }
2513 ecode++;
2514 break;
2515
2516 case OP_NOT_HSPACE:
2517 if (eptr >= md->end_subject)
2518 {
2519 SCHECK_PARTIAL();
2520 RRETURN(MATCH_NOMATCH);
2521 }
2522 GETCHARINCTEST(c, eptr);
2523 switch(c)
2524 {
2525 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2526 default: break;
2527 }
2528 ecode++;
2529 break;
2530
2531 case OP_HSPACE:
2532 if (eptr >= md->end_subject)
2533 {
2534 SCHECK_PARTIAL();
2535 RRETURN(MATCH_NOMATCH);
2536 }
2537 GETCHARINCTEST(c, eptr);
2538 switch(c)
2539 {
2540 HSPACE_CASES: break; /* Byte and multibyte cases */
2541 default: RRETURN(MATCH_NOMATCH);
2542 }
2543 ecode++;
2544 break;
2545
2546 case OP_NOT_VSPACE:
2547 if (eptr >= md->end_subject)
2548 {
2549 SCHECK_PARTIAL();
2550 RRETURN(MATCH_NOMATCH);
2551 }
2552 GETCHARINCTEST(c, eptr);
2553 switch(c)
2554 {
2555 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2556 default: break;
2557 }
2558 ecode++;
2559 break;
2560
2561 case OP_VSPACE:
2562 if (eptr >= md->end_subject)
2563 {
2564 SCHECK_PARTIAL();
2565 RRETURN(MATCH_NOMATCH);
2566 }
2567 GETCHARINCTEST(c, eptr);
2568 switch(c)
2569 {
2570 VSPACE_CASES: break;
2571 default: RRETURN(MATCH_NOMATCH);
2572 }
2573 ecode++;
2574 break;
2575
2576 #ifdef SUPPORT_UCP
2577 /* Check the next character by Unicode property. We will get here only
2578 if the support is in the binary; otherwise a compile-time error occurs. */
2579
2580 case OP_PROP:
2581 case OP_NOTPROP:
2582 if (eptr >= md->end_subject)
2583 {
2584 SCHECK_PARTIAL();
2585 RRETURN(MATCH_NOMATCH);
2586 }
2587 GETCHARINCTEST(c, eptr);
2588 {
2589 const pcre_uint32 *cp;
2590 const ucd_record *prop = GET_UCD(c);
2591
2592 switch(ecode[1])
2593 {
2594 case PT_ANY:
2595 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2596 break;
2597
2598 case PT_LAMP:
2599 if ((prop->chartype == ucp_Lu ||
2600 prop->chartype == ucp_Ll ||
2601 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2604
2605 case PT_GC:
2606 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2607 RRETURN(MATCH_NOMATCH);
2608 break;
2609
2610 case PT_PC:
2611 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2612 RRETURN(MATCH_NOMATCH);
2613 break;
2614
2615 case PT_SC:
2616 if ((ecode[2] != prop->script) == (op == OP_PROP))
2617 RRETURN(MATCH_NOMATCH);
2618 break;
2619
2620 /* These are specials */
2621
2622 case PT_ALNUM:
2623 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2624 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2625 RRETURN(MATCH_NOMATCH);
2626 break;
2627
2628 case PT_SPACE: /* Perl space */
2629 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2630 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2631 == (op == OP_NOTPROP))
2632 RRETURN(MATCH_NOMATCH);
2633 break;
2634
2635 case PT_PXSPACE: /* POSIX space */
2636 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2637 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2638 c == CHAR_FF || c == CHAR_CR)
2639 == (op == OP_NOTPROP))
2640 RRETURN(MATCH_NOMATCH);
2641 break;
2642
2643 case PT_WORD:
2644 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2645 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2646 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2647 RRETURN(MATCH_NOMATCH);
2648 break;
2649
2650 case PT_CLIST:
2651 cp = PRIV(ucd_caseless_sets) + ecode[2];
2652 for (;;)
2653 {
2654 if (c < *cp)
2655 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2656 if (c == *cp++)
2657 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2658 }
2659 break;
2660
2661 case PT_UCNC:
2662 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2663 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2664 c >= 0xe000) == (op == OP_NOTPROP))
2665 RRETURN(MATCH_NOMATCH);
2666 break;
2667
2668 /* This should never occur */
2669
2670 default:
2671 RRETURN(PCRE_ERROR_INTERNAL);
2672 }
2673
2674 ecode += 3;
2675 }
2676 break;
2677
2678 /* Match an extended Unicode sequence. We will get here only if the support
2679 is in the binary; otherwise a compile-time error occurs. */
2680
2681 case OP_EXTUNI:
2682 if (eptr >= md->end_subject)
2683 {
2684 SCHECK_PARTIAL();
2685 RRETURN(MATCH_NOMATCH);
2686 }
2687 else
2688 {
2689 int lgb, rgb;
2690 GETCHARINCTEST(c, eptr);
2691 lgb = UCD_GRAPHBREAK(c);
2692 while (eptr < md->end_subject)
2693 {
2694 int len = 1;
2695 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2696 rgb = UCD_GRAPHBREAK(c);
2697 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2698 lgb = rgb;
2699 eptr += len;
2700 }
2701 }
2702 CHECK_PARTIAL();
2703 ecode++;
2704 break;
2705 #endif /* SUPPORT_UCP */
2706
2707
2708 /* Match a back reference, possibly repeatedly. Look past the end of the
2709 item to see if there is repeat information following. The code is similar
2710 to that for character classes, but repeated for efficiency. Then obey
2711 similar code to character type repeats - written out again for speed.
2712 However, if the referenced string is the empty string, always treat
2713 it as matched, any number of times (otherwise there could be infinite
2714 loops). */
2715
2716 case OP_REF:
2717 case OP_REFI:
2718 caseless = op == OP_REFI;
2719 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2720 ecode += 1 + IMM2_SIZE;
2721
2722 /* If the reference is unset, there are two possibilities:
2723
2724 (a) In the default, Perl-compatible state, set the length negative;
2725 this ensures that every attempt at a match fails. We can't just fail
2726 here, because of the possibility of quantifiers with zero minima.
2727
2728 (b) If the JavaScript compatibility flag is set, set the length to zero
2729 so that the back reference matches an empty string.
2730
2731 Otherwise, set the length to the length of what was matched by the
2732 referenced subpattern. */
2733
2734 if (offset >= offset_top || md->offset_vector[offset] < 0)
2735 length = (md->jscript_compat)? 0 : -1;
2736 else
2737 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2738
2739 /* Set up for repetition, or handle the non-repeated case */
2740
2741 switch (*ecode)
2742 {
2743 case OP_CRSTAR:
2744 case OP_CRMINSTAR:
2745 case OP_CRPLUS:
2746 case OP_CRMINPLUS:
2747 case OP_CRQUERY:
2748 case OP_CRMINQUERY:
2749 c = *ecode++ - OP_CRSTAR;
2750 minimize = (c & 1) != 0;
2751 min = rep_min[c]; /* Pick up values from tables; */
2752 max = rep_max[c]; /* zero for max => infinity */
2753 if (max == 0) max = INT_MAX;
2754 break;
2755
2756 case OP_CRRANGE:
2757 case OP_CRMINRANGE:
2758 minimize = (*ecode == OP_CRMINRANGE);
2759 min = GET2(ecode, 1);
2760 max = GET2(ecode, 1 + IMM2_SIZE);
2761 if (max == 0) max = INT_MAX;
2762 ecode += 1 + 2 * IMM2_SIZE;
2763 break;
2764
2765 default: /* No repeat follows */
2766 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2767 {
2768 if (length == -2) eptr = md->end_subject; /* Partial match */
2769 CHECK_PARTIAL();
2770 RRETURN(MATCH_NOMATCH);
2771 }
2772 eptr += length;
2773 continue; /* With the main loop */
2774 }
2775
2776 /* Handle repeated back references. If the length of the reference is
2777 zero, just continue with the main loop. If the length is negative, it
2778 means the reference is unset in non-Java-compatible mode. If the minimum is
2779 zero, we can continue at the same level without recursion. For any other
2780 minimum, carrying on will result in NOMATCH. */
2781
2782 if (length == 0) continue;
2783 if (length < 0 && min == 0) continue;
2784
2785 /* First, ensure the minimum number of matches are present. We get back
2786 the length of the reference string explicitly rather than passing the
2787 address of eptr, so that eptr can be a register variable. */
2788
2789 for (i = 1; i <= min; i++)
2790 {
2791 int slength;
2792 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2793 {
2794 if (slength == -2) eptr = md->end_subject; /* Partial match */
2795 CHECK_PARTIAL();
2796 RRETURN(MATCH_NOMATCH);
2797 }
2798 eptr += slength;
2799 }
2800
2801 /* If min = max, continue at the same level without recursion.
2802 They are not both allowed to be zero. */
2803
2804 if (min == max) continue;
2805
2806 /* If minimizing, keep trying and advancing the pointer */
2807
2808 if (minimize)
2809 {
2810 for (fi = min;; fi++)
2811 {
2812 int slength;
2813 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2815 if (fi >= max) RRETURN(MATCH_NOMATCH);
2816 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2817 {
2818 if (slength == -2) eptr = md->end_subject; /* Partial match */
2819 CHECK_PARTIAL();
2820 RRETURN(MATCH_NOMATCH);
2821 }
2822 eptr += slength;
2823 }
2824 /* Control never gets here */
2825 }
2826
2827 /* If maximizing, find the longest string and work backwards */
2828
2829 else
2830 {
2831 pp = eptr;
2832 for (i = min; i < max; i++)
2833 {
2834 int slength;
2835 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2836 {
2837 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2838 the soft partial matching case. */
2839
2840 if (slength == -2 && md->partial != 0 &&
2841 md->end_subject > md->start_used_ptr)
2842 {
2843 md->hitend = TRUE;
2844 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2845 }
2846 break;
2847 }
2848 eptr += slength;
2849 }
2850
2851 while (eptr >= pp)
2852 {
2853 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2854 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2855 eptr -= length;
2856 }
2857 RRETURN(MATCH_NOMATCH);
2858 }
2859 /* Control never gets here */
2860
2861 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2862 used when all the characters in the class have values in the range 0-255,
2863 and either the matching is caseful, or the characters are in the range
2864 0-127 when UTF-8 processing is enabled. The only difference between
2865 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2866 encountered.
2867
2868 First, look past the end of the item to see if there is repeat information
2869 following. Then obey similar code to character type repeats - written out
2870 again for speed. */
2871
2872 case OP_NCLASS:
2873 case OP_CLASS:
2874 {
2875 /* The data variable is saved across frames, so the byte map needs to
2876 be stored there. */
2877 #define BYTE_MAP ((pcre_uint8 *)data)
2878 data = ecode + 1; /* Save for matching */
2879 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2880
2881 switch (*ecode)
2882 {
2883 case OP_CRSTAR:
2884 case OP_CRMINSTAR:
2885 case OP_CRPLUS:
2886 case OP_CRMINPLUS:
2887 case OP_CRQUERY:
2888 case OP_CRMINQUERY:
2889 c = *ecode++ - OP_CRSTAR;
2890 minimize = (c & 1) != 0;
2891 min = rep_min[c]; /* Pick up values from tables; */
2892 max = rep_max[c]; /* zero for max => infinity */
2893 if (max == 0) max = INT_MAX;
2894 break;
2895
2896 case OP_CRRANGE:
2897 case OP_CRMINRANGE:
2898 minimize = (*ecode == OP_CRMINRANGE);
2899 min = GET2(ecode, 1);
2900 max = GET2(ecode, 1 + IMM2_SIZE);
2901 if (max == 0) max = INT_MAX;
2902 ecode += 1 + 2 * IMM2_SIZE;
2903 break;
2904
2905 default: /* No repeat follows */
2906 min = max = 1;
2907 break;
2908 }
2909
2910 /* First, ensure the minimum number of matches are present. */
2911
2912 #ifdef SUPPORT_UTF
2913 if (utf)
2914 {
2915 for (i = 1; i <= min; i++)
2916 {
2917 if (eptr >= md->end_subject)
2918 {
2919 SCHECK_PARTIAL();
2920 RRETURN(MATCH_NOMATCH);
2921 }
2922 GETCHARINC(c, eptr);
2923 if (c > 255)
2924 {
2925 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2926 }
2927 else
2928 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2929 }
2930 }
2931 else
2932 #endif
2933 /* Not UTF mode */
2934 {
2935 for (i = 1; i <= min; i++)
2936 {
2937 if (eptr >= md->end_subject)
2938 {
2939 SCHECK_PARTIAL();
2940 RRETURN(MATCH_NOMATCH);
2941 }
2942 c = *eptr++;
2943 #ifndef COMPILE_PCRE8
2944 if (c > 255)
2945 {
2946 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2947 }
2948 else
2949 #endif
2950 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2951 }
2952 }
2953
2954 /* If max == min we can continue with the main loop without the
2955 need to recurse. */
2956
2957 if (min == max) continue;
2958
2959 /* If minimizing, keep testing the rest of the expression and advancing
2960 the pointer while it matches the class. */
2961
2962 if (minimize)
2963 {
2964 #ifdef SUPPORT_UTF
2965 if (utf)
2966 {
2967 for (fi = min;; fi++)
2968 {
2969 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2971 if (fi >= max) RRETURN(MATCH_NOMATCH);
2972 if (eptr >= md->end_subject)
2973 {
2974 SCHECK_PARTIAL();
2975 RRETURN(MATCH_NOMATCH);
2976 }
2977 GETCHARINC(c, eptr);
2978 if (c > 255)
2979 {
2980 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2981 }
2982 else
2983 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2984 }
2985 }
2986 else
2987 #endif
2988 /* Not UTF mode */
2989 {
2990 for (fi = min;; fi++)
2991 {
2992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2994 if (fi >= max) RRETURN(MATCH_NOMATCH);
2995 if (eptr >= md->end_subject)
2996 {
2997 SCHECK_PARTIAL();
2998 RRETURN(MATCH_NOMATCH);
2999 }
3000 c = *eptr++;
3001 #ifndef COMPILE_PCRE8
3002 if (c > 255)
3003 {
3004 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3005 }
3006 else
3007 #endif
3008 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3009 }
3010 }
3011 /* Control never gets here */
3012 }
3013
3014 /* If maximizing, find the longest possible run, then work backwards. */
3015
3016 else
3017 {
3018 pp = eptr;
3019
3020 #ifdef SUPPORT_UTF
3021 if (utf)
3022 {
3023 for (i = min; i < max; i++)
3024 {
3025 int len = 1;
3026 if (eptr >= md->end_subject)
3027 {
3028 SCHECK_PARTIAL();
3029 break;
3030 }
3031 GETCHARLEN(c, eptr, len);
3032 if (c > 255)
3033 {
3034 if (op == OP_CLASS) break;
3035 }
3036 else
3037 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3038 eptr += len;
3039 }
3040 for (;;)
3041 {
3042 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3044 if (eptr-- == pp) break; /* Stop if tried at original pos */
3045 BACKCHAR(eptr);
3046 }
3047 }
3048 else
3049 #endif
3050 /* Not UTF mode */
3051 {
3052 for (i = min; i < max; i++)
3053 {
3054 if (eptr >= md->end_subject)
3055 {
3056 SCHECK_PARTIAL();
3057 break;
3058 }
3059 c = *eptr;
3060 #ifndef COMPILE_PCRE8
3061 if (c > 255)
3062 {
3063 if (op == OP_CLASS) break;
3064 }
3065 else
3066 #endif
3067 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3068 eptr++;
3069 }
3070 while (eptr >= pp)
3071 {
3072 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3073 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3074 eptr--;
3075 }
3076 }
3077
3078 RRETURN(MATCH_NOMATCH);
3079 }
3080 #undef BYTE_MAP
3081 }
3082 /* Control never gets here */
3083
3084
3085 /* Match an extended character class. This opcode is encountered only
3086 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3087 mode, because Unicode properties are supported in non-UTF-8 mode. */
3088
3089 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3090 case OP_XCLASS:
3091 {
3092 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3093 ecode += GET(ecode, 1); /* Advance past the item */
3094
3095 switch (*ecode)
3096 {
3097 case OP_CRSTAR:
3098 case OP_CRMINSTAR:
3099 case OP_CRPLUS:
3100 case OP_CRMINPLUS:
3101 case OP_CRQUERY:
3102 case OP_CRMINQUERY:
3103 c = *ecode++ - OP_CRSTAR;
3104 minimize = (c & 1) != 0;
3105 min = rep_min[c]; /* Pick up values from tables; */
3106 max = rep_max[c]; /* zero for max => infinity */
3107 if (max == 0) max = INT_MAX;
3108 break;
3109
3110 case OP_CRRANGE:
3111 case OP_CRMINRANGE:
3112 minimize = (*ecode == OP_CRMINRANGE);
3113 min = GET2(ecode, 1);
3114 max = GET2(ecode, 1 + IMM2_SIZE);
3115 if (max == 0) max = INT_MAX;
3116 ecode += 1 + 2 * IMM2_SIZE;
3117 break;
3118
3119 default: /* No repeat follows */
3120 min = max = 1;
3121 break;
3122 }
3123
3124 /* First, ensure the minimum number of matches are present. */
3125
3126 for (i = 1; i <= min; i++)
3127 {
3128 if (eptr >= md->end_subject)
3129 {
3130 SCHECK_PARTIAL();
3131 RRETURN(MATCH_NOMATCH);
3132 }
3133 GETCHARINCTEST(c, eptr);
3134 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3135 }
3136
3137 /* If max == min we can continue with the main loop without the
3138 need to recurse. */
3139
3140 if (min == max) continue;
3141
3142 /* If minimizing, keep testing the rest of the expression and advancing
3143 the pointer while it matches the class. */
3144
3145 if (minimize)
3146 {
3147 for (fi = min;; fi++)
3148 {
3149 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3150 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3151 if (fi >= max) RRETURN(MATCH_NOMATCH);
3152 if (eptr >= md->end_subject)
3153 {
3154 SCHECK_PARTIAL();
3155 RRETURN(MATCH_NOMATCH);
3156 }
3157 GETCHARINCTEST(c, eptr);
3158 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3159 }
3160 /* Control never gets here */
3161 }
3162
3163 /* If maximizing, find the longest possible run, then work backwards. */
3164
3165 else
3166 {
3167 pp = eptr;
3168 for (i = min; i < max; i++)
3169 {
3170 int len = 1;
3171 if (eptr >= md->end_subject)
3172 {
3173 SCHECK_PARTIAL();
3174 break;
3175 }
3176 #ifdef SUPPORT_UTF
3177 GETCHARLENTEST(c, eptr, len);
3178 #else
3179 c = *eptr;
3180 #endif
3181 if (!PRIV(xclass)(c, data, utf)) break;
3182 eptr += len;
3183 }
3184 for(;;)
3185 {
3186 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3187 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3188 if (eptr-- == pp) break; /* Stop if tried at original pos */
3189 #ifdef SUPPORT_UTF
3190 if (utf) BACKCHAR(eptr);
3191 #endif
3192 }
3193 RRETURN(MATCH_NOMATCH);
3194 }
3195
3196 /* Control never gets here */
3197 }
3198 #endif /* End of XCLASS */
3199
3200 /* Match a single character, casefully */
3201
3202 case OP_CHAR:
3203 #ifdef SUPPORT_UTF
3204 if (utf)
3205 {
3206 length = 1;
3207 ecode++;
3208 GETCHARLEN(fc, ecode, length);
3209 if (length > md->end_subject - eptr)
3210 {
3211 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3212 RRETURN(MATCH_NOMATCH);
3213 }
3214 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3215 }
3216 else
3217 #endif
3218 /* Not UTF mode */
3219 {
3220 if (md->end_subject - eptr < 1)
3221 {
3222 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3223 RRETURN(MATCH_NOMATCH);
3224 }
3225 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3226 ecode += 2;
3227 }
3228 break;
3229
3230 /* Match a single character, caselessly. If we are at the end of the
3231 subject, give up immediately. */
3232
3233 case OP_CHARI:
3234 if (eptr >= md->end_subject)
3235 {
3236 SCHECK_PARTIAL();
3237 RRETURN(MATCH_NOMATCH);
3238 }
3239
3240 #ifdef SUPPORT_UTF
3241 if (utf)
3242 {
3243 length = 1;
3244 ecode++;
3245 GETCHARLEN(fc, ecode, length);
3246
3247 /* If the pattern character's value is < 128, we have only one byte, and
3248 we know that its other case must also be one byte long, so we can use the
3249 fast lookup table. We know that there is at least one byte left in the
3250 subject. */
3251
3252 if (fc < 128)
3253 {
3254 pcre_uint32 cc = RAWUCHAR(eptr);
3255 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3256 ecode++;
3257 eptr++;
3258 }
3259
3260 /* Otherwise we must pick up the subject character. Note that we cannot
3261 use the value of "length" to check for sufficient bytes left, because the
3262 other case of the character may have more or fewer bytes. */
3263
3264 else
3265 {
3266 pcre_uint32 dc;
3267 GETCHARINC(dc, eptr);
3268 ecode += length;
3269
3270 /* If we have Unicode property support, we can use it to test the other
3271 case of the character, if there is one. */
3272
3273 if (fc != dc)
3274 {
3275 #ifdef SUPPORT_UCP
3276 if (dc != UCD_OTHERCASE(fc))
3277 #endif
3278 RRETURN(MATCH_NOMATCH);
3279 }
3280 }
3281 }
3282 else
3283 #endif /* SUPPORT_UTF */
3284
3285 /* Not UTF mode */
3286 {
3287 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3288 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3289 eptr++;
3290 ecode += 2;
3291 }
3292 break;
3293
3294 /* Match a single character repeatedly. */
3295
3296 case OP_EXACT:
3297 case OP_EXACTI:
3298 min = max = GET2(ecode, 1);
3299 ecode += 1 + IMM2_SIZE;
3300 goto REPEATCHAR;
3301
3302 case OP_POSUPTO:
3303 case OP_POSUPTOI:
3304 possessive = TRUE;
3305 /* Fall through */
3306
3307 case OP_UPTO:
3308 case OP_UPTOI:
3309 case OP_MINUPTO:
3310 case OP_MINUPTOI:
3311 min = 0;
3312 max = GET2(ecode, 1);
3313 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3314 ecode += 1 + IMM2_SIZE;
3315 goto REPEATCHAR;
3316
3317 case OP_POSSTAR:
3318 case OP_POSSTARI:
3319 possessive = TRUE;
3320 min = 0;
3321 max = INT_MAX;
3322 ecode++;
3323 goto REPEATCHAR;
3324
3325 case OP_POSPLUS:
3326 case OP_POSPLUSI:
3327 possessive = TRUE;
3328 min = 1;
3329 max = INT_MAX;
3330 ecode++;
3331 goto REPEATCHAR;
3332
3333 case OP_POSQUERY:
3334 case OP_POSQUERYI:
3335 possessive = TRUE;
3336 min = 0;
3337 max = 1;
3338 ecode++;
3339 goto REPEATCHAR;
3340
3341 case OP_STAR:
3342 case OP_STARI:
3343 case OP_MINSTAR:
3344 case OP_MINSTARI:
3345 case OP_PLUS:
3346 case OP_PLUSI:
3347 case OP_MINPLUS:
3348 case OP_MINPLUSI:
3349 case OP_QUERY:
3350 case OP_QUERYI:
3351 case OP_MINQUERY:
3352 case OP_MINQUERYI:
3353 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3354 minimize = (c & 1) != 0;
3355 min = rep_min[c]; /* Pick up values from tables; */
3356 max = rep_max[c]; /* zero for max => infinity */
3357 if (max == 0) max = INT_MAX;
3358
3359 /* Common code for all repeated single-character matches. */
3360
3361 REPEATCHAR:
3362 #ifdef SUPPORT_UTF
3363 if (utf)
3364 {
3365 length = 1;
3366 charptr = ecode;
3367 GETCHARLEN(fc, ecode, length);
3368 ecode += length;
3369
3370 /* Handle multibyte character matching specially here. There is
3371 support for caseless matching if UCP support is present. */
3372
3373 if (length > 1)
3374 {
3375 #ifdef SUPPORT_UCP
3376 pcre_uint32 othercase;
3377 if (op >= OP_STARI && /* Caseless */
3378 (othercase = UCD_OTHERCASE(fc)) != fc)
3379 oclength = PRIV(ord2utf)(othercase, occhars);
3380 else oclength = 0;
3381 #endif /* SUPPORT_UCP */
3382
3383 for (i = 1; i <= min; i++)
3384 {
3385 if (eptr <= md->end_subject - length &&
3386 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3387 #ifdef SUPPORT_UCP
3388 else if (oclength > 0 &&
3389 eptr <= md->end_subject - oclength &&
3390 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3391 #endif /* SUPPORT_UCP */
3392 else
3393 {
3394 CHECK_PARTIAL();
3395 RRETURN(MATCH_NOMATCH);
3396 }
3397 }
3398
3399 if (min == max) continue;
3400
3401 if (minimize)
3402 {
3403 for (fi = min;; fi++)
3404 {
3405 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3406 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3407 if (fi >= max) RRETURN(MATCH_NOMATCH);
3408 if (eptr <= md->end_subject - length &&
3409 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3410 #ifdef SUPPORT_UCP
3411 else if (oclength > 0 &&
3412 eptr <= md->end_subject - oclength &&
3413 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3414 #endif /* SUPPORT_UCP */
3415 else
3416 {
3417 CHECK_PARTIAL();
3418 RRETURN(MATCH_NOMATCH);
3419 }
3420 }
3421 /* Control never gets here */
3422 }
3423
3424 else /* Maximize */
3425 {
3426 pp = eptr;
3427 for (i = min; i < max; i++)
3428 {
3429 if (eptr <= md->end_subject - length &&
3430 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3431 #ifdef SUPPORT_UCP
3432 else if (oclength > 0 &&
3433 eptr <= md->end_subject - oclength &&
3434 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3435 #endif /* SUPPORT_UCP */
3436 else
3437 {
3438 CHECK_PARTIAL();
3439 break;
3440 }
3441 }
3442
3443 if (possessive) continue;
3444
3445 for(;;)
3446 {
3447 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3449 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3450 #ifdef SUPPORT_UCP
3451 eptr--;
3452 BACKCHAR(eptr);
3453 #else /* without SUPPORT_UCP */
3454 eptr -= length;
3455 #endif /* SUPPORT_UCP */
3456 }
3457 }
3458 /* Control never gets here */
3459 }
3460
3461 /* If the length of a UTF-8 character is 1, we fall through here, and
3462 obey the code as for non-UTF-8 characters below, though in this case the
3463 value of fc will always be < 128. */
3464 }
3465 else
3466 #endif /* SUPPORT_UTF */
3467 /* When not in UTF-8 mode, load a single-byte character. */
3468 fc = *ecode++;
3469
3470 /* The value of fc at this point is always one character, though we may
3471 or may not be in UTF mode. The code is duplicated for the caseless and
3472 caseful cases, for speed, since matching characters is likely to be quite
3473 common. First, ensure the minimum number of matches are present. If min =
3474 max, continue at the same level without recursing. Otherwise, if
3475 minimizing, keep trying the rest of the expression and advancing one
3476 matching character if failing, up to the maximum. Alternatively, if
3477 maximizing, find the maximum number of characters and work backwards. */
3478
3479 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3480 max, (char *)eptr));
3481
3482 if (op >= OP_STARI) /* Caseless */
3483 {
3484 #ifdef COMPILE_PCRE8
3485 /* fc must be < 128 if UTF is enabled. */
3486 foc = md->fcc[fc];
3487 #else
3488 #ifdef SUPPORT_UTF
3489 #ifdef SUPPORT_UCP
3490 if (utf && fc > 127)
3491 foc = UCD_OTHERCASE(fc);
3492 #else
3493 if (utf && fc > 127)
3494 foc = fc;
3495 #endif /* SUPPORT_UCP */
3496 else
3497 #endif /* SUPPORT_UTF */
3498 foc = TABLE_GET(fc, md->fcc, fc);
3499 #endif /* COMPILE_PCRE8 */
3500
3501 for (i = 1; i <= min; i++)
3502 {
3503 pcre_uint32 cc; /* Faster than pcre_uchar */
3504 if (eptr >= md->end_subject)
3505 {
3506 SCHECK_PARTIAL();
3507 RRETURN(MATCH_NOMATCH);
3508 }
3509 cc = RAWUCHARTEST(eptr);
3510 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3511 eptr++;
3512 }
3513 if (min == max) continue;
3514 if (minimize)
3515 {
3516 for (fi = min;; fi++)
3517 {
3518 pcre_uint32 cc; /* Faster than pcre_uchar */
3519 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3520 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3521 if (fi >= max) RRETURN(MATCH_NOMATCH);
3522 if (eptr >= md->end_subject)
3523 {
3524 SCHECK_PARTIAL();
3525 RRETURN(MATCH_NOMATCH);
3526 }
3527 cc = RAWUCHARTEST(eptr);
3528 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3529 eptr++;
3530 }
3531 /* Control never gets here */
3532 }
3533 else /* Maximize */
3534 {
3535 pp = eptr;
3536 for (i = min; i < max; i++)
3537 {
3538 pcre_uint32 cc; /* Faster than pcre_uchar */
3539 if (eptr >= md->end_subject)
3540 {
3541 SCHECK_PARTIAL();
3542 break;
3543 }
3544 cc = RAWUCHARTEST(eptr);
3545 if (fc != cc && foc != cc) break;
3546 eptr++;
3547 }
3548
3549 if (possessive) continue;
3550
3551 while (eptr >= pp)
3552 {
3553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3554 eptr--;
3555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3556 }
3557 RRETURN(MATCH_NOMATCH);
3558 }
3559 /* Control never gets here */
3560 }
3561
3562 /* Caseful comparisons (includes all multi-byte characters) */
3563
3564 else
3565 {
3566 for (i = 1; i <= min; i++)
3567 {
3568 if (eptr >= md->end_subject)
3569 {
3570 SCHECK_PARTIAL();
3571 RRETURN(MATCH_NOMATCH);
3572 }
3573 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3574 }
3575
3576 if (min == max) continue;
3577
3578 if (minimize)
3579 {
3580 for (fi = min;; fi++)
3581 {
3582 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3583 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3584 if (fi >= max) RRETURN(MATCH_NOMATCH);
3585 if (eptr >= md->end_subject)
3586 {
3587 SCHECK_PARTIAL();
3588 RRETURN(MATCH_NOMATCH);
3589 }
3590 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3591 }
3592 /* Control never gets here */
3593 }
3594 else /* Maximize */
3595 {
3596 pp = eptr;
3597 for (i = min; i < max; i++)
3598 {
3599 if (eptr >= md->end_subject)
3600 {
3601 SCHECK_PARTIAL();
3602 break;
3603 }
3604 if (fc != RAWUCHARTEST(eptr)) break;
3605 eptr++;
3606 }
3607 if (possessive) continue;
3608
3609 while (eptr >= pp)
3610 {
3611 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3612 eptr--;
3613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3614 }
3615 RRETURN(MATCH_NOMATCH);
3616 }
3617 }
3618 /* Control never gets here */
3619
3620 /* Match a negated single one-byte character. The character we are
3621 checking can be multibyte. */
3622
3623 case OP_NOT:
3624 case OP_NOTI:
3625 if (eptr >= md->end_subject)
3626 {
3627 SCHECK_PARTIAL();
3628 RRETURN(MATCH_NOMATCH);
3629 }
3630 #ifdef SUPPORT_UTF
3631 if (utf)
3632 {
3633 register pcre_uint32 ch, och;
3634
3635 ecode++;
3636 GETCHARINC(ch, ecode);
3637 GETCHARINC(c, eptr);
3638
3639 if (op == OP_NOT)
3640 {
3641 if (ch == c) RRETURN(MATCH_NOMATCH);
3642 }
3643 else
3644 {
3645 #ifdef SUPPORT_UCP
3646 if (ch > 127)
3647 och = UCD_OTHERCASE(ch);
3648 #else
3649 if (ch > 127)
3650 och = ch;
3651 #endif /* SUPPORT_UCP */
3652 else
3653 och = TABLE_GET(ch, md->fcc, ch);
3654 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3655 }
3656 }
3657 else
3658 #endif
3659 {
3660 register pcre_uint32 ch = ecode[1];
3661 c = *eptr++;
3662 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3663 RRETURN(MATCH_NOMATCH);
3664 ecode += 2;
3665 }
3666 break;
3667
3668 /* Match a negated single one-byte character repeatedly. This is almost a
3669 repeat of the code for a repeated single character, but I haven't found a
3670 nice way of commoning these up that doesn't require a test of the
3671 positive/negative option for each character match. Maybe that wouldn't add
3672 very much to the time taken, but character matching *is* what this is all
3673 about... */
3674
3675 case OP_NOTEXACT:
3676 case OP_NOTEXACTI:
3677 min = max = GET2(ecode, 1);
3678 ecode += 1 + IMM2_SIZE;
3679 goto REPEATNOTCHAR;
3680
3681 case OP_NOTUPTO:
3682 case OP_NOTUPTOI:
3683 case OP_NOTMINUPTO:
3684 case OP_NOTMINUPTOI:
3685 min = 0;
3686 max = GET2(ecode, 1);
3687 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3688 ecode += 1 + IMM2_SIZE;
3689 goto REPEATNOTCHAR;
3690
3691 case OP_NOTPOSSTAR:
3692 case OP_NOTPOSSTARI:
3693 possessive = TRUE;
3694 min = 0;
3695 max = INT_MAX;
3696 ecode++;
3697 goto REPEATNOTCHAR;
3698
3699 case OP_NOTPOSPLUS:
3700 case OP_NOTPOSPLUSI:
3701 possessive = TRUE;
3702 min = 1;
3703 max = INT_MAX;
3704 ecode++;
3705 goto REPEATNOTCHAR;
3706
3707 case OP_NOTPOSQUERY:
3708 case OP_NOTPOSQUERYI:
3709 possessive = TRUE;
3710 min = 0;
3711 max = 1;
3712 ecode++;
3713 goto REPEATNOTCHAR;
3714
3715 case OP_NOTPOSUPTO:
3716 case OP_NOTPOSUPTOI:
3717 possessive = TRUE;
3718 min = 0;
3719 max = GET2(ecode, 1);
3720 ecode += 1 + IMM2_SIZE;
3721 goto REPEATNOTCHAR;
3722
3723 case OP_NOTSTAR:
3724 case OP_NOTSTARI:
3725 case OP_NOTMINSTAR:
3726 case OP_NOTMINSTARI:
3727 case OP_NOTPLUS:
3728 case OP_NOTPLUSI:
3729 case OP_NOTMINPLUS:
3730 case OP_NOTMINPLUSI:
3731 case OP_NOTQUERY:
3732 case OP_NOTQUERYI:
3733 case OP_NOTMINQUERY:
3734 case OP_NOTMINQUERYI:
3735 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3736 minimize = (c & 1) != 0;
3737 min = rep_min[c]; /* Pick up values from tables; */
3738 max = rep_max[c]; /* zero for max => infinity */
3739 if (max == 0) max = INT_MAX;
3740
3741 /* Common code for all repeated single-byte matches. */
3742
3743 REPEATNOTCHAR:
3744 GETCHARINCTEST(fc, ecode);
3745
3746 /* The code is duplicated for the caseless and caseful cases, for speed,
3747 since matching characters is likely to be quite common. First, ensure the
3748 minimum number of matches are present. If min = max, continue at the same
3749 level without recursing. Otherwise, if minimizing, keep trying the rest of
3750 the expression and advancing one matching character if failing, up to the
3751 maximum. Alternatively, if maximizing, find the maximum number of
3752 characters and work backwards. */
3753
3754 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3755 max, (char *)eptr));
3756
3757 if (op >= OP_NOTSTARI) /* Caseless */
3758 {
3759 #ifdef SUPPORT_UTF
3760 #ifdef SUPPORT_UCP
3761 if (utf && fc > 127)
3762 foc = UCD_OTHERCASE(fc);
3763 #else
3764 if (utf && fc > 127)
3765 foc = fc;
3766 #endif /* SUPPORT_UCP */
3767 else
3768 #endif /* SUPPORT_UTF */
3769 foc = TABLE_GET(fc, md->fcc, fc);
3770
3771 #ifdef SUPPORT_UTF
3772 if (utf)
3773 {
3774 register pcre_uint32 d;
3775 for (i = 1; i <= min; i++)
3776 {
3777 if (eptr >= md->end_subject)
3778 {
3779 SCHECK_PARTIAL();
3780 RRETURN(MATCH_NOMATCH);
3781 }
3782 GETCHARINC(d, eptr);
3783 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3784 }
3785 }
3786 else
3787 #endif
3788 /* Not UTF mode */
3789 {
3790 for (i = 1; i <= min; i++)
3791 {
3792 if (eptr >= md->end_subject)
3793 {
3794 SCHECK_PARTIAL();
3795 RRETURN(MATCH_NOMATCH);
3796 }
3797 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3798 eptr++;
3799 }
3800 }
3801
3802 if (min == max) continue;
3803
3804 if (minimize)
3805 {
3806 #ifdef SUPPORT_UTF
3807 if (utf)
3808 {
3809 register pcre_uint32 d;
3810 for (fi = min;; fi++)
3811 {
3812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3814 if (fi >= max) RRETURN(MATCH_NOMATCH);
3815 if (eptr >= md->end_subject)
3816 {
3817 SCHECK_PARTIAL();
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 GETCHARINC(d, eptr);
3821 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3822 }
3823 }
3824 else
3825 #endif
3826 /* Not UTF mode */
3827 {
3828 for (fi = min;; fi++)
3829 {
3830 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3832 if (fi >= max) RRETURN(MATCH_NOMATCH);
3833 if (eptr >= md->end_subject)
3834 {
3835 SCHECK_PARTIAL();
3836 RRETURN(MATCH_NOMATCH);
3837 }
3838 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3839 eptr++;
3840 }
3841 }
3842 /* Control never gets here */
3843 }
3844
3845 /* Maximize case */
3846
3847 else
3848 {
3849 pp = eptr;
3850
3851 #ifdef SUPPORT_UTF
3852 if (utf)
3853 {
3854 register pcre_uint32 d;
3855 for (i = min; i < max; i++)
3856 {
3857 int len = 1;
3858 if (eptr >= md->end_subject)
3859 {
3860 SCHECK_PARTIAL();
3861 break;
3862 }
3863 GETCHARLEN(d, eptr, len);
3864 if (fc == d || (unsigned int)foc == d) break;
3865 eptr += len;
3866 }
3867 if (possessive) continue;
3868 for(;;)
3869 {
3870 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3872 if (eptr-- == pp) break; /* Stop if tried at original pos */
3873 BACKCHAR(eptr);
3874 }
3875 }
3876 else
3877 #endif
3878 /* Not UTF mode */
3879 {
3880 for (i = min; i < max; i++)
3881 {
3882 if (eptr >= md->end_subject)
3883 {
3884 SCHECK_PARTIAL();
3885 break;
3886 }
3887 if (fc == *eptr || foc == *eptr) break;
3888 eptr++;
3889 }
3890 if (possessive) continue;
3891 while (eptr >= pp)
3892 {
3893 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3895 eptr--;
3896 }
3897 }
3898
3899 RRETURN(MATCH_NOMATCH);
3900 }
3901 /* Control never gets here */
3902 }
3903
3904 /* Caseful comparisons */
3905
3906 else
3907 {
3908 #ifdef SUPPORT_UTF
3909 if (utf)
3910 {
3911 register pcre_uint32 d;
3912 for (i = 1; i <= min; i++)
3913 {
3914 if (eptr >= md->end_subject)
3915 {
3916 SCHECK_PARTIAL();
3917 RRETURN(MATCH_NOMATCH);
3918 }
3919 GETCHARINC(d, eptr);
3920 if (fc == d) RRETURN(MATCH_NOMATCH);
3921 }
3922 }
3923 else
3924 #endif
3925 /* Not UTF mode */
3926 {
3927 for (i = 1; i <= min; i++)
3928 {
3929 if (eptr >= md->end_subject)
3930 {
3931 SCHECK_PARTIAL();
3932 RRETURN(MATCH_NOMATCH);
3933 }
3934 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3935 }
3936 }
3937
3938 if (min == max) continue;
3939
3940 if (minimize)
3941 {
3942 #ifdef SUPPORT_UTF
3943 if (utf)
3944 {
3945 register pcre_uint32 d;
3946 for (fi = min;; fi++)
3947 {
3948 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3950 if (fi >= max) RRETURN(MATCH_NOMATCH);
3951 if (eptr >= md->end_subject)
3952 {
3953 SCHECK_PARTIAL();
3954 RRETURN(MATCH_NOMATCH);
3955 }
3956 GETCHARINC(d, eptr);
3957 if (fc == d) RRETURN(MATCH_NOMATCH);
3958 }
3959 }
3960 else
3961 #endif
3962 /* Not UTF mode */
3963 {
3964 for (fi = min;; fi++)
3965 {
3966 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3968 if (fi >= max) RRETURN(MATCH_NOMATCH);
3969 if (eptr >= md->end_subject)
3970 {
3971 SCHECK_PARTIAL();
3972 RRETURN(MATCH_NOMATCH);
3973 }
3974 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3975 }
3976 }
3977 /* Control never gets here */
3978 }
3979
3980 /* Maximize case */
3981
3982 else
3983 {
3984 pp = eptr;
3985
3986 #ifdef SUPPORT_UTF
3987 if (utf)
3988 {
3989 register pcre_uint32 d;
3990 for (i = min; i < max; i++)
3991 {
3992 int len = 1;
3993 if (eptr >= md->end_subject)
3994 {
3995 SCHECK_PARTIAL();
3996 break;
3997 }
3998 GETCHARLEN(d, eptr, len);
3999 if (fc == d) break;
4000 eptr += len;
4001 }
4002 if (possessive) continue;
4003 for(;;)
4004 {
4005 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4007 if (eptr-- == pp) break; /* Stop if tried at original pos */
4008 BACKCHAR(eptr);
4009 }
4010 }
4011 else
4012 #endif
4013 /* Not UTF mode */
4014 {
4015 for (i = min; i < max; i++)
4016 {
4017 if (eptr >= md->end_subject)
4018 {
4019 SCHECK_PARTIAL();
4020 break;
4021 }
4022 if (fc == *eptr) break;
4023 eptr++;
4024 }
4025 if (possessive) continue;
4026 while (eptr >= pp)
4027 {
4028 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4030 eptr--;
4031 }
4032 }
4033
4034 RRETURN(MATCH_NOMATCH);
4035 }
4036 }
4037 /* Control never gets here */
4038
4039 /* Match a single character type repeatedly; several different opcodes
4040 share code. This is very similar to the code for single characters, but we
4041 repeat it in the interests of efficiency. */
4042
4043 case OP_TYPEEXACT:
4044 min = max = GET2(ecode, 1);
4045 minimize = TRUE;
4046 ecode += 1 + IMM2_SIZE;
4047 goto REPEATTYPE;
4048
4049 case OP_TYPEUPTO:
4050 case OP_TYPEMINUPTO:
4051 min = 0;
4052 max = GET2(ecode, 1);
4053 minimize = *ecode == OP_TYPEMINUPTO;
4054 ecode += 1 + IMM2_SIZE;
4055 goto REPEATTYPE;
4056
4057 case OP_TYPEPOSSTAR:
4058 possessive = TRUE;
4059 min = 0;
4060 max = INT_MAX;
4061 ecode++;
4062 goto REPEATTYPE;
4063
4064 case OP_TYPEPOSPLUS:
4065 possessive = TRUE;
4066 min = 1;
4067 max = INT_MAX;
4068 ecode++;
4069 goto REPEATTYPE;
4070
4071 case OP_TYPEPOSQUERY:
4072 possessive = TRUE;
4073 min = 0;
4074 max = 1;
4075 ecode++;
4076 goto REPEATTYPE;
4077
4078 case OP_TYPEPOSUPTO:
4079 possessive = TRUE;
4080 min = 0;
4081 max = GET2(ecode, 1);
4082 ecode += 1 + IMM2_SIZE;
4083 goto REPEATTYPE;
4084
4085 case OP_TYPESTAR:
4086 case OP_TYPEMINSTAR:
4087 case OP_TYPEPLUS:
4088 case OP_TYPEMINPLUS:
4089 case OP_TYPEQUERY:
4090 case OP_TYPEMINQUERY:
4091 c = *ecode++ - OP_TYPESTAR;
4092 minimize = (c & 1) != 0;
4093 min = rep_min[c]; /* Pick up values from tables; */
4094 max = rep_max[c]; /* zero for max => infinity */
4095 if (max == 0) max = INT_MAX;
4096
4097 /* Common code for all repeated single character type matches. Note that
4098 in UTF-8 mode, '.' matches a character of any length, but for the other
4099 character types, the valid characters are all one-byte long. */
4100
4101 REPEATTYPE:
4102 ctype = *ecode++; /* Code for the character type */
4103
4104 #ifdef SUPPORT_UCP
4105 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4106 {
4107 prop_fail_result = ctype == OP_NOTPROP;
4108 prop_type = *ecode++;
4109 prop_value = *ecode++;
4110 }
4111 else prop_type = -1;
4112 #endif
4113
4114 /* First, ensure the minimum number of matches are present. Use inline
4115 code for maximizing the speed, and do the type test once at the start
4116 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4117 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4118 and single-bytes. */
4119
4120 if (min > 0)
4121 {
4122 #ifdef SUPPORT_UCP
4123 if (prop_type >= 0)
4124 {
4125 switch(prop_type)
4126 {
4127 case PT_ANY:
4128 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4129 for (i = 1; i <= min; i++)
4130 {
4131 if (eptr >= md->end_subject)
4132 {
4133 SCHECK_PARTIAL();
4134 RRETURN(MATCH_NOMATCH);
4135 }
4136 GETCHARINCTEST(c, eptr);
4137 }
4138 break;
4139
4140 case PT_LAMP:
4141 for (i = 1; i <= min; i++)
4142 {
4143 int chartype;
4144 if (eptr >= md->end_subject)
4145 {
4146 SCHECK_PARTIAL();
4147 RRETURN(MATCH_NOMATCH);
4148 }
4149 GETCHARINCTEST(c, eptr);
4150 chartype = UCD_CHARTYPE(c);
4151 if ((chartype == ucp_Lu ||
4152 chartype == ucp_Ll ||
4153 chartype == ucp_Lt) == prop_fail_result)
4154 RRETURN(MATCH_NOMATCH);
4155 }
4156 break;
4157
4158 case PT_GC:
4159 for (i = 1; i <= min; i++)
4160 {
4161 if (eptr >= md->end_subject)
4162 {
4163 SCHECK_PARTIAL();
4164 RRETURN(MATCH_NOMATCH);
4165 }
4166 GETCHARINCTEST(c, eptr);
4167 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case PT_PC:
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 GETCHARINCTEST(c, eptr);
4181 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4182 RRETURN(MATCH_NOMATCH);
4183 }
4184 break;
4185
4186 case PT_SC:
4187 for (i = 1; i <= min; i++)
4188 {
4189 if (eptr >= md->end_subject)
4190 {
4191 SCHECK_PARTIAL();
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 GETCHARINCTEST(c, eptr);
4195 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4196 RRETURN(MATCH_NOMATCH);
4197 }
4198 break;
4199
4200 case PT_ALNUM:
4201 for (i = 1; i <= min; i++)
4202 {
4203 int category;
4204 if (eptr >= md->end_subject)
4205 {
4206 SCHECK_PARTIAL();
4207 RRETURN(MATCH_NOMATCH);
4208 }
4209 GETCHARINCTEST(c, eptr);
4210 category = UCD_CATEGORY(c);
4211 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4212 RRETURN(MATCH_NOMATCH);
4213 }
4214 break;
4215
4216 case PT_SPACE: /* Perl space */
4217 for (i = 1; i <= min; i++)
4218 {
4219 if (eptr >= md->end_subject)
4220 {
4221 SCHECK_PARTIAL();
4222 RRETURN(MATCH_NOMATCH);
4223 }
4224 GETCHARINCTEST(c, eptr);
4225 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4226 c == CHAR_FF || c == CHAR_CR)
4227 == prop_fail_result)
4228 RRETURN(MATCH_NOMATCH);
4229 }
4230 break;
4231
4232 case PT_PXSPACE: /* POSIX space */
4233 for (i = 1; i <= min; i++)
4234 {
4235 if (eptr >= md->end_subject)
4236 {
4237 SCHECK_PARTIAL();
4238 RRETURN(MATCH_NOMATCH);
4239 }
4240 GETCHARINCTEST(c, eptr);
4241 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4242 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4243 == prop_fail_result)
4244 RRETURN(MATCH_NOMATCH);
4245 }
4246 break;
4247
4248 case PT_WORD:
4249 for (i = 1; i <= min; i++)
4250 {
4251 int category;
4252 if (eptr >= md->end_subject)
4253 {
4254 SCHECK_PARTIAL();
4255 RRETURN(MATCH_NOMATCH);
4256 }
4257 GETCHARINCTEST(c, eptr);
4258 category = UCD_CATEGORY(c);
4259 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4260 == prop_fail_result)
4261 RRETURN(MATCH_NOMATCH);
4262 }
4263 break;
4264
4265 case PT_CLIST:
4266 for (i = 1; i <= min; i++)
4267 {
4268 const pcre_uint32 *cp;
4269 if (eptr >= md->end_subject)
4270 {
4271 SCHECK_PARTIAL();
4272 RRETURN(MATCH_NOMATCH);
4273 }
4274 GETCHARINCTEST(c, eptr);
4275 cp = PRIV(ucd_caseless_sets) + prop_value;
4276 for (;;)
4277 {
4278 if (c < *cp)
4279 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4280 if (c == *cp++)
4281 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4282 }
4283 }
4284 break;
4285
4286 case PT_UCNC:
4287 for (i = 1; i <= min; i++)
4288 {
4289 if (eptr >= md->end_subject)
4290 {
4291 SCHECK_PARTIAL();
4292 RRETURN(MATCH_NOMATCH);
4293 }
4294 GETCHARINCTEST(c, eptr);
4295 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4296 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4297 c >= 0xe000) == prop_fail_result)
4298 RRETURN(MATCH_NOMATCH);
4299 }
4300 break;
4301
4302 /* This should not occur */
4303
4304 default:
4305 RRETURN(PCRE_ERROR_INTERNAL);
4306 }
4307 }
4308
4309 /* Match extended Unicode sequences. We will get here only if the
4310 support is in the binary; otherwise a compile-time error occurs. */
4311
4312 else if (ctype == OP_EXTUNI)
4313 {
4314 for (i = 1; i <= min; i++)
4315 {
4316 if (eptr >= md->end_subject)
4317 {
4318 SCHECK_PARTIAL();
4319 RRETURN(MATCH_NOMATCH);
4320 }
4321 else
4322 {
4323 int lgb, rgb;
4324 GETCHARINCTEST(c, eptr);
4325 lgb = UCD_GRAPHBREAK(c);
4326 while (eptr < md->end_subject)
4327 {
4328 int len = 1;
4329 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4330 rgb = UCD_GRAPHBREAK(c);
4331 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4332 lgb = rgb;
4333 eptr += len;
4334 }
4335 }
4336 CHECK_PARTIAL();
4337 }
4338 }
4339
4340 else
4341 #endif /* SUPPORT_UCP */
4342
4343 /* Handle all other cases when the coding is UTF-8 */
4344
4345 #ifdef SUPPORT_UTF
4346 if (utf) switch(ctype)
4347 {
4348 case OP_ANY:
4349 for (i = 1; i <= min; i++)
4350 {
4351 if (eptr >= md->end_subject)
4352 {
4353 SCHECK_PARTIAL();
4354 RRETURN(MATCH_NOMATCH);
4355 }
4356 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4357 if (md->partial != 0 &&
4358 eptr + 1 >= md->end_subject &&
4359 NLBLOCK->nltype == NLTYPE_FIXED &&
4360 NLBLOCK->nllen == 2 &&
4361 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4362 {
4363 md->hitend = TRUE;
4364 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4365 }
4366 eptr++;
4367 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4368 }
4369 break;
4370
4371 case OP_ALLANY:
4372 for (i = 1; i <= min; i++)
4373 {
4374 if (eptr >= md->end_subject)
4375 {
4376 SCHECK_PARTIAL();
4377 RRETURN(MATCH_NOMATCH);
4378 }
4379 eptr++;
4380 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4381 }
4382 break;
4383
4384 case OP_ANYBYTE:
4385 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4386 eptr += min;
4387 break;
4388
4389 case OP_ANYNL:
4390 for (i = 1; i <= min; i++)
4391 {
4392 if (eptr >= md->end_subject)
4393 {
4394 SCHECK_PARTIAL();
4395 RRETURN(MATCH_NOMATCH);
4396 }
4397 GETCHARINC(c, eptr);
4398 switch(c)
4399 {
4400 default: RRETURN(MATCH_NOMATCH);
4401
4402 case CHAR_CR:
4403 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4404 break;
4405
4406 case CHAR_LF:
4407 break;
4408
4409 case CHAR_VT:
4410 case CHAR_FF:
4411 case CHAR_NEL:
4412 #ifndef EBCDIC
4413 case 0x2028:
4414 case 0x2029:
4415 #endif /* Not EBCDIC */
4416 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4417 break;
4418 }
4419 }
4420 break;
4421
4422 case OP_NOT_HSPACE:
4423 for (i = 1; i <= min; i++)
4424 {
4425 if (eptr >= md->end_subject)
4426 {
4427 SCHECK_PARTIAL();
4428 RRETURN(MATCH_NOMATCH);
4429 }
4430 GETCHARINC(c, eptr);
4431 switch(c)
4432 {
4433 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4434 default: break;
4435 }
4436 }
4437 break;
4438
4439 case OP_HSPACE:
4440 for (i = 1; i <= min; i++)
4441 {
4442 if (eptr >= md->end_subject)
4443 {
4444 SCHECK_PARTIAL();
4445 RRETURN(MATCH_NOMATCH);
4446 }
4447 GETCHARINC(c, eptr);
4448 switch(c)
4449 {
4450 HSPACE_CASES: break; /* Byte and multibyte cases */
4451 default: RRETURN(MATCH_NOMATCH);
4452 }
4453 }
4454 break;
4455
4456 case OP_NOT_VSPACE:
4457 for (i = 1; i <= min; i++)
4458 {
4459 if (eptr >= md->end_subject)
4460 {
4461 SCHECK_PARTIAL();
4462 RRETURN(MATCH_NOMATCH);
4463 }
4464 GETCHARINC(c, eptr);
4465 switch(c)
4466 {
4467 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4468 default: break;
4469 }
4470 }
4471 break;
4472
4473 case OP_VSPACE:
4474 for (i = 1; i <= min; i++)
4475 {
4476 if (eptr >= md->end_subject)
4477 {
4478 SCHECK_PARTIAL();
4479 RRETURN(MATCH_NOMATCH);
4480 }
4481 GETCHARINC(c, eptr);
4482 switch(c)
4483 {
4484 VSPACE_CASES: break;
4485 default: RRETURN(MATCH_NOMATCH);
4486 }
4487 }
4488 break;
4489
4490 case OP_NOT_DIGIT:
4491 for (i = 1; i <= min; i++)
4492 {
4493 if (eptr >= md->end_subject)
4494 {
4495 SCHECK_PARTIAL();
4496 RRETURN(MATCH_NOMATCH);
4497 }
4498 GETCHARINC(c, eptr);
4499 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4500 RRETURN(MATCH_NOMATCH);
4501 }
4502 break;
4503
4504 case OP_DIGIT:
4505 for (i = 1; i <= min; i++)
4506 {
4507 pcre_uint32 cc;
4508 if (eptr >= md->end_subject)
4509 {
4510 SCHECK_PARTIAL();
4511 RRETURN(MATCH_NOMATCH);
4512 }
4513 cc = RAWUCHAR(eptr);
4514 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4515 RRETURN(MATCH_NOMATCH);
4516 eptr++;
4517 /* No need to skip more bytes - we know it's a 1-byte character */
4518 }
4519 break;
4520
4521 case OP_NOT_WHITESPACE:
4522 for (i = 1; i <= min; i++)
4523 {
4524 pcre_uint32 cc;
4525 if (eptr >= md->end_subject)
4526 {
4527 SCHECK_PARTIAL();
4528 RRETURN(MATCH_NOMATCH);
4529 }
4530 cc = RAWUCHAR(eptr);
4531 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4532 RRETURN(MATCH_NOMATCH);
4533 eptr++;
4534 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4535 }
4536 break;
4537
4538 case OP_WHITESPACE:
4539 for (i = 1; i <= min; i++)
4540 {
4541 pcre_uint32 cc;
4542 if (eptr >= md->end_subject)
4543 {
4544 SCHECK_PARTIAL();
4545 RRETURN(MATCH_NOMATCH);
4546 }
4547 cc = RAWUCHAR(eptr);
4548 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4549 RRETURN(MATCH_NOMATCH);
4550 eptr++;
4551 /* No need to skip more bytes - we know it's a 1-byte character */
4552 }
4553 break;
4554
4555 case OP_NOT_WORDCHAR:
4556 for (i = 1; i <= min; i++)
4557 {
4558 pcre_uint32 cc;
4559 if (eptr >= md->end_subject)
4560 {
4561 SCHECK_PARTIAL();
4562 RRETURN(MATCH_NOMATCH);
4563 }
4564 cc = RAWUCHAR(eptr);
4565 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4566 RRETURN(MATCH_NOMATCH);
4567 eptr++;
4568 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4569 }
4570 break;
4571
4572 case OP_WORDCHAR:
4573 for (i = 1; i <= min; i++)
4574 {
4575 pcre_uint32 cc;
4576 if (eptr >= md->end_subject)
4577 {
4578 SCHECK_PARTIAL();
4579 RRETURN(MATCH_NOMATCH);
4580 }
4581 cc = RAWUCHAR(eptr);
4582 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4583 RRETURN(MATCH_NOMATCH);
4584 eptr++;
4585 /* No need to skip more bytes - we know it's a 1-byte character */
4586 }
4587 break;
4588
4589 default:
4590 RRETURN(PCRE_ERROR_INTERNAL);
4591 } /* End switch(ctype) */
4592
4593 else
4594 #endif /* SUPPORT_UTF */
4595
4596 /* Code for the non-UTF-8 case for minimum matching of operators other
4597 than OP_PROP and OP_NOTPROP. */
4598
4599 switch(ctype)
4600 {
4601 case OP_ANY:
4602 for (i = 1; i <= min; i++)
4603 {
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 RRETURN(MATCH_NOMATCH);
4608 }
4609 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4610 if (md->partial != 0 &&
4611 eptr + 1 >= md->end_subject &&
4612 NLBLOCK->nltype == NLTYPE_FIXED &&
4613 NLBLOCK->nllen == 2 &&
4614 *eptr == NLBLOCK->nl[0])
4615 {
4616 md->hitend = TRUE;
4617 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4618 }
4619 eptr++;
4620 }
4621 break;
4622
4623 case OP_ALLANY:
4624 if (eptr > md->end_subject - min)
4625 {
4626 SCHECK_PARTIAL();
4627 RRETURN(MATCH_NOMATCH);
4628 }
4629 eptr += min;
4630 break;
4631
4632 case OP_ANYBYTE:
4633 if (eptr > md->end_subject - min)
4634 {
4635 SCHECK_PARTIAL();
4636 RRETURN(MATCH_NOMATCH);
4637 }
4638 eptr += min;
4639 break;
4640
4641 case OP_ANYNL:
4642 for (i = 1; i <= min; i++)
4643 {
4644 if (eptr >= md->end_subject)
4645 {
4646 SCHECK_PARTIAL();
4647 RRETURN(MATCH_NOMATCH);
4648 }
4649 switch(*eptr++)
4650 {
4651 default: RRETURN(MATCH_NOMATCH);
4652
4653 case CHAR_CR:
4654 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4655 break;
4656
4657 case CHAR_LF:
4658 break;
4659
4660 case CHAR_VT:
4661 case CHAR_FF:
4662 case CHAR_NEL:
4663 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4664 case 0x2028:
4665 case 0x2029:
4666 #endif
4667 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4668 break;
4669 }
4670 }
4671 break;
4672
4673 case OP_NOT_HSPACE:
4674 for (i = 1; i <= min; i++)
4675 {
4676 if (eptr >= md->end_subject)
4677 {
4678 SCHECK_PARTIAL();
4679 RRETURN(MATCH_NOMATCH);
4680 }
4681 switch(*eptr++)
4682 {
4683 default: break;
4684 HSPACE_BYTE_CASES:
4685 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4686 HSPACE_MULTIBYTE_CASES:
4687 #endif
4688 RRETURN(MATCH_NOMATCH);
4689 }
4690 }
4691 break;
4692
4693 case OP_HSPACE:
4694 for (i = 1; i <= min; i++)
4695 {
4696 if (eptr >= md->end_subject)
4697 {
4698 SCHECK_PARTIAL();
4699 RRETURN(MATCH_NOMATCH);
4700 }
4701 switch(*eptr++)
4702 {
4703 default: RRETURN(MATCH_NOMATCH);
4704 HSPACE_BYTE_CASES:
4705 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4706 HSPACE_MULTIBYTE_CASES:
4707 #endif
4708 break;
4709 }
4710 }
4711 break;
4712
4713 case OP_NOT_VSPACE:
4714 for (i = 1; i <= min; i++)
4715 {
4716 if (eptr >= md->end_subject)
4717 {
4718 SCHECK_PARTIAL();
4719 RRETURN(MATCH_NOMATCH);
4720 }
4721 switch(*eptr++)
4722 {
4723 VSPACE_BYTE_CASES:
4724 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4725 VSPACE_MULTIBYTE_CASES:
4726 #endif
4727 RRETURN(MATCH_NOMATCH);
4728 default: break;
4729 }
4730 }
4731 break;
4732
4733 case OP_VSPACE:
4734 for (i = 1; i <= min; i++)
4735 {
4736 if (eptr >= md->end_subject)
4737 {
4738 SCHECK_PARTIAL();
4739 RRETURN(MATCH_NOMATCH);
4740 }
4741 switch(*eptr++)
4742 {
4743 default: RRETURN(MATCH_NOMATCH);
4744 VSPACE_BYTE_CASES:
4745 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4746 VSPACE_MULTIBYTE_CASES:
4747 #endif
4748 break;
4749 }
4750 }
4751 break;
4752
4753 case OP_NOT_DIGIT:
4754 for (i = 1; i <= min; i++)
4755 {
4756 if (eptr >= md->end_subject)
4757 {
4758 SCHECK_PARTIAL();
4759 RRETURN(MATCH_NOMATCH);
4760 }
4761 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4762 RRETURN(MATCH_NOMATCH);
4763 eptr++;
4764 }
4765 break;
4766
4767 case OP_DIGIT:
4768 for (i = 1; i <= min; i++)
4769 {
4770 if (eptr >= md->end_subject)
4771 {
4772 SCHECK_PARTIAL();
4773 RRETURN(MATCH_NOMATCH);
4774 }
4775 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4776 RRETURN(MATCH_NOMATCH);
4777 eptr++;
4778 }
4779 break;
4780
4781 case OP_NOT_WHITESPACE:
4782 for (i = 1; i <= min; i++)
4783 {
4784 if (eptr >= md->end_subject)
4785 {
4786 SCHECK_PARTIAL();
4787 RRETURN(MATCH_NOMATCH);
4788 }
4789 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4790 RRETURN(MATCH_NOMATCH);
4791 eptr++;
4792 }
4793 break;
4794
4795 case OP_WHITESPACE:
4796 for (i = 1; i <= min; i++)
4797 {
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 RRETURN(MATCH_NOMATCH);
4802 }
4803 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4804 RRETURN(MATCH_NOMATCH);
4805 eptr++;
4806 }
4807 break;
4808
4809 case OP_NOT_WORDCHAR:
4810 for (i = 1; i <= min; i++)
4811 {
4812 if (eptr >= md->end_subject)
4813 {
4814 SCHECK_PARTIAL();
4815 RRETURN(MATCH_NOMATCH);
4816 }
4817 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4818 RRETURN(MATCH_NOMATCH);
4819 eptr++;
4820 }
4821 break;
4822
4823 case OP_WORDCHAR:
4824 for (i = 1; i <= min; i++)
4825 {
4826 if (eptr >= md->end_subject)
4827 {
4828 SCHECK_PARTIAL();
4829 RRETURN(MATCH_NOMATCH);
4830 }
4831 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4832 RRETURN(MATCH_NOMATCH);
4833 eptr++;
4834 }
4835 break;
4836
4837 default:
4838 RRETURN(PCRE_ERROR_INTERNAL);
4839 }
4840 }
4841
4842 /* If min = max, continue at the same level without recursing */
4843
4844 if (min == max) continue;
4845
4846 /* If minimizing, we have to test the rest of the pattern before each
4847 subsequent match. Again, separate the UTF-8 case for speed, and also
4848 separate the UCP cases. */
4849
4850 if (minimize)
4851 {
4852 #ifdef SUPPORT_UCP
4853 if (prop_type >= 0)
4854 {
4855 switch(prop_type)
4856 {
4857 case PT_ANY:
4858 for (fi = min;; fi++)
4859 {
4860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4862 if (fi >= max) RRETURN(MATCH_NOMATCH);
4863 if (eptr >= md->end_subject)
4864 {
4865 SCHECK_PARTIAL();
4866 RRETURN(MATCH_NOMATCH);
4867 }
4868 GETCHARINCTEST(c, eptr);
4869 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4870 }
4871 /* Control never gets here */
4872
4873 case PT_LAMP:
4874 for (fi = min;; fi++)
4875 {
4876 int chartype;
4877 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4879 if (fi >= max) RRETURN(MATCH_NOMATCH);
4880 if (eptr >= md->end_subject)
4881 {
4882 SCHECK_PARTIAL();
4883 RRETURN(MATCH_NOMATCH);
4884 }
4885 GETCHARINCTEST(c, eptr);
4886 chartype = UCD_CHARTYPE(c);
4887 if ((chartype == ucp_Lu ||
4888 chartype == ucp_Ll ||
4889 chartype == ucp_Lt) == prop_fail_result)
4890 RRETURN(MATCH_NOMATCH);
4891 }
4892 /* Control never gets here */
4893
4894 case PT_GC:
4895 for (fi = min;; fi++)
4896 {
4897 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4898 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4899 if (fi >= max) RRETURN(MATCH_NOMATCH);
4900 if (eptr >= md->end_subject)
4901 {
4902 SCHECK_PARTIAL();
4903 RRETURN(MATCH_NOMATCH);
4904 }
4905 GETCHARINCTEST(c, eptr);
4906 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4907 RRETURN(MATCH_NOMATCH);
4908 }
4909 /* Control never gets here */
4910
4911 case PT_PC:
4912 for (fi = min;; fi++)
4913 {
4914 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4916 if (fi >= max) RRETURN(MATCH_NOMATCH);
4917 if (eptr >= md->end_subject)
4918 {
4919 SCHECK_PARTIAL();
4920 RRETURN(MATCH_NOMATCH);
4921 }
4922 GETCHARINCTEST(c, eptr);
4923 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4924 RRETURN(MATCH_NOMATCH);
4925 }
4926 /* Control never gets here */
4927
4928 case PT_SC:
4929 for (fi = min;; fi++)
4930 {
4931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4933 if (fi >= max) RRETURN(MATCH_NOMATCH);
4934 if (eptr >= md->end_subject)
4935 {
4936 SCHECK_PARTIAL();
4937 RRETURN(MATCH_NOMATCH);
4938 }
4939 GETCHARINCTEST(c, eptr);
4940 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4941 RRETURN(MATCH_NOMATCH);
4942 }
4943 /* Control never gets here */
4944
4945 case PT_ALNUM:
4946 for (fi = min;; fi++)
4947 {
4948 int category;
4949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4951 if (fi >= max) RRETURN(MATCH_NOMATCH);
4952 if (eptr >= md->end_subject)
4953 {
4954 SCHECK_PARTIAL();
4955 RRETURN(MATCH_NOMATCH);
4956 }
4957 GETCHARINCTEST(c, eptr);
4958 category = UCD_CATEGORY(c);
4959 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4960 RRETURN(MATCH_NOMATCH);
4961 }
4962 /* Control never gets here */
4963
4964 case PT_SPACE: /* Perl space */
4965 for (fi = min;; fi++)
4966 {
4967 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4969 if (fi >= max) RRETURN(MATCH_NOMATCH);
4970 if (eptr >= md->end_subject)
4971 {
4972 SCHECK_PARTIAL();
4973 RRETURN(MATCH_NOMATCH);
4974 }
4975 GETCHARINCTEST(c, eptr);
4976 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4977 c == CHAR_FF || c == CHAR_CR)
4978 == prop_fail_result)
4979 RRETURN(MATCH_NOMATCH);
4980 }
4981 /* Control never gets here */
4982
4983 case PT_PXSPACE: /* POSIX space */
4984 for (fi = min;; fi++)
4985 {
4986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4988 if (fi >= max) RRETURN(MATCH_NOMATCH);
4989 if (eptr >= md->end_subject)
4990 {
4991 SCHECK_PARTIAL();
4992 RRETURN(MATCH_NOMATCH);
4993 }
4994 GETCHARINCTEST(c, eptr);
4995 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4996 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4997 == prop_fail_result)
4998 RRETURN(MATCH_NOMATCH);
4999 }
5000 /* Control never gets here */
5001
5002 case PT_WORD:
5003 for (fi = min;; fi++)
5004 {
5005 int category;
5006 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5008 if (fi >= max) RRETURN(MATCH_NOMATCH);
5009 if (eptr >= md->end_subject)
5010 {
5011 SCHECK_PARTIAL();
5012 RRETURN(MATCH_NOMATCH);
5013 }
5014 GETCHARINCTEST(c, eptr);
5015 category = UCD_CATEGORY(c);
5016 if ((category == ucp_L ||
5017 category == ucp_N ||
5018 c == CHAR_UNDERSCORE)
5019 == prop_fail_result)
5020 RRETURN(MATCH_NOMATCH);
5021 }
5022 /* Control never gets here */
5023
5024 case PT_CLIST:
5025 for (fi = min;; fi++)
5026 {
5027 const pcre_uint32 *cp;
5028 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5030 if (fi >= max) RRETURN(MATCH_NOMATCH);
5031 if (eptr >= md->end_subject)
5032 {
5033 SCHECK_PARTIAL();
5034 RRETURN(MATCH_NOMATCH);
5035 }
5036 GETCHARINCTEST(c, eptr);
5037 cp = PRIV(ucd_caseless_sets) + prop_value;
5038 for (;;)
5039 {
5040 if (c < *cp)
5041 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5042 if (c == *cp++)
5043 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5044 }
5045 }
5046 /* Control never gets here */
5047
5048 case PT_UCNC:
5049 for (fi = min;; fi++)
5050 {
5051 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5053 if (fi >= max) RRETURN(MATCH_NOMATCH);
5054 if (eptr >= md->end_subject)
5055 {
5056 SCHECK_PARTIAL();
5057 RRETURN(MATCH_NOMATCH);
5058 }
5059 GETCHARINCTEST(c, eptr);
5060 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5061 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5062 c >= 0xe000) == prop_fail_result)
5063 RRETURN(MATCH_NOMATCH);
5064 }
5065 /* Control never gets here */
5066
5067 /* This should never occur */
5068 default:
5069 RRETURN(PCRE_ERROR_INTERNAL);
5070 }
5071 }
5072
5073 /* Match extended Unicode sequences. We will get here only if the
5074 support is in the binary; otherwise a compile-time error occurs. */
5075
5076 else if (ctype == OP_EXTUNI)
5077 {
5078 for (fi = min;; fi++)
5079 {
5080 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5081 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5082 if (fi >= max) RRETURN(MATCH_NOMATCH);
5083 if (eptr >= md->end_subject)
5084 {
5085 SCHECK_PARTIAL();
5086 RRETURN(MATCH_NOMATCH);
5087 }
5088 else
5089 {
5090 int lgb, rgb;
5091 GETCHARINCTEST(c, eptr);
5092 lgb = UCD_GRAPHBREAK(c);
5093 while (eptr < md->end_subject)
5094 {
5095 int len = 1;
5096 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5097 rgb = UCD_GRAPHBREAK(c);
5098 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5099 lgb = rgb;
5100 eptr += len;
5101 }
5102 }
5103 CHECK_PARTIAL();
5104 }
5105 }
5106 else
5107 #endif /* SUPPORT_UCP */
5108
5109 #ifdef SUPPORT_UTF
5110 if (utf)
5111 {
5112 for (fi = min;; fi++)
5113 {
5114 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5115 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5116 if (fi >= max) RRETURN(MATCH_NOMATCH);
5117 if (eptr >= md->end_subject)
5118 {
5119 SCHECK_PARTIAL();
5120 RRETURN(MATCH_NOMATCH);
5121 }
5122 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5123 RRETURN(MATCH_NOMATCH);
5124 GETCHARINC(c, eptr);
5125 switch(ctype)
5126 {
5127 case OP_ANY: /* This is the non-NL case */
5128 if (md->partial != 0 && /* Take care with CRLF partial */
5129 eptr >= md->end_subject &&
5130 NLBLOCK->nltype == NLTYPE_FIXED &&
5131 NLBLOCK->nllen == 2 &&
5132 c == NLBLOCK->nl[0])
5133 {
5134 md->hitend = TRUE;
5135 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5136 }
5137 break;
5138
5139 case OP_ALLANY:
5140 case OP_ANYBYTE:
5141 break;
5142
5143 case OP_ANYNL:
5144 switch(c)
5145 {
5146 default: RRETURN(MATCH_NOMATCH);
5147 case CHAR_CR:
5148 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5149 break;
5150
5151 case CHAR_LF:
5152 break;
5153
5154 case CHAR_VT:
5155 case CHAR_FF:
5156 case CHAR_NEL:
5157 #ifndef EBCDIC
5158 case 0x2028:
5159 case 0x2029:
5160 #endif /* Not EBCDIC */
5161 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5162 break;
5163 }
5164 break;
5165
5166 case OP_NOT_HSPACE:
5167 switch(c)
5168 {
5169 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5170 default: break;
5171 }
5172 break;
5173
5174 case OP_HSPACE:
5175 switch(c)
5176 {
5177 HSPACE_CASES: break;
5178 default: RRETURN(MATCH_NOMATCH);
5179 }
5180 break;
5181
5182 case OP_NOT_VSPACE:
5183 switch(c)
5184 {
5185 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5186 default: break;
5187 }
5188 break;
5189
5190 case OP_VSPACE:
5191 switch(c)
5192 {
5193 VSPACE_CASES: break;
5194 default: RRETURN(MATCH_NOMATCH);
5195 }
5196 break;
5197
5198 case OP_NOT_DIGIT:
5199 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5200 RRETURN(MATCH_NOMATCH);
5201 break;
5202
5203 case OP_DIGIT:
5204 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5205 RRETURN(MATCH_NOMATCH);
5206 break;
5207
5208 case OP_NOT_WHITESPACE:
5209 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5210 RRETURN(MATCH_NOMATCH);
5211 break;
5212
5213 case OP_WHITESPACE:
5214 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5215 RRETURN(MATCH_NOMATCH);
5216 break;
5217
5218 case OP_NOT_WORDCHAR:
5219 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5220 RRETURN(MATCH_NOMATCH);
5221 break;
5222
5223 case OP_WORDCHAR:
5224 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5225 RRETURN(MATCH_NOMATCH);
5226 break;
5227
5228 default:
5229 RRETURN(PCRE_ERROR_INTERNAL);
5230 }
5231 }
5232 }
5233 else
5234 #endif
5235 /* Not UTF mode */
5236 {
5237 for (fi = min;; fi++)
5238 {
5239 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5241 if (fi >= max) RRETURN(MATCH_NOMATCH);
5242 if (eptr >= md->end_subject)
5243 {
5244 SCHECK_PARTIAL();
5245 RRETURN(MATCH_NOMATCH);
5246 }
5247 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5248 RRETURN(MATCH_NOMATCH);
5249 c = *eptr++;
5250 switch(ctype)
5251 {
5252 case OP_ANY: /* This is the non-NL case */
5253 if (md->partial != 0 && /* Take care with CRLF partial */
5254 eptr >= md->end_subject &&
5255 NLBLOCK->nltype == NLTYPE_FIXED &&
5256 NLBLOCK->nllen == 2 &&
5257 c == NLBLOCK->nl[0])
5258 {
5259 md->hitend = TRUE;
5260 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5261 }
5262 break;
5263
5264 case OP_ALLANY:
5265 case OP_ANYBYTE:
5266 break;
5267
5268 case OP_ANYNL:
5269 switch(c)
5270 {
5271 default: RRETURN(MATCH_NOMATCH);
5272 case CHAR_CR:
5273 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5274 break;
5275
5276 case CHAR_LF:
5277 break;
5278
5279 case CHAR_VT:
5280 case CHAR_FF:
5281 case CHAR_NEL:
5282 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5283 case 0x2028:
5284 case 0x2029:
5285 #endif
5286 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5287 break;
5288 }
5289 break;
5290
5291 case OP_NOT_HSPACE:
5292 switch(c)
5293 {
5294 default: break;
5295 HSPACE_BYTE_CASES:
5296 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5297 HSPACE_MULTIBYTE_CASES:
5298 #endif
5299 RRETURN(MATCH_NOMATCH);
5300 }
5301 break;
5302
5303 case OP_HSPACE:
5304 switch(c)
5305 {
5306 default: RRETURN(MATCH_NOMATCH);
5307 HSPACE_BYTE_CASES:
5308 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5309 HSPACE_MULTIBYTE_CASES:
5310 #endif
5311 break;
5312 }
5313 break;
5314
5315 case OP_NOT_VSPACE:
5316 switch(c)
5317 {
5318 default: break;
5319 VSPACE_BYTE_CASES:
5320 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5321 VSPACE_MULTIBYTE_CASES:
5322 #endif
5323 RRETURN(MATCH_NOMATCH);
5324 }
5325 break;
5326
5327 case OP_VSPACE:
5328 switch(c)
5329 {
5330 default: RRETURN(MATCH_NOMATCH);
5331 VSPACE_BYTE_CASES:
5332 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5333 VSPACE_MULTIBYTE_CASES:
5334 #endif
5335 break;
5336 }
5337 break;
5338
5339 case OP_NOT_DIGIT:
5340 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5341 break;
5342
5343 case OP_DIGIT:
5344 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5345 break;
5346
5347 case OP_NOT_WHITESPACE:
5348 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5349 break;
5350
5351 case OP_WHITESPACE:
5352 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5353 break;
5354
5355 case OP_NOT_WORDCHAR:
5356 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5357 break;
5358
5359 case OP_WORDCHAR:
5360 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5361 break;
5362
5363 default:
5364 RRETURN(PCRE_ERROR_INTERNAL);
5365 }
5366 }
5367 }
5368 /* Control never gets here */
5369 }
5370
5371 /* If maximizing, it is worth using inline code for speed, doing the type
5372 test once at the start (i.e. keep it out of the loop). Again, keep the
5373 UTF-8 and UCP stuff separate. */
5374
5375 else
5376 {
5377 pp = eptr; /* Remember where we started */
5378
5379 #ifdef SUPPORT_UCP
5380 if (prop_type >= 0)
5381 {
5382 switch(prop_type)
5383 {
5384 case PT_ANY:
5385 for (i = min; i < max; i++)
5386 {
5387 int len = 1;
5388 if (eptr >= md->end_subject)
5389 {
5390 SCHECK_PARTIAL();
5391 break;
5392 }
5393 GETCHARLENTEST(c, eptr, len);
5394 if (prop_fail_result) break;
5395 eptr+= len;
5396 }
5397 break;
5398
5399 case PT_LAMP:
5400 for (i = min; i < max; i++)
5401 {
5402 int chartype;
5403 int len = 1;
5404 if (eptr >= md->end_subject)
5405 {
5406 SCHECK_PARTIAL();
5407 break;
5408 }
5409 GETCHARLENTEST(c, eptr, len);
5410 chartype = UCD_CHARTYPE(c);
5411 if ((chartype == ucp_Lu ||
5412 chartype == ucp_Ll ||
5413 chartype == ucp_Lt) == prop_fail_result)
5414 break;
5415 eptr+= len;
5416 }
5417 break;
5418
5419 case PT_GC:
5420 for (i = min; i < max; i++)
5421 {
5422 int len = 1;
5423 if (eptr >= md->end_subject)
5424 {
5425 SCHECK_PARTIAL();
5426 break;
5427 }
5428 GETCHARLENTEST(c, eptr, len);
5429 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5430 eptr+= len;
5431 }
5432 break;
5433
5434 case PT_PC:
5435 for (i = min; i < max; i++)
5436 {
5437 int len = 1;
5438 if (eptr >= md->end_subject)
5439 {
5440 SCHECK_PARTIAL();
5441 break;
5442 }
5443 GETCHARLENTEST(c, eptr, len);
5444 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5445 eptr+= len;
5446 }
5447 break;
5448
5449 case PT_SC:
5450 for (i = min; i < max; i++)
5451 {
5452 int len = 1;
5453 if (eptr >= md->end_subject)
5454 {
5455 SCHECK_PARTIAL();
5456 break;
5457 }
5458 GETCHARLENTEST(c, eptr, len);
5459 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5460 eptr+= len;
5461 }
5462 break;
5463
5464 case PT_ALNUM:
5465 for (i = min; i < max; i++)
5466 {
5467 int category;
5468 int len = 1;
5469 if (eptr >= md->end_subject)
5470 {
5471 SCHECK_PARTIAL();
5472 break;
5473 }
5474 GETCHARLENTEST(c, eptr, len);
5475 category = UCD_CATEGORY(c);
5476 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5477 break;
5478 eptr+= len;
5479 }
5480 break;
5481
5482 case PT_SPACE: /* Perl space */
5483 for (i = min; i < max; i++)
5484 {
5485 int len = 1;
5486 if (eptr >= md->end_subject)
5487 {
5488 SCHECK_PARTIAL();
5489 break;
5490 }
5491 GETCHARLENTEST(c, eptr, len);
5492 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5493 c == CHAR_FF || c == CHAR_CR)
5494 == prop_fail_result)
5495 break;
5496 eptr+= len;
5497 }
5498 break;
5499
5500 case PT_PXSPACE: /* POSIX space */
5501 for (i = min; i < max; i++)
5502 {
5503 int len = 1;
5504 if (eptr >= md->end_subject)
5505 {
5506 SCHECK_PARTIAL();
5507 break;
5508 }
5509 GETCHARLENTEST(c, eptr, len);
5510 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5511 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5512 == prop_fail_result)
5513 break;
5514 eptr+= len;
5515 }
5516 break;
5517
5518 case PT_WORD:
5519 for (i = min; i < max; i++)
5520 {
5521 int category;
5522 int len = 1;
5523 if (eptr >= md->end_subject)
5524 {
5525 SCHECK_PARTIAL();
5526 break;
5527 }
5528 GETCHARLENTEST(c, eptr, len);
5529 category = UCD_CATEGORY(c);
5530 if ((category == ucp_L || category == ucp_N ||
5531 c == CHAR_UNDERSCORE) == prop_fail_result)
5532 break;
5533 eptr+= len;
5534 }
5535 break;
5536
5537 case PT_CLIST:
5538 for (i = min; i < max; i++)
5539 {
5540 const pcre_uint32 *cp;
5541 int len = 1;
5542 if (eptr >= md->end_subject)
5543 {
5544 SCHECK_PARTIAL();
5545 break;
5546 }
5547 GETCHARLENTEST(c, eptr, len);
5548 cp = PRIV(ucd_caseless_sets) + prop_value;
5549 for (;;)
5550 {
5551 if (c < *cp)
5552 { if (prop_fail_result) break; else goto GOT_MAX; }
5553 if (c == *cp++)
5554 { if (prop_fail_result) goto GOT_MAX; else break; }
5555 }
5556 eptr += len;
5557 }
5558 GOT_MAX:
5559 break;
5560
5561 case PT_UCNC:
5562 for (i = min; i < max; i++)
5563 {
5564 int len = 1;
5565 if (eptr >= md->end_subject)
5566 {
5567 SCHECK_PARTIAL();
5568 break;
5569 }
5570 GETCHARLENTEST(c, eptr, len);
5571 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5572 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5573 c >= 0xe000) == prop_fail_result)
5574 break;
5575 eptr += len;
5576 }
5577 break;
5578
5579 default:
5580 RRETURN(PCRE_ERROR_INTERNAL);
5581 }
5582
5583 /* eptr is now past the end of the maximum run */
5584
5585 if (possessive) continue;
5586 for(;;)
5587 {
5588 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5590 if (eptr-- == pp) break; /* Stop if tried at original pos */
5591 if (utf) BACKCHAR(eptr);
5592 }
5593 }
5594
5595 /* Match extended Unicode sequences. We will get here only if the
5596 support is in the binary; otherwise a compile-time error occurs. */
5597
5598 else if (ctype == OP_EXTUNI)
5599 {
5600 for (i = min; i < max; i++)
5601 {
5602 if (eptr >= md->end_subject)
5603 {
5604 SCHECK_PARTIAL();
5605 break;
5606 }
5607 else
5608 {
5609 int lgb, rgb;
5610 GETCHARINCTEST(c, eptr);
5611 lgb = UCD_GRAPHBREAK(c);
5612 while (eptr < md->end_subject)
5613 {
5614 int len = 1;
5615 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5616 rgb = UCD_GRAPHBREAK(c);
5617 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5618 lgb = rgb;
5619 eptr += len;
5620 }
5621 }
5622 CHECK_PARTIAL();
5623 }
5624
5625 /* eptr is now past the end of the maximum run */
5626
5627 if (possessive) continue;
5628
5629 for(;;)
5630 {
5631 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5633 if (eptr-- == pp) break; /* Stop if tried at original pos */
5634 for (;;) /* Move back over one extended */
5635 {
5636 if (!utf) c = *eptr; else
5637 {
5638 BACKCHAR(eptr);
5639 GETCHAR(c, eptr);
5640 }
5641 if (UCD_CATEGORY(c) != ucp_M) break;
5642 eptr--;
5643 }
5644 }
5645 }
5646
5647 else
5648 #endif /* SUPPORT_UCP */
5649
5650 #ifdef SUPPORT_UTF
5651 if (utf)
5652 {
5653 switch(ctype)
5654 {
5655 case OP_ANY:
5656 if (max < INT_MAX)
5657 {
5658 for (i = min; i < max; i++)
5659 {
5660 if (eptr >= md->end_subject)
5661 {
5662 SCHECK_PARTIAL();
5663 break;
5664 }
5665 if (IS_NEWLINE(eptr)) break;
5666 if (md->partial != 0 && /* Take care with CRLF partial */
5667 eptr + 1 >= md->end_subject &&
5668 NLBLOCK->nltype == NLTYPE_FIXED &&
5669 NLBLOCK->nllen == 2 &&
5670 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5671 {
5672 md->hitend = TRUE;
5673 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5674 }
5675 eptr++;
5676 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5677 }
5678 }
5679
5680 /* Handle unlimited UTF-8 repeat */
5681
5682 else
5683 {
5684 for (i = min; i < max; i++)
5685 {
5686 if (eptr >= md->end_subject)
5687 {
5688 SCHECK_PARTIAL();
5689 break;
5690 }
5691 if (IS_NEWLINE(eptr)) break;
5692 if (md->partial != 0 && /* Take care with CRLF partial */
5693 eptr + 1 >= md->end_subject &&
5694 NLBLOCK->nltype == NLTYPE_FIXED &&
5695 NLBLOCK->nllen == 2 &&
5696 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5697 {
5698 md->hitend = TRUE;
5699 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5700 }
5701 eptr++;
5702 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5703 }
5704 }
5705 break;
5706
5707 case OP_ALLANY:
5708 if (max < INT_MAX)
5709 {
5710 for (i = min; i < max; i++)
5711 {
5712 if (eptr >= md->end_subject)
5713 {
5714 SCHECK_PARTIAL();
5715 break;
5716 }
5717 eptr++;
5718 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5719 }
5720 }
5721 else
5722 {
5723 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5724 SCHECK_PARTIAL();
5725 }
5726 break;
5727
5728 /* The byte case is the same as non-UTF8 */
5729
5730 case OP_ANYBYTE:
5731 c = max - min;
5732 if (c > (unsigned int)(md->end_subject - eptr))
5733 {
5734 eptr = md->end_subject;
5735 SCHECK_PARTIAL();
5736 }
5737 else eptr += c;
5738 break;
5739
5740 case OP_ANYNL:
5741 for (i = min; i < max; i++)
5742 {
5743 int len = 1;
5744 if (eptr >= md->end_subject)
5745 {
5746 SCHECK_PARTIAL();
5747 break;
5748 }
5749 GETCHARLEN(c, eptr, len);
5750 if (c == CHAR_CR)
5751 {
5752 if (++eptr >= md->end_subject) break;
5753 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5754 }
5755 else
5756 {
5757 if (c != CHAR_LF &&
5758 (md->bsr_anycrlf ||
5759 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5760 #ifndef EBCDIC
5761 && c != 0x2028 && c != 0x2029
5762 #endif /* Not EBCDIC */
5763 )))
5764 break;
5765 eptr += len;
5766 }
5767 }
5768 break;
5769
5770 case OP_NOT_HSPACE:
5771 case OP_HSPACE:
5772 for (i = min; i < max; i++)
5773 {
5774 BOOL gotspace;
5775 int len = 1;
5776 if (eptr >= md->end_subject)
5777 {
5778 SCHECK_PARTIAL();
5779 break;
5780 }
5781 GETCHARLEN(c, eptr, len);
5782 switch(c)
5783 {
5784 HSPACE_CASES: gotspace = TRUE; break;
5785 default: gotspace = FALSE; break;
5786 }
5787 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5788 eptr += len;
5789 }
5790 break;
5791
5792 case OP_NOT_VSPACE:
5793 case OP_VSPACE:
5794 for (i = min; i < max; i++)
5795 {
5796 BOOL gotspace;
5797 int len = 1;
5798 if (eptr >= md->end_subject)
5799 {
5800 SCHECK_PARTIAL();
5801 break;
5802 }
5803 GETCHARLEN(c, eptr, len);
5804 switch(c)
5805 {
5806 VSPACE_CASES: gotspace = TRUE; break;
5807 default: gotspace = FALSE; break;
5808 }
5809 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5810 eptr += len;
5811 }
5812 break;
5813
5814 case OP_NOT_DIGIT:
5815 for (i = min; i < max; i++)
5816 {
5817 int len = 1;
5818 if (eptr >= md->end_subject)
5819 {
5820 SCHECK_PARTIAL();
5821 break;
5822 }
5823 GETCHARLEN(c, eptr, len);
5824 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5825 eptr+= len;
5826 }
5827 break;
5828
5829 case OP_DIGIT:
5830 for (i = min; i < max; i++)
5831 {
5832 int len = 1;
5833 if (eptr >= md->end_subject)
5834 {
5835 SCHECK_PARTIAL();
5836 break;
5837 }
5838 GETCHARLEN(c, eptr, len);
5839 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5840 eptr+= len;
5841 }
5842 break;
5843
5844 case OP_NOT_WHITESPACE:
5845 for (i = min; i < max; i++)
5846 {
5847 int len = 1;
5848 if (eptr >= md->end_subject)
5849 {
5850 SCHECK_PARTIAL();
5851 break;
5852 }
5853 GETCHARLEN(c, eptr, len);
5854 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5855 eptr+= len;
5856 }
5857 break;
5858
5859 case OP_WHITESPACE:
5860 for (i = min; i < max; i++)
5861 {
5862 int len = 1;
5863 if (eptr >= md->end_subject)
5864 {
5865 SCHECK_PARTIAL();
5866 break;
5867 }
5868 GETCHARLEN(c, eptr, len);
5869 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5870 eptr+= len;
5871 }
5872 break;
5873
5874 case OP_NOT_WORDCHAR:
5875 for (i = min; i < max; i++)
5876 {
5877 int len = 1;
5878 if (eptr >= md->end_subject)
5879 {
5880 SCHECK_PARTIAL();
5881 break;
5882 }
5883 GETCHARLEN(c, eptr, len);
5884 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5885 eptr+= len;
5886 }
5887 break;
5888
5889 case OP_WORDCHAR:
5890 for (i = min; i < max; i++)
5891 {
5892 int len = 1;
5893 if (eptr >= md->end_subject)
5894 {
5895 SCHECK_PARTIAL();
5896 break;
5897 }
5898 GETCHARLEN(c, eptr, len);
5899 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5900 eptr+= len;
5901 }
5902 break;
5903
5904 default:
5905 RRETURN(PCRE_ERROR_INTERNAL);
5906 }
5907
5908 /* eptr is now past the end of the maximum run. If possessive, we are
5909 done (no backing up). Otherwise, match at this position; anything other
5910 than no match is immediately returned. For nomatch, back up one
5911 character, unless we are matching \R and the last thing matched was
5912 \r\n, in which case, back up two bytes. */
5913
5914 if (possessive) continue;
5915 for(;;)
5916 {
5917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5919 if (eptr-- == pp) break; /* Stop if tried at original pos */
5920 BACKCHAR(eptr);
5921 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5922 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5923 }
5924 }
5925 else
5926 #endif /* SUPPORT_UTF */
5927 /* Not UTF mode */
5928 {
5929 switch(ctype)
5930 {
5931 case OP_ANY:
5932 for (i = min; i < max; i++)
5933 {
5934 if (eptr >= md->end_subject)
5935 {
5936 SCHECK_PARTIAL();
5937 break;
5938 }
5939 if (IS_NEWLINE(eptr)) break;
5940 if (md->partial != 0 && /* Take care with CRLF partial */
5941 eptr + 1 >= md->end_subject &&
5942 NLBLOCK->nltype == NLTYPE_FIXED &&
5943 NLBLOCK->nllen == 2 &&
5944 *eptr == NLBLOCK->nl[0])
5945 {
5946 md->hitend = TRUE;
5947 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5948 }
5949 eptr++;
5950 }
5951 break;
5952
5953 case OP_ALLANY:
5954 case OP_ANYBYTE:
5955 c = max - min;
5956 if (c > (unsigned int)(md->end_subject - eptr))
5957 {
5958 eptr = md->end_subject;
5959 SCHECK_PARTIAL();
5960 }
5961 else eptr += c;
5962 break;
5963
5964 case OP_ANYNL:
5965 for (i = min; i < max; i++)
5966 {
5967 if (eptr >= md->end_subject)
5968 {
5969 SCHECK_PARTIAL();
5970 break;
5971 }
5972 c = *eptr;
5973 if (c == CHAR_CR)
5974 {
5975 if (++eptr >= md->end_subject) break;
5976 if (*eptr == CHAR_LF) eptr++;
5977 }
5978 else
5979 {
5980 if (c != CHAR_LF && (md->bsr_anycrlf ||
5981 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5982 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5983 && c != 0x2028 && c != 0x2029
5984 #endif
5985 ))) break;
5986 eptr++;
5987 }
5988 }
5989 break;
5990
5991 case OP_NOT_HSPACE:
5992 for (i = min; i < max; i++)
5993 {
5994 if (eptr >= md->end_subject)
5995 {
5996 SCHECK_PARTIAL();
5997 break;
5998 }
5999 switch(*eptr)
6000 {
6001 default: eptr++; break;
6002 HSPACE_BYTE_CASES:
6003 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6004 HSPACE_MULTIBYTE_CASES:
6005 #endif
6006 goto ENDLOOP00;
6007 }
6008 }
6009 ENDLOOP00:
6010 break;
6011
6012 case OP_HSPACE:
6013 for (i = min; i < max; i++)
6014 {
6015 if (eptr >= md->end_subject)
6016 {
6017 SCHECK_PARTIAL();
6018 break;
6019 }
6020 switch(*eptr)
6021 {
6022 default: goto ENDLOOP01;
6023 HSPACE_BYTE_CASES:
6024 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6025 HSPACE_MULTIBYTE_CASES:
6026 #endif
6027 eptr++; break;
6028 }
6029 }
6030 ENDLOOP01:
6031 break;
6032
6033 case OP_NOT_VSPACE:
6034 for (i = min; i < max; i++)
6035 {
6036 if (eptr >= md->end_subject)
6037 {
6038 SCHECK_PARTIAL();
6039 break;
6040 }
6041 switch(*eptr)
6042 {
6043 default: eptr++; break;
6044 VSPACE_BYTE_CASES:
6045 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6046 VSPACE_MULTIBYTE_CASES:
6047 #endif
6048 goto ENDLOOP02;
6049 }
6050 }
6051 ENDLOOP02:
6052 break;
6053
6054 case OP_VSPACE:
6055 for (i = min; i < max; i++)
6056 {
6057 if (eptr >= md->end_subject)
6058 {
6059 SCHECK_PARTIAL();
6060 break;
6061 }
6062 switch(*eptr)
6063 {
6064 default: goto ENDLOOP03;
6065 VSPACE_BYTE_CASES:
6066 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6067 VSPACE_MULTIBYTE_CASES:
6068 #endif
6069 eptr++; break;
6070 }
6071 }
6072 ENDLOOP03:
6073 break;
6074
6075 case OP_NOT_DIGIT:
6076 for (i = min; i < max; i++)
6077 {
6078 if (eptr >= md->end_subject)
6079 {
6080 SCHECK_PARTIAL();
6081 break;
6082 }
6083 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6084 eptr++;
6085 }
6086 break;
6087
6088 case OP_DIGIT:
6089 for (i = min; i < max; i++)
6090 {
6091 if (eptr >= md->end_subject)
6092 {
6093 SCHECK_PARTIAL();
6094 break;
6095 }
6096 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6097 eptr++;
6098 }
6099 break;
6100
6101 case OP_NOT_WHITESPACE:
6102 for (i = min; i < max; i++)
6103 {
6104 if (eptr >= md->end_subject)
6105 {
6106 SCHECK_PARTIAL();
6107 break;
6108 }
6109 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6110 eptr++;
6111 }
6112 break;
6113
6114 case OP_WHITESPACE:
6115 for (i = min; i < max; i++)
6116 {
6117 if (eptr >= md->end_subject)
6118 {
6119 SCHECK_PARTIAL();
6120 break;
6121 }
6122 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6123 eptr++;
6124 }
6125 break;
6126
6127 case OP_NOT_WORDCHAR:
6128 for (i = min; i < max; i++)
6129 {
6130 if (eptr >= md->end_subject)
6131 {
6132 SCHECK_PARTIAL();
6133 break;
6134 }
6135 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6136 eptr++;
6137 }
6138 break;
6139
6140 case OP_WORDCHAR:
6141 for (i = min; i < max; i++)
6142 {
6143 if (eptr >= md->end_subject)
6144 {
6145 SCHECK_PARTIAL();
6146 break;
6147 }
6148 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6149 eptr++;
6150 }
6151 break;
6152
6153 default:
6154 RRETURN(PCRE_ERROR_INTERNAL);
6155 }
6156
6157 /* eptr is now past the end of the maximum run. If possessive, we are
6158 done (no backing up). Otherwise, match at this position; anything other
6159 than no match is immediately returned. For nomatch, back up one
6160 character (byte), unless we are matching \R and the last thing matched
6161 was \r\n, in which case, back up two bytes. */
6162
6163 if (possessive) continue;
6164 while (eptr >= pp)
6165 {
6166 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6167 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6168 eptr--;
6169 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6170 eptr[-1] == CHAR_CR) eptr--;
6171 }
6172 }
6173
6174 /* Get here if we can't make it match with any permitted repetitions */
6175
6176 RRETURN(MATCH_NOMATCH);
6177 }
6178 /* Control never gets here */
6179
6180 /* There's been some horrible disaster. Arrival here can only mean there is
6181 something seriously wrong in the code above or the OP_xxx definitions. */
6182
6183 default:
6184 DPRINTF(("Unknown opcode %d\n", *ecode));
6185 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6186 }
6187
6188 /* Do not stick any code in here without much thought; it is assumed
6189 that "continue" in the code above comes out to here to repeat the main
6190 loop. */
6191
6192 } /* End of main loop */
6193 /* Control never reaches here */
6194
6195
6196 /* When compiling to use the heap rather than the stack for recursive calls to
6197 match(), the RRETURN() macro jumps here. The number that is saved in
6198 frame->Xwhere indicates which label we actually want to return to. */
6199
6200 #ifdef NO_RECURSE
6201 #define LBL(val) case val: goto L_RM##val;
6202 HEAP_RETURN:
6203 switch (frame->Xwhere)
6204 {
6205 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6206 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6207 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6208 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6209 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6210 LBL(65) LBL(66)
6211 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6212 LBL(21)
6213 #endif
6214 #ifdef SUPPORT_UTF
6215 LBL(16) LBL(18) LBL(20)
6216 LBL(22) LBL(23) LBL(28) LBL(30)
6217 LBL(32) LBL(34) LBL(42) LBL(46)
6218 #ifdef SUPPORT_UCP
6219 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6220 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6221 #endif /* SUPPORT_UCP */
6222 #endif /* SUPPORT_UTF */
6223 default:
6224 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6225 return PCRE_ERROR_INTERNAL;
6226 }
6227 #undef LBL
6228 #endif /* NO_RECURSE */
6229 }
6230
6231
6232 /***************************************************************************
6233 ****************************************************************************
6234 RECURSION IN THE match() FUNCTION
6235
6236 Undefine all the macros that were defined above to handle this. */
6237
6238 #ifdef NO_RECURSE
6239 #undef eptr
6240 #undef ecode
6241 #undef mstart
6242 #undef offset_top
6243 #undef eptrb
6244 #undef flags
6245
6246 #undef callpat
6247 #undef charptr
6248 #undef data
6249 #undef next
6250 #undef pp
6251 #undef prev
6252 #undef saved_eptr
6253
6254 #undef new_recursive
6255
6256 #undef cur_is_word
6257 #undef condition
6258 #undef prev_is_word
6259
6260 #undef ctype
6261 #undef length
6262 #undef max
6263 #undef min
6264 #undef number
6265 #undef offset
6266 #undef op
6267 #undef save_capture_last
6268 #undef save_offset1
6269 #undef save_offset2
6270 #undef save_offset3
6271 #undef stacksave
6272
6273 #undef newptrb
6274
6275 #endif
6276
6277 /* These two are defined as macros in both cases */
6278
6279 #undef fc
6280 #undef fi
6281
6282 /***************************************************************************
6283 ***************************************************************************/
6284
6285
6286 #ifdef NO_RECURSE
6287 /*************************************************
6288 * Release allocated heap frames *
6289 *************************************************/
6290
6291 /* This function releases all the allocated frames. The base frame is on the
6292 machine stack, and so must not be freed.
6293
6294 Argument: the address of the base frame
6295 Returns: nothing
6296 */
6297
6298 static void
6299 release_match_heapframes (heapframe *frame_base)
6300 {
6301 heapframe *nextframe = frame_base->Xnextframe;
6302 while (nextframe != NULL)
6303 {
6304 heapframe *oldframe = nextframe;
6305 nextframe = nextframe->Xnextframe;
6306 (PUBL(stack_free))(oldframe);
6307 }
6308 }
6309 #endif
6310
6311
6312 /*************************************************
6313 * Execute a Regular Expression *
6314 *************************************************/
6315
6316 /* This function applies a compiled re to a subject string and picks out
6317 portions of the string if it matches. Two elements in the vector are set for
6318 each substring: the offsets to the start and end of the substring.
6319
6320 Arguments:
6321 argument_re points to the compiled expression
6322 extra_data points to extra data or is NULL
6323 subject points to the subject string
6324 length length of subject string (may contain binary zeros)
6325 start_offset where to start in the subject string
6326 options option bits
6327 offsets points to a vector of ints to be filled in with offsets
6328 offsetcount the number of elements in the vector
6329
6330 Returns: > 0 => success; value is the number of elements filled in
6331 = 0 => success, but offsets is not big enough
6332 -1 => failed to match
6333 < -1 => some kind of unexpected problem
6334 */
6335
6336 #if defined COMPILE_PCRE8
6337 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6338 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6339 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6340 int offsetcount)
6341 #elif defined COMPILE_PCRE16
6342 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6343 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6344 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6345 int offsetcount)
6346 #elif defined COMPILE_PCRE32
6347 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6348 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6349 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6350 int offsetcount)
6351 #endif
6352 {
6353 int rc, ocount, arg_offset_max;
6354 int newline;
6355 BOOL using_temporary_offsets = FALSE;
6356 BOOL anchored;
6357 BOOL startline;
6358 BOOL firstline;
6359 BOOL utf;
6360 BOOL has_first_char = FALSE;
6361 BOOL has_req_char = FALSE;
6362 pcre_uchar first_char = 0;
6363 pcre_uchar first_char2 = 0;
6364 pcre_uchar req_char = 0;
6365 pcre_uchar req_char2 = 0;
6366 match_data match_block;
6367 match_data *md = &match_block;
6368 const pcre_uint8 *tables;
6369 const pcre_uint8 *start_bits = NULL;
6370 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6371 PCRE_PUCHAR end_subject;
6372 PCRE_PUCHAR start_partial = NULL;
6373 PCRE_PUCHAR match_partial;
6374 PCRE_PUCHAR req_char_ptr = start_match - 1;
6375
6376 const pcre_study_data *study;
6377 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6378
6379 #ifdef NO_RECURSE
6380 heapframe frame_zero;
6381 frame_zero.Xprevframe = NULL; /* Marks the top level */
6382 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6383 md->match_frames_base = &frame_zero;
6384 #endif
6385
6386 /* Check for the special magic call that measures the size of the stack used
6387 per recursive call of match(). Without the funny casting for sizeof, a Windows
6388 compiler gave this error: "unary minus operator applied to unsigned type,
6389 result still unsigned". Hopefully the cast fixes that. */
6390
6391 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6392 start_offset == -999)
6393 #ifdef NO_RECURSE
6394 return -((int)sizeof(heapframe));
6395 #else
6396 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6397 #endif
6398
6399 /* Plausibility checks */
6400
6401 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6402 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6403 return PCRE_ERROR_NULL;
6404 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6405 if (length < 0) return PCRE_ERROR_BADLENGTH;
6406 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6407
6408 /* Check that the first field in the block is the magic number. If it is not,
6409 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6410 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6411 means that the pattern is likely compiled with different endianness. */
6412
6413 if (re->magic_number != MAGIC_NUMBER)
6414 return re->magic_number == REVERSED_MAGIC_NUMBER?
6415 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6416 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6417
6418 /* These two settings are used in the code for checking a UTF-8 string that
6419 follows immediately afterwards. Other values in the md block are used only
6420 during "normal" pcre_exec() processing, not when the JIT support is in use,
6421 so they are set up later. */
6422
6423 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6424 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6425 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6426 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6427
6428 /* Check a UTF-8 string if required. Pass back the character offset and error
6429 code for an invalid string if a results vector is available. */
6430
6431 #ifdef SUPPORT_UTF
6432 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6433 {
6434 int erroroffset;
6435 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6436 if (errorcode != 0)
6437 {
6438 if (offsetcount >= 2)
6439 {
6440 offsets[0] = erroroffset;
6441 offsets[1] = errorcode;
6442 }
6443 #if defined COMPILE_PCRE8
6444 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6445 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6446 #elif defined COMPILE_PCRE16
6447 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6448 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6449 #elif defined COMPILE_PCRE32
6450 return PCRE_ERROR_BADUTF32;
6451 #endif
6452 }
6453 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6454 /* Check that a start_offset points to the start of a UTF character. */
6455 if (start_offset > 0 && start_offset < length &&
6456 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6457 return PCRE_ERROR_BADUTF8_OFFSET;
6458 #endif
6459 }
6460 #endif
6461
6462 /* If the pattern was successfully studied with JIT support, run the JIT
6463 executable instead of the rest of this function. Most options must be set at
6464 compile time for the JIT code to be usable. Fallback to the normal code path if
6465 an unsupported flag is set. */
6466
6467 #ifdef SUPPORT_JIT
6468 if (extra_data != NULL
6469 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6470 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6471 && extra_data->executable_jit != NULL
6472 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6473 {
6474 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6475 start_offset, options, offsets, offsetcount);
6476
6477 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6478 mode is not compiled. In this case we simply fallback to interpreter. */
6479
6480 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6481 }
6482 #endif
6483
6484 /* Carry on with non-JIT matching. This information is for finding all the
6485 numbers associated with a given name, for condition testing. */
6486
6487 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6488 md->name_count = re->name_count;
6489 md->name_entry_size = re->name_entry_size;
6490
6491 /* Fish out the optional data from the extra_data structure, first setting
6492 the default values. */
6493
6494 study = NULL;
6495 md->match_limit = MATCH_LIMIT;
6496 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6497 md->callout_data = NULL;
6498
6499 /* The table pointer is always in native byte order. */
6500
6501 tables = re->tables;
6502
6503 if (extra_data != NULL)
6504 {
6505 register unsigned int flags = extra_data->flags;
6506 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6507 study = (const pcre_study_data *)extra_data->study_data;
6508 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6509 md->match_limit = extra_data->match_limit;
6510 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6511 md->match_limit_recursion = extra_data->match_limit_recursion;
6512 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6513 md->callout_data = extra_data->callout_data;
6514 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6515 }
6516
6517 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6518 is a feature that makes it possible to save compiled regex and re-use them
6519 in other programs later. */
6520
6521 if (tables == NULL) tables = PRIV(default_tables);
6522
6523 /* Set up other data */
6524
6525 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6526 startline = (re->flags & PCRE_STARTLINE) != 0;
6527 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6528
6529 /* The code starts after the real_pcre block and the capture name table. */
6530
6531 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6532 re->name_count * re->name_entry_size;
6533
6534 md->start_subject = (PCRE_PUCHAR)subject;
6535 md->start_offset = start_offset;
6536 md->end_subject = md->start_subject + length;
6537 end_subject = md->end_subject;
6538
6539 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6540 md->use_ucp = (re->options & PCRE_UCP) != 0;
6541 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6542 md->ignore_skip_arg = 0;
6543
6544 /* Some options are unpacked into BOOL variables in the hope that testing
6545 them will be faster than individual option bits. */
6546
6547 md->notbol = (options & PCRE_NOTBOL) != 0;
6548 md->noteol = (options & PCRE_NOTEOL) != 0;
6549 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6550 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6551
6552 md->hitend = FALSE;
6553 md->mark = md->nomatch_mark = NULL; /* In case never set */
6554
6555 md->recursive = NULL; /* No recursion at top level */
6556 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6557
6558 md->lcc = tables + lcc_offset;
6559 md->fcc = tables + fcc_offset;
6560 md->ctypes = tables + ctypes_offset;
6561
6562 /* Handle different \R options. */
6563
6564 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6565 {
6566 case 0:
6567 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6568 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6569 else
6570 #ifdef BSR_ANYCRLF
6571 md->bsr_anycrlf = TRUE;
6572 #else
6573 md->bsr_anycrlf = FALSE;
6574 #endif
6575 break;
6576
6577 case PCRE_BSR_ANYCRLF:
6578 md->bsr_anycrlf = TRUE;
6579 break;
6580
6581 case PCRE_BSR_UNICODE:
6582 md->bsr_anycrlf = FALSE;
6583 break;
6584
6585 default: return PCRE_ERROR_BADNEWLINE;
6586 }
6587
6588 /* Handle different types of newline. The three bits give eight cases. If
6589 nothing is set at run time, whatever was used at compile time applies. */
6590
6591 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6592 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6593 {
6594 case 0: newline = NEWLINE; break; /* Compile-time default */
6595 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6596 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6597 case PCRE_NEWLINE_CR+
6598 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6599 case PCRE_NEWLINE_ANY: newline = -1; break;
6600 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6601 default: return PCRE_ERROR_BADNEWLINE;
6602 }
6603
6604 if (newline == -2)
6605 {
6606 md->nltype = NLTYPE_ANYCRLF;
6607 }
6608 else if (newline < 0)
6609 {
6610 md->nltype = NLTYPE_ANY;
6611 }
6612 else
6613 {
6614 md->nltype = NLTYPE_FIXED;
6615 if (newline > 255)
6616 {
6617 md->nllen = 2;
6618 md->nl[0] = (newline >> 8) & 255;
6619 md->nl[1] = newline & 255;
6620 }
6621 else
6622 {
6623 md->nllen = 1;
6624 md->nl[0] = newline;
6625 }
6626 }
6627
6628 /* Partial matching was originally supported only for a restricted set of
6629 regexes; from release 8.00 there are no restrictions, but the bits are still
6630 defined (though never set). So there's no harm in leaving this code. */
6631
6632 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6633 return PCRE_ERROR_BADPARTIAL;
6634
6635 /* If the expression has got more back references than the offsets supplied can
6636 hold, we get a temporary chunk of working store to use during the matching.
6637 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6638 of 3. */
6639
6640 ocount = offsetcount - (offsetcount % 3);
6641 arg_offset_max = (2*ocount)/3;
6642
6643 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6644 {
6645 ocount = re->top_backref * 3 + 3;
6646 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6647 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6648 using_temporary_offsets = TRUE;
6649 DPRINTF(("Got memory to hold back references\n"));
6650 }
6651 else md->offset_vector = offsets;
6652 md->offset_end = ocount;
6653 md->offset_max = (2*ocount)/3;
6654 md->capture_last = 0;
6655
6656 /* Reset the working variable associated with each extraction. These should
6657 never be used unless previously set, but they get saved and restored, and so we
6658 initialize them to avoid reading uninitialized locations. Also, unset the
6659 offsets for the matched string. This is really just for tidiness with callouts,
6660 in case they inspect these fields. */
6661
6662 if (md->offset_vector != NULL)
6663 {
6664 register int *iptr = md->offset_vector + ocount;
6665 register int *iend = iptr - re->top_bracket;
6666 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6667 while (--iptr >= iend) *iptr = -1;
6668 md->offset_vector[0] = md->offset_vector[1] = -1;
6669 }
6670
6671 /* Set up the first character to match, if available. The first_char value is
6672 never set for an anchored regular expression, but the anchoring may be forced
6673 at run time, so we have to test for anchoring. The first char may be unset for
6674 an unanchored pattern, of course. If there's no first char and the pattern was
6675 studied, there may be a bitmap of possible first characters. */
6676
6677 if (!anchored)
6678 {
6679 if ((re->flags & PCRE_FIRSTSET) != 0)
6680 {
6681 has_first_char = TRUE;
6682 first_char = first_char2 = (pcre_uchar)(re->first_char);
6683 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6684 {
6685 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6686 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6687 if (utf && first_char > 127)
6688 first_char2 = UCD_OTHERCASE(first_char);
6689 #endif
6690 }
6691 }
6692 else
6693 if (!startline && study != NULL &&
6694 (study->flags & PCRE_STUDY_MAPPED) != 0)
6695 start_bits = study->start_bits;
6696 }
6697
6698 /* For anchored or unanchored matches, there may be a "last known required
6699 character" set. */
6700
6701 if ((re->flags & PCRE_REQCHSET) != 0)
6702 {
6703 has_req_char = TRUE;
6704 req_char = req_char2 = (pcre_uchar)(re->req_char);
6705 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6706 {
6707 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6708 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6709 if (utf && req_char > 127)
6710 req_char2 = UCD_OTHERCASE(req_char);
6711 #endif
6712 }
6713 }
6714
6715
6716 /* ==========================================================================*/
6717
6718 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6719 the loop runs just once. */
6720
6721 for(;;)
6722 {
6723 PCRE_PUCHAR save_end_subject = end_subject;
6724 PCRE_PUCHAR new_start_match;
6725
6726 /* If firstline is TRUE, the start of the match is constrained to the first
6727 line of a multiline string. That is, the match must be before or at the first
6728 newline. Implement this by temporarily adjusting end_subject so that we stop
6729 scanning at a newline. If the match fails at the newline, later code breaks
6730 this loop. */
6731
6732 if (firstline)
6733 {
6734 PCRE_PUCHAR t = start_match;
6735 #ifdef SUPPORT_UTF
6736 if (utf)
6737 {
6738 while (t < md->end_subject && !IS_NEWLINE(t))
6739 {
6740 t++;
6741 ACROSSCHAR(t < end_subject, *t, t++);
6742 }
6743 }
6744 else
6745 #endif
6746 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6747 end_subject = t;
6748 }
6749
6750 /* There are some optimizations that avoid running the match if a known
6751 starting point is not found, or if a known later character is not present.
6752 However, there is an option that disables these, for testing and for ensuring
6753 that all callouts do actually occur. The option can be set in the regex by
6754 (*NO_START_OPT) or passed in match-time options. */
6755
6756 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6757 { </