/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1257 - (show annotations)
Fri Feb 22 20:20:30 2013 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 213789 byte(s)
Change some pcre_uchar variables to pcre_uint32 in pcre_exec.c, for better 
performance.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_COMMIT (-998)
91 #define MATCH_KETRPOS (-997)
92 #define MATCH_ONCE (-996)
93 #define MATCH_PRUNE (-995)
94 #define MATCH_SKIP (-994)
95 #define MATCH_SKIP_ARG (-993)
96 #define MATCH_THEN (-992)
97
98 /* Maximum number of ints of offset to save on the stack for recursive calls.
99 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
100 because the offset vector is always a multiple of 3 long. */
101
102 #define REC_STACK_SAVE_MAX 30
103
104 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
105
106 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
107 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
108
109 #ifdef PCRE_DEBUG
110 /*************************************************
111 * Debugging function to print chars *
112 *************************************************/
113
114 /* Print a sequence of chars in printable format, stopping at the end of the
115 subject if the requested.
116
117 Arguments:
118 p points to characters
119 length number to print
120 is_subject TRUE if printing from within md->start_subject
121 md pointer to matching data block, if is_subject is TRUE
122
123 Returns: nothing
124 */
125
126 static void
127 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
128 {
129 pcre_uint32 c;
130 BOOL utf = md->utf;
131 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
132 while (length-- > 0)
133 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
134 }
135 #endif
136
137
138
139 /*************************************************
140 * Match a back-reference *
141 *************************************************/
142
143 /* Normally, if a back reference hasn't been set, the length that is passed is
144 negative, so the match always fails. However, in JavaScript compatibility mode,
145 the length passed is zero. Note that in caseless UTF-8 mode, the number of
146 subject bytes matched may be different to the number of reference bytes.
147
148 Arguments:
149 offset index into the offset vector
150 eptr pointer into the subject
151 length length of reference to be matched (number of bytes)
152 md points to match data block
153 caseless TRUE if caseless
154
155 Returns: >= 0 the number of subject bytes matched
156 -1 no match
157 -2 partial match; always given if at end subject
158 */
159
160 static int
161 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
162 BOOL caseless)
163 {
164 PCRE_PUCHAR eptr_start = eptr;
165 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
166 #ifdef SUPPORT_UTF
167 BOOL utf = md->utf;
168 #endif
169
170 #ifdef PCRE_DEBUG
171 if (eptr >= md->end_subject)
172 printf("matching subject <null>");
173 else
174 {
175 printf("matching subject ");
176 pchars(eptr, length, TRUE, md);
177 }
178 printf(" against backref ");
179 pchars(p, length, FALSE, md);
180 printf("\n");
181 #endif
182
183 /* Always fail if reference not set (and not JavaScript compatible - in that
184 case the length is passed as zero). */
185
186 if (length < 0) return -1;
187
188 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
189 properly if Unicode properties are supported. Otherwise, we can check only
190 ASCII characters. */
191
192 if (caseless)
193 {
194 #ifdef SUPPORT_UTF
195 #ifdef SUPPORT_UCP
196 if (utf)
197 {
198 /* Match characters up to the end of the reference. NOTE: the number of
199 data units matched may differ, because in UTF-8 there are some characters
200 whose upper and lower case versions code have different numbers of bytes.
201 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
202 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
203 sequence of two of the latter. It is important, therefore, to check the
204 length along the reference, not along the subject (earlier code did this
205 wrong). */
206
207 PCRE_PUCHAR endptr = p + length;
208 while (p < endptr)
209 {
210 pcre_uint32 c, d;
211 const ucd_record *ur;
212 if (eptr >= md->end_subject) return -2; /* Partial match */
213 GETCHARINC(c, eptr);
214 GETCHARINC(d, p);
215 ur = GET_UCD(d);
216 if (c != d && c != d + ur->other_case)
217 {
218 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
219 for (;;)
220 {
221 if (c < *pp) return -1;
222 if (c == *pp++) break;
223 }
224 }
225 }
226 }
227 else
228 #endif
229 #endif
230
231 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
232 is no UCP support. */
233 {
234 while (length-- > 0)
235 {
236 pcre_uint32 cc, cp;
237 if (eptr >= md->end_subject) return -2; /* Partial match */
238 cc = RAWUCHARTEST(eptr);
239 cp = RAWUCHARTEST(p);
240 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
241 p++;
242 eptr++;
243 }
244 }
245 }
246
247 /* In the caseful case, we can just compare the bytes, whether or not we
248 are in UTF-8 mode. */
249
250 else
251 {
252 while (length-- > 0)
253 {
254 if (eptr >= md->end_subject) return -2; /* Partial match */
255 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
256 }
257 }
258
259 return (int)(eptr - eptr_start);
260 }
261
262
263
264 /***************************************************************************
265 ****************************************************************************
266 RECURSION IN THE match() FUNCTION
267
268 The match() function is highly recursive, though not every recursive call
269 increases the recursive depth. Nevertheless, some regular expressions can cause
270 it to recurse to a great depth. I was writing for Unix, so I just let it call
271 itself recursively. This uses the stack for saving everything that has to be
272 saved for a recursive call. On Unix, the stack can be large, and this works
273 fine.
274
275 It turns out that on some non-Unix-like systems there are problems with
276 programs that use a lot of stack. (This despite the fact that every last chip
277 has oodles of memory these days, and techniques for extending the stack have
278 been known for decades.) So....
279
280 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
281 calls by keeping local variables that need to be preserved in blocks of memory
282 obtained from malloc() instead instead of on the stack. Macros are used to
283 achieve this so that the actual code doesn't look very different to what it
284 always used to.
285
286 The original heap-recursive code used longjmp(). However, it seems that this
287 can be very slow on some operating systems. Following a suggestion from Stan
288 Switzer, the use of longjmp() has been abolished, at the cost of having to
289 provide a unique number for each call to RMATCH. There is no way of generating
290 a sequence of numbers at compile time in C. I have given them names, to make
291 them stand out more clearly.
292
293 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
294 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
295 tests. Furthermore, not using longjmp() means that local dynamic variables
296 don't have indeterminate values; this has meant that the frame size can be
297 reduced because the result can be "passed back" by straight setting of the
298 variable instead of being passed in the frame.
299 ****************************************************************************
300 ***************************************************************************/
301
302 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
303 below must be updated in sync. */
304
305 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
306 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
307 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
308 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
309 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
310 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
311 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
312
313 /* These versions of the macros use the stack, as normal. There are debugging
314 versions and production versions. Note that the "rw" argument of RMATCH isn't
315 actually used in this definition. */
316
317 #ifndef NO_RECURSE
318 #define REGISTER register
319
320 #ifdef PCRE_DEBUG
321 #define RMATCH(ra,rb,rc,rd,re,rw) \
322 { \
323 printf("match() called in line %d\n", __LINE__); \
324 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
325 printf("to line %d\n", __LINE__); \
326 }
327 #define RRETURN(ra) \
328 { \
329 printf("match() returned %d from line %d\n", ra, __LINE__); \
330 return ra; \
331 }
332 #else
333 #define RMATCH(ra,rb,rc,rd,re,rw) \
334 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
335 #define RRETURN(ra) return ra
336 #endif
337
338 #else
339
340
341 /* These versions of the macros manage a private stack on the heap. Note that
342 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
343 argument of match(), which never changes. */
344
345 #define REGISTER
346
347 #define RMATCH(ra,rb,rc,rd,re,rw)\
348 {\
349 heapframe *newframe = frame->Xnextframe;\
350 if (newframe == NULL)\
351 {\
352 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
353 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
354 newframe->Xnextframe = NULL;\
355 frame->Xnextframe = newframe;\
356 }\
357 frame->Xwhere = rw;\
358 newframe->Xeptr = ra;\
359 newframe->Xecode = rb;\
360 newframe->Xmstart = mstart;\
361 newframe->Xoffset_top = rc;\
362 newframe->Xeptrb = re;\
363 newframe->Xrdepth = frame->Xrdepth + 1;\
364 newframe->Xprevframe = frame;\
365 frame = newframe;\
366 DPRINTF(("restarting from line %d\n", __LINE__));\
367 goto HEAP_RECURSE;\
368 L_##rw:\
369 DPRINTF(("jumped back to line %d\n", __LINE__));\
370 }
371
372 #define RRETURN(ra)\
373 {\
374 heapframe *oldframe = frame;\
375 frame = oldframe->Xprevframe;\
376 if (frame != NULL)\
377 {\
378 rrc = ra;\
379 goto HEAP_RETURN;\
380 }\
381 return ra;\
382 }
383
384
385 /* Structure for remembering the local variables in a private frame */
386
387 typedef struct heapframe {
388 struct heapframe *Xprevframe;
389 struct heapframe *Xnextframe;
390
391 /* Function arguments that may change */
392
393 PCRE_PUCHAR Xeptr;
394 const pcre_uchar *Xecode;
395 PCRE_PUCHAR Xmstart;
396 int Xoffset_top;
397 eptrblock *Xeptrb;
398 unsigned int Xrdepth;
399
400 /* Function local variables */
401
402 PCRE_PUCHAR Xcallpat;
403 #ifdef SUPPORT_UTF
404 PCRE_PUCHAR Xcharptr;
405 #endif
406 PCRE_PUCHAR Xdata;
407 PCRE_PUCHAR Xnext;
408 PCRE_PUCHAR Xpp;
409 PCRE_PUCHAR Xprev;
410 PCRE_PUCHAR Xsaved_eptr;
411
412 recursion_info Xnew_recursive;
413
414 BOOL Xcur_is_word;
415 BOOL Xcondition;
416 BOOL Xprev_is_word;
417
418 #ifdef SUPPORT_UCP
419 int Xprop_type;
420 unsigned int Xprop_value;
421 int Xprop_fail_result;
422 int Xoclength;
423 pcre_uchar Xocchars[6];
424 #endif
425
426 int Xcodelink;
427 int Xctype;
428 unsigned int Xfc;
429 int Xfi;
430 int Xlength;
431 int Xmax;
432 int Xmin;
433 unsigned int Xnumber;
434 int Xoffset;
435 unsigned int Xop;
436 pcre_int32 Xsave_capture_last;
437 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
438 int Xstacksave[REC_STACK_SAVE_MAX];
439
440 eptrblock Xnewptrb;
441
442 /* Where to jump back to */
443
444 int Xwhere;
445
446 } heapframe;
447
448 #endif
449
450
451 /***************************************************************************
452 ***************************************************************************/
453
454
455
456 /*************************************************
457 * Match from current position *
458 *************************************************/
459
460 /* This function is called recursively in many circumstances. Whenever it
461 returns a negative (error) response, the outer incarnation must also return the
462 same response. */
463
464 /* These macros pack up tests that are used for partial matching, and which
465 appear several times in the code. We set the "hit end" flag if the pointer is
466 at the end of the subject and also past the start of the subject (i.e.
467 something has been matched). For hard partial matching, we then return
468 immediately. The second one is used when we already know we are past the end of
469 the subject. */
470
471 #define CHECK_PARTIAL()\
472 if (md->partial != 0 && eptr >= md->end_subject && \
473 eptr > md->start_used_ptr) \
474 { \
475 md->hitend = TRUE; \
476 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
477 }
478
479 #define SCHECK_PARTIAL()\
480 if (md->partial != 0 && eptr > md->start_used_ptr) \
481 { \
482 md->hitend = TRUE; \
483 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
484 }
485
486
487 /* Performance note: It might be tempting to extract commonly used fields from
488 the md structure (e.g. utf, end_subject) into individual variables to improve
489 performance. Tests using gcc on a SPARC disproved this; in the first case, it
490 made performance worse.
491
492 Arguments:
493 eptr pointer to current character in subject
494 ecode pointer to current position in compiled code
495 mstart pointer to the current match start position (can be modified
496 by encountering \K)
497 offset_top current top pointer
498 md pointer to "static" info for the match
499 eptrb pointer to chain of blocks containing eptr at start of
500 brackets - for testing for empty matches
501 rdepth the recursion depth
502
503 Returns: MATCH_MATCH if matched ) these values are >= 0
504 MATCH_NOMATCH if failed to match )
505 a negative MATCH_xxx value for PRUNE, SKIP, etc
506 a negative PCRE_ERROR_xxx value if aborted by an error condition
507 (e.g. stopped by repeated call or recursion limit)
508 */
509
510 static int
511 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
512 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
513 unsigned int rdepth)
514 {
515 /* These variables do not need to be preserved over recursion in this function,
516 so they can be ordinary variables in all cases. Mark some of them with
517 "register" because they are used a lot in loops. */
518
519 register int rrc; /* Returns from recursive calls */
520 register int i; /* Used for loops not involving calls to RMATCH() */
521 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
522 register BOOL utf; /* Local copy of UTF flag for speed */
523
524 BOOL minimize, possessive; /* Quantifier options */
525 BOOL caseless;
526 int condcode;
527
528 /* When recursion is not being used, all "local" variables that have to be
529 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
530 frame on the stack here; subsequent instantiations are obtained from the heap
531 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
532 the top-level on the stack rather than malloc-ing them all gives a performance
533 boost in many cases where there is not much "recursion". */
534
535 #ifdef NO_RECURSE
536 heapframe *frame = (heapframe *)md->match_frames_base;
537
538 /* Copy in the original argument variables */
539
540 frame->Xeptr = eptr;
541 frame->Xecode = ecode;
542 frame->Xmstart = mstart;
543 frame->Xoffset_top = offset_top;
544 frame->Xeptrb = eptrb;
545 frame->Xrdepth = rdepth;
546
547 /* This is where control jumps back to to effect "recursion" */
548
549 HEAP_RECURSE:
550
551 /* Macros make the argument variables come from the current frame */
552
553 #define eptr frame->Xeptr
554 #define ecode frame->Xecode
555 #define mstart frame->Xmstart
556 #define offset_top frame->Xoffset_top
557 #define eptrb frame->Xeptrb
558 #define rdepth frame->Xrdepth
559
560 /* Ditto for the local variables */
561
562 #ifdef SUPPORT_UTF
563 #define charptr frame->Xcharptr
564 #endif
565 #define callpat frame->Xcallpat
566 #define codelink frame->Xcodelink
567 #define data frame->Xdata
568 #define next frame->Xnext
569 #define pp frame->Xpp
570 #define prev frame->Xprev
571 #define saved_eptr frame->Xsaved_eptr
572
573 #define new_recursive frame->Xnew_recursive
574
575 #define cur_is_word frame->Xcur_is_word
576 #define condition frame->Xcondition
577 #define prev_is_word frame->Xprev_is_word
578
579 #ifdef SUPPORT_UCP
580 #define prop_type frame->Xprop_type
581 #define prop_value frame->Xprop_value
582 #define prop_fail_result frame->Xprop_fail_result
583 #define oclength frame->Xoclength
584 #define occhars frame->Xocchars
585 #endif
586
587 #define ctype frame->Xctype
588 #define fc frame->Xfc
589 #define fi frame->Xfi
590 #define length frame->Xlength
591 #define max frame->Xmax
592 #define min frame->Xmin
593 #define number frame->Xnumber
594 #define offset frame->Xoffset
595 #define op frame->Xop
596 #define save_capture_last frame->Xsave_capture_last
597 #define save_offset1 frame->Xsave_offset1
598 #define save_offset2 frame->Xsave_offset2
599 #define save_offset3 frame->Xsave_offset3
600 #define stacksave frame->Xstacksave
601
602 #define newptrb frame->Xnewptrb
603
604 /* When recursion is being used, local variables are allocated on the stack and
605 get preserved during recursion in the normal way. In this environment, fi and
606 i, and fc and c, can be the same variables. */
607
608 #else /* NO_RECURSE not defined */
609 #define fi i
610 #define fc c
611
612 /* Many of the following variables are used only in small blocks of the code.
613 My normal style of coding would have declared them within each of those blocks.
614 However, in order to accommodate the version of this code that uses an external
615 "stack" implemented on the heap, it is easier to declare them all here, so the
616 declarations can be cut out in a block. The only declarations within blocks
617 below are for variables that do not have to be preserved over a recursive call
618 to RMATCH(). */
619
620 #ifdef SUPPORT_UTF
621 const pcre_uchar *charptr;
622 #endif
623 const pcre_uchar *callpat;
624 const pcre_uchar *data;
625 const pcre_uchar *next;
626 PCRE_PUCHAR pp;
627 const pcre_uchar *prev;
628 PCRE_PUCHAR saved_eptr;
629
630 recursion_info new_recursive;
631
632 BOOL cur_is_word;
633 BOOL condition;
634 BOOL prev_is_word;
635
636 #ifdef SUPPORT_UCP
637 int prop_type;
638 unsigned int prop_value;
639 int prop_fail_result;
640 int oclength;
641 pcre_uchar occhars[6];
642 #endif
643
644 int codelink;
645 int ctype;
646 int length;
647 int max;
648 int min;
649 unsigned int number;
650 int offset;
651 unsigned int op;
652 pcre_int32 save_capture_last;
653 int save_offset1, save_offset2, save_offset3;
654 int stacksave[REC_STACK_SAVE_MAX];
655
656 eptrblock newptrb;
657
658 /* There is a special fudge for calling match() in a way that causes it to
659 measure the size of its basic stack frame when the stack is being used for
660 recursion. The second argument (ecode) being NULL triggers this behaviour. It
661 cannot normally ever be NULL. The return is the negated value of the frame
662 size. */
663
664 if (ecode == NULL)
665 {
666 if (rdepth == 0)
667 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
668 else
669 {
670 int len = (char *)&rdepth - (char *)eptr;
671 return (len > 0)? -len : len;
672 }
673 }
674 #endif /* NO_RECURSE */
675
676 /* To save space on the stack and in the heap frame, I have doubled up on some
677 of the local variables that are used only in localised parts of the code, but
678 still need to be preserved over recursive calls of match(). These macros define
679 the alternative names that are used. */
680
681 #define allow_zero cur_is_word
682 #define cbegroup condition
683 #define code_offset codelink
684 #define condassert condition
685 #define matched_once prev_is_word
686 #define foc number
687 #define save_mark data
688
689 /* These statements are here to stop the compiler complaining about unitialized
690 variables. */
691
692 #ifdef SUPPORT_UCP
693 prop_value = 0;
694 prop_fail_result = 0;
695 #endif
696
697
698 /* This label is used for tail recursion, which is used in a few cases even
699 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
700 used. Thanks to Ian Taylor for noticing this possibility and sending the
701 original patch. */
702
703 TAIL_RECURSE:
704
705 /* OK, now we can get on with the real code of the function. Recursive calls
706 are specified by the macro RMATCH and RRETURN is used to return. When
707 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
708 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
709 defined). However, RMATCH isn't like a function call because it's quite a
710 complicated macro. It has to be used in one particular way. This shouldn't,
711 however, impact performance when true recursion is being used. */
712
713 #ifdef SUPPORT_UTF
714 utf = md->utf; /* Local copy of the flag */
715 #else
716 utf = FALSE;
717 #endif
718
719 /* First check that we haven't called match() too many times, or that we
720 haven't exceeded the recursive call limit. */
721
722 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
723 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
724
725 /* At the start of a group with an unlimited repeat that may match an empty
726 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
727 done this way to save having to use another function argument, which would take
728 up space on the stack. See also MATCH_CONDASSERT below.
729
730 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
731 such remembered pointers, to be checked when we hit the closing ket, in order
732 to break infinite loops that match no characters. When match() is called in
733 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
734 NOT be used with tail recursion, because the memory block that is used is on
735 the stack, so a new one may be required for each match(). */
736
737 if (md->match_function_type == MATCH_CBEGROUP)
738 {
739 newptrb.epb_saved_eptr = eptr;
740 newptrb.epb_prev = eptrb;
741 eptrb = &newptrb;
742 md->match_function_type = 0;
743 }
744
745 /* Now start processing the opcodes. */
746
747 for (;;)
748 {
749 minimize = possessive = FALSE;
750 op = *ecode;
751
752 switch(op)
753 {
754 case OP_MARK:
755 md->nomatch_mark = ecode + 2;
756 md->mark = NULL; /* In case previously set by assertion */
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
758 eptrb, RM55);
759 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
760 md->mark == NULL) md->mark = ecode + 2;
761
762 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
763 argument, and we must check whether that argument matches this MARK's
764 argument. It is passed back in md->start_match_ptr (an overloading of that
765 variable). If it does match, we reset that variable to the current subject
766 position and return MATCH_SKIP. Otherwise, pass back the return code
767 unaltered. */
768
769 else if (rrc == MATCH_SKIP_ARG &&
770 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
771 {
772 md->start_match_ptr = eptr;
773 RRETURN(MATCH_SKIP);
774 }
775 RRETURN(rrc);
776
777 case OP_FAIL:
778 RRETURN(MATCH_NOMATCH);
779
780 /* COMMIT overrides PRUNE, SKIP, and THEN */
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
786 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
787 rrc != MATCH_THEN)
788 RRETURN(rrc);
789 RRETURN(MATCH_COMMIT);
790
791 /* PRUNE overrides THEN */
792
793 case OP_PRUNE:
794 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
795 eptrb, RM51);
796 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
797 RRETURN(MATCH_PRUNE);
798
799 case OP_PRUNE_ARG:
800 md->nomatch_mark = ecode + 2;
801 md->mark = NULL; /* In case previously set by assertion */
802 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
803 eptrb, RM56);
804 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
805 md->mark == NULL) md->mark = ecode + 2;
806 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
807 RRETURN(MATCH_PRUNE);
808
809 /* SKIP overrides PRUNE and THEN */
810
811 case OP_SKIP:
812 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
813 eptrb, RM53);
814 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
815 RRETURN(rrc);
816 md->start_match_ptr = eptr; /* Pass back current position */
817 RRETURN(MATCH_SKIP);
818
819 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
820 nomatch_mark. There is a flag that disables this opcode when re-matching a
821 pattern that ended with a SKIP for which there was not a matching MARK. */
822
823 case OP_SKIP_ARG:
824 if (md->ignore_skip_arg)
825 {
826 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
827 break;
828 }
829 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
830 eptrb, RM57);
831 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
832 RRETURN(rrc);
833
834 /* Pass back the current skip name by overloading md->start_match_ptr and
835 returning the special MATCH_SKIP_ARG return code. This will either be
836 caught by a matching MARK, or get to the top, where it causes a rematch
837 with the md->ignore_skip_arg flag set. */
838
839 md->start_match_ptr = ecode + 2;
840 RRETURN(MATCH_SKIP_ARG);
841
842 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
843 the branch in which it occurs can be determined. Overload the start of
844 match pointer to do this. */
845
846 case OP_THEN:
847 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
848 eptrb, RM54);
849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
850 md->start_match_ptr = ecode;
851 RRETURN(MATCH_THEN);
852
853 case OP_THEN_ARG:
854 md->nomatch_mark = ecode + 2;
855 md->mark = NULL; /* In case previously set by assertion */
856 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
857 md, eptrb, RM58);
858 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
859 md->mark == NULL) md->mark = ecode + 2;
860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
861 md->start_match_ptr = ecode;
862 RRETURN(MATCH_THEN);
863
864 /* Handle an atomic group that does not contain any capturing parentheses.
865 This can be handled like an assertion. Prior to 8.13, all atomic groups
866 were handled this way. In 8.13, the code was changed as below for ONCE, so
867 that backups pass through the group and thereby reset captured values.
868 However, this uses a lot more stack, so in 8.20, atomic groups that do not
869 contain any captures generate OP_ONCE_NC, which can be handled in the old,
870 less stack intensive way.
871
872 Check the alternative branches in turn - the matching won't pass the KET
873 for this kind of subpattern. If any one branch matches, we carry on as at
874 the end of a normal bracket, leaving the subject pointer, but resetting
875 the start-of-match value in case it was changed by \K. */
876
877 case OP_ONCE_NC:
878 prev = ecode;
879 saved_eptr = eptr;
880 save_mark = md->mark;
881 do
882 {
883 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
884 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
885 {
886 mstart = md->start_match_ptr;
887 break;
888 }
889 if (rrc == MATCH_THEN)
890 {
891 next = ecode + GET(ecode,1);
892 if (md->start_match_ptr < next &&
893 (*ecode == OP_ALT || *next == OP_ALT))
894 rrc = MATCH_NOMATCH;
895 }
896
897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
898 ecode += GET(ecode,1);
899 md->mark = save_mark;
900 }
901 while (*ecode == OP_ALT);
902
903 /* If hit the end of the group (which could be repeated), fail */
904
905 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
906
907 /* Continue as from after the group, updating the offsets high water
908 mark, since extracts may have been taken. */
909
910 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
911
912 offset_top = md->end_offset_top;
913 eptr = md->end_match_ptr;
914
915 /* For a non-repeating ket, just continue at this level. This also
916 happens for a repeating ket if no characters were matched in the group.
917 This is the forcible breaking of infinite loops as implemented in Perl
918 5.005. */
919
920 if (*ecode == OP_KET || eptr == saved_eptr)
921 {
922 ecode += 1+LINK_SIZE;
923 break;
924 }
925
926 /* The repeating kets try the rest of the pattern or restart from the
927 preceding bracket, in the appropriate order. The second "call" of match()
928 uses tail recursion, to avoid using another stack frame. */
929
930 if (*ecode == OP_KETRMIN)
931 {
932 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
934 ecode = prev;
935 goto TAIL_RECURSE;
936 }
937 else /* OP_KETRMAX */
938 {
939 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
941 ecode += 1 + LINK_SIZE;
942 goto TAIL_RECURSE;
943 }
944 /* Control never gets here */
945
946 /* Handle a capturing bracket, other than those that are possessive with an
947 unlimited repeat. If there is space in the offset vector, save the current
948 subject position in the working slot at the top of the vector. We mustn't
949 change the current values of the data slot, because they may be set from a
950 previous iteration of this group, and be referred to by a reference inside
951 the group. A failure to match might occur after the group has succeeded,
952 if something later on doesn't match. For this reason, we need to restore
953 the working value and also the values of the final offsets, in case they
954 were set by a previous iteration of the same bracket.
955
956 If there isn't enough space in the offset vector, treat this as if it were
957 a non-capturing bracket. Don't worry about setting the flag for the error
958 case here; that is handled in the code for KET. */
959
960 case OP_CBRA:
961 case OP_SCBRA:
962 number = GET2(ecode, 1+LINK_SIZE);
963 offset = number << 1;
964
965 #ifdef PCRE_DEBUG
966 printf("start bracket %d\n", number);
967 printf("subject=");
968 pchars(eptr, 16, TRUE, md);
969 printf("\n");
970 #endif
971
972 if (offset < md->offset_max)
973 {
974 save_offset1 = md->offset_vector[offset];
975 save_offset2 = md->offset_vector[offset+1];
976 save_offset3 = md->offset_vector[md->offset_end - number];
977 save_capture_last = md->capture_last;
978 save_mark = md->mark;
979
980 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
981 md->offset_vector[md->offset_end - number] =
982 (int)(eptr - md->start_subject);
983
984 for (;;)
985 {
986 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
987 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
988 eptrb, RM1);
989 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
990
991 /* If we backed up to a THEN, check whether it is within the current
992 branch by comparing the address of the THEN that is passed back with
993 the end of the branch. If it is within the current branch, and the
994 branch is one of two or more alternatives (it either starts or ends
995 with OP_ALT), we have reached the limit of THEN's action, so convert
996 the return code to NOMATCH, which will cause normal backtracking to
997 happen from now on. Otherwise, THEN is passed back to an outer
998 alternative. This implements Perl's treatment of parenthesized groups,
999 where a group not containing | does not affect the current alternative,
1000 that is, (X) is NOT the same as (X|(*F)). */
1001
1002 if (rrc == MATCH_THEN)
1003 {
1004 next = ecode + GET(ecode,1);
1005 if (md->start_match_ptr < next &&
1006 (*ecode == OP_ALT || *next == OP_ALT))
1007 rrc = MATCH_NOMATCH;
1008 }
1009
1010 /* Anything other than NOMATCH is passed back. */
1011
1012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1013 md->capture_last = save_capture_last;
1014 ecode += GET(ecode, 1);
1015 md->mark = save_mark;
1016 if (*ecode != OP_ALT) break;
1017 }
1018
1019 DPRINTF(("bracket %d failed\n", number));
1020 md->offset_vector[offset] = save_offset1;
1021 md->offset_vector[offset+1] = save_offset2;
1022 md->offset_vector[md->offset_end - number] = save_offset3;
1023
1024 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1025
1026 RRETURN(rrc);
1027 }
1028
1029 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1030 as a non-capturing bracket. */
1031
1032 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034
1035 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1036
1037 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1038 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1039
1040 /* Non-capturing or atomic group, except for possessive with unlimited
1041 repeat and ONCE group with no captures. Loop for all the alternatives.
1042
1043 When we get to the final alternative within the brackets, we used to return
1044 the result of a recursive call to match() whatever happened so it was
1045 possible to reduce stack usage by turning this into a tail recursion,
1046 except in the case of a possibly empty group. However, now that there is
1047 the possiblity of (*THEN) occurring in the final alternative, this
1048 optimization is no longer always possible.
1049
1050 We can optimize if we know there are no (*THEN)s in the pattern; at present
1051 this is the best that can be done.
1052
1053 MATCH_ONCE is returned when the end of an atomic group is successfully
1054 reached, but subsequent matching fails. It passes back up the tree (causing
1055 captured values to be reset) until the original atomic group level is
1056 reached. This is tested by comparing md->once_target with the start of the
1057 group. At this point, the return is converted into MATCH_NOMATCH so that
1058 previous backup points can be taken. */
1059
1060 case OP_ONCE:
1061 case OP_BRA:
1062 case OP_SBRA:
1063 DPRINTF(("start non-capturing bracket\n"));
1064
1065 for (;;)
1066 {
1067 if (op >= OP_SBRA || op == OP_ONCE)
1068 md->match_function_type = MATCH_CBEGROUP;
1069
1070 /* If this is not a possibly empty group, and there are no (*THEN)s in
1071 the pattern, and this is the final alternative, optimize as described
1072 above. */
1073
1074 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1075 {
1076 ecode += PRIV(OP_lengths)[*ecode];
1077 goto TAIL_RECURSE;
1078 }
1079
1080 /* In all other cases, we have to make another call to match(). */
1081
1082 save_mark = md->mark;
1083 save_capture_last = md->capture_last;
1084 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1085 RM2);
1086
1087 /* See comment in the code for capturing groups above about handling
1088 THEN. */
1089
1090 if (rrc == MATCH_THEN)
1091 {
1092 next = ecode + GET(ecode,1);
1093 if (md->start_match_ptr < next &&
1094 (*ecode == OP_ALT || *next == OP_ALT))
1095 rrc = MATCH_NOMATCH;
1096 }
1097
1098 if (rrc != MATCH_NOMATCH)
1099 {
1100 if (rrc == MATCH_ONCE)
1101 {
1102 const pcre_uchar *scode = ecode;
1103 if (*scode != OP_ONCE) /* If not at start, find it */
1104 {
1105 while (*scode == OP_ALT) scode += GET(scode, 1);
1106 scode -= GET(scode, 1);
1107 }
1108 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1109 }
1110 RRETURN(rrc);
1111 }
1112 ecode += GET(ecode, 1);
1113 md->mark = save_mark;
1114 if (*ecode != OP_ALT) break;
1115 md->capture_last = save_capture_last;
1116 }
1117
1118 RRETURN(MATCH_NOMATCH);
1119
1120 /* Handle possessive capturing brackets with an unlimited repeat. We come
1121 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1122 handled similarly to the normal case above. However, the matching is
1123 different. The end of these brackets will always be OP_KETRPOS, which
1124 returns MATCH_KETRPOS without going further in the pattern. By this means
1125 we can handle the group by iteration rather than recursion, thereby
1126 reducing the amount of stack needed. */
1127
1128 case OP_CBRAPOS:
1129 case OP_SCBRAPOS:
1130 allow_zero = FALSE;
1131
1132 POSSESSIVE_CAPTURE:
1133 number = GET2(ecode, 1+LINK_SIZE);
1134 offset = number << 1;
1135
1136 #ifdef PCRE_DEBUG
1137 printf("start possessive bracket %d\n", number);
1138 printf("subject=");
1139 pchars(eptr, 16, TRUE, md);
1140 printf("\n");
1141 #endif
1142
1143 if (offset < md->offset_max)
1144 {
1145 matched_once = FALSE;
1146 code_offset = (int)(ecode - md->start_code);
1147
1148 save_offset1 = md->offset_vector[offset];
1149 save_offset2 = md->offset_vector[offset+1];
1150 save_offset3 = md->offset_vector[md->offset_end - number];
1151 save_capture_last = md->capture_last;
1152
1153 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1154
1155 /* Each time round the loop, save the current subject position for use
1156 when the group matches. For MATCH_MATCH, the group has matched, so we
1157 restart it with a new subject starting position, remembering that we had
1158 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1159 usual. If we haven't matched any alternatives in any iteration, check to
1160 see if a previous iteration matched. If so, the group has matched;
1161 continue from afterwards. Otherwise it has failed; restore the previous
1162 capture values before returning NOMATCH. */
1163
1164 for (;;)
1165 {
1166 md->offset_vector[md->offset_end - number] =
1167 (int)(eptr - md->start_subject);
1168 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1169 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1170 eptrb, RM63);
1171 if (rrc == MATCH_KETRPOS)
1172 {
1173 offset_top = md->end_offset_top;
1174 eptr = md->end_match_ptr;
1175 ecode = md->start_code + code_offset;
1176 save_capture_last = md->capture_last;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 md->capture_last = save_capture_last;
1194 ecode += GET(ecode, 1);
1195 if (*ecode != OP_ALT) break;
1196 }
1197
1198 if (!matched_once)
1199 {
1200 md->offset_vector[offset] = save_offset1;
1201 md->offset_vector[offset+1] = save_offset2;
1202 md->offset_vector[md->offset_end - number] = save_offset3;
1203 }
1204
1205 if (allow_zero || matched_once)
1206 {
1207 ecode += 1 + LINK_SIZE;
1208 break;
1209 }
1210
1211 RRETURN(MATCH_NOMATCH);
1212 }
1213
1214 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1215 as a non-capturing bracket. */
1216
1217 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1218 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1219
1220 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1221
1222 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1223 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1224
1225 /* Non-capturing possessive bracket with unlimited repeat. We come here
1226 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1227 without the capturing complication. It is written out separately for speed
1228 and cleanliness. */
1229
1230 case OP_BRAPOS:
1231 case OP_SBRAPOS:
1232 allow_zero = FALSE;
1233
1234 POSSESSIVE_NON_CAPTURE:
1235 matched_once = FALSE;
1236 code_offset = (int)(ecode - md->start_code);
1237 save_capture_last = md->capture_last;
1238
1239 for (;;)
1240 {
1241 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1242 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1243 eptrb, RM48);
1244 if (rrc == MATCH_KETRPOS)
1245 {
1246 offset_top = md->end_offset_top;
1247 eptr = md->end_match_ptr;
1248 ecode = md->start_code + code_offset;
1249 matched_once = TRUE;
1250 continue;
1251 }
1252
1253 /* See comment in the code for capturing groups above about handling
1254 THEN. */
1255
1256 if (rrc == MATCH_THEN)
1257 {
1258 next = ecode + GET(ecode,1);
1259 if (md->start_match_ptr < next &&
1260 (*ecode == OP_ALT || *next == OP_ALT))
1261 rrc = MATCH_NOMATCH;
1262 }
1263
1264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1265 ecode += GET(ecode, 1);
1266 if (*ecode != OP_ALT) break;
1267 md->capture_last = save_capture_last;
1268 }
1269
1270 if (matched_once || allow_zero)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275 RRETURN(MATCH_NOMATCH);
1276
1277 /* Control never reaches here. */
1278
1279 /* Conditional group: compilation checked that there are no more than
1280 two branches. If the condition is false, skipping the first branch takes us
1281 past the end if there is only one branch, but that's OK because that is
1282 exactly what going to the ket would do. */
1283
1284 case OP_COND:
1285 case OP_SCOND:
1286 codelink = GET(ecode, 1);
1287
1288 /* Because of the way auto-callout works during compile, a callout item is
1289 inserted between OP_COND and an assertion condition. */
1290
1291 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1292 {
1293 if (PUBL(callout) != NULL)
1294 {
1295 PUBL(callout_block) cb;
1296 cb.version = 2; /* Version 1 of the callout block */
1297 cb.callout_number = ecode[LINK_SIZE+2];
1298 cb.offset_vector = md->offset_vector;
1299 #if defined COMPILE_PCRE8
1300 cb.subject = (PCRE_SPTR)md->start_subject;
1301 #elif defined COMPILE_PCRE16
1302 cb.subject = (PCRE_SPTR16)md->start_subject;
1303 #elif defined COMPILE_PCRE32
1304 cb.subject = (PCRE_SPTR32)md->start_subject;
1305 #endif
1306 cb.subject_length = (int)(md->end_subject - md->start_subject);
1307 cb.start_match = (int)(mstart - md->start_subject);
1308 cb.current_position = (int)(eptr - md->start_subject);
1309 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1310 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1311 cb.capture_top = offset_top/2;
1312 cb.capture_last = md->capture_last & CAPLMASK;
1313 /* Internal change requires this for API compatibility. */
1314 if (cb.capture_last == 0) cb.capture_last = -1;
1315 cb.callout_data = md->callout_data;
1316 cb.mark = md->nomatch_mark;
1317 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1318 if (rrc < 0) RRETURN(rrc);
1319 }
1320 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1321 }
1322
1323 condcode = ecode[LINK_SIZE+1];
1324
1325 /* Now see what the actual condition is */
1326
1327 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1328 {
1329 if (md->recursive == NULL) /* Not recursing => FALSE */
1330 {
1331 condition = FALSE;
1332 ecode += GET(ecode, 1);
1333 }
1334 else
1335 {
1336 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1337 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1338
1339 /* If the test is for recursion into a specific subpattern, and it is
1340 false, but the test was set up by name, scan the table to see if the
1341 name refers to any other numbers, and test them. The condition is true
1342 if any one is set. */
1343
1344 if (!condition && condcode == OP_NRREF)
1345 {
1346 pcre_uchar *slotA = md->name_table;
1347 for (i = 0; i < md->name_count; i++)
1348 {
1349 if (GET2(slotA, 0) == recno) break;
1350 slotA += md->name_entry_size;
1351 }
1352
1353 /* Found a name for the number - there can be only one; duplicate
1354 names for different numbers are allowed, but not vice versa. First
1355 scan down for duplicates. */
1356
1357 if (i < md->name_count)
1358 {
1359 pcre_uchar *slotB = slotA;
1360 while (slotB > md->name_table)
1361 {
1362 slotB -= md->name_entry_size;
1363 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1364 {
1365 condition = GET2(slotB, 0) == md->recursive->group_num;
1366 if (condition) break;
1367 }
1368 else break;
1369 }
1370
1371 /* Scan up for duplicates */
1372
1373 if (!condition)
1374 {
1375 slotB = slotA;
1376 for (i++; i < md->name_count; i++)
1377 {
1378 slotB += md->name_entry_size;
1379 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1380 {
1381 condition = GET2(slotB, 0) == md->recursive->group_num;
1382 if (condition) break;
1383 }
1384 else break;
1385 }
1386 }
1387 }
1388 }
1389
1390 /* Chose branch according to the condition */
1391
1392 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1393 }
1394 }
1395
1396 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1397 {
1398 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1399 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1400
1401 /* If the numbered capture is unset, but the reference was by name,
1402 scan the table to see if the name refers to any other numbers, and test
1403 them. The condition is true if any one is set. This is tediously similar
1404 to the code above, but not close enough to try to amalgamate. */
1405
1406 if (!condition && condcode == OP_NCREF)
1407 {
1408 unsigned int refno = offset >> 1;
1409 pcre_uchar *slotA = md->name_table;
1410
1411 for (i = 0; i < md->name_count; i++)
1412 {
1413 if (GET2(slotA, 0) == refno) break;
1414 slotA += md->name_entry_size;
1415 }
1416
1417 /* Found a name for the number - there can be only one; duplicate names
1418 for different numbers are allowed, but not vice versa. First scan down
1419 for duplicates. */
1420
1421 if (i < md->name_count)
1422 {
1423 pcre_uchar *slotB = slotA;
1424 while (slotB > md->name_table)
1425 {
1426 slotB -= md->name_entry_size;
1427 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1428 {
1429 offset = GET2(slotB, 0) << 1;
1430 condition = offset < offset_top &&
1431 md->offset_vector[offset] >= 0;
1432 if (condition) break;
1433 }
1434 else break;
1435 }
1436
1437 /* Scan up for duplicates */
1438
1439 if (!condition)
1440 {
1441 slotB = slotA;
1442 for (i++; i < md->name_count; i++)
1443 {
1444 slotB += md->name_entry_size;
1445 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1446 {
1447 offset = GET2(slotB, 0) << 1;
1448 condition = offset < offset_top &&
1449 md->offset_vector[offset] >= 0;
1450 if (condition) break;
1451 }
1452 else break;
1453 }
1454 }
1455 }
1456 }
1457
1458 /* Chose branch according to the condition */
1459
1460 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1461 }
1462
1463 else if (condcode == OP_DEF) /* DEFINE - always false */
1464 {
1465 condition = FALSE;
1466 ecode += GET(ecode, 1);
1467 }
1468
1469 /* The condition is an assertion. Call match() to evaluate it - setting
1470 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1471 an assertion. */
1472
1473 else
1474 {
1475 md->match_function_type = MATCH_CONDASSERT;
1476 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1477 if (rrc == MATCH_MATCH)
1478 {
1479 if (md->end_offset_top > offset_top)
1480 offset_top = md->end_offset_top; /* Captures may have happened */
1481 condition = TRUE;
1482 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1483 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1484 }
1485
1486 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1487 assertion; it is therefore treated as NOMATCH. */
1488
1489 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1490 {
1491 RRETURN(rrc); /* Need braces because of following else */
1492 }
1493 else
1494 {
1495 condition = FALSE;
1496 ecode += codelink;
1497 }
1498 }
1499
1500 /* We are now at the branch that is to be obeyed. As there is only one, can
1501 use tail recursion to avoid using another stack frame, except when there is
1502 unlimited repeat of a possibly empty group. In the latter case, a recursive
1503 call to match() is always required, unless the second alternative doesn't
1504 exist, in which case we can just plough on. Note that, for compatibility
1505 with Perl, the | in a conditional group is NOT treated as creating two
1506 alternatives. If a THEN is encountered in the branch, it propagates out to
1507 the enclosing alternative (unless nested in a deeper set of alternatives,
1508 of course). */
1509
1510 if (condition || *ecode == OP_ALT)
1511 {
1512 if (op != OP_SCOND)
1513 {
1514 ecode += 1 + LINK_SIZE;
1515 goto TAIL_RECURSE;
1516 }
1517
1518 md->match_function_type = MATCH_CBEGROUP;
1519 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1520 RRETURN(rrc);
1521 }
1522
1523 /* Condition false & no alternative; continue after the group. */
1524
1525 else
1526 {
1527 ecode += 1 + LINK_SIZE;
1528 }
1529 break;
1530
1531
1532 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1533 to close any currently open capturing brackets. */
1534
1535 case OP_CLOSE:
1536 number = GET2(ecode, 1); /* Must be less than 65536 */
1537 offset = number << 1;
1538
1539 #ifdef PCRE_DEBUG
1540 printf("end bracket %d at *ACCEPT", number);
1541 printf("\n");
1542 #endif
1543
1544 md->capture_last = (md->capture_last & OVFLMASK) | number;
1545 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1546 {
1547 md->offset_vector[offset] =
1548 md->offset_vector[md->offset_end - number];
1549 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1550 if (offset_top <= offset) offset_top = offset + 2;
1551 }
1552 ecode += 1 + IMM2_SIZE;
1553 break;
1554
1555
1556 /* End of the pattern, either real or forced. */
1557
1558 case OP_END:
1559 case OP_ACCEPT:
1560 case OP_ASSERT_ACCEPT:
1561
1562 /* If we have matched an empty string, fail if not in an assertion and not
1563 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1564 is set and we have matched at the start of the subject. In both cases,
1565 backtracking will then try other alternatives, if any. */
1566
1567 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1568 md->recursive == NULL &&
1569 (md->notempty ||
1570 (md->notempty_atstart &&
1571 mstart == md->start_subject + md->start_offset)))
1572 RRETURN(MATCH_NOMATCH);
1573
1574 /* Otherwise, we have a match. */
1575
1576 md->end_match_ptr = eptr; /* Record where we ended */
1577 md->end_offset_top = offset_top; /* and how many extracts were taken */
1578 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1579
1580 /* For some reason, the macros don't work properly if an expression is
1581 given as the argument to RRETURN when the heap is in use. */
1582
1583 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1584 RRETURN(rrc);
1585
1586 /* Assertion brackets. Check the alternative branches in turn - the
1587 matching won't pass the KET for an assertion. If any one branch matches,
1588 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1589 start of each branch to move the current point backwards, so the code at
1590 this level is identical to the lookahead case. When the assertion is part
1591 of a condition, we want to return immediately afterwards. The caller of
1592 this incarnation of the match() function will have set MATCH_CONDASSERT in
1593 md->match_function type, and one of these opcodes will be the first opcode
1594 that is processed. We use a local variable that is preserved over calls to
1595 match() to remember this case. */
1596
1597 case OP_ASSERT:
1598 case OP_ASSERTBACK:
1599 save_mark = md->mark;
1600 if (md->match_function_type == MATCH_CONDASSERT)
1601 {
1602 condassert = TRUE;
1603 md->match_function_type = 0;
1604 }
1605 else condassert = FALSE;
1606
1607 do
1608 {
1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1610 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1611 {
1612 mstart = md->start_match_ptr; /* In case \K reset it */
1613 break;
1614 }
1615 md->mark = save_mark;
1616
1617 /* A COMMIT failure must fail the entire assertion, without trying any
1618 subsequent branches. */
1619
1620 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1621
1622 /* PCRE does not allow THEN to escape beyond an assertion; it
1623 is treated as NOMATCH. */
1624
1625 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1626 ecode += GET(ecode, 1);
1627 }
1628 while (*ecode == OP_ALT);
1629
1630 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1631
1632 /* If checking an assertion for a condition, return MATCH_MATCH. */
1633
1634 if (condassert) RRETURN(MATCH_MATCH);
1635
1636 /* Continue from after the assertion, updating the offsets high water
1637 mark, since extracts may have been taken during the assertion. */
1638
1639 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1640 ecode += 1 + LINK_SIZE;
1641 offset_top = md->end_offset_top;
1642 continue;
1643
1644 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1645 PRUNE, or COMMIT means we must assume failure without checking subsequent
1646 branches. */
1647
1648 case OP_ASSERT_NOT:
1649 case OP_ASSERTBACK_NOT:
1650 save_mark = md->mark;
1651 if (md->match_function_type == MATCH_CONDASSERT)
1652 {
1653 condassert = TRUE;
1654 md->match_function_type = 0;
1655 }
1656 else condassert = FALSE;
1657
1658 do
1659 {
1660 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1661 md->mark = save_mark;
1662 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1663 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1664 {
1665 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1666 break;
1667 }
1668
1669 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1670 as NOMATCH. */
1671
1672 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1673 ecode += GET(ecode,1);
1674 }
1675 while (*ecode == OP_ALT);
1676
1677 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1678
1679 ecode += 1 + LINK_SIZE;
1680 continue;
1681
1682 /* Move the subject pointer back. This occurs only at the start of
1683 each branch of a lookbehind assertion. If we are too close to the start to
1684 move back, this match function fails. When working with UTF-8 we move
1685 back a number of characters, not bytes. */
1686
1687 case OP_REVERSE:
1688 #ifdef SUPPORT_UTF
1689 if (utf)
1690 {
1691 i = GET(ecode, 1);
1692 while (i-- > 0)
1693 {
1694 eptr--;
1695 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1696 BACKCHAR(eptr);
1697 }
1698 }
1699 else
1700 #endif
1701
1702 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1703
1704 {
1705 eptr -= GET(ecode, 1);
1706 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1707 }
1708
1709 /* Save the earliest consulted character, then skip to next op code */
1710
1711 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1712 ecode += 1 + LINK_SIZE;
1713 break;
1714
1715 /* The callout item calls an external function, if one is provided, passing
1716 details of the match so far. This is mainly for debugging, though the
1717 function is able to force a failure. */
1718
1719 case OP_CALLOUT:
1720 if (PUBL(callout) != NULL)
1721 {
1722 PUBL(callout_block) cb;
1723 cb.version = 2; /* Version 1 of the callout block */
1724 cb.callout_number = ecode[1];
1725 cb.offset_vector = md->offset_vector;
1726 #if defined COMPILE_PCRE8
1727 cb.subject = (PCRE_SPTR)md->start_subject;
1728 #elif defined COMPILE_PCRE16
1729 cb.subject = (PCRE_SPTR16)md->start_subject;
1730 #elif defined COMPILE_PCRE32
1731 cb.subject = (PCRE_SPTR32)md->start_subject;
1732 #endif
1733 cb.subject_length = (int)(md->end_subject - md->start_subject);
1734 cb.start_match = (int)(mstart - md->start_subject);
1735 cb.current_position = (int)(eptr - md->start_subject);
1736 cb.pattern_position = GET(ecode, 2);
1737 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1738 cb.capture_top = offset_top/2;
1739 cb.capture_last = md->capture_last & CAPLMASK;
1740 /* Internal change requires this for API compatibility. */
1741 if (cb.capture_last == 0) cb.capture_last = -1;
1742 cb.callout_data = md->callout_data;
1743 cb.mark = md->nomatch_mark;
1744 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1745 if (rrc < 0) RRETURN(rrc);
1746 }
1747 ecode += 2 + 2*LINK_SIZE;
1748 break;
1749
1750 /* Recursion either matches the current regex, or some subexpression. The
1751 offset data is the offset to the starting bracket from the start of the
1752 whole pattern. (This is so that it works from duplicated subpatterns.)
1753
1754 The state of the capturing groups is preserved over recursion, and
1755 re-instated afterwards. We don't know how many are started and not yet
1756 finished (offset_top records the completed total) so we just have to save
1757 all the potential data. There may be up to 65535 such values, which is too
1758 large to put on the stack, but using malloc for small numbers seems
1759 expensive. As a compromise, the stack is used when there are no more than
1760 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1761
1762 There are also other values that have to be saved. We use a chained
1763 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1764 for the original version of this logic. It has, however, been hacked around
1765 a lot, so he is not to blame for the current way it works. */
1766
1767 case OP_RECURSE:
1768 {
1769 recursion_info *ri;
1770 unsigned int recno;
1771
1772 callpat = md->start_code + GET(ecode, 1);
1773 recno = (callpat == md->start_code)? 0 :
1774 GET2(callpat, 1 + LINK_SIZE);
1775
1776 /* Check for repeating a recursion without advancing the subject pointer.
1777 This should catch convoluted mutual recursions. (Some simple cases are
1778 caught at compile time.) */
1779
1780 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1781 if (recno == ri->group_num && eptr == ri->subject_position)
1782 RRETURN(PCRE_ERROR_RECURSELOOP);
1783
1784 /* Add to "recursing stack" */
1785
1786 new_recursive.group_num = recno;
1787 new_recursive.saved_capture_last = md->capture_last;
1788 new_recursive.subject_position = eptr;
1789 new_recursive.prevrec = md->recursive;
1790 md->recursive = &new_recursive;
1791
1792 /* Where to continue from afterwards */
1793
1794 ecode += 1 + LINK_SIZE;
1795
1796 /* Now save the offset data */
1797
1798 new_recursive.saved_max = md->offset_end;
1799 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1800 new_recursive.offset_save = stacksave;
1801 else
1802 {
1803 new_recursive.offset_save =
1804 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1805 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1806 }
1807 memcpy(new_recursive.offset_save, md->offset_vector,
1808 new_recursive.saved_max * sizeof(int));
1809
1810 /* OK, now we can do the recursion. After processing each alternative,
1811 restore the offset data and the last captured value. If there were nested
1812 recursions, md->recursive might be changed, so reset it before looping.
1813 */
1814
1815 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1816 cbegroup = (*callpat >= OP_SBRA);
1817 do
1818 {
1819 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1820 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1821 md, eptrb, RM6);
1822 memcpy(md->offset_vector, new_recursive.offset_save,
1823 new_recursive.saved_max * sizeof(int));
1824 md->capture_last = new_recursive.saved_capture_last;
1825 md->recursive = new_recursive.prevrec;
1826 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1827 {
1828 DPRINTF(("Recursion matched\n"));
1829 if (new_recursive.offset_save != stacksave)
1830 (PUBL(free))(new_recursive.offset_save);
1831
1832 /* Set where we got to in the subject, and reset the start in case
1833 it was changed by \K. This *is* propagated back out of a recursion,
1834 for Perl compatibility. */
1835
1836 eptr = md->end_match_ptr;
1837 mstart = md->start_match_ptr;
1838 goto RECURSION_MATCHED; /* Exit loop; end processing */
1839 }
1840
1841 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1842 is treated as NOMATCH. */
1843
1844 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1845 rrc != MATCH_COMMIT)
1846 {
1847 DPRINTF(("Recursion gave error %d\n", rrc));
1848 if (new_recursive.offset_save != stacksave)
1849 (PUBL(free))(new_recursive.offset_save);
1850 RRETURN(rrc);
1851 }
1852
1853 md->recursive = &new_recursive;
1854 callpat += GET(callpat, 1);
1855 }
1856 while (*callpat == OP_ALT);
1857
1858 DPRINTF(("Recursion didn't match\n"));
1859 md->recursive = new_recursive.prevrec;
1860 if (new_recursive.offset_save != stacksave)
1861 (PUBL(free))(new_recursive.offset_save);
1862 RRETURN(MATCH_NOMATCH);
1863 }
1864
1865 RECURSION_MATCHED:
1866 break;
1867
1868 /* An alternation is the end of a branch; scan along to find the end of the
1869 bracketed group and go to there. */
1870
1871 case OP_ALT:
1872 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1873 break;
1874
1875 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1876 indicating that it may occur zero times. It may repeat infinitely, or not
1877 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1878 with fixed upper repeat limits are compiled as a number of copies, with the
1879 optional ones preceded by BRAZERO or BRAMINZERO. */
1880
1881 case OP_BRAZERO:
1882 next = ecode + 1;
1883 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1885 do next += GET(next, 1); while (*next == OP_ALT);
1886 ecode = next + 1 + LINK_SIZE;
1887 break;
1888
1889 case OP_BRAMINZERO:
1890 next = ecode + 1;
1891 do next += GET(next, 1); while (*next == OP_ALT);
1892 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1894 ecode++;
1895 break;
1896
1897 case OP_SKIPZERO:
1898 next = ecode+1;
1899 do next += GET(next,1); while (*next == OP_ALT);
1900 ecode = next + 1 + LINK_SIZE;
1901 break;
1902
1903 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1904 here; just jump to the group, with allow_zero set TRUE. */
1905
1906 case OP_BRAPOSZERO:
1907 op = *(++ecode);
1908 allow_zero = TRUE;
1909 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1910 goto POSSESSIVE_NON_CAPTURE;
1911
1912 /* End of a group, repeated or non-repeating. */
1913
1914 case OP_KET:
1915 case OP_KETRMIN:
1916 case OP_KETRMAX:
1917 case OP_KETRPOS:
1918 prev = ecode - GET(ecode, 1);
1919
1920 /* If this was a group that remembered the subject start, in order to break
1921 infinite repeats of empty string matches, retrieve the subject start from
1922 the chain. Otherwise, set it NULL. */
1923
1924 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1925 {
1926 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1927 eptrb = eptrb->epb_prev; /* Backup to previous group */
1928 }
1929 else saved_eptr = NULL;
1930
1931 /* If we are at the end of an assertion group or a non-capturing atomic
1932 group, stop matching and return MATCH_MATCH, but record the current high
1933 water mark for use by positive assertions. We also need to record the match
1934 start in case it was changed by \K. */
1935
1936 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1937 *prev == OP_ONCE_NC)
1938 {
1939 md->end_match_ptr = eptr; /* For ONCE_NC */
1940 md->end_offset_top = offset_top;
1941 md->start_match_ptr = mstart;
1942 RRETURN(MATCH_MATCH); /* Sets md->mark */
1943 }
1944
1945 /* For capturing groups we have to check the group number back at the start
1946 and if necessary complete handling an extraction by setting the offsets and
1947 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1948 into group 0, so it won't be picked up here. Instead, we catch it when the
1949 OP_END is reached. Other recursion is handled here. We just have to record
1950 the current subject position and start match pointer and give a MATCH
1951 return. */
1952
1953 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1954 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1955 {
1956 number = GET2(prev, 1+LINK_SIZE);
1957 offset = number << 1;
1958
1959 #ifdef PCRE_DEBUG
1960 printf("end bracket %d", number);
1961 printf("\n");
1962 #endif
1963
1964 /* Handle a recursively called group. */
1965
1966 if (md->recursive != NULL && md->recursive->group_num == number)
1967 {
1968 md->end_match_ptr = eptr;
1969 md->start_match_ptr = mstart;
1970 RRETURN(MATCH_MATCH);
1971 }
1972
1973 /* Deal with capturing */
1974
1975 md->capture_last = (md->capture_last & OVFLMASK) | number;
1976 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1977 {
1978 /* If offset is greater than offset_top, it means that we are
1979 "skipping" a capturing group, and that group's offsets must be marked
1980 unset. In earlier versions of PCRE, all the offsets were unset at the
1981 start of matching, but this doesn't work because atomic groups and
1982 assertions can cause a value to be set that should later be unset.
1983 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1984 part of the atomic group, but this is not on the final matching path,
1985 so must be unset when 2 is set. (If there is no group 2, there is no
1986 problem, because offset_top will then be 2, indicating no capture.) */
1987
1988 if (offset > offset_top)
1989 {
1990 register int *iptr = md->offset_vector + offset_top;
1991 register int *iend = md->offset_vector + offset;
1992 while (iptr < iend) *iptr++ = -1;
1993 }
1994
1995 /* Now make the extraction */
1996
1997 md->offset_vector[offset] =
1998 md->offset_vector[md->offset_end - number];
1999 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2000 if (offset_top <= offset) offset_top = offset + 2;
2001 }
2002 }
2003
2004 /* For an ordinary non-repeating ket, just continue at this level. This
2005 also happens for a repeating ket if no characters were matched in the
2006 group. This is the forcible breaking of infinite loops as implemented in
2007 Perl 5.005. For a non-repeating atomic group that includes captures,
2008 establish a backup point by processing the rest of the pattern at a lower
2009 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2010 original OP_ONCE level, thereby bypassing intermediate backup points, but
2011 resetting any captures that happened along the way. */
2012
2013 if (*ecode == OP_KET || eptr == saved_eptr)
2014 {
2015 if (*prev == OP_ONCE)
2016 {
2017 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2020 RRETURN(MATCH_ONCE);
2021 }
2022 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2023 break;
2024 }
2025
2026 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2027 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2028 at a time from the outer level, thus saving stack. */
2029
2030 if (*ecode == OP_KETRPOS)
2031 {
2032 md->end_match_ptr = eptr;
2033 md->end_offset_top = offset_top;
2034 RRETURN(MATCH_KETRPOS);
2035 }
2036
2037 /* The normal repeating kets try the rest of the pattern or restart from
2038 the preceding bracket, in the appropriate order. In the second case, we can
2039 use tail recursion to avoid using another stack frame, unless we have an
2040 an atomic group or an unlimited repeat of a group that can match an empty
2041 string. */
2042
2043 if (*ecode == OP_KETRMIN)
2044 {
2045 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2046 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2047 if (*prev == OP_ONCE)
2048 {
2049 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2051 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2052 RRETURN(MATCH_ONCE);
2053 }
2054 if (*prev >= OP_SBRA) /* Could match an empty string */
2055 {
2056 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2057 RRETURN(rrc);
2058 }
2059 ecode = prev;
2060 goto TAIL_RECURSE;
2061 }
2062 else /* OP_KETRMAX */
2063 {
2064 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2065 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 if (*prev == OP_ONCE)
2068 {
2069 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2071 md->once_target = prev;
2072 RRETURN(MATCH_ONCE);
2073 }
2074 ecode += 1 + LINK_SIZE;
2075 goto TAIL_RECURSE;
2076 }
2077 /* Control never gets here */
2078
2079 /* Not multiline mode: start of subject assertion, unless notbol. */
2080
2081 case OP_CIRC:
2082 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2083
2084 /* Start of subject assertion */
2085
2086 case OP_SOD:
2087 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2088 ecode++;
2089 break;
2090
2091 /* Multiline mode: start of subject unless notbol, or after any newline. */
2092
2093 case OP_CIRCM:
2094 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2095 if (eptr != md->start_subject &&
2096 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2097 RRETURN(MATCH_NOMATCH);
2098 ecode++;
2099 break;
2100
2101 /* Start of match assertion */
2102
2103 case OP_SOM:
2104 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2105 ecode++;
2106 break;
2107
2108 /* Reset the start of match point */
2109
2110 case OP_SET_SOM:
2111 mstart = eptr;
2112 ecode++;
2113 break;
2114
2115 /* Multiline mode: assert before any newline, or before end of subject
2116 unless noteol is set. */
2117
2118 case OP_DOLLM:
2119 if (eptr < md->end_subject)
2120 {
2121 if (!IS_NEWLINE(eptr))
2122 {
2123 if (md->partial != 0 &&
2124 eptr + 1 >= md->end_subject &&
2125 NLBLOCK->nltype == NLTYPE_FIXED &&
2126 NLBLOCK->nllen == 2 &&
2127 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2128 {
2129 md->hitend = TRUE;
2130 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2131 }
2132 RRETURN(MATCH_NOMATCH);
2133 }
2134 }
2135 else
2136 {
2137 if (md->noteol) RRETURN(MATCH_NOMATCH);
2138 SCHECK_PARTIAL();
2139 }
2140 ecode++;
2141 break;
2142
2143 /* Not multiline mode: assert before a terminating newline or before end of
2144 subject unless noteol is set. */
2145
2146 case OP_DOLL:
2147 if (md->noteol) RRETURN(MATCH_NOMATCH);
2148 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2149
2150 /* ... else fall through for endonly */
2151
2152 /* End of subject assertion (\z) */
2153
2154 case OP_EOD:
2155 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2156 SCHECK_PARTIAL();
2157 ecode++;
2158 break;
2159
2160 /* End of subject or ending \n assertion (\Z) */
2161
2162 case OP_EODN:
2163 ASSERT_NL_OR_EOS:
2164 if (eptr < md->end_subject &&
2165 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2166 {
2167 if (md->partial != 0 &&
2168 eptr + 1 >= md->end_subject &&
2169 NLBLOCK->nltype == NLTYPE_FIXED &&
2170 NLBLOCK->nllen == 2 &&
2171 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2172 {
2173 md->hitend = TRUE;
2174 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2175 }
2176 RRETURN(MATCH_NOMATCH);
2177 }
2178
2179 /* Either at end of string or \n before end. */
2180
2181 SCHECK_PARTIAL();
2182 ecode++;
2183 break;
2184
2185 /* Word boundary assertions */
2186
2187 case OP_NOT_WORD_BOUNDARY:
2188 case OP_WORD_BOUNDARY:
2189 {
2190
2191 /* Find out if the previous and current characters are "word" characters.
2192 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2193 be "non-word" characters. Remember the earliest consulted character for
2194 partial matching. */
2195
2196 #ifdef SUPPORT_UTF
2197 if (utf)
2198 {
2199 /* Get status of previous character */
2200
2201 if (eptr == md->start_subject) prev_is_word = FALSE; else
2202 {
2203 PCRE_PUCHAR lastptr = eptr - 1;
2204 BACKCHAR(lastptr);
2205 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2206 GETCHAR(c, lastptr);
2207 #ifdef SUPPORT_UCP
2208 if (md->use_ucp)
2209 {
2210 if (c == '_') prev_is_word = TRUE; else
2211 {
2212 int cat = UCD_CATEGORY(c);
2213 prev_is_word = (cat == ucp_L || cat == ucp_N);
2214 }
2215 }
2216 else
2217 #endif
2218 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2219 }
2220
2221 /* Get status of next character */
2222
2223 if (eptr >= md->end_subject)
2224 {
2225 SCHECK_PARTIAL();
2226 cur_is_word = FALSE;
2227 }
2228 else
2229 {
2230 GETCHAR(c, eptr);
2231 #ifdef SUPPORT_UCP
2232 if (md->use_ucp)
2233 {
2234 if (c == '_') cur_is_word = TRUE; else
2235 {
2236 int cat = UCD_CATEGORY(c);
2237 cur_is_word = (cat == ucp_L || cat == ucp_N);
2238 }
2239 }
2240 else
2241 #endif
2242 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2243 }
2244 }
2245 else
2246 #endif
2247
2248 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2249 consistency with the behaviour of \w we do use it in this case. */
2250
2251 {
2252 /* Get status of previous character */
2253
2254 if (eptr == md->start_subject) prev_is_word = FALSE; else
2255 {
2256 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2257 #ifdef SUPPORT_UCP
2258 if (md->use_ucp)
2259 {
2260 c = eptr[-1];
2261 if (c == '_') prev_is_word = TRUE; else
2262 {
2263 int cat = UCD_CATEGORY(c);
2264 prev_is_word = (cat == ucp_L || cat == ucp_N);
2265 }
2266 }
2267 else
2268 #endif
2269 prev_is_word = MAX_255(eptr[-1])
2270 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2271 }
2272
2273 /* Get status of next character */
2274
2275 if (eptr >= md->end_subject)
2276 {
2277 SCHECK_PARTIAL();
2278 cur_is_word = FALSE;
2279 }
2280 else
2281 #ifdef SUPPORT_UCP
2282 if (md->use_ucp)
2283 {
2284 c = *eptr;
2285 if (c == '_') cur_is_word = TRUE; else
2286 {
2287 int cat = UCD_CATEGORY(c);
2288 cur_is_word = (cat == ucp_L || cat == ucp_N);
2289 }
2290 }
2291 else
2292 #endif
2293 cur_is_word = MAX_255(*eptr)
2294 && ((md->ctypes[*eptr] & ctype_word) != 0);
2295 }
2296
2297 /* Now see if the situation is what we want */
2298
2299 if ((*ecode++ == OP_WORD_BOUNDARY)?
2300 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2301 RRETURN(MATCH_NOMATCH);
2302 }
2303 break;
2304
2305 /* Match any single character type except newline; have to take care with
2306 CRLF newlines and partial matching. */
2307
2308 case OP_ANY:
2309 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2310 if (md->partial != 0 &&
2311 eptr + 1 >= md->end_subject &&
2312 NLBLOCK->nltype == NLTYPE_FIXED &&
2313 NLBLOCK->nllen == 2 &&
2314 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2315 {
2316 md->hitend = TRUE;
2317 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2318 }
2319
2320 /* Fall through */
2321
2322 /* Match any single character whatsoever. */
2323
2324 case OP_ALLANY:
2325 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2326 { /* not be updated before SCHECK_PARTIAL. */
2327 SCHECK_PARTIAL();
2328 RRETURN(MATCH_NOMATCH);
2329 }
2330 eptr++;
2331 #ifdef SUPPORT_UTF
2332 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2333 #endif
2334 ecode++;
2335 break;
2336
2337 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2338 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2339
2340 case OP_ANYBYTE:
2341 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2342 { /* not be updated before SCHECK_PARTIAL. */
2343 SCHECK_PARTIAL();
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 eptr++;
2347 ecode++;
2348 break;
2349
2350 case OP_NOT_DIGIT:
2351 if (eptr >= md->end_subject)
2352 {
2353 SCHECK_PARTIAL();
2354 RRETURN(MATCH_NOMATCH);
2355 }
2356 GETCHARINCTEST(c, eptr);
2357 if (
2358 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2359 c < 256 &&
2360 #endif
2361 (md->ctypes[c] & ctype_digit) != 0
2362 )
2363 RRETURN(MATCH_NOMATCH);
2364 ecode++;
2365 break;
2366
2367 case OP_DIGIT:
2368 if (eptr >= md->end_subject)
2369 {
2370 SCHECK_PARTIAL();
2371 RRETURN(MATCH_NOMATCH);
2372 }
2373 GETCHARINCTEST(c, eptr);
2374 if (
2375 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2376 c > 255 ||
2377 #endif
2378 (md->ctypes[c] & ctype_digit) == 0
2379 )
2380 RRETURN(MATCH_NOMATCH);
2381 ecode++;
2382 break;
2383
2384 case OP_NOT_WHITESPACE:
2385 if (eptr >= md->end_subject)
2386 {
2387 SCHECK_PARTIAL();
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 GETCHARINCTEST(c, eptr);
2391 if (
2392 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2393 c < 256 &&
2394 #endif
2395 (md->ctypes[c] & ctype_space) != 0
2396 )
2397 RRETURN(MATCH_NOMATCH);
2398 ecode++;
2399 break;
2400
2401 case OP_WHITESPACE:
2402 if (eptr >= md->end_subject)
2403 {
2404 SCHECK_PARTIAL();
2405 RRETURN(MATCH_NOMATCH);
2406 }
2407 GETCHARINCTEST(c, eptr);
2408 if (
2409 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2410 c > 255 ||
2411 #endif
2412 (md->ctypes[c] & ctype_space) == 0
2413 )
2414 RRETURN(MATCH_NOMATCH);
2415 ecode++;
2416 break;
2417
2418 case OP_NOT_WORDCHAR:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 RRETURN(MATCH_NOMATCH);
2423 }
2424 GETCHARINCTEST(c, eptr);
2425 if (
2426 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2427 c < 256 &&
2428 #endif
2429 (md->ctypes[c] & ctype_word) != 0
2430 )
2431 RRETURN(MATCH_NOMATCH);
2432 ecode++;
2433 break;
2434
2435 case OP_WORDCHAR:
2436 if (eptr >= md->end_subject)
2437 {
2438 SCHECK_PARTIAL();
2439 RRETURN(MATCH_NOMATCH);
2440 }
2441 GETCHARINCTEST(c, eptr);
2442 if (
2443 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2444 c > 255 ||
2445 #endif
2446 (md->ctypes[c] & ctype_word) == 0
2447 )
2448 RRETURN(MATCH_NOMATCH);
2449 ecode++;
2450 break;
2451
2452 case OP_ANYNL:
2453 if (eptr >= md->end_subject)
2454 {
2455 SCHECK_PARTIAL();
2456 RRETURN(MATCH_NOMATCH);
2457 }
2458 GETCHARINCTEST(c, eptr);
2459 switch(c)
2460 {
2461 default: RRETURN(MATCH_NOMATCH);
2462
2463 case CHAR_CR:
2464 if (eptr >= md->end_subject)
2465 {
2466 SCHECK_PARTIAL();
2467 }
2468 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2469 break;
2470
2471 case CHAR_LF:
2472 break;
2473
2474 case CHAR_VT:
2475 case CHAR_FF:
2476 case CHAR_NEL:
2477 #ifndef EBCDIC
2478 case 0x2028:
2479 case 0x2029:
2480 #endif /* Not EBCDIC */
2481 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2482 break;
2483 }
2484 ecode++;
2485 break;
2486
2487 case OP_NOT_HSPACE:
2488 if (eptr >= md->end_subject)
2489 {
2490 SCHECK_PARTIAL();
2491 RRETURN(MATCH_NOMATCH);
2492 }
2493 GETCHARINCTEST(c, eptr);
2494 switch(c)
2495 {
2496 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2497 default: break;
2498 }
2499 ecode++;
2500 break;
2501
2502 case OP_HSPACE:
2503 if (eptr >= md->end_subject)
2504 {
2505 SCHECK_PARTIAL();
2506 RRETURN(MATCH_NOMATCH);
2507 }
2508 GETCHARINCTEST(c, eptr);
2509 switch(c)
2510 {
2511 HSPACE_CASES: break; /* Byte and multibyte cases */
2512 default: RRETURN(MATCH_NOMATCH);
2513 }
2514 ecode++;
2515 break;
2516
2517 case OP_NOT_VSPACE:
2518 if (eptr >= md->end_subject)
2519 {
2520 SCHECK_PARTIAL();
2521 RRETURN(MATCH_NOMATCH);
2522 }
2523 GETCHARINCTEST(c, eptr);
2524 switch(c)
2525 {
2526 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2527 default: break;
2528 }
2529 ecode++;
2530 break;
2531
2532 case OP_VSPACE:
2533 if (eptr >= md->end_subject)
2534 {
2535 SCHECK_PARTIAL();
2536 RRETURN(MATCH_NOMATCH);
2537 }
2538 GETCHARINCTEST(c, eptr);
2539 switch(c)
2540 {
2541 VSPACE_CASES: break;
2542 default: RRETURN(MATCH_NOMATCH);
2543 }
2544 ecode++;
2545 break;
2546
2547 #ifdef SUPPORT_UCP
2548 /* Check the next character by Unicode property. We will get here only
2549 if the support is in the binary; otherwise a compile-time error occurs. */
2550
2551 case OP_PROP:
2552 case OP_NOTPROP:
2553 if (eptr >= md->end_subject)
2554 {
2555 SCHECK_PARTIAL();
2556 RRETURN(MATCH_NOMATCH);
2557 }
2558 GETCHARINCTEST(c, eptr);
2559 {
2560 const pcre_uint32 *cp;
2561 const ucd_record *prop = GET_UCD(c);
2562
2563 switch(ecode[1])
2564 {
2565 case PT_ANY:
2566 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2567 break;
2568
2569 case PT_LAMP:
2570 if ((prop->chartype == ucp_Lu ||
2571 prop->chartype == ucp_Ll ||
2572 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2573 RRETURN(MATCH_NOMATCH);
2574 break;
2575
2576 case PT_GC:
2577 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2580
2581 case PT_PC:
2582 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2583 RRETURN(MATCH_NOMATCH);
2584 break;
2585
2586 case PT_SC:
2587 if ((ecode[2] != prop->script) == (op == OP_PROP))
2588 RRETURN(MATCH_NOMATCH);
2589 break;
2590
2591 /* These are specials */
2592
2593 case PT_ALNUM:
2594 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2595 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2596 RRETURN(MATCH_NOMATCH);
2597 break;
2598
2599 case PT_SPACE: /* Perl space */
2600 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2601 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2602 == (op == OP_NOTPROP))
2603 RRETURN(MATCH_NOMATCH);
2604 break;
2605
2606 case PT_PXSPACE: /* POSIX space */
2607 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2608 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2609 c == CHAR_FF || c == CHAR_CR)
2610 == (op == OP_NOTPROP))
2611 RRETURN(MATCH_NOMATCH);
2612 break;
2613
2614 case PT_WORD:
2615 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2616 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2617 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2618 RRETURN(MATCH_NOMATCH);
2619 break;
2620
2621 case PT_CLIST:
2622 cp = PRIV(ucd_caseless_sets) + ecode[2];
2623 for (;;)
2624 {
2625 if (c < *cp)
2626 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2627 if (c == *cp++)
2628 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2629 }
2630 break;
2631
2632 /* This should never occur */
2633
2634 default:
2635 RRETURN(PCRE_ERROR_INTERNAL);
2636 }
2637
2638 ecode += 3;
2639 }
2640 break;
2641
2642 /* Match an extended Unicode sequence. We will get here only if the support
2643 is in the binary; otherwise a compile-time error occurs. */
2644
2645 case OP_EXTUNI:
2646 if (eptr >= md->end_subject)
2647 {
2648 SCHECK_PARTIAL();
2649 RRETURN(MATCH_NOMATCH);
2650 }
2651 else
2652 {
2653 int lgb, rgb;
2654 GETCHARINCTEST(c, eptr);
2655 lgb = UCD_GRAPHBREAK(c);
2656 while (eptr < md->end_subject)
2657 {
2658 int len = 1;
2659 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2660 rgb = UCD_GRAPHBREAK(c);
2661 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2662 lgb = rgb;
2663 eptr += len;
2664 }
2665 }
2666 CHECK_PARTIAL();
2667 ecode++;
2668 break;
2669 #endif /* SUPPORT_UCP */
2670
2671
2672 /* Match a back reference, possibly repeatedly. Look past the end of the
2673 item to see if there is repeat information following. The code is similar
2674 to that for character classes, but repeated for efficiency. Then obey
2675 similar code to character type repeats - written out again for speed.
2676 However, if the referenced string is the empty string, always treat
2677 it as matched, any number of times (otherwise there could be infinite
2678 loops). */
2679
2680 case OP_REF:
2681 case OP_REFI:
2682 caseless = op == OP_REFI;
2683 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2684 ecode += 1 + IMM2_SIZE;
2685
2686 /* If the reference is unset, there are two possibilities:
2687
2688 (a) In the default, Perl-compatible state, set the length negative;
2689 this ensures that every attempt at a match fails. We can't just fail
2690 here, because of the possibility of quantifiers with zero minima.
2691
2692 (b) If the JavaScript compatibility flag is set, set the length to zero
2693 so that the back reference matches an empty string.
2694
2695 Otherwise, set the length to the length of what was matched by the
2696 referenced subpattern. */
2697
2698 if (offset >= offset_top || md->offset_vector[offset] < 0)
2699 length = (md->jscript_compat)? 0 : -1;
2700 else
2701 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2702
2703 /* Set up for repetition, or handle the non-repeated case */
2704
2705 switch (*ecode)
2706 {
2707 case OP_CRSTAR:
2708 case OP_CRMINSTAR:
2709 case OP_CRPLUS:
2710 case OP_CRMINPLUS:
2711 case OP_CRQUERY:
2712 case OP_CRMINQUERY:
2713 c = *ecode++ - OP_CRSTAR;
2714 minimize = (c & 1) != 0;
2715 min = rep_min[c]; /* Pick up values from tables; */
2716 max = rep_max[c]; /* zero for max => infinity */
2717 if (max == 0) max = INT_MAX;
2718 break;
2719
2720 case OP_CRRANGE:
2721 case OP_CRMINRANGE:
2722 minimize = (*ecode == OP_CRMINRANGE);
2723 min = GET2(ecode, 1);
2724 max = GET2(ecode, 1 + IMM2_SIZE);
2725 if (max == 0) max = INT_MAX;
2726 ecode += 1 + 2 * IMM2_SIZE;
2727 break;
2728
2729 default: /* No repeat follows */
2730 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2731 {
2732 if (length == -2) eptr = md->end_subject; /* Partial match */
2733 CHECK_PARTIAL();
2734 RRETURN(MATCH_NOMATCH);
2735 }
2736 eptr += length;
2737 continue; /* With the main loop */
2738 }
2739
2740 /* Handle repeated back references. If the length of the reference is
2741 zero, just continue with the main loop. If the length is negative, it
2742 means the reference is unset in non-Java-compatible mode. If the minimum is
2743 zero, we can continue at the same level without recursion. For any other
2744 minimum, carrying on will result in NOMATCH. */
2745
2746 if (length == 0) continue;
2747 if (length < 0 && min == 0) continue;
2748
2749 /* First, ensure the minimum number of matches are present. We get back
2750 the length of the reference string explicitly rather than passing the
2751 address of eptr, so that eptr can be a register variable. */
2752
2753 for (i = 1; i <= min; i++)
2754 {
2755 int slength;
2756 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2757 {
2758 if (slength == -2) eptr = md->end_subject; /* Partial match */
2759 CHECK_PARTIAL();
2760 RRETURN(MATCH_NOMATCH);
2761 }
2762 eptr += slength;
2763 }
2764
2765 /* If min = max, continue at the same level without recursion.
2766 They are not both allowed to be zero. */
2767
2768 if (min == max) continue;
2769
2770 /* If minimizing, keep trying and advancing the pointer */
2771
2772 if (minimize)
2773 {
2774 for (fi = min;; fi++)
2775 {
2776 int slength;
2777 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2779 if (fi >= max) RRETURN(MATCH_NOMATCH);
2780 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2781 {
2782 if (slength == -2) eptr = md->end_subject; /* Partial match */
2783 CHECK_PARTIAL();
2784 RRETURN(MATCH_NOMATCH);
2785 }
2786 eptr += slength;
2787 }
2788 /* Control never gets here */
2789 }
2790
2791 /* If maximizing, find the longest string and work backwards */
2792
2793 else
2794 {
2795 pp = eptr;
2796 for (i = min; i < max; i++)
2797 {
2798 int slength;
2799 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2800 {
2801 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2802 the soft partial matching case. */
2803
2804 if (slength == -2 && md->partial != 0 &&
2805 md->end_subject > md->start_used_ptr)
2806 {
2807 md->hitend = TRUE;
2808 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2809 }
2810 break;
2811 }
2812 eptr += slength;
2813 }
2814
2815 while (eptr >= pp)
2816 {
2817 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2818 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2819 eptr -= length;
2820 }
2821 RRETURN(MATCH_NOMATCH);
2822 }
2823 /* Control never gets here */
2824
2825 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2826 used when all the characters in the class have values in the range 0-255,
2827 and either the matching is caseful, or the characters are in the range
2828 0-127 when UTF-8 processing is enabled. The only difference between
2829 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2830 encountered.
2831
2832 First, look past the end of the item to see if there is repeat information
2833 following. Then obey similar code to character type repeats - written out
2834 again for speed. */
2835
2836 case OP_NCLASS:
2837 case OP_CLASS:
2838 {
2839 /* The data variable is saved across frames, so the byte map needs to
2840 be stored there. */
2841 #define BYTE_MAP ((pcre_uint8 *)data)
2842 data = ecode + 1; /* Save for matching */
2843 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2844
2845 switch (*ecode)
2846 {
2847 case OP_CRSTAR:
2848 case OP_CRMINSTAR:
2849 case OP_CRPLUS:
2850 case OP_CRMINPLUS:
2851 case OP_CRQUERY:
2852 case OP_CRMINQUERY:
2853 c = *ecode++ - OP_CRSTAR;
2854 minimize = (c & 1) != 0;
2855 min = rep_min[c]; /* Pick up values from tables; */
2856 max = rep_max[c]; /* zero for max => infinity */
2857 if (max == 0) max = INT_MAX;
2858 break;
2859
2860 case OP_CRRANGE:
2861 case OP_CRMINRANGE:
2862 minimize = (*ecode == OP_CRMINRANGE);
2863 min = GET2(ecode, 1);
2864 max = GET2(ecode, 1 + IMM2_SIZE);
2865 if (max == 0) max = INT_MAX;
2866 ecode += 1 + 2 * IMM2_SIZE;
2867 break;
2868
2869 default: /* No repeat follows */
2870 min = max = 1;
2871 break;
2872 }
2873
2874 /* First, ensure the minimum number of matches are present. */
2875
2876 #ifdef SUPPORT_UTF
2877 if (utf)
2878 {
2879 for (i = 1; i <= min; i++)
2880 {
2881 if (eptr >= md->end_subject)
2882 {
2883 SCHECK_PARTIAL();
2884 RRETURN(MATCH_NOMATCH);
2885 }
2886 GETCHARINC(c, eptr);
2887 if (c > 255)
2888 {
2889 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2890 }
2891 else
2892 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2893 }
2894 }
2895 else
2896 #endif
2897 /* Not UTF mode */
2898 {
2899 for (i = 1; i <= min; i++)
2900 {
2901 if (eptr >= md->end_subject)
2902 {
2903 SCHECK_PARTIAL();
2904 RRETURN(MATCH_NOMATCH);
2905 }
2906 c = *eptr++;
2907 #ifndef COMPILE_PCRE8
2908 if (c > 255)
2909 {
2910 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2911 }
2912 else
2913 #endif
2914 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2915 }
2916 }
2917
2918 /* If max == min we can continue with the main loop without the
2919 need to recurse. */
2920
2921 if (min == max) continue;
2922
2923 /* If minimizing, keep testing the rest of the expression and advancing
2924 the pointer while it matches the class. */
2925
2926 if (minimize)
2927 {
2928 #ifdef SUPPORT_UTF
2929 if (utf)
2930 {
2931 for (fi = min;; fi++)
2932 {
2933 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2935 if (fi >= max) RRETURN(MATCH_NOMATCH);
2936 if (eptr >= md->end_subject)
2937 {
2938 SCHECK_PARTIAL();
2939 RRETURN(MATCH_NOMATCH);
2940 }
2941 GETCHARINC(c, eptr);
2942 if (c > 255)
2943 {
2944 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2945 }
2946 else
2947 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2948 }
2949 }
2950 else
2951 #endif
2952 /* Not UTF mode */
2953 {
2954 for (fi = min;; fi++)
2955 {
2956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2958 if (fi >= max) RRETURN(MATCH_NOMATCH);
2959 if (eptr >= md->end_subject)
2960 {
2961 SCHECK_PARTIAL();
2962 RRETURN(MATCH_NOMATCH);
2963 }
2964 c = *eptr++;
2965 #ifndef COMPILE_PCRE8
2966 if (c > 255)
2967 {
2968 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2969 }
2970 else
2971 #endif
2972 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2973 }
2974 }
2975 /* Control never gets here */
2976 }
2977
2978 /* If maximizing, find the longest possible run, then work backwards. */
2979
2980 else
2981 {
2982 pp = eptr;
2983
2984 #ifdef SUPPORT_UTF
2985 if (utf)
2986 {
2987 for (i = min; i < max; i++)
2988 {
2989 int len = 1;
2990 if (eptr >= md->end_subject)
2991 {
2992 SCHECK_PARTIAL();
2993 break;
2994 }
2995 GETCHARLEN(c, eptr, len);
2996 if (c > 255)
2997 {
2998 if (op == OP_CLASS) break;
2999 }
3000 else
3001 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3002 eptr += len;
3003 }
3004 for (;;)
3005 {
3006 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3008 if (eptr-- == pp) break; /* Stop if tried at original pos */
3009 BACKCHAR(eptr);
3010 }
3011 }
3012 else
3013 #endif
3014 /* Not UTF mode */
3015 {
3016 for (i = min; i < max; i++)
3017 {
3018 if (eptr >= md->end_subject)
3019 {
3020 SCHECK_PARTIAL();
3021 break;
3022 }
3023 c = *eptr;
3024 #ifndef COMPILE_PCRE8
3025 if (c > 255)
3026 {
3027 if (op == OP_CLASS) break;
3028 }
3029 else
3030 #endif
3031 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3032 eptr++;
3033 }
3034 while (eptr >= pp)
3035 {
3036 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3037 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3038 eptr--;
3039 }
3040 }
3041
3042 RRETURN(MATCH_NOMATCH);
3043 }
3044 #undef BYTE_MAP
3045 }
3046 /* Control never gets here */
3047
3048
3049 /* Match an extended character class. This opcode is encountered only
3050 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3051 mode, because Unicode properties are supported in non-UTF-8 mode. */
3052
3053 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3054 case OP_XCLASS:
3055 {
3056 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3057 ecode += GET(ecode, 1); /* Advance past the item */
3058
3059 switch (*ecode)
3060 {
3061 case OP_CRSTAR:
3062 case OP_CRMINSTAR:
3063 case OP_CRPLUS:
3064 case OP_CRMINPLUS:
3065 case OP_CRQUERY:
3066 case OP_CRMINQUERY:
3067 c = *ecode++ - OP_CRSTAR;
3068 minimize = (c & 1) != 0;
3069 min = rep_min[c]; /* Pick up values from tables; */
3070 max = rep_max[c]; /* zero for max => infinity */
3071 if (max == 0) max = INT_MAX;
3072 break;
3073
3074 case OP_CRRANGE:
3075 case OP_CRMINRANGE:
3076 minimize = (*ecode == OP_CRMINRANGE);
3077 min = GET2(ecode, 1);
3078 max = GET2(ecode, 1 + IMM2_SIZE);
3079 if (max == 0) max = INT_MAX;
3080 ecode += 1 + 2 * IMM2_SIZE;
3081 break;
3082
3083 default: /* No repeat follows */
3084 min = max = 1;
3085 break;
3086 }
3087
3088 /* First, ensure the minimum number of matches are present. */
3089
3090 for (i = 1; i <= min; i++)
3091 {
3092 if (eptr >= md->end_subject)
3093 {
3094 SCHECK_PARTIAL();
3095 RRETURN(MATCH_NOMATCH);
3096 }
3097 GETCHARINCTEST(c, eptr);
3098 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3099 }
3100
3101 /* If max == min we can continue with the main loop without the
3102 need to recurse. */
3103
3104 if (min == max) continue;
3105
3106 /* If minimizing, keep testing the rest of the expression and advancing
3107 the pointer while it matches the class. */
3108
3109 if (minimize)
3110 {
3111 for (fi = min;; fi++)
3112 {
3113 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3114 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3115 if (fi >= max) RRETURN(MATCH_NOMATCH);
3116 if (eptr >= md->end_subject)
3117 {
3118 SCHECK_PARTIAL();
3119 RRETURN(MATCH_NOMATCH);
3120 }
3121 GETCHARINCTEST(c, eptr);
3122 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3123 }
3124 /* Control never gets here */
3125 }
3126
3127 /* If maximizing, find the longest possible run, then work backwards. */
3128
3129 else
3130 {
3131 pp = eptr;
3132 for (i = min; i < max; i++)
3133 {
3134 int len = 1;
3135 if (eptr >= md->end_subject)
3136 {
3137 SCHECK_PARTIAL();
3138 break;
3139 }
3140 #ifdef SUPPORT_UTF
3141 GETCHARLENTEST(c, eptr, len);
3142 #else
3143 c = *eptr;
3144 #endif
3145 if (!PRIV(xclass)(c, data, utf)) break;
3146 eptr += len;
3147 }
3148 for(;;)
3149 {
3150 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3151 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3152 if (eptr-- == pp) break; /* Stop if tried at original pos */
3153 #ifdef SUPPORT_UTF
3154 if (utf) BACKCHAR(eptr);
3155 #endif
3156 }
3157 RRETURN(MATCH_NOMATCH);
3158 }
3159
3160 /* Control never gets here */
3161 }
3162 #endif /* End of XCLASS */
3163
3164 /* Match a single character, casefully */
3165
3166 case OP_CHAR:
3167 #ifdef SUPPORT_UTF
3168 if (utf)
3169 {
3170 length = 1;
3171 ecode++;
3172 GETCHARLEN(fc, ecode, length);
3173 if (length > md->end_subject - eptr)
3174 {
3175 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3179 }
3180 else
3181 #endif
3182 /* Not UTF mode */
3183 {
3184 if (md->end_subject - eptr < 1)
3185 {
3186 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3187 RRETURN(MATCH_NOMATCH);
3188 }
3189 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3190 ecode += 2;
3191 }
3192 break;
3193
3194 /* Match a single character, caselessly. If we are at the end of the
3195 subject, give up immediately. */
3196
3197 case OP_CHARI:
3198 if (eptr >= md->end_subject)
3199 {
3200 SCHECK_PARTIAL();
3201 RRETURN(MATCH_NOMATCH);
3202 }
3203
3204 #ifdef SUPPORT_UTF
3205 if (utf)
3206 {
3207 length = 1;
3208 ecode++;
3209 GETCHARLEN(fc, ecode, length);
3210
3211 /* If the pattern character's value is < 128, we have only one byte, and
3212 we know that its other case must also be one byte long, so we can use the
3213 fast lookup table. We know that there is at least one byte left in the
3214 subject. */
3215
3216 if (fc < 128)
3217 {
3218 pcre_uint32 cc = RAWUCHAR(eptr);
3219 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3220 ecode++;
3221 eptr++;
3222 }
3223
3224 /* Otherwise we must pick up the subject character. Note that we cannot
3225 use the value of "length" to check for sufficient bytes left, because the
3226 other case of the character may have more or fewer bytes. */
3227
3228 else
3229 {
3230 pcre_uint32 dc;
3231 GETCHARINC(dc, eptr);
3232 ecode += length;
3233
3234 /* If we have Unicode property support, we can use it to test the other
3235 case of the character, if there is one. */
3236
3237 if (fc != dc)
3238 {
3239 #ifdef SUPPORT_UCP
3240 if (dc != UCD_OTHERCASE(fc))
3241 #endif
3242 RRETURN(MATCH_NOMATCH);
3243 }
3244 }
3245 }
3246 else
3247 #endif /* SUPPORT_UTF */
3248
3249 /* Not UTF mode */
3250 {
3251 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3252 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3253 eptr++;
3254 ecode += 2;
3255 }
3256 break;
3257
3258 /* Match a single character repeatedly. */
3259
3260 case OP_EXACT:
3261 case OP_EXACTI:
3262 min = max = GET2(ecode, 1);
3263 ecode += 1 + IMM2_SIZE;
3264 goto REPEATCHAR;
3265
3266 case OP_POSUPTO:
3267 case OP_POSUPTOI:
3268 possessive = TRUE;
3269 /* Fall through */
3270
3271 case OP_UPTO:
3272 case OP_UPTOI:
3273 case OP_MINUPTO:
3274 case OP_MINUPTOI:
3275 min = 0;
3276 max = GET2(ecode, 1);
3277 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3278 ecode += 1 + IMM2_SIZE;
3279 goto REPEATCHAR;
3280
3281 case OP_POSSTAR:
3282 case OP_POSSTARI:
3283 possessive = TRUE;
3284 min = 0;
3285 max = INT_MAX;
3286 ecode++;
3287 goto REPEATCHAR;
3288
3289 case OP_POSPLUS:
3290 case OP_POSPLUSI:
3291 possessive = TRUE;
3292 min = 1;
3293 max = INT_MAX;
3294 ecode++;
3295 goto REPEATCHAR;
3296
3297 case OP_POSQUERY:
3298 case OP_POSQUERYI:
3299 possessive = TRUE;
3300 min = 0;
3301 max = 1;
3302 ecode++;
3303 goto REPEATCHAR;
3304
3305 case OP_STAR:
3306 case OP_STARI:
3307 case OP_MINSTAR:
3308 case OP_MINSTARI:
3309 case OP_PLUS:
3310 case OP_PLUSI:
3311 case OP_MINPLUS:
3312 case OP_MINPLUSI:
3313 case OP_QUERY:
3314 case OP_QUERYI:
3315 case OP_MINQUERY:
3316 case OP_MINQUERYI:
3317 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3318 minimize = (c & 1) != 0;
3319 min = rep_min[c]; /* Pick up values from tables; */
3320 max = rep_max[c]; /* zero for max => infinity */
3321 if (max == 0) max = INT_MAX;
3322
3323 /* Common code for all repeated single-character matches. */
3324
3325 REPEATCHAR:
3326 #ifdef SUPPORT_UTF
3327 if (utf)
3328 {
3329 length = 1;
3330 charptr = ecode;
3331 GETCHARLEN(fc, ecode, length);
3332 ecode += length;
3333
3334 /* Handle multibyte character matching specially here. There is
3335 support for caseless matching if UCP support is present. */
3336
3337 if (length > 1)
3338 {
3339 #ifdef SUPPORT_UCP
3340 pcre_uint32 othercase;
3341 if (op >= OP_STARI && /* Caseless */
3342 (othercase = UCD_OTHERCASE(fc)) != fc)
3343 oclength = PRIV(ord2utf)(othercase, occhars);
3344 else oclength = 0;
3345 #endif /* SUPPORT_UCP */
3346
3347 for (i = 1; i <= min; i++)
3348 {
3349 if (eptr <= md->end_subject - length &&
3350 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3351 #ifdef SUPPORT_UCP
3352 else if (oclength > 0 &&
3353 eptr <= md->end_subject - oclength &&
3354 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3355 #endif /* SUPPORT_UCP */
3356 else
3357 {
3358 CHECK_PARTIAL();
3359 RRETURN(MATCH_NOMATCH);
3360 }
3361 }
3362
3363 if (min == max) continue;
3364
3365 if (minimize)
3366 {
3367 for (fi = min;; fi++)
3368 {
3369 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3370 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3371 if (fi >= max) RRETURN(MATCH_NOMATCH);
3372 if (eptr <= md->end_subject - length &&
3373 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3374 #ifdef SUPPORT_UCP
3375 else if (oclength > 0 &&
3376 eptr <= md->end_subject - oclength &&
3377 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3378 #endif /* SUPPORT_UCP */
3379 else
3380 {
3381 CHECK_PARTIAL();
3382 RRETURN(MATCH_NOMATCH);
3383 }
3384 }
3385 /* Control never gets here */
3386 }
3387
3388 else /* Maximize */
3389 {
3390 pp = eptr;
3391 for (i = min; i < max; i++)
3392 {
3393 if (eptr <= md->end_subject - length &&
3394 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3395 #ifdef SUPPORT_UCP
3396 else if (oclength > 0 &&
3397 eptr <= md->end_subject - oclength &&
3398 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3399 #endif /* SUPPORT_UCP */
3400 else
3401 {
3402 CHECK_PARTIAL();
3403 break;
3404 }
3405 }
3406
3407 if (possessive) continue;
3408
3409 for(;;)
3410 {
3411 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3412 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3413 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3414 #ifdef SUPPORT_UCP
3415 eptr--;
3416 BACKCHAR(eptr);
3417 #else /* without SUPPORT_UCP */
3418 eptr -= length;
3419 #endif /* SUPPORT_UCP */
3420 }
3421 }
3422 /* Control never gets here */
3423 }
3424
3425 /* If the length of a UTF-8 character is 1, we fall through here, and
3426 obey the code as for non-UTF-8 characters below, though in this case the
3427 value of fc will always be < 128. */
3428 }
3429 else
3430 #endif /* SUPPORT_UTF */
3431 /* When not in UTF-8 mode, load a single-byte character. */
3432 fc = *ecode++;
3433
3434 /* The value of fc at this point is always one character, though we may
3435 or may not be in UTF mode. The code is duplicated for the caseless and
3436 caseful cases, for speed, since matching characters is likely to be quite
3437 common. First, ensure the minimum number of matches are present. If min =
3438 max, continue at the same level without recursing. Otherwise, if
3439 minimizing, keep trying the rest of the expression and advancing one
3440 matching character if failing, up to the maximum. Alternatively, if
3441 maximizing, find the maximum number of characters and work backwards. */
3442
3443 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3444 max, (char *)eptr));
3445
3446 if (op >= OP_STARI) /* Caseless */
3447 {
3448 #ifdef COMPILE_PCRE8
3449 /* fc must be < 128 if UTF is enabled. */
3450 foc = md->fcc[fc];
3451 #else
3452 #ifdef SUPPORT_UTF
3453 #ifdef SUPPORT_UCP
3454 if (utf && fc > 127)
3455 foc = UCD_OTHERCASE(fc);
3456 #else
3457 if (utf && fc > 127)
3458 foc = fc;
3459 #endif /* SUPPORT_UCP */
3460 else
3461 #endif /* SUPPORT_UTF */
3462 foc = TABLE_GET(fc, md->fcc, fc);
3463 #endif /* COMPILE_PCRE8 */
3464
3465 for (i = 1; i <= min; i++)
3466 {
3467 pcre_uint32 cc; /* Faster than pcre_uchar */
3468 if (eptr >= md->end_subject)
3469 {
3470 SCHECK_PARTIAL();
3471 RRETURN(MATCH_NOMATCH);
3472 }
3473 cc = RAWUCHARTEST(eptr);
3474 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3475 eptr++;
3476 }
3477 if (min == max) continue;
3478 if (minimize)
3479 {
3480 for (fi = min;; fi++)
3481 {
3482 pcre_uint32 cc; /* Faster than pcre_uchar */
3483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3485 if (fi >= max) RRETURN(MATCH_NOMATCH);
3486 if (eptr >= md->end_subject)
3487 {
3488 SCHECK_PARTIAL();
3489 RRETURN(MATCH_NOMATCH);
3490 }
3491 cc = RAWUCHARTEST(eptr);
3492 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3493 eptr++;
3494 }
3495 /* Control never gets here */
3496 }
3497 else /* Maximize */
3498 {
3499 pp = eptr;
3500 for (i = min; i < max; i++)
3501 {
3502 pcre_uint32 cc; /* Faster than pcre_uchar */
3503 if (eptr >= md->end_subject)
3504 {
3505 SCHECK_PARTIAL();
3506 break;
3507 }
3508 cc = RAWUCHARTEST(eptr);
3509 if (fc != cc && foc != cc) break;
3510 eptr++;
3511 }
3512
3513 if (possessive) continue;
3514
3515 while (eptr >= pp)
3516 {
3517 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3518 eptr--;
3519 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3520 }
3521 RRETURN(MATCH_NOMATCH);
3522 }
3523 /* Control never gets here */
3524 }
3525
3526 /* Caseful comparisons (includes all multi-byte characters) */
3527
3528 else
3529 {
3530 for (i = 1; i <= min; i++)
3531 {
3532 if (eptr >= md->end_subject)
3533 {
3534 SCHECK_PARTIAL();
3535 RRETURN(MATCH_NOMATCH);
3536 }
3537 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3538 }
3539
3540 if (min == max) continue;
3541
3542 if (minimize)
3543 {
3544 for (fi = min;; fi++)
3545 {
3546 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3547 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3548 if (fi >= max) RRETURN(MATCH_NOMATCH);
3549 if (eptr >= md->end_subject)
3550 {
3551 SCHECK_PARTIAL();
3552 RRETURN(MATCH_NOMATCH);
3553 }
3554 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3555 }
3556 /* Control never gets here */
3557 }
3558 else /* Maximize */
3559 {
3560 pp = eptr;
3561 for (i = min; i < max; i++)
3562 {
3563 if (eptr >= md->end_subject)
3564 {
3565 SCHECK_PARTIAL();
3566 break;
3567 }
3568 if (fc != RAWUCHARTEST(eptr)) break;
3569 eptr++;
3570 }
3571 if (possessive) continue;
3572
3573 while (eptr >= pp)
3574 {
3575 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3576 eptr--;
3577 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3578 }
3579 RRETURN(MATCH_NOMATCH);
3580 }
3581 }
3582 /* Control never gets here */
3583
3584 /* Match a negated single one-byte character. The character we are
3585 checking can be multibyte. */
3586
3587 case OP_NOT:
3588 case OP_NOTI:
3589 if (eptr >= md->end_subject)
3590 {
3591 SCHECK_PARTIAL();
3592 RRETURN(MATCH_NOMATCH);
3593 }
3594 #ifdef SUPPORT_UTF
3595 if (utf)
3596 {
3597 register pcre_uint32 ch, och;
3598
3599 ecode++;
3600 GETCHARINC(ch, ecode);
3601 GETCHARINC(c, eptr);
3602
3603 if (op == OP_NOT)
3604 {
3605 if (ch == c) RRETURN(MATCH_NOMATCH);
3606 }
3607 else
3608 {
3609 #ifdef SUPPORT_UCP
3610 if (ch > 127)
3611 och = UCD_OTHERCASE(ch);
3612 #else
3613 if (ch > 127)
3614 och = ch;
3615 #endif /* SUPPORT_UCP */
3616 else
3617 och = TABLE_GET(ch, md->fcc, ch);
3618 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3619 }
3620 }
3621 else
3622 #endif
3623 {
3624 register pcre_uint32 ch = ecode[1];
3625 c = *eptr++;
3626 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3627 RRETURN(MATCH_NOMATCH);
3628 ecode += 2;
3629 }
3630 break;
3631
3632 /* Match a negated single one-byte character repeatedly. This is almost a
3633 repeat of the code for a repeated single character, but I haven't found a
3634 nice way of commoning these up that doesn't require a test of the
3635 positive/negative option for each character match. Maybe that wouldn't add
3636 very much to the time taken, but character matching *is* what this is all
3637 about... */
3638
3639 case OP_NOTEXACT:
3640 case OP_NOTEXACTI:
3641 min = max = GET2(ecode, 1);
3642 ecode += 1 + IMM2_SIZE;
3643 goto REPEATNOTCHAR;
3644
3645 case OP_NOTUPTO:
3646 case OP_NOTUPTOI:
3647 case OP_NOTMINUPTO:
3648 case OP_NOTMINUPTOI:
3649 min = 0;
3650 max = GET2(ecode, 1);
3651 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3652 ecode += 1 + IMM2_SIZE;
3653 goto REPEATNOTCHAR;
3654
3655 case OP_NOTPOSSTAR:
3656 case OP_NOTPOSSTARI:
3657 possessive = TRUE;
3658 min = 0;
3659 max = INT_MAX;
3660 ecode++;
3661 goto REPEATNOTCHAR;
3662
3663 case OP_NOTPOSPLUS:
3664 case OP_NOTPOSPLUSI:
3665 possessive = TRUE;
3666 min = 1;
3667 max = INT_MAX;
3668 ecode++;
3669 goto REPEATNOTCHAR;
3670
3671 case OP_NOTPOSQUERY:
3672 case OP_NOTPOSQUERYI:
3673 possessive = TRUE;
3674 min = 0;
3675 max = 1;
3676 ecode++;
3677 goto REPEATNOTCHAR;
3678
3679 case OP_NOTPOSUPTO:
3680 case OP_NOTPOSUPTOI:
3681 possessive = TRUE;
3682 min = 0;
3683 max = GET2(ecode, 1);
3684 ecode += 1 + IMM2_SIZE;
3685 goto REPEATNOTCHAR;
3686
3687 case OP_NOTSTAR:
3688 case OP_NOTSTARI:
3689 case OP_NOTMINSTAR:
3690 case OP_NOTMINSTARI:
3691 case OP_NOTPLUS:
3692 case OP_NOTPLUSI:
3693 case OP_NOTMINPLUS:
3694 case OP_NOTMINPLUSI:
3695 case OP_NOTQUERY:
3696 case OP_NOTQUERYI:
3697 case OP_NOTMINQUERY:
3698 case OP_NOTMINQUERYI:
3699 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3700 minimize = (c & 1) != 0;
3701 min = rep_min[c]; /* Pick up values from tables; */
3702 max = rep_max[c]; /* zero for max => infinity */
3703 if (max == 0) max = INT_MAX;
3704
3705 /* Common code for all repeated single-byte matches. */
3706
3707 REPEATNOTCHAR:
3708 GETCHARINCTEST(fc, ecode);
3709
3710 /* The code is duplicated for the caseless and caseful cases, for speed,
3711 since matching characters is likely to be quite common. First, ensure the
3712 minimum number of matches are present. If min = max, continue at the same
3713 level without recursing. Otherwise, if minimizing, keep trying the rest of
3714 the expression and advancing one matching character if failing, up to the
3715 maximum. Alternatively, if maximizing, find the maximum number of
3716 characters and work backwards. */
3717
3718 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3719 max, (char *)eptr));
3720
3721 if (op >= OP_NOTSTARI) /* Caseless */
3722 {
3723 #ifdef SUPPORT_UTF
3724 #ifdef SUPPORT_UCP
3725 if (utf && fc > 127)
3726 foc = UCD_OTHERCASE(fc);
3727 #else
3728 if (utf && fc > 127)
3729 foc = fc;
3730 #endif /* SUPPORT_UCP */
3731 else
3732 #endif /* SUPPORT_UTF */
3733 foc = TABLE_GET(fc, md->fcc, fc);
3734
3735 #ifdef SUPPORT_UTF
3736 if (utf)
3737 {
3738 register pcre_uint32 d;
3739 for (i = 1; i <= min; i++)
3740 {
3741 if (eptr >= md->end_subject)
3742 {
3743 SCHECK_PARTIAL();
3744 RRETURN(MATCH_NOMATCH);
3745 }
3746 GETCHARINC(d, eptr);
3747 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3748 }
3749 }
3750 else
3751 #endif
3752 /* Not UTF mode */
3753 {
3754 for (i = 1; i <= min; i++)
3755 {
3756 if (eptr >= md->end_subject)
3757 {
3758 SCHECK_PARTIAL();
3759 RRETURN(MATCH_NOMATCH);
3760 }
3761 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3762 eptr++;
3763 }
3764 }
3765
3766 if (min == max) continue;
3767
3768 if (minimize)
3769 {
3770 #ifdef SUPPORT_UTF
3771 if (utf)
3772 {
3773 register pcre_uint32 d;
3774 for (fi = min;; fi++)
3775 {
3776 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3777 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3778 if (fi >= max) RRETURN(MATCH_NOMATCH);
3779 if (eptr >= md->end_subject)
3780 {
3781 SCHECK_PARTIAL();
3782 RRETURN(MATCH_NOMATCH);
3783 }
3784 GETCHARINC(d, eptr);
3785 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3786 }
3787 }
3788 else
3789 #endif
3790 /* Not UTF mode */
3791 {
3792 for (fi = min;; fi++)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (fi >= max) RRETURN(MATCH_NOMATCH);
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 RRETURN(MATCH_NOMATCH);
3801 }
3802 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3803 eptr++;
3804 }
3805 }
3806 /* Control never gets here */
3807 }
3808
3809 /* Maximize case */
3810
3811 else
3812 {
3813 pp = eptr;
3814
3815 #ifdef SUPPORT_UTF
3816 if (utf)
3817 {
3818 register pcre_uint32 d;
3819 for (i = min; i < max; i++)
3820 {
3821 int len = 1;
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 break;
3826 }
3827 GETCHARLEN(d, eptr, len);
3828 if (fc == d || (unsigned int)foc == d) break;
3829 eptr += len;
3830 }
3831 if (possessive) continue;
3832 for(;;)
3833 {
3834 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3836 if (eptr-- == pp) break; /* Stop if tried at original pos */
3837 BACKCHAR(eptr);
3838 }
3839 }
3840 else
3841 #endif
3842 /* Not UTF mode */
3843 {
3844 for (i = min; i < max; i++)
3845 {
3846 if (eptr >= md->end_subject)
3847 {
3848 SCHECK_PARTIAL();
3849 break;
3850 }
3851 if (fc == *eptr || foc == *eptr) break;
3852 eptr++;
3853 }
3854 if (possessive) continue;
3855 while (eptr >= pp)
3856 {
3857 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3859 eptr--;
3860 }
3861 }
3862
3863 RRETURN(MATCH_NOMATCH);
3864 }
3865 /* Control never gets here */
3866 }
3867
3868 /* Caseful comparisons */
3869
3870 else
3871 {
3872 #ifdef SUPPORT_UTF
3873 if (utf)
3874 {
3875 register pcre_uint32 d;
3876 for (i = 1; i <= min; i++)
3877 {
3878 if (eptr >= md->end_subject)
3879 {
3880 SCHECK_PARTIAL();
3881 RRETURN(MATCH_NOMATCH);
3882 }
3883 GETCHARINC(d, eptr);
3884 if (fc == d) RRETURN(MATCH_NOMATCH);
3885 }
3886 }
3887 else
3888 #endif
3889 /* Not UTF mode */
3890 {
3891 for (i = 1; i <= min; i++)
3892 {
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 RRETURN(MATCH_NOMATCH);
3897 }
3898 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3899 }
3900 }
3901
3902 if (min == max) continue;
3903
3904 if (minimize)
3905 {
3906 #ifdef SUPPORT_UTF
3907 if (utf)
3908 {
3909 register pcre_uint32 d;
3910 for (fi = min;; fi++)
3911 {
3912 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3914 if (fi >= max) RRETURN(MATCH_NOMATCH);
3915 if (eptr >= md->end_subject)
3916 {
3917 SCHECK_PARTIAL();
3918 RRETURN(MATCH_NOMATCH);
3919 }
3920 GETCHARINC(d, eptr);
3921 if (fc == d) RRETURN(MATCH_NOMATCH);
3922 }
3923 }
3924 else
3925 #endif
3926 /* Not UTF mode */
3927 {
3928 for (fi = min;; fi++)
3929 {
3930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3932 if (fi >= max) RRETURN(MATCH_NOMATCH);
3933 if (eptr >= md->end_subject)
3934 {
3935 SCHECK_PARTIAL();
3936 RRETURN(MATCH_NOMATCH);
3937 }
3938 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3939 }
3940 }
3941 /* Control never gets here */
3942 }
3943
3944 /* Maximize case */
3945
3946 else
3947 {
3948 pp = eptr;
3949
3950 #ifdef SUPPORT_UTF
3951 if (utf)
3952 {
3953 register pcre_uint32 d;
3954 for (i = min; i < max; i++)
3955 {
3956 int len = 1;
3957 if (eptr >= md->end_subject)
3958 {
3959 SCHECK_PARTIAL();
3960 break;
3961 }
3962 GETCHARLEN(d, eptr, len);
3963 if (fc == d) break;
3964 eptr += len;
3965 }
3966 if (possessive) continue;
3967 for(;;)
3968 {
3969 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3971 if (eptr-- == pp) break; /* Stop if tried at original pos */
3972 BACKCHAR(eptr);
3973 }
3974 }
3975 else
3976 #endif
3977 /* Not UTF mode */
3978 {
3979 for (i = min; i < max; i++)
3980 {
3981 if (eptr >= md->end_subject)
3982 {
3983 SCHECK_PARTIAL();
3984 break;
3985 }
3986 if (fc == *eptr) break;
3987 eptr++;
3988 }
3989 if (possessive) continue;
3990 while (eptr >= pp)
3991 {
3992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3994 eptr--;
3995 }
3996 }
3997
3998 RRETURN(MATCH_NOMATCH);
3999 }
4000 }
4001 /* Control never gets here */
4002
4003 /* Match a single character type repeatedly; several different opcodes
4004 share code. This is very similar to the code for single characters, but we
4005 repeat it in the interests of efficiency. */
4006
4007 case OP_TYPEEXACT:
4008 min = max = GET2(ecode, 1);
4009 minimize = TRUE;
4010 ecode += 1 + IMM2_SIZE;
4011 goto REPEATTYPE;
4012
4013 case OP_TYPEUPTO:
4014 case OP_TYPEMINUPTO:
4015 min = 0;
4016 max = GET2(ecode, 1);
4017 minimize = *ecode == OP_TYPEMINUPTO;
4018 ecode += 1 + IMM2_SIZE;
4019 goto REPEATTYPE;
4020
4021 case OP_TYPEPOSSTAR:
4022 possessive = TRUE;
4023 min = 0;
4024 max = INT_MAX;
4025 ecode++;
4026 goto REPEATTYPE;
4027
4028 case OP_TYPEPOSPLUS:
4029 possessive = TRUE;
4030 min = 1;
4031 max = INT_MAX;
4032 ecode++;
4033 goto REPEATTYPE;
4034
4035 case OP_TYPEPOSQUERY:
4036 possessive = TRUE;
4037 min = 0;
4038 max = 1;
4039 ecode++;
4040 goto REPEATTYPE;
4041
4042 case OP_TYPEPOSUPTO:
4043 possessive = TRUE;
4044 min = 0;
4045 max = GET2(ecode, 1);
4046 ecode += 1 + IMM2_SIZE;
4047 goto REPEATTYPE;
4048
4049 case OP_TYPESTAR:
4050 case OP_TYPEMINSTAR:
4051 case OP_TYPEPLUS:
4052 case OP_TYPEMINPLUS:
4053 case OP_TYPEQUERY:
4054 case OP_TYPEMINQUERY:
4055 c = *ecode++ - OP_TYPESTAR;
4056 minimize = (c & 1) != 0;
4057 min = rep_min[c]; /* Pick up values from tables; */
4058 max = rep_max[c]; /* zero for max => infinity */
4059 if (max == 0) max = INT_MAX;
4060
4061 /* Common code for all repeated single character type matches. Note that
4062 in UTF-8 mode, '.' matches a character of any length, but for the other
4063 character types, the valid characters are all one-byte long. */
4064
4065 REPEATTYPE:
4066 ctype = *ecode++; /* Code for the character type */
4067
4068 #ifdef SUPPORT_UCP
4069 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4070 {
4071 prop_fail_result = ctype == OP_NOTPROP;
4072 prop_type = *ecode++;
4073 prop_value = *ecode++;
4074 }
4075 else prop_type = -1;
4076 #endif
4077
4078 /* First, ensure the minimum number of matches are present. Use inline
4079 code for maximizing the speed, and do the type test once at the start
4080 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4081 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4082 and single-bytes. */
4083
4084 if (min > 0)
4085 {
4086 #ifdef SUPPORT_UCP
4087 if (prop_type >= 0)
4088 {
4089 switch(prop_type)
4090 {
4091 case PT_ANY:
4092 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4093 for (i = 1; i <= min; i++)
4094 {
4095 if (eptr >= md->end_subject)
4096 {
4097 SCHECK_PARTIAL();
4098 RRETURN(MATCH_NOMATCH);
4099 }
4100 GETCHARINCTEST(c, eptr);
4101 }
4102 break;
4103
4104 case PT_LAMP:
4105 for (i = 1; i <= min; i++)
4106 {
4107 int chartype;
4108 if (eptr >= md->end_subject)
4109 {
4110 SCHECK_PARTIAL();
4111 RRETURN(MATCH_NOMATCH);
4112 }
4113 GETCHARINCTEST(c, eptr);
4114 chartype = UCD_CHARTYPE(c);
4115 if ((chartype == ucp_Lu ||
4116 chartype == ucp_Ll ||
4117 chartype == ucp_Lt) == prop_fail_result)
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 break;
4121
4122 case PT_GC:
4123 for (i = 1; i <= min; i++)
4124 {
4125 if (eptr >= md->end_subject)
4126 {
4127 SCHECK_PARTIAL();
4128 RRETURN(MATCH_NOMATCH);
4129 }
4130 GETCHARINCTEST(c, eptr);
4131 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4132 RRETURN(MATCH_NOMATCH);
4133 }
4134 break;
4135
4136 case PT_PC:
4137 for (i = 1; i <= min; i++)
4138 {
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 RRETURN(MATCH_NOMATCH);
4143 }
4144 GETCHARINCTEST(c, eptr);
4145 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4146 RRETURN(MATCH_NOMATCH);
4147 }
4148 break;
4149
4150 case PT_SC:
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 RRETURN(MATCH_NOMATCH);
4157 }
4158 GETCHARINCTEST(c, eptr);
4159 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4160 RRETURN(MATCH_NOMATCH);
4161 }
4162 break;
4163
4164 case PT_ALNUM:
4165 for (i = 1; i <= min; i++)
4166 {
4167 int category;
4168 if (eptr >= md->end_subject)
4169 {
4170 SCHECK_PARTIAL();
4171 RRETURN(MATCH_NOMATCH);
4172 }
4173 GETCHARINCTEST(c, eptr);
4174 category = UCD_CATEGORY(c);
4175 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4176 RRETURN(MATCH_NOMATCH);
4177 }
4178 break;
4179
4180 case PT_SPACE: /* Perl space */
4181 for (i = 1; i <= min; i++)
4182 {
4183 if (eptr >= md->end_subject)
4184 {
4185 SCHECK_PARTIAL();
4186 RRETURN(MATCH_NOMATCH);
4187 }
4188 GETCHARINCTEST(c, eptr);
4189 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4190 c == CHAR_FF || c == CHAR_CR)
4191 == prop_fail_result)
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 break;
4195
4196 case PT_PXSPACE: /* POSIX space */
4197 for (i = 1; i <= min; i++)
4198 {
4199 if (eptr >= md->end_subject)
4200 {
4201 SCHECK_PARTIAL();
4202 RRETURN(MATCH_NOMATCH);
4203 }
4204 GETCHARINCTEST(c, eptr);
4205 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4206 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4207 == prop_fail_result)
4208 RRETURN(MATCH_NOMATCH);
4209 }
4210 break;
4211
4212 case PT_WORD:
4213 for (i = 1; i <= min; i++)
4214 {
4215 int category;
4216 if (eptr >= md->end_subject)
4217 {
4218 SCHECK_PARTIAL();
4219 RRETURN(MATCH_NOMATCH);
4220 }
4221 GETCHARINCTEST(c, eptr);
4222 category = UCD_CATEGORY(c);
4223 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4224 == prop_fail_result)
4225 RRETURN(MATCH_NOMATCH);
4226 }
4227 break;
4228
4229 case PT_CLIST:
4230 for (i = 1; i <= min; i++)
4231 {
4232 const pcre_uint32 *cp;
4233 if (eptr >= md->end_subject)
4234 {
4235 SCHECK_PARTIAL();
4236 RRETURN(MATCH_NOMATCH);
4237 }
4238 GETCHARINCTEST(c, eptr);
4239 cp = PRIV(ucd_caseless_sets) + prop_value;
4240 for (;;)
4241 {
4242 if (c < *cp)
4243 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4244 if (c == *cp++)
4245 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4246 }
4247 }
4248 break;
4249
4250 /* This should not occur */
4251
4252 default:
4253 RRETURN(PCRE_ERROR_INTERNAL);
4254 }
4255 }
4256
4257 /* Match extended Unicode sequences. We will get here only if the
4258 support is in the binary; otherwise a compile-time error occurs. */
4259
4260 else if (ctype == OP_EXTUNI)
4261 {
4262 for (i = 1; i <= min; i++)
4263 {
4264 if (eptr >= md->end_subject)
4265 {
4266 SCHECK_PARTIAL();
4267 RRETURN(MATCH_NOMATCH);
4268 }
4269 else
4270 {
4271 int lgb, rgb;
4272 GETCHARINCTEST(c, eptr);
4273 lgb = UCD_GRAPHBREAK(c);
4274 while (eptr < md->end_subject)
4275 {
4276 int len = 1;
4277 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4278 rgb = UCD_GRAPHBREAK(c);
4279 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4280 lgb = rgb;
4281 eptr += len;
4282 }
4283 }
4284 CHECK_PARTIAL();
4285 }
4286 }
4287
4288 else
4289 #endif /* SUPPORT_UCP */
4290
4291 /* Handle all other cases when the coding is UTF-8 */
4292
4293 #ifdef SUPPORT_UTF
4294 if (utf) switch(ctype)
4295 {
4296 case OP_ANY:
4297 for (i = 1; i <= min; i++)
4298 {
4299 if (eptr >= md->end_subject)
4300 {
4301 SCHECK_PARTIAL();
4302 RRETURN(MATCH_NOMATCH);
4303 }
4304 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4305 if (md->partial != 0 &&
4306 eptr + 1 >= md->end_subject &&
4307 NLBLOCK->nltype == NLTYPE_FIXED &&
4308 NLBLOCK->nllen == 2 &&
4309 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4310 {
4311 md->hitend = TRUE;
4312 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4313 }
4314 eptr++;
4315 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4316 }
4317 break;
4318
4319 case OP_ALLANY:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 RRETURN(MATCH_NOMATCH);
4326 }
4327 eptr++;
4328 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4329 }
4330 break;
4331
4332 case OP_ANYBYTE:
4333 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4334 eptr += min;
4335 break;
4336
4337 case OP_ANYNL:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 RRETURN(MATCH_NOMATCH);
4344 }
4345 GETCHARINC(c, eptr);
4346 switch(c)
4347 {
4348 default: RRETURN(MATCH_NOMATCH);
4349
4350 case CHAR_CR:
4351 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4352 break;
4353
4354 case CHAR_LF:
4355 break;
4356
4357 case CHAR_VT:
4358 case CHAR_FF:
4359 case CHAR_NEL:
4360 #ifndef EBCDIC
4361 case 0x2028:
4362 case 0x2029:
4363 #endif /* Not EBCDIC */
4364 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4365 break;
4366 }
4367 }
4368 break;
4369
4370 case OP_NOT_HSPACE:
4371 for (i = 1; i <= min; i++)
4372 {
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 RRETURN(MATCH_NOMATCH);
4377 }
4378 GETCHARINC(c, eptr);
4379 switch(c)
4380 {
4381 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4382 default: break;
4383 }
4384 }
4385 break;
4386
4387 case OP_HSPACE:
4388 for (i = 1; i <= min; i++)
4389 {
4390 if (eptr >= md->end_subject)
4391 {
4392 SCHECK_PARTIAL();
4393 RRETURN(MATCH_NOMATCH);
4394 }
4395 GETCHARINC(c, eptr);
4396 switch(c)
4397 {
4398 HSPACE_CASES: break; /* Byte and multibyte cases */
4399 default: RRETURN(MATCH_NOMATCH);
4400 }
4401 }
4402 break;
4403
4404 case OP_NOT_VSPACE:
4405 for (i = 1; i <= min; i++)
4406 {
4407 if (eptr >= md->end_subject)
4408 {
4409 SCHECK_PARTIAL();
4410 RRETURN(MATCH_NOMATCH);
4411 }
4412 GETCHARINC(c, eptr);
4413 switch(c)
4414 {
4415 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4416 default: break;
4417 }
4418 }
4419 break;
4420
4421 case OP_VSPACE:
4422 for (i = 1; i <= min; i++)
4423 {
4424 if (eptr >= md->end_subject)
4425 {
4426 SCHECK_PARTIAL();
4427 RRETURN(MATCH_NOMATCH);
4428 }
4429 GETCHARINC(c, eptr);
4430 switch(c)
4431 {
4432 VSPACE_CASES: break;
4433 default: RRETURN(MATCH_NOMATCH);
4434 }
4435 }
4436 break;
4437
4438 case OP_NOT_DIGIT:
4439 for (i = 1; i <= min; i++)
4440 {
4441 if (eptr >= md->end_subject)
4442 {
4443 SCHECK_PARTIAL();
4444 RRETURN(MATCH_NOMATCH);
4445 }
4446 GETCHARINC(c, eptr);
4447 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4448 RRETURN(MATCH_NOMATCH);
4449 }
4450 break;
4451
4452 case OP_DIGIT:
4453 for (i = 1; i <= min; i++)
4454 {
4455 pcre_uint32 cc;
4456 if (eptr >= md->end_subject)
4457 {
4458 SCHECK_PARTIAL();
4459 RRETURN(MATCH_NOMATCH);
4460 }
4461 cc = RAWUCHAR(eptr);
4462 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4463 RRETURN(MATCH_NOMATCH);
4464 eptr++;
4465 /* No need to skip more bytes - we know it's a 1-byte character */
4466 }
4467 break;
4468
4469 case OP_NOT_WHITESPACE:
4470 for (i = 1; i <= min; i++)
4471 {
4472 pcre_uint32 cc;
4473 if (eptr >= md->end_subject)
4474 {
4475 SCHECK_PARTIAL();
4476 RRETURN(MATCH_NOMATCH);
4477 }
4478 cc = RAWUCHAR(eptr);
4479 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4480 RRETURN(MATCH_NOMATCH);
4481 eptr++;
4482 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4483 }
4484 break;
4485
4486 case OP_WHITESPACE:
4487 for (i = 1; i <= min; i++)
4488 {
4489 pcre_uint32 cc;
4490 if (eptr >= md->end_subject)
4491 {
4492 SCHECK_PARTIAL();
4493 RRETURN(MATCH_NOMATCH);
4494 }
4495 cc = RAWUCHAR(eptr);
4496 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4497 RRETURN(MATCH_NOMATCH);
4498 eptr++;
4499 /* No need to skip more bytes - we know it's a 1-byte character */
4500 }
4501 break;
4502
4503 case OP_NOT_WORDCHAR:
4504 for (i = 1; i <= min; i++)
4505 {
4506 pcre_uint32 cc;
4507 if (eptr >= md->end_subject)
4508 {
4509 SCHECK_PARTIAL();
4510 RRETURN(MATCH_NOMATCH);
4511 }
4512 cc = RAWUCHAR(eptr);
4513 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4514 RRETURN(MATCH_NOMATCH);
4515 eptr++;
4516 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4517 }
4518 break;
4519
4520 case OP_WORDCHAR:
4521 for (i = 1; i <= min; i++)
4522 {
4523 pcre_uint32 cc;
4524 if (eptr >= md->end_subject)
4525 {
4526 SCHECK_PARTIAL();
4527 RRETURN(MATCH_NOMATCH);
4528 }
4529 cc = RAWUCHAR(eptr);
4530 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4531 RRETURN(MATCH_NOMATCH);
4532 eptr++;
4533 /* No need to skip more bytes - we know it's a 1-byte character */
4534 }
4535 break;
4536
4537 default:
4538 RRETURN(PCRE_ERROR_INTERNAL);
4539 } /* End switch(ctype) */
4540
4541 else
4542 #endif /* SUPPORT_UTF */
4543
4544 /* Code for the non-UTF-8 case for minimum matching of operators other
4545 than OP_PROP and OP_NOTPROP. */
4546
4547 switch(ctype)
4548 {
4549 case OP_ANY:
4550 for (i = 1; i <= min; i++)
4551 {
4552 if (eptr >= md->end_subject)
4553 {
4554 SCHECK_PARTIAL();
4555 RRETURN(MATCH_NOMATCH);
4556 }
4557 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4558 if (md->partial != 0 &&
4559 eptr + 1 >= md->end_subject &&
4560 NLBLOCK->nltype == NLTYPE_FIXED &&
4561 NLBLOCK->nllen == 2 &&
4562 *eptr == NLBLOCK->nl[0])
4563 {
4564 md->hitend = TRUE;
4565 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4566 }
4567 eptr++;
4568 }
4569 break;
4570
4571 case OP_ALLANY:
4572 if (eptr > md->end_subject - min)
4573 {
4574 SCHECK_PARTIAL();
4575 RRETURN(MATCH_NOMATCH);
4576 }
4577 eptr += min;
4578 break;
4579
4580 case OP_ANYBYTE:
4581 if (eptr > md->end_subject - min)
4582 {
4583 SCHECK_PARTIAL();
4584 RRETURN(MATCH_NOMATCH);
4585 }
4586 eptr += min;
4587 break;
4588
4589 case OP_ANYNL:
4590 for (i = 1; i <= min; i++)
4591 {
4592 if (eptr >= md->end_subject)
4593 {
4594 SCHECK_PARTIAL();
4595 RRETURN(MATCH_NOMATCH);
4596 }
4597 switch(*eptr++)
4598 {
4599 default: RRETURN(MATCH_NOMATCH);
4600
4601 case CHAR_CR:
4602 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4603 break;
4604
4605 case CHAR_LF:
4606 break;
4607
4608 case CHAR_VT:
4609 case CHAR_FF:
4610 case CHAR_NEL:
4611 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4612 case 0x2028:
4613 case 0x2029:
4614 #endif
4615 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4616 break;
4617 }
4618 }
4619 break;
4620
4621 case OP_NOT_HSPACE:
4622 for (i = 1; i <= min; i++)
4623 {
4624 if (eptr >= md->end_subject)
4625 {
4626 SCHECK_PARTIAL();
4627 RRETURN(MATCH_NOMATCH);
4628 }
4629 switch(*eptr++)
4630 {
4631 default: break;
4632 HSPACE_BYTE_CASES:
4633 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4634 HSPACE_MULTIBYTE_CASES:
4635 #endif
4636 RRETURN(MATCH_NOMATCH);
4637 }
4638 }
4639 break;
4640
4641 case OP_HSPACE:
4642 for (i = 1; i <= min; i++)
4643 {
4644 if (eptr >= md->end_subject)
4645 {
4646 SCHECK_PARTIAL();
4647 RRETURN(MATCH_NOMATCH);
4648 }
4649 switch(*eptr++)
4650 {
4651 default: RRETURN(MATCH_NOMATCH);
4652 HSPACE_BYTE_CASES:
4653 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4654 HSPACE_MULTIBYTE_CASES:
4655 #endif
4656 break;
4657 }
4658 }
4659 break;
4660
4661 case OP_NOT_VSPACE:
4662 for (i = 1; i <= min; i++)
4663 {
4664 if (eptr >= md->end_subject)
4665 {
4666 SCHECK_PARTIAL();
4667 RRETURN(MATCH_NOMATCH);
4668 }
4669 switch(*eptr++)
4670 {
4671 VSPACE_BYTE_CASES:
4672 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4673 VSPACE_MULTIBYTE_CASES:
4674 #endif
4675 RRETURN(MATCH_NOMATCH);
4676 default: break;
4677 }
4678 }
4679 break;
4680
4681 case OP_VSPACE:
4682 for (i = 1; i <= min; i++)
4683 {
4684 if (eptr >= md->end_subject)
4685 {
4686 SCHECK_PARTIAL();
4687 RRETURN(MATCH_NOMATCH);
4688 }
4689 switch(*eptr++)
4690 {
4691 default: RRETURN(MATCH_NOMATCH);
4692 VSPACE_BYTE_CASES:
4693 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4694 VSPACE_MULTIBYTE_CASES:
4695 #endif
4696 break;
4697 }
4698 }
4699 break;
4700
4701 case OP_NOT_DIGIT:
4702 for (i = 1; i <= min; i++)
4703 {
4704 if (eptr >= md->end_subject)
4705 {
4706 SCHECK_PARTIAL();
4707 RRETURN(MATCH_NOMATCH);
4708 }
4709 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4710 RRETURN(MATCH_NOMATCH);
4711 eptr++;
4712 }
4713 break;
4714
4715 case OP_DIGIT:
4716 for (i = 1; i <= min; i++)
4717 {
4718 if (eptr >= md->end_subject)
4719 {
4720 SCHECK_PARTIAL();
4721 RRETURN(MATCH_NOMATCH);
4722 }
4723 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4724 RRETURN(MATCH_NOMATCH);
4725 eptr++;
4726 }
4727 break;
4728
4729 case OP_NOT_WHITESPACE:
4730 for (i = 1; i <= min; i++)
4731 {
4732 if (eptr >= md->end_subject)
4733 {
4734 SCHECK_PARTIAL();
4735 RRETURN(MATCH_NOMATCH);
4736 }
4737 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4738 RRETURN(MATCH_NOMATCH);
4739 eptr++;
4740 }
4741 break;
4742
4743 case OP_WHITESPACE:
4744 for (i = 1; i <= min; i++)
4745 {
4746 if (eptr >= md->end_subject)
4747 {
4748 SCHECK_PARTIAL();
4749 RRETURN(MATCH_NOMATCH);
4750 }
4751 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4752 RRETURN(MATCH_NOMATCH);
4753 eptr++;
4754 }
4755 break;
4756
4757 case OP_NOT_WORDCHAR:
4758 for (i = 1; i <= min; i++)
4759 {
4760 if (eptr >= md->end_subject)
4761 {
4762 SCHECK_PARTIAL();
4763 RRETURN(MATCH_NOMATCH);
4764 }
4765 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4766 RRETURN(MATCH_NOMATCH);
4767 eptr++;
4768 }
4769 break;
4770
4771 case OP_WORDCHAR:
4772 for (i = 1; i <= min; i++)
4773 {
4774 if (eptr >= md->end_subject)
4775 {
4776 SCHECK_PARTIAL();
4777 RRETURN(MATCH_NOMATCH);
4778 }
4779 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4780 RRETURN(MATCH_NOMATCH);
4781 eptr++;
4782 }
4783 break;
4784
4785 default:
4786 RRETURN(PCRE_ERROR_INTERNAL);
4787 }
4788 }
4789
4790 /* If min = max, continue at the same level without recursing */
4791
4792 if (min == max) continue;
4793
4794 /* If minimizing, we have to test the rest of the pattern before each
4795 subsequent match. Again, separate the UTF-8 case for speed, and also
4796 separate the UCP cases. */
4797
4798 if (minimize)
4799 {
4800 #ifdef SUPPORT_UCP
4801 if (prop_type >= 0)
4802 {
4803 switch(prop_type)
4804 {
4805 case PT_ANY:
4806 for (fi = min;; fi++)
4807 {
4808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4810 if (fi >= max) RRETURN(MATCH_NOMATCH);
4811 if (eptr >= md->end_subject)
4812 {
4813 SCHECK_PARTIAL();
4814 RRETURN(MATCH_NOMATCH);
4815 }
4816 GETCHARINCTEST(c, eptr);
4817 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4818 }
4819 /* Control never gets here */
4820
4821 case PT_LAMP:
4822 for (fi = min;; fi++)
4823 {
4824 int chartype;
4825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4827 if (fi >= max) RRETURN(MATCH_NOMATCH);
4828 if (eptr >= md->end_subject)
4829 {
4830 SCHECK_PARTIAL();
4831 RRETURN(MATCH_NOMATCH);
4832 }
4833 GETCHARINCTEST(c, eptr);
4834 chartype = UCD_CHARTYPE(c);
4835 if ((chartype == ucp_Lu ||
4836 chartype == ucp_Ll ||
4837 chartype == ucp_Lt) == prop_fail_result)
4838 RRETURN(MATCH_NOMATCH);
4839 }
4840 /* Control never gets here */
4841
4842 case PT_GC:
4843 for (fi = min;; fi++)
4844 {
4845 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4847 if (fi >= max) RRETURN(MATCH_NOMATCH);
4848 if (eptr >= md->end_subject)
4849 {
4850 SCHECK_PARTIAL();
4851 RRETURN(MATCH_NOMATCH);
4852 }
4853 GETCHARINCTEST(c, eptr);
4854 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4855 RRETURN(MATCH_NOMATCH);
4856 }
4857 /* Control never gets here */
4858
4859 case PT_PC:
4860 for (fi = min;; fi++)
4861 {
4862 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4864 if (fi >= max) RRETURN(MATCH_NOMATCH);
4865 if (eptr >= md->end_subject)
4866 {
4867 SCHECK_PARTIAL();
4868 RRETURN(MATCH_NOMATCH);
4869 }
4870 GETCHARINCTEST(c, eptr);
4871 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4872 RRETURN(MATCH_NOMATCH);
4873 }
4874 /* Control never gets here */
4875
4876 case PT_SC:
4877 for (fi = min;; fi++)
4878 {
4879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4881 if (fi >= max) RRETURN(MATCH_NOMATCH);
4882 if (eptr >= md->end_subject)
4883 {
4884 SCHECK_PARTIAL();
4885 RRETURN(MATCH_NOMATCH);
4886 }
4887 GETCHARINCTEST(c, eptr);
4888 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4889 RRETURN(MATCH_NOMATCH);
4890 }
4891 /* Control never gets here */
4892
4893 case PT_ALNUM:
4894 for (fi = min;; fi++)
4895 {
4896 int category;
4897 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4898 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4899 if (fi >= max) RRETURN(MATCH_NOMATCH);
4900 if (eptr >= md->end_subject)
4901 {
4902 SCHECK_PARTIAL();
4903 RRETURN(MATCH_NOMATCH);
4904 }
4905 GETCHARINCTEST(c, eptr);
4906 category = UCD_CATEGORY(c);
4907 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 /* Control never gets here */
4911
4912 case PT_SPACE: /* Perl space */
4913 for (fi = min;; fi++)
4914 {
4915 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4917 if (fi >= max) RRETURN(MATCH_NOMATCH);
4918 if (eptr >= md->end_subject)
4919 {
4920 SCHECK_PARTIAL();
4921 RRETURN(MATCH_NOMATCH);
4922 }
4923 GETCHARINCTEST(c, eptr);
4924 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4925 c == CHAR_FF || c == CHAR_CR)
4926 == prop_fail_result)
4927 RRETURN(MATCH_NOMATCH);
4928 }
4929 /* Control never gets here */
4930
4931 case PT_PXSPACE: /* POSIX space */
4932 for (fi = min;; fi++)
4933 {
4934 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4936 if (fi >= max) RRETURN(MATCH_NOMATCH);
4937 if (eptr >= md->end_subject)
4938 {
4939 SCHECK_PARTIAL();
4940 RRETURN(MATCH_NOMATCH);
4941 }
4942 GETCHARINCTEST(c, eptr);
4943 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4944 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4945 == prop_fail_result)
4946 RRETURN(MATCH_NOMATCH);
4947 }
4948 /* Control never gets here */
4949
4950 case PT_WORD:
4951 for (fi = min;; fi++)
4952 {
4953 int category;
4954 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4955 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4956 if (fi >= max) RRETURN(MATCH_NOMATCH);
4957 if (eptr >= md->end_subject)
4958 {
4959 SCHECK_PARTIAL();
4960 RRETURN(MATCH_NOMATCH);
4961 }
4962 GETCHARINCTEST(c, eptr);
4963 category = UCD_CATEGORY(c);
4964 if ((category == ucp_L ||
4965 category == ucp_N ||
4966 c == CHAR_UNDERSCORE)
4967 == prop_fail_result)
4968 RRETURN(MATCH_NOMATCH);
4969 }
4970 /* Control never gets here */
4971
4972 case PT_CLIST:
4973 for (fi = min;; fi++)
4974 {
4975 const pcre_uint32 *cp;
4976 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
4977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4978 if (fi >= max) RRETURN(MATCH_NOMATCH);
4979 if (eptr >= md->end_subject)
4980 {
4981 SCHECK_PARTIAL();
4982 RRETURN(MATCH_NOMATCH);
4983 }
4984 GETCHARINCTEST(c, eptr);
4985 cp = PRIV(ucd_caseless_sets) + prop_value;
4986 for (;;)
4987 {
4988 if (c < *cp)
4989 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4990 if (c == *cp++)
4991 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4992 }
4993 }
4994 /* Control never gets here */
4995
4996 /* This should never occur */
4997 default:
4998 RRETURN(PCRE_ERROR_INTERNAL);
4999 }
5000 }
5001
5002 /* Match extended Unicode sequences. We will get here only if the
5003 support is in the binary; otherwise a compile-time error occurs. */
5004
5005 else if (ctype == OP_EXTUNI)
5006 {
5007 for (fi = min;; fi++)
5008 {
5009 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5011 if (fi >= max) RRETURN(MATCH_NOMATCH);
5012 if (eptr >= md->end_subject)
5013 {
5014 SCHECK_PARTIAL();
5015 RRETURN(MATCH_NOMATCH);
5016 }
5017 else
5018 {
5019 int lgb, rgb;
5020 GETCHARINCTEST(c, eptr);
5021 lgb = UCD_GRAPHBREAK(c);
5022 while (eptr < md->end_subject)
5023 {
5024 int len = 1;
5025 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5026 rgb = UCD_GRAPHBREAK(c);
5027 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5028 lgb = rgb;
5029 eptr += len;
5030 }
5031 }
5032 CHECK_PARTIAL();
5033 }
5034 }
5035 else
5036 #endif /* SUPPORT_UCP */
5037
5038 #ifdef SUPPORT_UTF
5039 if (utf)
5040 {
5041 for (fi = min;; fi++)
5042 {
5043 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5045 if (fi >= max) RRETURN(MATCH_NOMATCH);
5046 if (eptr >= md->end_subject)
5047 {
5048 SCHECK_PARTIAL();
5049 RRETURN(MATCH_NOMATCH);
5050 }
5051 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5052 RRETURN(MATCH_NOMATCH);
5053 GETCHARINC(c, eptr);
5054 switch(ctype)
5055 {
5056 case OP_ANY: /* This is the non-NL case */
5057 if (md->partial != 0 && /* Take care with CRLF partial */
5058 eptr >= md->end_subject &&
5059 NLBLOCK->nltype == NLTYPE_FIXED &&
5060 NLBLOCK->nllen == 2 &&
5061 c == NLBLOCK->nl[0])
5062 {
5063 md->hitend = TRUE;
5064 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5065 }
5066 break;
5067
5068 case OP_ALLANY:
5069 case OP_ANYBYTE:
5070 break;
5071
5072 case OP_ANYNL:
5073 switch(c)
5074 {
5075 default: RRETURN(MATCH_NOMATCH);
5076 case CHAR_CR:
5077 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5078 break;
5079
5080 case CHAR_LF:
5081 break;
5082
5083 case CHAR_VT:
5084 case CHAR_FF:
5085 case CHAR_NEL:
5086 #ifndef EBCDIC
5087 case 0x2028:
5088 case 0x2029:
5089 #endif /* Not EBCDIC */
5090 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5091 break;
5092 }
5093 break;
5094
5095 case OP_NOT_HSPACE:
5096 switch(c)
5097 {
5098 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5099 default: break;
5100 }
5101 break;
5102
5103 case OP_HSPACE:
5104 switch(c)
5105 {
5106 HSPACE_CASES: break;
5107 default: RRETURN(MATCH_NOMATCH);
5108 }
5109 break;
5110
5111 case OP_NOT_VSPACE:
5112 switch(c)
5113 {
5114 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5115 default: break;
5116 }
5117 break;
5118
5119 case OP_VSPACE:
5120 switch(c)
5121 {
5122 VSPACE_CASES: break;
5123 default: RRETURN(MATCH_NOMATCH);
5124 }
5125 break;
5126
5127 case OP_NOT_DIGIT:
5128 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5129 RRETURN(MATCH_NOMATCH);
5130 break;
5131
5132 case OP_DIGIT:
5133 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5134 RRETURN(MATCH_NOMATCH);
5135 break;
5136
5137 case OP_NOT_WHITESPACE:
5138 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5139 RRETURN(MATCH_NOMATCH);
5140 break;
5141
5142 case OP_WHITESPACE:
5143 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5144 RRETURN(MATCH_NOMATCH);
5145 break;
5146
5147 case OP_NOT_WORDCHAR:
5148 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5149 RRETURN(MATCH_NOMATCH);
5150 break;
5151
5152 case OP_WORDCHAR:
5153 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5154 RRETURN(MATCH_NOMATCH);
5155 break;
5156
5157 default:
5158 RRETURN(PCRE_ERROR_INTERNAL);
5159 }
5160 }
5161 }
5162 else
5163 #endif
5164 /* Not UTF mode */
5165 {
5166 for (fi = min;; fi++)
5167 {
5168 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5169 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5170 if (fi >= max) RRETURN(MATCH_NOMATCH);
5171 if (eptr >= md->end_subject)
5172 {
5173 SCHECK_PARTIAL();
5174 RRETURN(MATCH_NOMATCH);
5175 }
5176 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5177 RRETURN(MATCH_NOMATCH);
5178 c = *eptr++;
5179 switch(ctype)
5180 {
5181 case OP_ANY: /* This is the non-NL case */
5182 if (md->partial != 0 && /* Take care with CRLF partial */
5183 eptr >= md->end_subject &&
5184 NLBLOCK->nltype == NLTYPE_FIXED &&
5185 NLBLOCK->nllen == 2 &&
5186 c == NLBLOCK->nl[0])
5187 {
5188 md->hitend = TRUE;
5189 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5190 }
5191 break;
5192
5193 case OP_ALLANY:
5194 case OP_ANYBYTE:
5195 break;
5196
5197 case OP_ANYNL:
5198 switch(c)
5199 {
5200 default: RRETURN(MATCH_NOMATCH);
5201 case CHAR_CR:
5202 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5203 break;
5204
5205 case CHAR_LF:
5206 break;
5207
5208 case CHAR_VT:
5209 case CHAR_FF:
5210 case CHAR_NEL:
5211 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5212 case 0x2028:
5213 case 0x2029:
5214 #endif
5215 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5216 break;
5217 }
5218 break;
5219
5220 case OP_NOT_HSPACE:
5221 switch(c)
5222 {
5223 default: break;
5224 HSPACE_BYTE_CASES:
5225 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5226 HSPACE_MULTIBYTE_CASES:
5227 #endif
5228 RRETURN(MATCH_NOMATCH);
5229 }
5230 break;
5231
5232 case OP_HSPACE:
5233 switch(c)
5234 {
5235 default: RRETURN(MATCH_NOMATCH);
5236 HSPACE_BYTE_CASES:
5237 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5238 HSPACE_MULTIBYTE_CASES:
5239 #endif
5240 break;
5241 }
5242 break;
5243
5244 case OP_NOT_VSPACE:
5245 switch(c)
5246 {
5247 default: break;
5248 VSPACE_BYTE_CASES:
5249 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5250 VSPACE_MULTIBYTE_CASES:
5251 #endif
5252 RRETURN(MATCH_NOMATCH);
5253 }
5254 break;
5255
5256 case OP_VSPACE:
5257 switch(c)
5258 {
5259 default: RRETURN(MATCH_NOMATCH);
5260 VSPACE_BYTE_CASES:
5261 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5262 VSPACE_MULTIBYTE_CASES:
5263 #endif
5264 break;
5265 }
5266 break;
5267
5268 case OP_NOT_DIGIT:
5269 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5270 break;
5271
5272 case OP_DIGIT:
5273 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5274 break;
5275
5276 case OP_NOT_WHITESPACE:
5277 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5278 break;
5279
5280 case OP_WHITESPACE:
5281 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5282 break;
5283
5284 case OP_NOT_WORDCHAR:
5285 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5286 break;
5287
5288 case OP_WORDCHAR:
5289 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5290 break;
5291
5292 default:
5293 RRETURN(PCRE_ERROR_INTERNAL);
5294 }
5295 }
5296 }
5297 /* Control never gets here */
5298 }
5299
5300 /* If maximizing, it is worth using inline code for speed, doing the type
5301 test once at the start (i.e. keep it out of the loop). Again, keep the
5302 UTF-8 and UCP stuff separate. */
5303
5304 else
5305 {
5306 pp = eptr; /* Remember where we started */
5307
5308 #ifdef SUPPORT_UCP
5309 if (prop_type >= 0)
5310 {
5311 switch(prop_type)
5312 {
5313 case PT_ANY:
5314 for (i = min; i < max; i++)
5315 {
5316 int len = 1;
5317 if (eptr >= md->end_subject)
5318 {
5319 SCHECK_PARTIAL();
5320 break;
5321 }
5322 GETCHARLENTEST(c, eptr, len);
5323 if (prop_fail_result) break;
5324 eptr+= len;
5325 }
5326 break;
5327
5328 case PT_LAMP:
5329 for (i = min; i < max; i++)
5330 {
5331 int chartype;
5332 int len = 1;
5333 if (eptr >= md->end_subject)
5334 {
5335 SCHECK_PARTIAL();
5336 break;
5337 }
5338 GETCHARLENTEST(c, eptr, len);
5339 chartype = UCD_CHARTYPE(c);
5340 if ((chartype == ucp_Lu ||
5341 chartype == ucp_Ll ||
5342 chartype == ucp_Lt) == prop_fail_result)
5343 break;
5344 eptr+= len;
5345 }
5346 break;
5347
5348 case PT_GC:
5349 for (i = min; i < max; i++)
5350 {
5351 int len = 1;
5352 if (eptr >= md->end_subject)
5353 {
5354 SCHECK_PARTIAL();
5355 break;
5356 }
5357 GETCHARLENTEST(c, eptr, len);
5358 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5359 eptr+= len;
5360 }
5361 break;
5362
5363 case PT_PC:
5364 for (i = min; i < max; i++)
5365 {
5366 int len = 1;
5367 if (eptr >= md->end_subject)
5368 {
5369 SCHECK_PARTIAL();
5370 break;
5371 }
5372 GETCHARLENTEST(c, eptr, len);
5373 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5374 eptr+= len;
5375 }
5376 break;
5377
5378 case PT_SC:
5379 for (i = min; i < max; i++)
5380 {
5381 int len = 1;
5382 if (eptr >= md->end_subject)
5383 {
5384 SCHECK_PARTIAL();
5385 break;
5386 }
5387 GETCHARLENTEST(c, eptr, len);
5388 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5389 eptr+= len;
5390 }
5391 break;
5392
5393 case PT_ALNUM:
5394 for (i = min; i < max; i++)
5395 {
5396 int category;
5397 int len = 1;
5398 if (eptr >= md->end_subject)
5399 {
5400 SCHECK_PARTIAL();
5401 break;
5402 }
5403 GETCHARLENTEST(c, eptr, len);
5404 category = UCD_CATEGORY(c);
5405 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5406 break;
5407 eptr+= len;
5408 }
5409 break;
5410
5411 case PT_SPACE: /* Perl space */
5412 for (i = min; i < max; i++)
5413 {
5414 int len = 1;
5415 if (eptr >= md->end_subject)
5416 {
5417 SCHECK_PARTIAL();
5418 break;
5419 }
5420 GETCHARLENTEST(c, eptr, len);
5421 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5422 c == CHAR_FF || c == CHAR_CR)
5423 == prop_fail_result)
5424 break;
5425 eptr+= len;
5426 }
5427 break;
5428
5429 case PT_PXSPACE: /* POSIX space */
5430 for (i = min; i < max; i++)
5431 {
5432 int len = 1;
5433 if (eptr >= md->end_subject)
5434 {
5435 SCHECK_PARTIAL();
5436 break;
5437 }
5438 GETCHARLENTEST(c, eptr, len);
5439 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5440 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5441 == prop_fail_result)
5442 break;
5443 eptr+= len;
5444 }
5445 break;
5446
5447 case PT_WORD:
5448 for (i = min; i < max; i++)
5449 {
5450 int category;
5451 int len = 1;
5452 if (eptr >= md->end_subject)
5453 {
5454 SCHECK_PARTIAL();
5455 break;
5456 }
5457 GETCHARLENTEST(c, eptr, len);
5458 category = UCD_CATEGORY(c);
5459 if ((category == ucp_L || category == ucp_N ||
5460 c == CHAR_UNDERSCORE) == prop_fail_result)
5461 break;
5462 eptr+= len;
5463 }
5464 break;
5465
5466 case PT_CLIST:
5467 for (i = min; i < max; i++)
5468 {
5469 const pcre_uint32 *cp;
5470 int len = 1;
5471 if (eptr >= md->end_subject)
5472 {
5473 SCHECK_PARTIAL();
5474 break;
5475 }
5476 GETCHARLENTEST(c, eptr, len);
5477 cp = PRIV(ucd_caseless_sets) + prop_value;
5478 for (;;)
5479 {
5480 if (c < *cp)
5481 { if (prop_fail_result) break; else goto GOT_MAX; }
5482 if (c == *cp++)
5483 { if (prop_fail_result) goto GOT_MAX; else break; }
5484 }
5485 eptr += len;
5486 }
5487 GOT_MAX:
5488 break;
5489
5490 default:
5491 RRETURN(PCRE_ERROR_INTERNAL);
5492 }
5493
5494 /* eptr is now past the end of the maximum run */
5495
5496 if (possessive) continue;
5497 for(;;)
5498 {
5499 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5500 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5501 if (eptr-- == pp) break; /* Stop if tried at original pos */
5502 if (utf) BACKCHAR(eptr);
5503 }
5504 }
5505
5506 /* Match extended Unicode sequences. We will get here only if the
5507 support is in the binary; otherwise a compile-time error occurs. */
5508
5509 else if (ctype == OP_EXTUNI)
5510 {
5511 for (i = min; i < max; i++)
5512 {
5513 if (eptr >= md->end_subject)
5514 {
5515 SCHECK_PARTIAL();
5516 break;
5517 }
5518 else
5519 {
5520 int lgb, rgb;
5521 GETCHARINCTEST(c, eptr);
5522 lgb = UCD_GRAPHBREAK(c);
5523 while (eptr < md->end_subject)
5524 {
5525 int len = 1;
5526 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5527 rgb = UCD_GRAPHBREAK(c);
5528 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5529 lgb = rgb;
5530 eptr += len;
5531 }
5532 }
5533 CHECK_PARTIAL();
5534 }
5535
5536 /* eptr is now past the end of the maximum run */
5537
5538 if (possessive) continue;
5539
5540 for(;;)
5541 {
5542 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5543 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5544 if (eptr-- == pp) break; /* Stop if tried at original pos */
5545 for (;;) /* Move back over one extended */
5546 {
5547 if (!utf) c = *eptr; else
5548 {
5549 BACKCHAR(eptr);
5550 GETCHAR(c, eptr);
5551 }
5552 if (UCD_CATEGORY(c) != ucp_M) break;
5553 eptr--;
5554 }
5555 }
5556 }
5557
5558 else
5559 #endif /* SUPPORT_UCP */
5560
5561 #ifdef SUPPORT_UTF
5562 if (utf)
5563 {
5564 switch(ctype)
5565 {
5566 case OP_ANY:
5567 if (max < INT_MAX)
5568 {
5569 for (i = min; i < max; i++)
5570 {
5571 if (eptr >= md->end_subject)
5572 {
5573 SCHECK_PARTIAL();
5574 break;
5575 }
5576 if (IS_NEWLINE(eptr)) break;
5577 if (md->partial != 0 && /* Take care with CRLF partial */
5578 eptr + 1 >= md->end_subject &&
5579 NLBLOCK->nltype == NLTYPE_FIXED &&
5580 NLBLOCK->nllen == 2 &&
5581 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5582 {
5583 md->hitend = TRUE;
5584 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5585 }
5586 eptr++;
5587 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5588 }
5589 }
5590
5591 /* Handle unlimited UTF-8 repeat */
5592
5593 else
5594 {
5595 for (i = min; i < max; i++)
5596 {
5597 if (eptr >= md->end_subject)
5598 {
5599 SCHECK_PARTIAL();
5600 break;
5601 }
5602 if (IS_NEWLINE(eptr)) break;
5603 if (md->partial != 0 && /* Take care with CRLF partial */
5604 eptr + 1 >= md->end_subject &&
5605 NLBLOCK->nltype == NLTYPE_FIXED &&
5606 NLBLOCK->nllen == 2 &&
5607 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5608 {
5609 md->hitend = TRUE;
5610 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5611 }
5612 eptr++;
5613 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5614 }
5615 }
5616 break;
5617
5618 case OP_ALLANY:
5619 if (max < INT_MAX)
5620 {
5621 for (i = min; i < max; i++)
5622 {
5623 if (eptr >= md->end_subject)
5624 {
5625 SCHECK_PARTIAL();
5626 break;
5627 }
5628 eptr++;
5629 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5630 }
5631 }
5632 else
5633 {
5634 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5635 SCHECK_PARTIAL();
5636 }
5637 break;
5638
5639 /* The byte case is the same as non-UTF8 */
5640
5641 case OP_ANYBYTE:
5642 c = max - min;
5643 if (c > (unsigned int)(md->end_subject - eptr))
5644 {
5645 eptr = md->end_subject;
5646 SCHECK_PARTIAL();
5647 }
5648 else eptr += c;
5649 break;
5650
5651 case OP_ANYNL:
5652 for (i = min; i < max; i++)
5653 {
5654 int len = 1;
5655 if (eptr >= md->end_subject)
5656 {
5657 SCHECK_PARTIAL();
5658 break;
5659 }
5660 GETCHARLEN(c, eptr, len);
5661 if (c == CHAR_CR)
5662 {
5663 if (++eptr >= md->end_subject) break;
5664 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5665 }
5666 else
5667 {
5668 if (c != CHAR_LF &&
5669 (md->bsr_anycrlf ||
5670 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5671 #ifndef EBCDIC
5672 && c != 0x2028 && c != 0x2029
5673 #endif /* Not EBCDIC */
5674 )))
5675 break;
5676 eptr += len;
5677 }
5678 }
5679 break;
5680
5681 case OP_NOT_HSPACE:
5682 case OP_HSPACE:
5683 for (i = min; i < max; i++)
5684 {
5685 BOOL gotspace;
5686 int len = 1;
5687 if (eptr >= md->end_subject)
5688 {
5689 SCHECK_PARTIAL();
5690 break;
5691 }
5692 GETCHARLEN(c, eptr, len);
5693 switch(c)
5694 {
5695 HSPACE_CASES: gotspace = TRUE; break;
5696 default: gotspace = FALSE; break;
5697 }
5698 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5699 eptr += len;
5700 }
5701 break;
5702
5703 case OP_NOT_VSPACE:
5704 case OP_VSPACE:
5705 for (i = min; i < max; i++)
5706 {
5707 BOOL gotspace;
5708 int len = 1;
5709 if (eptr >= md->end_subject)
5710 {
5711 SCHECK_PARTIAL();
5712 break;
5713 }
5714 GETCHARLEN(c, eptr, len);
5715 switch(c)
5716 {
5717 VSPACE_CASES: gotspace = TRUE; break;
5718 default: gotspace = FALSE; break;
5719 }
5720 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5721 eptr += len;
5722 }
5723 break;
5724
5725 case OP_NOT_DIGIT:
5726 for (i = min; i < max; i++)
5727 {
5728 int len = 1;
5729 if (eptr >= md->end_subject)
5730 {
5731 SCHECK_PARTIAL();
5732 break;
5733 }
5734 GETCHARLEN(c, eptr, len);
5735 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5736 eptr+= len;
5737 }
5738 break;
5739
5740 case OP_DIGIT:
5741 for (i = min; i < max; i++)
5742 {
5743 int len = 1;
5744 if (eptr >= md->end_subject)
5745 {
5746 SCHECK_PARTIAL();
5747 break;
5748 }
5749 GETCHARLEN(c, eptr, len);
5750 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5751 eptr+= len;
5752 }
5753 break;
5754
5755 case OP_NOT_WHITESPACE:
5756 for (i = min; i < max; i++)
5757 {
5758 int len = 1;
5759 if (eptr >= md->end_subject)
5760 {
5761 SCHECK_PARTIAL();
5762 break;
5763 }
5764 GETCHARLEN(c, eptr, len);
5765 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5766 eptr+= len;
5767 }
5768 break;
5769
5770 case OP_WHITESPACE:
5771 for (i = min; i < max; i++)
5772 {
5773 int len = 1;
5774 if (eptr >= md->end_subject)
5775 {
5776 SCHECK_PARTIAL();
5777 break;
5778 }
5779 GETCHARLEN(c, eptr, len);
5780 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5781 eptr+= len;
5782 }
5783 break;
5784
5785 case OP_NOT_WORDCHAR:
5786 for (i = min; i < max; i++)
5787 {
5788 int len = 1;
5789 if (eptr >= md->end_subject)
5790 {
5791 SCHECK_PARTIAL();
5792 break;
5793 }
5794 GETCHARLEN(c, eptr, len);
5795 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5796 eptr+= len;
5797 }
5798 break;
5799
5800 case OP_WORDCHAR:
5801 for (i = min; i < max; i++)
5802 {
5803 int len = 1;
5804 if (eptr >= md->end_subject)
5805 {
5806 SCHECK_PARTIAL();
5807 break;
5808 }
5809 GETCHARLEN(c, eptr, len);
5810 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5811 eptr+= len;
5812 }
5813 break;
5814
5815 default:
5816 RRETURN(PCRE_ERROR_INTERNAL);
5817 }
5818
5819 /* eptr is now past the end of the maximum run. If possessive, we are
5820 done (no backing up). Otherwise, match at this position; anything other
5821 than no match is immediately returned. For nomatch, back up one
5822 character, unless we are matching \R and the last thing matched was
5823 \r\n, in which case, back up two bytes. */
5824
5825 if (possessive) continue;
5826 for(;;)
5827 {
5828 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5830 if (eptr-- == pp) break; /* Stop if tried at original pos */
5831 BACKCHAR(eptr);
5832 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5833 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5834 }
5835 }
5836 else
5837 #endif /* SUPPORT_UTF */
5838 /* Not UTF mode */
5839 {
5840 switch(ctype)
5841 {
5842 case OP_ANY:
5843 for (i = min; i < max; i++)
5844 {
5845 if (eptr >= md->end_subject)
5846 {
5847 SCHECK_PARTIAL();
5848 break;
5849 }
5850 if (IS_NEWLINE(eptr)) break;
5851 if (md->partial != 0 && /* Take care with CRLF partial */
5852 eptr + 1 >= md->end_subject &&
5853 NLBLOCK->nltype == NLTYPE_FIXED &&
5854 NLBLOCK->nllen == 2 &&
5855 *eptr == NLBLOCK->nl[0])
5856 {
5857 md->hitend = TRUE;
5858 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5859 }
5860 eptr++;
5861 }
5862 break;
5863
5864 case OP_ALLANY:
5865 case OP_ANYBYTE:
5866 c = max - min;
5867 if (c > (unsigned int)(md->end_subject - eptr))
5868 {
5869 eptr = md->end_subject;
5870 SCHECK_PARTIAL();
5871 }
5872 else eptr += c;
5873 break;
5874
5875 case OP_ANYNL:
5876 for (i = min; i < max; i++)
5877 {
5878 if (eptr >= md->end_subject)
5879 {
5880 SCHECK_PARTIAL();
5881 break;
5882 }
5883 c = *eptr;
5884 if (c == CHAR_CR)
5885 {
5886 if (++eptr >= md->end_subject) break;
5887 if (*eptr == CHAR_LF) eptr++;
5888 }
5889 else
5890 {
5891 if (c != CHAR_LF && (md->bsr_anycrlf ||
5892 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5893 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5894 && c != 0x2028 && c != 0x2029
5895 #endif
5896 ))) break;
5897 eptr++;
5898 }
5899 }
5900 break;
5901
5902 case OP_NOT_HSPACE:
5903 for (i = min; i < max; i++)
5904 {
5905 if (eptr >= md->end_subject)
5906 {
5907 SCHECK_PARTIAL();
5908 break;
5909 }
5910 switch(*eptr)
5911 {
5912 default: eptr++; break;
5913 HSPACE_BYTE_CASES:
5914 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5915 HSPACE_MULTIBYTE_CASES:
5916 #endif
5917 goto ENDLOOP00;
5918 }
5919 }
5920 ENDLOOP00:
5921 break;
5922
5923 case OP_HSPACE:
5924 for (i = min; i < max; i++)
5925 {
5926 if (eptr >= md->end_subject)
5927 {
5928 SCHECK_PARTIAL();
5929 break;
5930 }
5931 switch(*eptr)
5932 {
5933 default: goto ENDLOOP01;
5934 HSPACE_BYTE_CASES:
5935 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5936 HSPACE_MULTIBYTE_CASES:
5937 #endif
5938 eptr++; break;
5939 }
5940 }
5941 ENDLOOP01:
5942 break;
5943
5944 case OP_NOT_VSPACE:
5945 for (i = min; i < max; i++)
5946 {
5947 if (eptr >= md->end_subject)
5948 {
5949 SCHECK_PARTIAL();
5950 break;
5951 }
5952 switch(*eptr)
5953 {
5954 default: eptr++; break;
5955 VSPACE_BYTE_CASES:
5956 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5957 VSPACE_MULTIBYTE_CASES:
5958 #endif
5959 goto ENDLOOP02;
5960 }
5961 }
5962 ENDLOOP02:
5963 break;
5964
5965 case OP_VSPACE:
5966 for (i = min; i < max; i++)
5967 {
5968 if (eptr >= md->end_subject)
5969 {
5970 SCHECK_PARTIAL();
5971 break;
5972 }
5973 switch(*eptr)
5974 {
5975 default: goto ENDLOOP03;
5976 VSPACE_BYTE_CASES:
5977 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5978 VSPACE_MULTIBYTE_CASES:
5979 #endif
5980 eptr++; break;
5981 }
5982 }
5983 ENDLOOP03:
5984 break;
5985
5986 case OP_NOT_DIGIT:
5987 for (i = min; i < max; i++)
5988 {
5989 if (eptr >= md->end_subject)
5990 {
5991 SCHECK_PARTIAL();
5992 break;
5993 }
5994 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5995 eptr++;
5996 }
5997 break;
5998
5999 case OP_DIGIT:
6000 for (i = min; i < max; i++)
6001 {
6002 if (eptr >= md->end_subject)
6003 {
6004 SCHECK_PARTIAL();
6005 break;
6006 }
6007 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6008 eptr++;
6009 }
6010 break;
6011
6012 case OP_NOT_WHITESPACE:
6013 for (i = min; i < max; i++)
6014 {
6015 if (eptr >= md->end_subject)
6016 {
6017 SCHECK_PARTIAL();
6018 break;
6019 }
6020 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6021 eptr++;
6022 }
6023 break;
6024
6025 case OP_WHITESPACE:
6026 for (i = min; i < max; i++)
6027 {
6028 if (eptr >= md->end_subject)
6029 {
6030 SCHECK_PARTIAL();
6031 break;
6032 }
6033 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6034 eptr++;
6035 }
6036 break;
6037
6038 case OP_NOT_WORDCHAR:
6039 for (i = min; i < max; i++)
6040 {
6041 if (eptr >= md->end_subject)
6042 {
6043 SCHECK_PARTIAL();
6044 break;
6045 }
6046 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6047 eptr++;
6048 }
6049 break;
6050
6051 case OP_WORDCHAR:
6052 for (i = min; i < max; i++)
6053 {
6054 if (eptr >= md->end_subject)
6055 {
6056 SCHECK_PARTIAL();
6057 break;
6058 }
6059 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6060 eptr++;
6061 }
6062 break;
6063
6064 default:
6065 RRETURN(PCRE_ERROR_INTERNAL);
6066 }
6067
6068 /* eptr is now past the end of the maximum run. If possessive, we are
6069 done (no backing up). Otherwise, match at this position; anything other
6070 than no match is immediately returned. For nomatch, back up one
6071 character (byte), unless we are matching \R and the last thing matched
6072 was \r\n, in which case, back up two bytes. */
6073
6074 if (possessive) continue;
6075 while (eptr >= pp)
6076 {
6077 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6079 eptr--;
6080 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6081 eptr[-1] == CHAR_CR) eptr--;
6082 }
6083 }
6084
6085 /* Get here if we can't make it match with any permitted repetitions */
6086
6087 RRETURN(MATCH_NOMATCH);
6088 }
6089 /* Control never gets here */
6090
6091 /* There's been some horrible disaster. Arrival here can only mean there is
6092 something seriously wrong in the code above or the OP_xxx definitions. */
6093
6094 default:
6095 DPRINTF(("Unknown opcode %d\n", *ecode));
6096 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6097 }
6098
6099 /* Do not stick any code in here without much thought; it is assumed
6100 that "continue" in the code above comes out to here to repeat the main
6101 loop. */
6102
6103 } /* End of main loop */
6104 /* Control never reaches here */
6105
6106
6107 /* When compiling to use the heap rather than the stack for recursive calls to
6108 match(), the RRETURN() macro jumps here. The number that is saved in
6109 frame->Xwhere indicates which label we actually want to return to. */
6110
6111 #ifdef NO_RECURSE
6112 #define LBL(val) case val: goto L_RM##val;
6113 HEAP_RETURN:
6114 switch (frame->Xwhere)
6115 {
6116 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6117 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6118 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6119 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6120 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6121 LBL(65) LBL(66)
6122 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6123 LBL(21)
6124 #endif
6125 #ifdef SUPPORT_UTF
6126 LBL(16) LBL(18) LBL(20)
6127 LBL(22) LBL(23) LBL(28) LBL(30)
6128 LBL(32) LBL(34) LBL(42) LBL(46)
6129 #ifdef SUPPORT_UCP
6130 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6131 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6132 #endif /* SUPPORT_UCP */
6133 #endif /* SUPPORT_UTF */
6134 default:
6135 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6136 return PCRE_ERROR_INTERNAL;
6137 }
6138 #undef LBL
6139 #endif /* NO_RECURSE */
6140 }
6141
6142
6143 /***************************************************************************
6144 ****************************************************************************
6145 RECURSION IN THE match() FUNCTION
6146
6147 Undefine all the macros that were defined above to handle this. */
6148
6149 #ifdef NO_RECURSE
6150 #undef eptr
6151 #undef ecode
6152 #undef mstart
6153 #undef offset_top
6154 #undef eptrb
6155 #undef flags
6156
6157 #undef callpat
6158 #undef charptr
6159 #undef data
6160 #undef next
6161 #undef pp
6162 #undef prev
6163 #undef saved_eptr
6164
6165 #undef new_recursive
6166
6167 #undef cur_is_word
6168 #undef condition
6169 #undef prev_is_word
6170
6171 #undef ctype
6172 #undef length
6173 #undef max
6174 #undef min
6175 #undef number
6176 #undef offset
6177 #undef op
6178 #undef save_capture_last
6179 #undef save_offset1
6180 #undef save_offset2
6181 #undef save_offset3
6182 #undef stacksave
6183
6184 #undef newptrb
6185
6186 #endif
6187
6188 /* These two are defined as macros in both cases */
6189
6190 #undef fc
6191 #undef fi
6192
6193 /***************************************************************************
6194 ***************************************************************************/
6195
6196
6197 #ifdef NO_RECURSE
6198 /*************************************************
6199 * Release allocated heap frames *
6200 *************************************************/
6201
6202 /* This function releases all the allocated frames. The base frame is on the
6203 machine stack, and so must not be freed.
6204
6205 Argument: the address of the base frame
6206 Returns: nothing
6207 */
6208
6209 static void
6210 release_match_heapframes (heapframe *frame_base)
6211 {
6212 heapframe *nextframe = frame_base->Xnextframe;
6213 while (nextframe != NULL)
6214 {
6215 heapframe *oldframe = nextframe;
6216 nextframe = nextframe->Xnextframe;
6217 (PUBL(stack_free))(oldframe);
6218 }
6219 }
6220 #endif
6221
6222
6223 /*************************************************
6224 * Execute a Regular Expression *
6225 *************************************************/
6226
6227 /* This function applies a compiled re to a subject string and picks out
6228 portions of the string if it matches. Two elements in the vector are set for
6229 each substring: the offsets to the start and end of the substring.
6230
6231 Arguments:
6232 argument_re points to the compiled expression
6233 extra_data points to extra data or is NULL
6234 subject points to the subject string
6235 length length of subject string (may contain binary zeros)
6236 start_offset where to start in the subject string
6237 options option bits
6238 offsets points to a vector of ints to be filled in with offsets
6239 offsetcount the number of elements in the vector
6240
6241 Returns: > 0 => success; value is the number of elements filled in
6242 = 0 => success, but offsets is not big enough
6243 -1 => failed to match
6244 < -1 => some kind of unexpected problem
6245 */
6246
6247 #if defined COMPILE_PCRE8
6248 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6249 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6250 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6251 int offsetcount)
6252 #elif defined COMPILE_PCRE16
6253 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6254 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6255 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6256 int offsetcount)
6257 #elif defined COMPILE_PCRE32
6258 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6259 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6260 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6261 int offsetcount)
6262 #endif
6263 {
6264 int rc, ocount, arg_offset_max;
6265 int newline;
6266 BOOL using_temporary_offsets = FALSE;
6267 BOOL anchored;
6268 BOOL startline;
6269 BOOL firstline;
6270 BOOL utf;
6271 BOOL has_first_char = FALSE;
6272 BOOL has_req_char = FALSE;
6273 pcre_uchar first_char = 0;
6274 pcre_uchar first_char2 = 0;
6275 pcre_uchar req_char = 0;
6276 pcre_uchar req_char2 = 0;
6277 match_data match_block;
6278 match_data *md = &match_block;
6279 const pcre_uint8 *tables;
6280 const pcre_uint8 *start_bits = NULL;
6281 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6282 PCRE_PUCHAR end_subject;
6283 PCRE_PUCHAR start_partial = NULL;
6284 PCRE_PUCHAR match_partial;
6285 PCRE_PUCHAR req_char_ptr = start_match - 1;
6286
6287 const pcre_study_data *study;
6288 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6289
6290 #ifdef NO_RECURSE
6291 heapframe frame_zero;
6292 frame_zero.Xprevframe = NULL; /* Marks the top level */
6293 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6294 md->match_frames_base = &frame_zero;
6295 #endif
6296
6297 /* Check for the special magic call that measures the size of the stack used
6298 per recursive call of match(). Without the funny casting for sizeof, a Windows
6299 compiler gave this error: "unary minus operator applied to unsigned type,
6300 result still unsigned". Hopefully the cast fixes that. */
6301
6302 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6303 start_offset == -999)
6304 #ifdef NO_RECURSE
6305 return -((int)sizeof(heapframe));
6306 #else
6307 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6308 #endif
6309
6310 /* Plausibility checks */
6311
6312 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6313 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6314 return PCRE_ERROR_NULL;
6315 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6316 if (length < 0) return PCRE_ERROR_BADLENGTH;
6317 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6318
6319 /* Check that the first field in the block is the magic number. If it is not,
6320 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6321 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6322 means that the pattern is likely compiled with different endianness. */
6323
6324 if (re->magic_number != MAGIC_NUMBER)
6325 return re->magic_number == REVERSED_MAGIC_NUMBER?
6326 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6327 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6328
6329 /* These two settings are used in the code for checking a UTF-8 string that
6330 follows immediately afterwards. Other values in the md block are used only
6331 during "normal" pcre_exec() processing, not when the JIT support is in use,
6332 so they are set up later. */
6333
6334 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6335 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6336 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6337 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6338
6339 /* Check a UTF-8 string if required. Pass back the character offset and error
6340 code for an invalid string if a results vector is available. */
6341
6342 #ifdef SUPPORT_UTF
6343 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6344 {
6345 int erroroffset;
6346 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6347 if (errorcode != 0)
6348 {
6349 if (offsetcount >= 2)
6350 {
6351 offsets[0] = erroroffset;
6352 offsets[1] = errorcode;
6353 }
6354 #if defined COMPILE_PCRE8
6355 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6356 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6357 #elif defined COMPILE_PCRE16
6358 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6359 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6360 #elif defined COMPILE_PCRE32
6361 return PCRE_ERROR_BADUTF32;
6362 #endif
6363 }
6364 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6365 /* Check that a start_offset points to the start of a UTF character. */
6366 if (start_offset > 0 && start_offset < length &&
6367 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6368 return PCRE_ERROR_BADUTF8_OFFSET;
6369 #endif
6370 }
6371 #endif
6372
6373 /* If the pattern was successfully studied with JIT support, run the JIT
6374 executable instead of the rest of this function. Most options must be set at
6375 compile time for the JIT code to be usable. Fallback to the normal code path if
6376 an unsupported flag is set. */
6377
6378 #ifdef SUPPORT_JIT
6379 if (extra_data != NULL
6380 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6381 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6382 && extra_data->executable_jit != NULL
6383 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6384 {
6385 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6386 start_offset, options, offsets, offsetcount);
6387
6388 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6389 mode is not compiled. In this case we simply fallback to interpreter. */
6390
6391 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6392 }
6393 #endif
6394
6395 /* Carry on with non-JIT matching. This information is for finding all the
6396 numbers associated with a given name, for condition testing. */
6397
6398 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6399 md->name_count = re->name_count;
6400 md->name_entry_size = re->name_entry_size;
6401
6402 /* Fish out the optional data from the extra_data structure, first setting
6403 the default values. */
6404
6405 study = NULL;
6406 md->match_limit = MATCH_LIMIT;
6407 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6408 md->callout_data = NULL;
6409
6410 /* The table pointer is always in native byte order. */
6411
6412 tables = re->tables;
6413
6414 if (extra_data != NULL)
6415 {
6416 register unsigned int flags = extra_data->flags;
6417 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6418 study = (const pcre_study_data *)extra_data->study_data;
6419 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6420 md->match_limit = extra_data->match_limit;
6421 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6422 md->match_limit_recursion = extra_data->match_limit_recursion;
6423 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6424 md->callout_data = extra_data->callout_data;
6425 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6426 }
6427
6428 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6429 is a feature that makes it possible to save compiled regex and re-use them
6430 in other programs later. */
6431
6432 if (tables == NULL) tables = PRIV(default_tables);
6433
6434 /* Set up other data */
6435
6436 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6437 startline = (re->flags & PCRE_STARTLINE) != 0;
6438 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6439
6440 /* The code starts after the real_pcre block and the capture name table. */
6441
6442 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6443 re->name_count * re->name_entry_size;
6444
6445 md->start_subject = (PCRE_PUCHAR)subject;
6446 md->start_offset = start_offset;
6447 md->end_subject = md->start_subject + length;
6448 end_subject = md->end_subject;
6449
6450 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6451 md->use_ucp = (re->options & PCRE_UCP) != 0;
6452 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6453 md->ignore_skip_arg = FALSE;
6454
6455 /* Some options are unpacked into BOOL variables in the hope that testing
6456 them will be faster than individual option bits. */
6457
6458 md->notbol = (options & PCRE_NOTBOL) != 0;
6459 md->noteol = (options & PCRE_NOTEOL) != 0;
6460 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6461 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6462
6463 md->hitend = FALSE;
6464 md->mark = md->nomatch_mark = NULL; /* In case never set */
6465
6466 md->recursive = NULL; /* No recursion at top level */
6467 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6468
6469 md->lcc = tables + lcc_offset;
6470 md->fcc = tables + fcc_offset;
6471 md->ctypes = tables + ctypes_offset;
6472
6473 /* Handle different \R options. */
6474
6475 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6476 {
6477 case 0:
6478 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6479 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6480 else
6481 #ifdef BSR_ANYCRLF
6482 md->bsr_anycrlf = TRUE;
6483 #else
6484 md->bsr_anycrlf = FALSE;
6485 #endif
6486 break;
6487
6488 case PCRE_BSR_ANYCRLF:
6489 md->bsr_anycrlf = TRUE;
6490 break;
6491
6492 case PCRE_BSR_UNICODE:
6493 md->bsr_anycrlf = FALSE;
6494 break;
6495
6496 default: return PCRE_ERROR_BADNEWLINE;
6497 }
6498
6499 /* Handle different types of newline. The three bits give eight cases. If
6500 nothing is set at run time, whatever was used at compile time applies. */
6501
6502 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6503 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6504 {
6505 case 0: newline = NEWLINE; break; /* Compile-time default */
6506 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6507 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6508 case PCRE_NEWLINE_CR+
6509 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6510 case PCRE_NEWLINE_ANY: newline = -1; break;
6511 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6512 default: return PCRE_ERROR_BADNEWLINE;
6513 }
6514
6515 if (newline == -2)
6516 {
6517 md->nltype = NLTYPE_ANYCRLF;
6518 }
6519 else if (newline < 0)
6520 {
6521 md->nltype = NLTYPE_ANY;
6522 }
6523 else
6524 {
6525 md->nltype = NLTYPE_FIXED;
6526 if (newline > 255)
6527 {
6528 md->nllen = 2;
6529 md->nl[0] = (newline >> 8) & 255;
6530 md->nl[1] = newline & 255;
6531 }
6532 else
6533 {
6534 md->nllen = 1;
6535 md->nl[0] = newline;
6536 }
6537 }
6538
6539 /* Partial matching was originally supported only for a restricted set of
6540 regexes; from release 8.00 there are no restrictions, but the bits are still
6541 defined (though never set). So there's no harm in leaving this code. */
6542
6543 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6544 return PCRE_ERROR_BADPARTIAL;
6545
6546 /* If the expression has got more back references than the offsets supplied can
6547 hold, we get a temporary chunk of working store to use during the matching.
6548 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6549 of 3. */
6550
6551 ocount = offsetcount - (offsetcount % 3);
6552 arg_offset_max = (2*ocount)/3;
6553
6554 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6555 {
6556 ocount = re->top_backref * 3 + 3;
6557 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6558 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6559 using_temporary_offsets = TRUE;
6560 DPRINTF(("Got memory to hold back references\n"));
6561 }
6562 else md->offset_vector = offsets;
6563 md->offset_end = ocount;
6564 md->offset_max = (2*ocount)/3;
6565 md->capture_last = 0;
6566
6567 /* Reset the working variable associated with each extraction. These should
6568 never be used unless previously set, but they get saved and restored, and so we
6569 initialize them to avoid reading uninitialized locations. Also, unset the
6570 offsets for the matched string. This is really just for tidiness with callouts,
6571 in case they inspect these fields. */
6572
6573 if (md->offset_vector != NULL)
6574 {
6575 register int *iptr = md->offset_vector + ocount;
6576 register int *iend = iptr - re->top_bracket;
6577 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6578 while (--iptr >= iend) *iptr = -1;
6579 md->offset_vector[0] = md->offset_vector[1] = -1;
6580 }
6581
6582 /* Set up the first character to match, if available. The first_char value is
6583 never set for an anchored regular expression, but the anchoring may be forced
6584 at run time, so we have to test for anchoring. The first char may be unset for
6585 an unanchored pattern, of course. If there's no first char and the pattern was
6586 studied, there may be a bitmap of possible first characters. */
6587
6588 if (!anchored)
6589 {
6590 if ((re->flags & PCRE_FIRSTSET) != 0)
6591 {
6592 has_first_char = TRUE;
6593 first_char = first_char2 = (pcre_uchar)(re->first_char);
6594 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6595 {
6596 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6597 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6598 if (utf && first_char > 127)
6599 first_char2 = UCD_OTHERCASE(first_char);
6600 #endif
6601 }
6602 }
6603 else
6604 if (!startline && study != NULL &&
6605 (study->flags & PCRE_STUDY_MAPPED) != 0)
6606 start_bits = study->start_bits;
6607 }
6608
6609 /* For anchored or unanchored matches, there may be a "last known required
6610 character" set. */
6611
6612 if ((re->flags & PCRE_REQCHSET) != 0)
6613 {
6614 has_req_char = TRUE;
6615 req_char = req_char2 = (pcre_uchar)(re->req_char);
6616 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6617 {
6618 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6619 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6620 if (utf && req_char > 127)
6621 req_char2 = UCD_OTHERCASE(req_char);
6622 #endif
6623 }
6624 }
6625
6626
6627 /* ==========================================================================*/
6628
6629 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6630 the loop runs just once. */
6631
6632 for(;;)
6633 {
6634 PCRE_PUCHAR save_end_subject = end_subject;
6635 PCRE_PUCHAR new_start_match;
6636
6637 /* If firstline is TRUE, the start of the match is constrained to the first
6638 line of a multiline string. That is, the match must be before or at the first
6639 newline. Implement this by temporarily adjusting end_subject so that we stop
6640 scanning at a newline. If the match fails at the newline, later code breaks
6641 this loop. */
6642
6643 if (firstline)
6644 {
6645 PCRE_PUCHAR t = start_match;
6646 #ifdef SUPPORT_UTF
6647 if (utf)
6648 {
6649 while (t < md->end_subject && !IS_NEWLINE(t))
6650 {
6651 t++;
6652 ACROSSCHAR(t < end_subject, *t, t++);
6653 }
6654 }
6655 else
6656 #endif
6657 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6658 end_subject = t;
6659 }
6660
6661 /* There are some optimizations that avoid running the match if a known
6662 starting point is not found, or if a known later character is not present.
6663 However, there is an option that disables these, for testing and for ensuring
6664 that all callouts do actually occur. The option can be set in the regex by
6665 (*NO_START_OPT) or passed in match-time options. */
6666
6667 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6668 {
6669 /* Advance to a unique first char if there is one. */
6670
6671 if (has_first_char)
6672 {
6673 pcre_uchar smc;
6674
6675 if (first_char != first_char2)
6676 while (start_match < end_subject &&
6677 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6678 start_match++;
6679 else
6680 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6681 start_match++;
6682 }
6683
6684 /* Or to just after a linebreak for a multiline match */
6685
6686 else if (startline)
6687 {
6688 if (start_match > md->start_subject + start_offset)
6689 {
6690 #ifdef SUPPORT_UTF
6691 if (utf)
6692 {
6693 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6694 {
6695 start_match++;
6696 ACROSSCHAR(start_match < end_subject, *start_match,
6697 start_match++);
6698 }
6699 }
6700 else
6701 #endif
6702 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6703 start_match++;
6704
6705 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6706 and we are now at a LF, advance the match position by one more character.
6707 */
6708
6709 if (start_match[-1] == CHAR_CR &&
6710 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6711 start_match < end_subject &&
6712 RAWUCHARTEST(start_match) == CHAR_NL)
6713 start_match++;
6714 }
6715 }
6716
6717 /* Or to a non-unique first byte after study */
6718
6719 else if (start_bits != NULL)
6720 {
6721 while (start_match < end_subject)
6722 {
6723 register pcre_uint32 c = RAWUCHARTEST(start_match);
6724 #ifndef COMPILE_PCRE8
6725 if (c > 255) c = 255;
6726 #endif
6727 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6728 {
6729 start_match++;
6730 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6731 /* In non 8-bit mode, the iteration will stop for
6732 characters > 255 at the beginning or not stop at all. */
6733 if (utf)
6734 ACROSSCHAR(start_match < end_subject, *start_match,
6735 start_match++);
6736 #endif
6737 }
6738 else break;
6739 }
6740 }
6741 } /* Starting optimizations */
6742
6743 /* Restore fudged end_subject */
6744
6745 end_subject = save_end_subject;
6746
6747 /* The following two optimizations are disabled for partial matching or if
6748 disabling is explicitly requested. */
6749
6750 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6751 {
6752 /* If the pattern was studied, a minimum subject length may be set. This is
6753 a lower bound; no actual string of that length may actually match the
6754 pattern. Although the value is, strictly, in characters, we treat it as
6755 bytes to avoid spending too much time in this optimization. */
6756
6757 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6758 (pcre_uint32)(end_subject - start_match) < study->minlength)
6759 {