/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1171 - (show annotations)
Wed Oct 24 15:22:42 2012 UTC (6 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 212061 byte(s)
Error occurred while calculating annotation data.
Previous patch for no stack recursion was incomplete.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95 #ifdef PCRE_DEBUG
96 /*************************************************
97 * Debugging function to print chars *
98 *************************************************/
99
100 /* Print a sequence of chars in printable format, stopping at the end of the
101 subject if the requested.
102
103 Arguments:
104 p points to characters
105 length number to print
106 is_subject TRUE if printing from within md->start_subject
107 md pointer to matching data block, if is_subject is TRUE
108
109 Returns: nothing
110 */
111
112 static void
113 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
114 {
115 pcre_uint32 c;
116 BOOL utf = md->utf;
117 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
118 while (length-- > 0)
119 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
120 }
121 #endif
122
123
124
125 /*************************************************
126 * Match a back-reference *
127 *************************************************/
128
129 /* Normally, if a back reference hasn't been set, the length that is passed is
130 negative, so the match always fails. However, in JavaScript compatibility mode,
131 the length passed is zero. Note that in caseless UTF-8 mode, the number of
132 subject bytes matched may be different to the number of reference bytes.
133
134 Arguments:
135 offset index into the offset vector
136 eptr pointer into the subject
137 length length of reference to be matched (number of bytes)
138 md points to match data block
139 caseless TRUE if caseless
140
141 Returns: >= 0 the number of subject bytes matched
142 -1 no match
143 -2 partial match; always given if at end subject
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152 #ifdef SUPPORT_UTF
153 BOOL utf = md->utf;
154 #endif
155
156 #ifdef PCRE_DEBUG
157 if (eptr >= md->end_subject)
158 printf("matching subject <null>");
159 else
160 {
161 printf("matching subject ");
162 pchars(eptr, length, TRUE, md);
163 }
164 printf(" against backref ");
165 pchars(p, length, FALSE, md);
166 printf("\n");
167 #endif
168
169 /* Always fail if reference not set (and not JavaScript compatible - in that
170 case the length is passed as zero). */
171
172 if (length < 0) return -1;
173
174 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
175 properly if Unicode properties are supported. Otherwise, we can check only
176 ASCII characters. */
177
178 if (caseless)
179 {
180 #ifdef SUPPORT_UTF
181 #ifdef SUPPORT_UCP
182 if (utf)
183 {
184 /* Match characters up to the end of the reference. NOTE: the number of
185 data units matched may differ, because in UTF-8 there are some characters
186 whose upper and lower case versions code have different numbers of bytes.
187 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
188 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
189 sequence of two of the latter. It is important, therefore, to check the
190 length along the reference, not along the subject (earlier code did this
191 wrong). */
192
193 PCRE_PUCHAR endptr = p + length;
194 while (p < endptr)
195 {
196 pcre_uint32 c, d;
197 const ucd_record *ur;
198 if (eptr >= md->end_subject) return -2; /* Partial match */
199 GETCHARINC(c, eptr);
200 GETCHARINC(d, p);
201 ur = GET_UCD(d);
202 if (c != d && c != d + ur->other_case)
203 {
204 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
205 for (;;)
206 {
207 if (c < *pp) return -1;
208 if (c == *pp++) break;
209 }
210 }
211 }
212 }
213 else
214 #endif
215 #endif
216
217 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
218 is no UCP support. */
219 {
220 while (length-- > 0)
221 {
222 pcre_uchar cc, cp;
223 if (eptr >= md->end_subject) return -2; /* Partial match */
224 cc = RAWUCHARTEST(eptr);
225 cp = RAWUCHARTEST(p);
226 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
227 p++;
228 eptr++;
229 }
230 }
231 }
232
233 /* In the caseful case, we can just compare the bytes, whether or not we
234 are in UTF-8 mode. */
235
236 else
237 {
238 while (length-- > 0)
239 {
240 if (eptr >= md->end_subject) return -2; /* Partial match */
241 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
242 }
243 }
244
245 return (int)(eptr - eptr_start);
246 }
247
248
249
250 /***************************************************************************
251 ****************************************************************************
252 RECURSION IN THE match() FUNCTION
253
254 The match() function is highly recursive, though not every recursive call
255 increases the recursive depth. Nevertheless, some regular expressions can cause
256 it to recurse to a great depth. I was writing for Unix, so I just let it call
257 itself recursively. This uses the stack for saving everything that has to be
258 saved for a recursive call. On Unix, the stack can be large, and this works
259 fine.
260
261 It turns out that on some non-Unix-like systems there are problems with
262 programs that use a lot of stack. (This despite the fact that every last chip
263 has oodles of memory these days, and techniques for extending the stack have
264 been known for decades.) So....
265
266 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
267 calls by keeping local variables that need to be preserved in blocks of memory
268 obtained from malloc() instead instead of on the stack. Macros are used to
269 achieve this so that the actual code doesn't look very different to what it
270 always used to.
271
272 The original heap-recursive code used longjmp(). However, it seems that this
273 can be very slow on some operating systems. Following a suggestion from Stan
274 Switzer, the use of longjmp() has been abolished, at the cost of having to
275 provide a unique number for each call to RMATCH. There is no way of generating
276 a sequence of numbers at compile time in C. I have given them names, to make
277 them stand out more clearly.
278
279 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
280 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
281 tests. Furthermore, not using longjmp() means that local dynamic variables
282 don't have indeterminate values; this has meant that the frame size can be
283 reduced because the result can be "passed back" by straight setting of the
284 variable instead of being passed in the frame.
285 ****************************************************************************
286 ***************************************************************************/
287
288 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
289 below must be updated in sync. */
290
291 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
292 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
293 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
294 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
295 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
296 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
297 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
298
299 /* These versions of the macros use the stack, as normal. There are debugging
300 versions and production versions. Note that the "rw" argument of RMATCH isn't
301 actually used in this definition. */
302
303 #ifndef NO_RECURSE
304 #define REGISTER register
305
306 #ifdef PCRE_DEBUG
307 #define RMATCH(ra,rb,rc,rd,re,rw) \
308 { \
309 printf("match() called in line %d\n", __LINE__); \
310 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
311 printf("to line %d\n", __LINE__); \
312 }
313 #define RRETURN(ra) \
314 { \
315 printf("match() returned %d from line %d\n", ra, __LINE__); \
316 return ra; \
317 }
318 #else
319 #define RMATCH(ra,rb,rc,rd,re,rw) \
320 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
321 #define RRETURN(ra) return ra
322 #endif
323
324 #else
325
326
327 /* These versions of the macros manage a private stack on the heap. Note that
328 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
329 argument of match(), which never changes. */
330
331 #define REGISTER
332
333 #define RMATCH(ra,rb,rc,rd,re,rw)\
334 {\
335 heapframe *newframe = frame->Xnextframe;\
336 if (newframe == NULL)\
337 {\
338 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
339 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
340 newframe->Xnextframe = NULL;\
341 frame->Xnextframe = newframe;\
342 }\
343 frame->Xwhere = rw;\
344 newframe->Xeptr = ra;\
345 newframe->Xecode = rb;\
346 newframe->Xmstart = mstart;\
347 newframe->Xoffset_top = rc;\
348 newframe->Xeptrb = re;\
349 newframe->Xrdepth = frame->Xrdepth + 1;\
350 newframe->Xprevframe = frame;\
351 frame = newframe;\
352 DPRINTF(("restarting from line %d\n", __LINE__));\
353 goto HEAP_RECURSE;\
354 L_##rw:\
355 DPRINTF(("jumped back to line %d\n", __LINE__));\
356 }
357
358 #define RRETURN(ra)\
359 {\
360 heapframe *oldframe = frame;\
361 frame = oldframe->Xprevframe;\
362 if (frame != NULL)\
363 {\
364 rrc = ra;\
365 goto HEAP_RETURN;\
366 }\
367 return ra;\
368 }
369
370
371 /* Structure for remembering the local variables in a private frame */
372
373 typedef struct heapframe {
374 struct heapframe *Xprevframe;
375 struct heapframe *Xnextframe;
376
377 /* Function arguments that may change */
378
379 PCRE_PUCHAR Xeptr;
380 const pcre_uchar *Xecode;
381 PCRE_PUCHAR Xmstart;
382 int Xoffset_top;
383 eptrblock *Xeptrb;
384 unsigned int Xrdepth;
385
386 /* Function local variables */
387
388 PCRE_PUCHAR Xcallpat;
389 #ifdef SUPPORT_UTF
390 PCRE_PUCHAR Xcharptr;
391 #endif
392 PCRE_PUCHAR Xdata;
393 PCRE_PUCHAR Xnext;
394 PCRE_PUCHAR Xpp;
395 PCRE_PUCHAR Xprev;
396 PCRE_PUCHAR Xsaved_eptr;
397
398 recursion_info Xnew_recursive;
399
400 BOOL Xcur_is_word;
401 BOOL Xcondition;
402 BOOL Xprev_is_word;
403
404 #ifdef SUPPORT_UCP
405 int Xprop_type;
406 unsigned int Xprop_value;
407 int Xprop_fail_result;
408 int Xoclength;
409 pcre_uchar Xocchars[6];
410 #endif
411
412 int Xcodelink;
413 int Xctype;
414 unsigned int Xfc;
415 int Xfi;
416 int Xlength;
417 int Xmax;
418 int Xmin;
419 int Xnumber;
420 int Xoffset;
421 int Xop;
422 int Xsave_capture_last;
423 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
424 int Xstacksave[REC_STACK_SAVE_MAX];
425
426 eptrblock Xnewptrb;
427
428 /* Where to jump back to */
429
430 int Xwhere;
431
432 } heapframe;
433
434 #endif
435
436
437 /***************************************************************************
438 ***************************************************************************/
439
440
441
442 /*************************************************
443 * Match from current position *
444 *************************************************/
445
446 /* This function is called recursively in many circumstances. Whenever it
447 returns a negative (error) response, the outer incarnation must also return the
448 same response. */
449
450 /* These macros pack up tests that are used for partial matching, and which
451 appear several times in the code. We set the "hit end" flag if the pointer is
452 at the end of the subject and also past the start of the subject (i.e.
453 something has been matched). For hard partial matching, we then return
454 immediately. The second one is used when we already know we are past the end of
455 the subject. */
456
457 #define CHECK_PARTIAL()\
458 if (md->partial != 0 && eptr >= md->end_subject && \
459 eptr > md->start_used_ptr) \
460 { \
461 md->hitend = TRUE; \
462 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
463 }
464
465 #define SCHECK_PARTIAL()\
466 if (md->partial != 0 && eptr > md->start_used_ptr) \
467 { \
468 md->hitend = TRUE; \
469 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
470 }
471
472
473 /* Performance note: It might be tempting to extract commonly used fields from
474 the md structure (e.g. utf, end_subject) into individual variables to improve
475 performance. Tests using gcc on a SPARC disproved this; in the first case, it
476 made performance worse.
477
478 Arguments:
479 eptr pointer to current character in subject
480 ecode pointer to current position in compiled code
481 mstart pointer to the current match start position (can be modified
482 by encountering \K)
483 offset_top current top pointer
484 md pointer to "static" info for the match
485 eptrb pointer to chain of blocks containing eptr at start of
486 brackets - for testing for empty matches
487 rdepth the recursion depth
488
489 Returns: MATCH_MATCH if matched ) these values are >= 0
490 MATCH_NOMATCH if failed to match )
491 a negative MATCH_xxx value for PRUNE, SKIP, etc
492 a negative PCRE_ERROR_xxx value if aborted by an error condition
493 (e.g. stopped by repeated call or recursion limit)
494 */
495
496 static int
497 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
498 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
499 unsigned int rdepth)
500 {
501 /* These variables do not need to be preserved over recursion in this function,
502 so they can be ordinary variables in all cases. Mark some of them with
503 "register" because they are used a lot in loops. */
504
505 register int rrc; /* Returns from recursive calls */
506 register int i; /* Used for loops not involving calls to RMATCH() */
507 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
508 register BOOL utf; /* Local copy of UTF flag for speed */
509
510 BOOL minimize, possessive; /* Quantifier options */
511 BOOL caseless;
512 int condcode;
513
514 /* When recursion is not being used, all "local" variables that have to be
515 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
516 frame on the stack here; subsequent instantiations are obtained from the heap
517 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
518 the top-level on the stack rather than malloc-ing them all gives a performance
519 boost in many cases where there is not much "recursion". */
520
521 #ifdef NO_RECURSE
522 heapframe *frame = (heapframe *)md->match_frames_base;
523
524 /* Copy in the original argument variables */
525
526 frame->Xeptr = eptr;
527 frame->Xecode = ecode;
528 frame->Xmstart = mstart;
529 frame->Xoffset_top = offset_top;
530 frame->Xeptrb = eptrb;
531 frame->Xrdepth = rdepth;
532
533 /* This is where control jumps back to to effect "recursion" */
534
535 HEAP_RECURSE:
536
537 /* Macros make the argument variables come from the current frame */
538
539 #define eptr frame->Xeptr
540 #define ecode frame->Xecode
541 #define mstart frame->Xmstart
542 #define offset_top frame->Xoffset_top
543 #define eptrb frame->Xeptrb
544 #define rdepth frame->Xrdepth
545
546 /* Ditto for the local variables */
547
548 #ifdef SUPPORT_UTF
549 #define charptr frame->Xcharptr
550 #endif
551 #define callpat frame->Xcallpat
552 #define codelink frame->Xcodelink
553 #define data frame->Xdata
554 #define next frame->Xnext
555 #define pp frame->Xpp
556 #define prev frame->Xprev
557 #define saved_eptr frame->Xsaved_eptr
558
559 #define new_recursive frame->Xnew_recursive
560
561 #define cur_is_word frame->Xcur_is_word
562 #define condition frame->Xcondition
563 #define prev_is_word frame->Xprev_is_word
564
565 #ifdef SUPPORT_UCP
566 #define prop_type frame->Xprop_type
567 #define prop_value frame->Xprop_value
568 #define prop_fail_result frame->Xprop_fail_result
569 #define oclength frame->Xoclength
570 #define occhars frame->Xocchars
571 #endif
572
573 #define ctype frame->Xctype
574 #define fc frame->Xfc
575 #define fi frame->Xfi
576 #define length frame->Xlength
577 #define max frame->Xmax
578 #define min frame->Xmin
579 #define number frame->Xnumber
580 #define offset frame->Xoffset
581 #define op frame->Xop
582 #define save_capture_last frame->Xsave_capture_last
583 #define save_offset1 frame->Xsave_offset1
584 #define save_offset2 frame->Xsave_offset2
585 #define save_offset3 frame->Xsave_offset3
586 #define stacksave frame->Xstacksave
587
588 #define newptrb frame->Xnewptrb
589
590 /* When recursion is being used, local variables are allocated on the stack and
591 get preserved during recursion in the normal way. In this environment, fi and
592 i, and fc and c, can be the same variables. */
593
594 #else /* NO_RECURSE not defined */
595 #define fi i
596 #define fc c
597
598 /* Many of the following variables are used only in small blocks of the code.
599 My normal style of coding would have declared them within each of those blocks.
600 However, in order to accommodate the version of this code that uses an external
601 "stack" implemented on the heap, it is easier to declare them all here, so the
602 declarations can be cut out in a block. The only declarations within blocks
603 below are for variables that do not have to be preserved over a recursive call
604 to RMATCH(). */
605
606 #ifdef SUPPORT_UTF
607 const pcre_uchar *charptr;
608 #endif
609 const pcre_uchar *callpat;
610 const pcre_uchar *data;
611 const pcre_uchar *next;
612 PCRE_PUCHAR pp;
613 const pcre_uchar *prev;
614 PCRE_PUCHAR saved_eptr;
615
616 recursion_info new_recursive;
617
618 BOOL cur_is_word;
619 BOOL condition;
620 BOOL prev_is_word;
621
622 #ifdef SUPPORT_UCP
623 int prop_type;
624 unsigned int prop_value;
625 int prop_fail_result;
626 int oclength;
627 pcre_uchar occhars[6];
628 #endif
629
630 int codelink;
631 int ctype;
632 int length;
633 int max;
634 int min;
635 unsigned int number;
636 int offset;
637 pcre_uchar op;
638 int save_capture_last;
639 int save_offset1, save_offset2, save_offset3;
640 int stacksave[REC_STACK_SAVE_MAX];
641
642 eptrblock newptrb;
643
644 /* There is a special fudge for calling match() in a way that causes it to
645 measure the size of its basic stack frame when the stack is being used for
646 recursion. The second argument (ecode) being NULL triggers this behaviour. It
647 cannot normally ever be NULL. The return is the negated value of the frame
648 size. */
649
650 if (ecode == NULL)
651 {
652 if (rdepth == 0)
653 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
654 else
655 {
656 int len = (char *)&rdepth - (char *)eptr;
657 return (len > 0)? -len : len;
658 }
659 }
660 #endif /* NO_RECURSE */
661
662 /* To save space on the stack and in the heap frame, I have doubled up on some
663 of the local variables that are used only in localised parts of the code, but
664 still need to be preserved over recursive calls of match(). These macros define
665 the alternative names that are used. */
666
667 #define allow_zero cur_is_word
668 #define cbegroup condition
669 #define code_offset codelink
670 #define condassert condition
671 #define matched_once prev_is_word
672 #define foc number
673 #define save_mark data
674
675 /* These statements are here to stop the compiler complaining about unitialized
676 variables. */
677
678 #ifdef SUPPORT_UCP
679 prop_value = 0;
680 prop_fail_result = 0;
681 #endif
682
683
684 /* This label is used for tail recursion, which is used in a few cases even
685 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
686 used. Thanks to Ian Taylor for noticing this possibility and sending the
687 original patch. */
688
689 TAIL_RECURSE:
690
691 /* OK, now we can get on with the real code of the function. Recursive calls
692 are specified by the macro RMATCH and RRETURN is used to return. When
693 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
694 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
695 defined). However, RMATCH isn't like a function call because it's quite a
696 complicated macro. It has to be used in one particular way. This shouldn't,
697 however, impact performance when true recursion is being used. */
698
699 #ifdef SUPPORT_UTF
700 utf = md->utf; /* Local copy of the flag */
701 #else
702 utf = FALSE;
703 #endif
704
705 /* First check that we haven't called match() too many times, or that we
706 haven't exceeded the recursive call limit. */
707
708 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
709 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
710
711 /* At the start of a group with an unlimited repeat that may match an empty
712 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
713 done this way to save having to use another function argument, which would take
714 up space on the stack. See also MATCH_CONDASSERT below.
715
716 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
717 such remembered pointers, to be checked when we hit the closing ket, in order
718 to break infinite loops that match no characters. When match() is called in
719 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
720 NOT be used with tail recursion, because the memory block that is used is on
721 the stack, so a new one may be required for each match(). */
722
723 if (md->match_function_type == MATCH_CBEGROUP)
724 {
725 newptrb.epb_saved_eptr = eptr;
726 newptrb.epb_prev = eptrb;
727 eptrb = &newptrb;
728 md->match_function_type = 0;
729 }
730
731 /* Now start processing the opcodes. */
732
733 for (;;)
734 {
735 minimize = possessive = FALSE;
736 op = *ecode;
737
738 switch(op)
739 {
740 case OP_MARK:
741 md->nomatch_mark = ecode + 2;
742 md->mark = NULL; /* In case previously set by assertion */
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 eptrb, RM55);
745 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746 md->mark == NULL) md->mark = ecode + 2;
747
748 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
749 argument, and we must check whether that argument matches this MARK's
750 argument. It is passed back in md->start_match_ptr (an overloading of that
751 variable). If it does match, we reset that variable to the current subject
752 position and return MATCH_SKIP. Otherwise, pass back the return code
753 unaltered. */
754
755 else if (rrc == MATCH_SKIP_ARG &&
756 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
757 {
758 md->start_match_ptr = eptr;
759 RRETURN(MATCH_SKIP);
760 }
761 RRETURN(rrc);
762
763 case OP_FAIL:
764 RRETURN(MATCH_NOMATCH);
765
766 /* COMMIT overrides PRUNE, SKIP, and THEN */
767
768 case OP_COMMIT:
769 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
770 eptrb, RM52);
771 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
772 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
773 rrc != MATCH_THEN)
774 RRETURN(rrc);
775 RRETURN(MATCH_COMMIT);
776
777 /* PRUNE overrides THEN */
778
779 case OP_PRUNE:
780 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
781 eptrb, RM51);
782 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
783 RRETURN(MATCH_PRUNE);
784
785 case OP_PRUNE_ARG:
786 md->nomatch_mark = ecode + 2;
787 md->mark = NULL; /* In case previously set by assertion */
788 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
789 eptrb, RM56);
790 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
791 md->mark == NULL) md->mark = ecode + 2;
792 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
793 RRETURN(MATCH_PRUNE);
794
795 /* SKIP overrides PRUNE and THEN */
796
797 case OP_SKIP:
798 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
799 eptrb, RM53);
800 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
801 RRETURN(rrc);
802 md->start_match_ptr = eptr; /* Pass back current position */
803 RRETURN(MATCH_SKIP);
804
805 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
806 nomatch_mark. There is a flag that disables this opcode when re-matching a
807 pattern that ended with a SKIP for which there was not a matching MARK. */
808
809 case OP_SKIP_ARG:
810 if (md->ignore_skip_arg)
811 {
812 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
813 break;
814 }
815 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
816 eptrb, RM57);
817 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
818 RRETURN(rrc);
819
820 /* Pass back the current skip name by overloading md->start_match_ptr and
821 returning the special MATCH_SKIP_ARG return code. This will either be
822 caught by a matching MARK, or get to the top, where it causes a rematch
823 with the md->ignore_skip_arg flag set. */
824
825 md->start_match_ptr = ecode + 2;
826 RRETURN(MATCH_SKIP_ARG);
827
828 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
829 the branch in which it occurs can be determined. Overload the start of
830 match pointer to do this. */
831
832 case OP_THEN:
833 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
834 eptrb, RM54);
835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
836 md->start_match_ptr = ecode;
837 RRETURN(MATCH_THEN);
838
839 case OP_THEN_ARG:
840 md->nomatch_mark = ecode + 2;
841 md->mark = NULL; /* In case previously set by assertion */
842 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
843 md, eptrb, RM58);
844 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
845 md->mark == NULL) md->mark = ecode + 2;
846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
847 md->start_match_ptr = ecode;
848 RRETURN(MATCH_THEN);
849
850 /* Handle an atomic group that does not contain any capturing parentheses.
851 This can be handled like an assertion. Prior to 8.13, all atomic groups
852 were handled this way. In 8.13, the code was changed as below for ONCE, so
853 that backups pass through the group and thereby reset captured values.
854 However, this uses a lot more stack, so in 8.20, atomic groups that do not
855 contain any captures generate OP_ONCE_NC, which can be handled in the old,
856 less stack intensive way.
857
858 Check the alternative branches in turn - the matching won't pass the KET
859 for this kind of subpattern. If any one branch matches, we carry on as at
860 the end of a normal bracket, leaving the subject pointer, but resetting
861 the start-of-match value in case it was changed by \K. */
862
863 case OP_ONCE_NC:
864 prev = ecode;
865 saved_eptr = eptr;
866 save_mark = md->mark;
867 do
868 {
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
870 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
871 {
872 mstart = md->start_match_ptr;
873 break;
874 }
875 if (rrc == MATCH_THEN)
876 {
877 next = ecode + GET(ecode,1);
878 if (md->start_match_ptr < next &&
879 (*ecode == OP_ALT || *next == OP_ALT))
880 rrc = MATCH_NOMATCH;
881 }
882
883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
884 ecode += GET(ecode,1);
885 md->mark = save_mark;
886 }
887 while (*ecode == OP_ALT);
888
889 /* If hit the end of the group (which could be repeated), fail */
890
891 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
892
893 /* Continue as from after the group, updating the offsets high water
894 mark, since extracts may have been taken. */
895
896 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
897
898 offset_top = md->end_offset_top;
899 eptr = md->end_match_ptr;
900
901 /* For a non-repeating ket, just continue at this level. This also
902 happens for a repeating ket if no characters were matched in the group.
903 This is the forcible breaking of infinite loops as implemented in Perl
904 5.005. */
905
906 if (*ecode == OP_KET || eptr == saved_eptr)
907 {
908 ecode += 1+LINK_SIZE;
909 break;
910 }
911
912 /* The repeating kets try the rest of the pattern or restart from the
913 preceding bracket, in the appropriate order. The second "call" of match()
914 uses tail recursion, to avoid using another stack frame. */
915
916 if (*ecode == OP_KETRMIN)
917 {
918 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
920 ecode = prev;
921 goto TAIL_RECURSE;
922 }
923 else /* OP_KETRMAX */
924 {
925 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
927 ecode += 1 + LINK_SIZE;
928 goto TAIL_RECURSE;
929 }
930 /* Control never gets here */
931
932 /* Handle a capturing bracket, other than those that are possessive with an
933 unlimited repeat. If there is space in the offset vector, save the current
934 subject position in the working slot at the top of the vector. We mustn't
935 change the current values of the data slot, because they may be set from a
936 previous iteration of this group, and be referred to by a reference inside
937 the group. A failure to match might occur after the group has succeeded,
938 if something later on doesn't match. For this reason, we need to restore
939 the working value and also the values of the final offsets, in case they
940 were set by a previous iteration of the same bracket.
941
942 If there isn't enough space in the offset vector, treat this as if it were
943 a non-capturing bracket. Don't worry about setting the flag for the error
944 case here; that is handled in the code for KET. */
945
946 case OP_CBRA:
947 case OP_SCBRA:
948 number = GET2(ecode, 1+LINK_SIZE);
949 offset = number << 1;
950
951 #ifdef PCRE_DEBUG
952 printf("start bracket %d\n", number);
953 printf("subject=");
954 pchars(eptr, 16, TRUE, md);
955 printf("\n");
956 #endif
957
958 if (offset < md->offset_max)
959 {
960 save_offset1 = md->offset_vector[offset];
961 save_offset2 = md->offset_vector[offset+1];
962 save_offset3 = md->offset_vector[md->offset_end - number];
963 save_capture_last = md->capture_last;
964 save_mark = md->mark;
965
966 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
967 md->offset_vector[md->offset_end - number] =
968 (int)(eptr - md->start_subject);
969
970 for (;;)
971 {
972 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
973 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
974 eptrb, RM1);
975 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
976
977 /* If we backed up to a THEN, check whether it is within the current
978 branch by comparing the address of the THEN that is passed back with
979 the end of the branch. If it is within the current branch, and the
980 branch is one of two or more alternatives (it either starts or ends
981 with OP_ALT), we have reached the limit of THEN's action, so convert
982 the return code to NOMATCH, which will cause normal backtracking to
983 happen from now on. Otherwise, THEN is passed back to an outer
984 alternative. This implements Perl's treatment of parenthesized groups,
985 where a group not containing | does not affect the current alternative,
986 that is, (X) is NOT the same as (X|(*F)). */
987
988 if (rrc == MATCH_THEN)
989 {
990 next = ecode + GET(ecode,1);
991 if (md->start_match_ptr < next &&
992 (*ecode == OP_ALT || *next == OP_ALT))
993 rrc = MATCH_NOMATCH;
994 }
995
996 /* Anything other than NOMATCH is passed back. */
997
998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
999 md->capture_last = save_capture_last;
1000 ecode += GET(ecode, 1);
1001 md->mark = save_mark;
1002 if (*ecode != OP_ALT) break;
1003 }
1004
1005 DPRINTF(("bracket %d failed\n", number));
1006 md->offset_vector[offset] = save_offset1;
1007 md->offset_vector[offset+1] = save_offset2;
1008 md->offset_vector[md->offset_end - number] = save_offset3;
1009
1010 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1011
1012 RRETURN(rrc);
1013 }
1014
1015 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1016 as a non-capturing bracket. */
1017
1018 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1019 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1020
1021 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1022
1023 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1024 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1025
1026 /* Non-capturing or atomic group, except for possessive with unlimited
1027 repeat and ONCE group with no captures. Loop for all the alternatives.
1028
1029 When we get to the final alternative within the brackets, we used to return
1030 the result of a recursive call to match() whatever happened so it was
1031 possible to reduce stack usage by turning this into a tail recursion,
1032 except in the case of a possibly empty group. However, now that there is
1033 the possiblity of (*THEN) occurring in the final alternative, this
1034 optimization is no longer always possible.
1035
1036 We can optimize if we know there are no (*THEN)s in the pattern; at present
1037 this is the best that can be done.
1038
1039 MATCH_ONCE is returned when the end of an atomic group is successfully
1040 reached, but subsequent matching fails. It passes back up the tree (causing
1041 captured values to be reset) until the original atomic group level is
1042 reached. This is tested by comparing md->once_target with the start of the
1043 group. At this point, the return is converted into MATCH_NOMATCH so that
1044 previous backup points can be taken. */
1045
1046 case OP_ONCE:
1047 case OP_BRA:
1048 case OP_SBRA:
1049 DPRINTF(("start non-capturing bracket\n"));
1050
1051 for (;;)
1052 {
1053 if (op >= OP_SBRA || op == OP_ONCE)
1054 md->match_function_type = MATCH_CBEGROUP;
1055
1056 /* If this is not a possibly empty group, and there are no (*THEN)s in
1057 the pattern, and this is the final alternative, optimize as described
1058 above. */
1059
1060 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1061 {
1062 ecode += PRIV(OP_lengths)[*ecode];
1063 goto TAIL_RECURSE;
1064 }
1065
1066 /* In all other cases, we have to make another call to match(). */
1067
1068 save_mark = md->mark;
1069 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1070 RM2);
1071
1072 /* See comment in the code for capturing groups above about handling
1073 THEN. */
1074
1075 if (rrc == MATCH_THEN)
1076 {
1077 next = ecode + GET(ecode,1);
1078 if (md->start_match_ptr < next &&
1079 (*ecode == OP_ALT || *next == OP_ALT))
1080 rrc = MATCH_NOMATCH;
1081 }
1082
1083 if (rrc != MATCH_NOMATCH)
1084 {
1085 if (rrc == MATCH_ONCE)
1086 {
1087 const pcre_uchar *scode = ecode;
1088 if (*scode != OP_ONCE) /* If not at start, find it */
1089 {
1090 while (*scode == OP_ALT) scode += GET(scode, 1);
1091 scode -= GET(scode, 1);
1092 }
1093 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1094 }
1095 RRETURN(rrc);
1096 }
1097 ecode += GET(ecode, 1);
1098 md->mark = save_mark;
1099 if (*ecode != OP_ALT) break;
1100 }
1101
1102 RRETURN(MATCH_NOMATCH);
1103
1104 /* Handle possessive capturing brackets with an unlimited repeat. We come
1105 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1106 handled similarly to the normal case above. However, the matching is
1107 different. The end of these brackets will always be OP_KETRPOS, which
1108 returns MATCH_KETRPOS without going further in the pattern. By this means
1109 we can handle the group by iteration rather than recursion, thereby
1110 reducing the amount of stack needed. */
1111
1112 case OP_CBRAPOS:
1113 case OP_SCBRAPOS:
1114 allow_zero = FALSE;
1115
1116 POSSESSIVE_CAPTURE:
1117 number = GET2(ecode, 1+LINK_SIZE);
1118 offset = number << 1;
1119
1120 #ifdef PCRE_DEBUG
1121 printf("start possessive bracket %d\n", number);
1122 printf("subject=");
1123 pchars(eptr, 16, TRUE, md);
1124 printf("\n");
1125 #endif
1126
1127 if (offset < md->offset_max)
1128 {
1129 matched_once = FALSE;
1130 code_offset = (int)(ecode - md->start_code);
1131
1132 save_offset1 = md->offset_vector[offset];
1133 save_offset2 = md->offset_vector[offset+1];
1134 save_offset3 = md->offset_vector[md->offset_end - number];
1135 save_capture_last = md->capture_last;
1136
1137 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1138
1139 /* Each time round the loop, save the current subject position for use
1140 when the group matches. For MATCH_MATCH, the group has matched, so we
1141 restart it with a new subject starting position, remembering that we had
1142 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1143 usual. If we haven't matched any alternatives in any iteration, check to
1144 see if a previous iteration matched. If so, the group has matched;
1145 continue from afterwards. Otherwise it has failed; restore the previous
1146 capture values before returning NOMATCH. */
1147
1148 for (;;)
1149 {
1150 md->offset_vector[md->offset_end - number] =
1151 (int)(eptr - md->start_subject);
1152 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1153 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1154 eptrb, RM63);
1155 if (rrc == MATCH_KETRPOS)
1156 {
1157 offset_top = md->end_offset_top;
1158 eptr = md->end_match_ptr;
1159 ecode = md->start_code + code_offset;
1160 save_capture_last = md->capture_last;
1161 matched_once = TRUE;
1162 continue;
1163 }
1164
1165 /* See comment in the code for capturing groups above about handling
1166 THEN. */
1167
1168 if (rrc == MATCH_THEN)
1169 {
1170 next = ecode + GET(ecode,1);
1171 if (md->start_match_ptr < next &&
1172 (*ecode == OP_ALT || *next == OP_ALT))
1173 rrc = MATCH_NOMATCH;
1174 }
1175
1176 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1177 md->capture_last = save_capture_last;
1178 ecode += GET(ecode, 1);
1179 if (*ecode != OP_ALT) break;
1180 }
1181
1182 if (!matched_once)
1183 {
1184 md->offset_vector[offset] = save_offset1;
1185 md->offset_vector[offset+1] = save_offset2;
1186 md->offset_vector[md->offset_end - number] = save_offset3;
1187 }
1188
1189 if (allow_zero || matched_once)
1190 {
1191 ecode += 1 + LINK_SIZE;
1192 break;
1193 }
1194
1195 RRETURN(MATCH_NOMATCH);
1196 }
1197
1198 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1199 as a non-capturing bracket. */
1200
1201 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1202 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1203
1204 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1205
1206 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1207 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1208
1209 /* Non-capturing possessive bracket with unlimited repeat. We come here
1210 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1211 without the capturing complication. It is written out separately for speed
1212 and cleanliness. */
1213
1214 case OP_BRAPOS:
1215 case OP_SBRAPOS:
1216 allow_zero = FALSE;
1217
1218 POSSESSIVE_NON_CAPTURE:
1219 matched_once = FALSE;
1220 code_offset = (int)(ecode - md->start_code);
1221
1222 for (;;)
1223 {
1224 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1225 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1226 eptrb, RM48);
1227 if (rrc == MATCH_KETRPOS)
1228 {
1229 offset_top = md->end_offset_top;
1230 eptr = md->end_match_ptr;
1231 ecode = md->start_code + code_offset;
1232 matched_once = TRUE;
1233 continue;
1234 }
1235
1236 /* See comment in the code for capturing groups above about handling
1237 THEN. */
1238
1239 if (rrc == MATCH_THEN)
1240 {
1241 next = ecode + GET(ecode,1);
1242 if (md->start_match_ptr < next &&
1243 (*ecode == OP_ALT || *next == OP_ALT))
1244 rrc = MATCH_NOMATCH;
1245 }
1246
1247 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1248 ecode += GET(ecode, 1);
1249 if (*ecode != OP_ALT) break;
1250 }
1251
1252 if (matched_once || allow_zero)
1253 {
1254 ecode += 1 + LINK_SIZE;
1255 break;
1256 }
1257 RRETURN(MATCH_NOMATCH);
1258
1259 /* Control never reaches here. */
1260
1261 /* Conditional group: compilation checked that there are no more than
1262 two branches. If the condition is false, skipping the first branch takes us
1263 past the end if there is only one branch, but that's OK because that is
1264 exactly what going to the ket would do. */
1265
1266 case OP_COND:
1267 case OP_SCOND:
1268 codelink = GET(ecode, 1);
1269
1270 /* Because of the way auto-callout works during compile, a callout item is
1271 inserted between OP_COND and an assertion condition. */
1272
1273 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1274 {
1275 if (PUBL(callout) != NULL)
1276 {
1277 PUBL(callout_block) cb;
1278 cb.version = 2; /* Version 1 of the callout block */
1279 cb.callout_number = ecode[LINK_SIZE+2];
1280 cb.offset_vector = md->offset_vector;
1281 #if defined COMPILE_PCRE8
1282 cb.subject = (PCRE_SPTR)md->start_subject;
1283 #elif defined COMPILE_PCRE16
1284 cb.subject = (PCRE_SPTR16)md->start_subject;
1285 #elif defined COMPILE_PCRE32
1286 cb.subject = (PCRE_SPTR32)md->start_subject;
1287 #endif
1288 cb.subject_length = (int)(md->end_subject - md->start_subject);
1289 cb.start_match = (int)(mstart - md->start_subject);
1290 cb.current_position = (int)(eptr - md->start_subject);
1291 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1292 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1293 cb.capture_top = offset_top/2;
1294 cb.capture_last = md->capture_last;
1295 cb.callout_data = md->callout_data;
1296 cb.mark = md->nomatch_mark;
1297 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1298 if (rrc < 0) RRETURN(rrc);
1299 }
1300 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1301 }
1302
1303 condcode = ecode[LINK_SIZE+1];
1304
1305 /* Now see what the actual condition is */
1306
1307 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1308 {
1309 if (md->recursive == NULL) /* Not recursing => FALSE */
1310 {
1311 condition = FALSE;
1312 ecode += GET(ecode, 1);
1313 }
1314 else
1315 {
1316 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1317 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1318
1319 /* If the test is for recursion into a specific subpattern, and it is
1320 false, but the test was set up by name, scan the table to see if the
1321 name refers to any other numbers, and test them. The condition is true
1322 if any one is set. */
1323
1324 if (!condition && condcode == OP_NRREF)
1325 {
1326 pcre_uchar *slotA = md->name_table;
1327 for (i = 0; i < md->name_count; i++)
1328 {
1329 if (GET2(slotA, 0) == recno) break;
1330 slotA += md->name_entry_size;
1331 }
1332
1333 /* Found a name for the number - there can be only one; duplicate
1334 names for different numbers are allowed, but not vice versa. First
1335 scan down for duplicates. */
1336
1337 if (i < md->name_count)
1338 {
1339 pcre_uchar *slotB = slotA;
1340 while (slotB > md->name_table)
1341 {
1342 slotB -= md->name_entry_size;
1343 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1344 {
1345 condition = GET2(slotB, 0) == md->recursive->group_num;
1346 if (condition) break;
1347 }
1348 else break;
1349 }
1350
1351 /* Scan up for duplicates */
1352
1353 if (!condition)
1354 {
1355 slotB = slotA;
1356 for (i++; i < md->name_count; i++)
1357 {
1358 slotB += md->name_entry_size;
1359 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1360 {
1361 condition = GET2(slotB, 0) == md->recursive->group_num;
1362 if (condition) break;
1363 }
1364 else break;
1365 }
1366 }
1367 }
1368 }
1369
1370 /* Chose branch according to the condition */
1371
1372 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1373 }
1374 }
1375
1376 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1377 {
1378 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1379 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1380
1381 /* If the numbered capture is unset, but the reference was by name,
1382 scan the table to see if the name refers to any other numbers, and test
1383 them. The condition is true if any one is set. This is tediously similar
1384 to the code above, but not close enough to try to amalgamate. */
1385
1386 if (!condition && condcode == OP_NCREF)
1387 {
1388 unsigned int refno = offset >> 1;
1389 pcre_uchar *slotA = md->name_table;
1390
1391 for (i = 0; i < md->name_count; i++)
1392 {
1393 if (GET2(slotA, 0) == refno) break;
1394 slotA += md->name_entry_size;
1395 }
1396
1397 /* Found a name for the number - there can be only one; duplicate names
1398 for different numbers are allowed, but not vice versa. First scan down
1399 for duplicates. */
1400
1401 if (i < md->name_count)
1402 {
1403 pcre_uchar *slotB = slotA;
1404 while (slotB > md->name_table)
1405 {
1406 slotB -= md->name_entry_size;
1407 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1408 {
1409 offset = GET2(slotB, 0) << 1;
1410 condition = offset < offset_top &&
1411 md->offset_vector[offset] >= 0;
1412 if (condition) break;
1413 }
1414 else break;
1415 }
1416
1417 /* Scan up for duplicates */
1418
1419 if (!condition)
1420 {
1421 slotB = slotA;
1422 for (i++; i < md->name_count; i++)
1423 {
1424 slotB += md->name_entry_size;
1425 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1426 {
1427 offset = GET2(slotB, 0) << 1;
1428 condition = offset < offset_top &&
1429 md->offset_vector[offset] >= 0;
1430 if (condition) break;
1431 }
1432 else break;
1433 }
1434 }
1435 }
1436 }
1437
1438 /* Chose branch according to the condition */
1439
1440 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1441 }
1442
1443 else if (condcode == OP_DEF) /* DEFINE - always false */
1444 {
1445 condition = FALSE;
1446 ecode += GET(ecode, 1);
1447 }
1448
1449 /* The condition is an assertion. Call match() to evaluate it - setting
1450 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1451 an assertion. */
1452
1453 else
1454 {
1455 md->match_function_type = MATCH_CONDASSERT;
1456 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1457 if (rrc == MATCH_MATCH)
1458 {
1459 if (md->end_offset_top > offset_top)
1460 offset_top = md->end_offset_top; /* Captures may have happened */
1461 condition = TRUE;
1462 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1463 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1464 }
1465
1466 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1467 assertion; it is therefore treated as NOMATCH. */
1468
1469 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1470 {
1471 RRETURN(rrc); /* Need braces because of following else */
1472 }
1473 else
1474 {
1475 condition = FALSE;
1476 ecode += codelink;
1477 }
1478 }
1479
1480 /* We are now at the branch that is to be obeyed. As there is only one, can
1481 use tail recursion to avoid using another stack frame, except when there is
1482 unlimited repeat of a possibly empty group. In the latter case, a recursive
1483 call to match() is always required, unless the second alternative doesn't
1484 exist, in which case we can just plough on. Note that, for compatibility
1485 with Perl, the | in a conditional group is NOT treated as creating two
1486 alternatives. If a THEN is encountered in the branch, it propagates out to
1487 the enclosing alternative (unless nested in a deeper set of alternatives,
1488 of course). */
1489
1490 if (condition || *ecode == OP_ALT)
1491 {
1492 if (op != OP_SCOND)
1493 {
1494 ecode += 1 + LINK_SIZE;
1495 goto TAIL_RECURSE;
1496 }
1497
1498 md->match_function_type = MATCH_CBEGROUP;
1499 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1500 RRETURN(rrc);
1501 }
1502
1503 /* Condition false & no alternative; continue after the group. */
1504
1505 else
1506 {
1507 ecode += 1 + LINK_SIZE;
1508 }
1509 break;
1510
1511
1512 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1513 to close any currently open capturing brackets. */
1514
1515 case OP_CLOSE:
1516 number = GET2(ecode, 1);
1517 offset = number << 1;
1518
1519 #ifdef PCRE_DEBUG
1520 printf("end bracket %d at *ACCEPT", number);
1521 printf("\n");
1522 #endif
1523
1524 md->capture_last = number;
1525 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1526 {
1527 md->offset_vector[offset] =
1528 md->offset_vector[md->offset_end - number];
1529 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1530 if (offset_top <= offset) offset_top = offset + 2;
1531 }
1532 ecode += 1 + IMM2_SIZE;
1533 break;
1534
1535
1536 /* End of the pattern, either real or forced. */
1537
1538 case OP_END:
1539 case OP_ACCEPT:
1540 case OP_ASSERT_ACCEPT:
1541
1542 /* If we have matched an empty string, fail if not in an assertion and not
1543 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1544 is set and we have matched at the start of the subject. In both cases,
1545 backtracking will then try other alternatives, if any. */
1546
1547 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1548 md->recursive == NULL &&
1549 (md->notempty ||
1550 (md->notempty_atstart &&
1551 mstart == md->start_subject + md->start_offset)))
1552 RRETURN(MATCH_NOMATCH);
1553
1554 /* Otherwise, we have a match. */
1555
1556 md->end_match_ptr = eptr; /* Record where we ended */
1557 md->end_offset_top = offset_top; /* and how many extracts were taken */
1558 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1559
1560 /* For some reason, the macros don't work properly if an expression is
1561 given as the argument to RRETURN when the heap is in use. */
1562
1563 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1564 RRETURN(rrc);
1565
1566 /* Assertion brackets. Check the alternative branches in turn - the
1567 matching won't pass the KET for an assertion. If any one branch matches,
1568 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1569 start of each branch to move the current point backwards, so the code at
1570 this level is identical to the lookahead case. When the assertion is part
1571 of a condition, we want to return immediately afterwards. The caller of
1572 this incarnation of the match() function will have set MATCH_CONDASSERT in
1573 md->match_function type, and one of these opcodes will be the first opcode
1574 that is processed. We use a local variable that is preserved over calls to
1575 match() to remember this case. */
1576
1577 case OP_ASSERT:
1578 case OP_ASSERTBACK:
1579 save_mark = md->mark;
1580 if (md->match_function_type == MATCH_CONDASSERT)
1581 {
1582 condassert = TRUE;
1583 md->match_function_type = 0;
1584 }
1585 else condassert = FALSE;
1586
1587 do
1588 {
1589 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1590 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1591 {
1592 mstart = md->start_match_ptr; /* In case \K reset it */
1593 break;
1594 }
1595 md->mark = save_mark;
1596
1597 /* A COMMIT failure must fail the entire assertion, without trying any
1598 subsequent branches. */
1599
1600 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1601
1602 /* PCRE does not allow THEN to escape beyond an assertion; it
1603 is treated as NOMATCH. */
1604
1605 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1606 ecode += GET(ecode, 1);
1607 }
1608 while (*ecode == OP_ALT);
1609
1610 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1611
1612 /* If checking an assertion for a condition, return MATCH_MATCH. */
1613
1614 if (condassert) RRETURN(MATCH_MATCH);
1615
1616 /* Continue from after the assertion, updating the offsets high water
1617 mark, since extracts may have been taken during the assertion. */
1618
1619 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1620 ecode += 1 + LINK_SIZE;
1621 offset_top = md->end_offset_top;
1622 continue;
1623
1624 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1625 PRUNE, or COMMIT means we must assume failure without checking subsequent
1626 branches. */
1627
1628 case OP_ASSERT_NOT:
1629 case OP_ASSERTBACK_NOT:
1630 save_mark = md->mark;
1631 if (md->match_function_type == MATCH_CONDASSERT)
1632 {
1633 condassert = TRUE;
1634 md->match_function_type = 0;
1635 }
1636 else condassert = FALSE;
1637
1638 do
1639 {
1640 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1641 md->mark = save_mark;
1642 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1643 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1644 {
1645 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1646 break;
1647 }
1648
1649 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1650 as NOMATCH. */
1651
1652 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1653 ecode += GET(ecode,1);
1654 }
1655 while (*ecode == OP_ALT);
1656
1657 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1658
1659 ecode += 1 + LINK_SIZE;
1660 continue;
1661
1662 /* Move the subject pointer back. This occurs only at the start of
1663 each branch of a lookbehind assertion. If we are too close to the start to
1664 move back, this match function fails. When working with UTF-8 we move
1665 back a number of characters, not bytes. */
1666
1667 case OP_REVERSE:
1668 #ifdef SUPPORT_UTF
1669 if (utf)
1670 {
1671 i = GET(ecode, 1);
1672 while (i-- > 0)
1673 {
1674 eptr--;
1675 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1676 BACKCHAR(eptr);
1677 }
1678 }
1679 else
1680 #endif
1681
1682 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1683
1684 {
1685 eptr -= GET(ecode, 1);
1686 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1687 }
1688
1689 /* Save the earliest consulted character, then skip to next op code */
1690
1691 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1692 ecode += 1 + LINK_SIZE;
1693 break;
1694
1695 /* The callout item calls an external function, if one is provided, passing
1696 details of the match so far. This is mainly for debugging, though the
1697 function is able to force a failure. */
1698
1699 case OP_CALLOUT:
1700 if (PUBL(callout) != NULL)
1701 {
1702 PUBL(callout_block) cb;
1703 cb.version = 2; /* Version 1 of the callout block */
1704 cb.callout_number = ecode[1];
1705 cb.offset_vector = md->offset_vector;
1706 #if defined COMPILE_PCRE8
1707 cb.subject = (PCRE_SPTR)md->start_subject;
1708 #elif defined COMPILE_PCRE16
1709 cb.subject = (PCRE_SPTR16)md->start_subject;
1710 #elif defined COMPILE_PCRE32
1711 cb.subject = (PCRE_SPTR32)md->start_subject;
1712 #endif
1713 cb.subject_length = (int)(md->end_subject - md->start_subject);
1714 cb.start_match = (int)(mstart - md->start_subject);
1715 cb.current_position = (int)(eptr - md->start_subject);
1716 cb.pattern_position = GET(ecode, 2);
1717 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1718 cb.capture_top = offset_top/2;
1719 cb.capture_last = md->capture_last;
1720 cb.callout_data = md->callout_data;
1721 cb.mark = md->nomatch_mark;
1722 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1723 if (rrc < 0) RRETURN(rrc);
1724 }
1725 ecode += 2 + 2*LINK_SIZE;
1726 break;
1727
1728 /* Recursion either matches the current regex, or some subexpression. The
1729 offset data is the offset to the starting bracket from the start of the
1730 whole pattern. (This is so that it works from duplicated subpatterns.)
1731
1732 The state of the capturing groups is preserved over recursion, and
1733 re-instated afterwards. We don't know how many are started and not yet
1734 finished (offset_top records the completed total) so we just have to save
1735 all the potential data. There may be up to 65535 such values, which is too
1736 large to put on the stack, but using malloc for small numbers seems
1737 expensive. As a compromise, the stack is used when there are no more than
1738 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1739
1740 There are also other values that have to be saved. We use a chained
1741 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1742 for the original version of this logic. It has, however, been hacked around
1743 a lot, so he is not to blame for the current way it works. */
1744
1745 case OP_RECURSE:
1746 {
1747 recursion_info *ri;
1748 unsigned int recno;
1749
1750 callpat = md->start_code + GET(ecode, 1);
1751 recno = (callpat == md->start_code)? 0 :
1752 GET2(callpat, 1 + LINK_SIZE);
1753
1754 /* Check for repeating a recursion without advancing the subject pointer.
1755 This should catch convoluted mutual recursions. (Some simple cases are
1756 caught at compile time.) */
1757
1758 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1759 if (recno == ri->group_num && eptr == ri->subject_position)
1760 RRETURN(PCRE_ERROR_RECURSELOOP);
1761
1762 /* Add to "recursing stack" */
1763
1764 new_recursive.group_num = recno;
1765 new_recursive.subject_position = eptr;
1766 new_recursive.prevrec = md->recursive;
1767 md->recursive = &new_recursive;
1768
1769 /* Where to continue from afterwards */
1770
1771 ecode += 1 + LINK_SIZE;
1772
1773 /* Now save the offset data */
1774
1775 new_recursive.saved_max = md->offset_end;
1776 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1777 new_recursive.offset_save = stacksave;
1778 else
1779 {
1780 new_recursive.offset_save =
1781 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1782 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1783 }
1784 memcpy(new_recursive.offset_save, md->offset_vector,
1785 new_recursive.saved_max * sizeof(int));
1786
1787 /* OK, now we can do the recursion. After processing each alternative,
1788 restore the offset data. If there were nested recursions, md->recursive
1789 might be changed, so reset it before looping. */
1790
1791 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1792 cbegroup = (*callpat >= OP_SBRA);
1793 do
1794 {
1795 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1796 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1797 md, eptrb, RM6);
1798 memcpy(md->offset_vector, new_recursive.offset_save,
1799 new_recursive.saved_max * sizeof(int));
1800 md->recursive = new_recursive.prevrec;
1801 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1802 {
1803 DPRINTF(("Recursion matched\n"));
1804 if (new_recursive.offset_save != stacksave)
1805 (PUBL(free))(new_recursive.offset_save);
1806
1807 /* Set where we got to in the subject, and reset the start in case
1808 it was changed by \K. This *is* propagated back out of a recursion,
1809 for Perl compatibility. */
1810
1811 eptr = md->end_match_ptr;
1812 mstart = md->start_match_ptr;
1813 goto RECURSION_MATCHED; /* Exit loop; end processing */
1814 }
1815
1816 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1817 is treated as NOMATCH. */
1818
1819 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1820 rrc != MATCH_COMMIT)
1821 {
1822 DPRINTF(("Recursion gave error %d\n", rrc));
1823 if (new_recursive.offset_save != stacksave)
1824 (PUBL(free))(new_recursive.offset_save);
1825 RRETURN(rrc);
1826 }
1827
1828 md->recursive = &new_recursive;
1829 callpat += GET(callpat, 1);
1830 }
1831 while (*callpat == OP_ALT);
1832
1833 DPRINTF(("Recursion didn't match\n"));
1834 md->recursive = new_recursive.prevrec;
1835 if (new_recursive.offset_save != stacksave)
1836 (PUBL(free))(new_recursive.offset_save);
1837 RRETURN(MATCH_NOMATCH);
1838 }
1839
1840 RECURSION_MATCHED:
1841 break;
1842
1843 /* An alternation is the end of a branch; scan along to find the end of the
1844 bracketed group and go to there. */
1845
1846 case OP_ALT:
1847 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1848 break;
1849
1850 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1851 indicating that it may occur zero times. It may repeat infinitely, or not
1852 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1853 with fixed upper repeat limits are compiled as a number of copies, with the
1854 optional ones preceded by BRAZERO or BRAMINZERO. */
1855
1856 case OP_BRAZERO:
1857 next = ecode + 1;
1858 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1860 do next += GET(next, 1); while (*next == OP_ALT);
1861 ecode = next + 1 + LINK_SIZE;
1862 break;
1863
1864 case OP_BRAMINZERO:
1865 next = ecode + 1;
1866 do next += GET(next, 1); while (*next == OP_ALT);
1867 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1869 ecode++;
1870 break;
1871
1872 case OP_SKIPZERO:
1873 next = ecode+1;
1874 do next += GET(next,1); while (*next == OP_ALT);
1875 ecode = next + 1 + LINK_SIZE;
1876 break;
1877
1878 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1879 here; just jump to the group, with allow_zero set TRUE. */
1880
1881 case OP_BRAPOSZERO:
1882 op = *(++ecode);
1883 allow_zero = TRUE;
1884 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1885 goto POSSESSIVE_NON_CAPTURE;
1886
1887 /* End of a group, repeated or non-repeating. */
1888
1889 case OP_KET:
1890 case OP_KETRMIN:
1891 case OP_KETRMAX:
1892 case OP_KETRPOS:
1893 prev = ecode - GET(ecode, 1);
1894
1895 /* If this was a group that remembered the subject start, in order to break
1896 infinite repeats of empty string matches, retrieve the subject start from
1897 the chain. Otherwise, set it NULL. */
1898
1899 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1900 {
1901 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1902 eptrb = eptrb->epb_prev; /* Backup to previous group */
1903 }
1904 else saved_eptr = NULL;
1905
1906 /* If we are at the end of an assertion group or a non-capturing atomic
1907 group, stop matching and return MATCH_MATCH, but record the current high
1908 water mark for use by positive assertions. We also need to record the match
1909 start in case it was changed by \K. */
1910
1911 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1912 *prev == OP_ONCE_NC)
1913 {
1914 md->end_match_ptr = eptr; /* For ONCE_NC */
1915 md->end_offset_top = offset_top;
1916 md->start_match_ptr = mstart;
1917 RRETURN(MATCH_MATCH); /* Sets md->mark */
1918 }
1919
1920 /* For capturing groups we have to check the group number back at the start
1921 and if necessary complete handling an extraction by setting the offsets and
1922 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1923 into group 0, so it won't be picked up here. Instead, we catch it when the
1924 OP_END is reached. Other recursion is handled here. We just have to record
1925 the current subject position and start match pointer and give a MATCH
1926 return. */
1927
1928 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1929 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1930 {
1931 number = GET2(prev, 1+LINK_SIZE);
1932 offset = number << 1;
1933
1934 #ifdef PCRE_DEBUG
1935 printf("end bracket %d", number);
1936 printf("\n");
1937 #endif
1938
1939 /* Handle a recursively called group. */
1940
1941 if (md->recursive != NULL && md->recursive->group_num == number)
1942 {
1943 md->end_match_ptr = eptr;
1944 md->start_match_ptr = mstart;
1945 RRETURN(MATCH_MATCH);
1946 }
1947
1948 /* Deal with capturing */
1949
1950 md->capture_last = number;
1951 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1952 {
1953 /* If offset is greater than offset_top, it means that we are
1954 "skipping" a capturing group, and that group's offsets must be marked
1955 unset. In earlier versions of PCRE, all the offsets were unset at the
1956 start of matching, but this doesn't work because atomic groups and
1957 assertions can cause a value to be set that should later be unset.
1958 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1959 part of the atomic group, but this is not on the final matching path,
1960 so must be unset when 2 is set. (If there is no group 2, there is no
1961 problem, because offset_top will then be 2, indicating no capture.) */
1962
1963 if (offset > offset_top)
1964 {
1965 register int *iptr = md->offset_vector + offset_top;
1966 register int *iend = md->offset_vector + offset;
1967 while (iptr < iend) *iptr++ = -1;
1968 }
1969
1970 /* Now make the extraction */
1971
1972 md->offset_vector[offset] =
1973 md->offset_vector[md->offset_end - number];
1974 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1975 if (offset_top <= offset) offset_top = offset + 2;
1976 }
1977 }
1978
1979 /* For an ordinary non-repeating ket, just continue at this level. This
1980 also happens for a repeating ket if no characters were matched in the
1981 group. This is the forcible breaking of infinite loops as implemented in
1982 Perl 5.005. For a non-repeating atomic group that includes captures,
1983 establish a backup point by processing the rest of the pattern at a lower
1984 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1985 original OP_ONCE level, thereby bypassing intermediate backup points, but
1986 resetting any captures that happened along the way. */
1987
1988 if (*ecode == OP_KET || eptr == saved_eptr)
1989 {
1990 if (*prev == OP_ONCE)
1991 {
1992 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1994 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1995 RRETURN(MATCH_ONCE);
1996 }
1997 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1998 break;
1999 }
2000
2001 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2002 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2003 at a time from the outer level, thus saving stack. */
2004
2005 if (*ecode == OP_KETRPOS)
2006 {
2007 md->end_match_ptr = eptr;
2008 md->end_offset_top = offset_top;
2009 RRETURN(MATCH_KETRPOS);
2010 }
2011
2012 /* The normal repeating kets try the rest of the pattern or restart from
2013 the preceding bracket, in the appropriate order. In the second case, we can
2014 use tail recursion to avoid using another stack frame, unless we have an
2015 an atomic group or an unlimited repeat of a group that can match an empty
2016 string. */
2017
2018 if (*ecode == OP_KETRMIN)
2019 {
2020 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (*prev == OP_ONCE)
2023 {
2024 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2027 RRETURN(MATCH_ONCE);
2028 }
2029 if (*prev >= OP_SBRA) /* Could match an empty string */
2030 {
2031 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2032 RRETURN(rrc);
2033 }
2034 ecode = prev;
2035 goto TAIL_RECURSE;
2036 }
2037 else /* OP_KETRMAX */
2038 {
2039 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2040 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2041 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2042 if (*prev == OP_ONCE)
2043 {
2044 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046 md->once_target = prev;
2047 RRETURN(MATCH_ONCE);
2048 }
2049 ecode += 1 + LINK_SIZE;
2050 goto TAIL_RECURSE;
2051 }
2052 /* Control never gets here */
2053
2054 /* Not multiline mode: start of subject assertion, unless notbol. */
2055
2056 case OP_CIRC:
2057 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2058
2059 /* Start of subject assertion */
2060
2061 case OP_SOD:
2062 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2063 ecode++;
2064 break;
2065
2066 /* Multiline mode: start of subject unless notbol, or after any newline. */
2067
2068 case OP_CIRCM:
2069 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2070 if (eptr != md->start_subject &&
2071 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2072 RRETURN(MATCH_NOMATCH);
2073 ecode++;
2074 break;
2075
2076 /* Start of match assertion */
2077
2078 case OP_SOM:
2079 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2080 ecode++;
2081 break;
2082
2083 /* Reset the start of match point */
2084
2085 case OP_SET_SOM:
2086 mstart = eptr;
2087 ecode++;
2088 break;
2089
2090 /* Multiline mode: assert before any newline, or before end of subject
2091 unless noteol is set. */
2092
2093 case OP_DOLLM:
2094 if (eptr < md->end_subject)
2095 {
2096 if (!IS_NEWLINE(eptr))
2097 {
2098 if (md->partial != 0 &&
2099 eptr + 1 >= md->end_subject &&
2100 NLBLOCK->nltype == NLTYPE_FIXED &&
2101 NLBLOCK->nllen == 2 &&
2102 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2103 {
2104 md->hitend = TRUE;
2105 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2106 }
2107 RRETURN(MATCH_NOMATCH);
2108 }
2109 }
2110 else
2111 {
2112 if (md->noteol) RRETURN(MATCH_NOMATCH);
2113 SCHECK_PARTIAL();
2114 }
2115 ecode++;
2116 break;
2117
2118 /* Not multiline mode: assert before a terminating newline or before end of
2119 subject unless noteol is set. */
2120
2121 case OP_DOLL:
2122 if (md->noteol) RRETURN(MATCH_NOMATCH);
2123 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2124
2125 /* ... else fall through for endonly */
2126
2127 /* End of subject assertion (\z) */
2128
2129 case OP_EOD:
2130 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2131 SCHECK_PARTIAL();
2132 ecode++;
2133 break;
2134
2135 /* End of subject or ending \n assertion (\Z) */
2136
2137 case OP_EODN:
2138 ASSERT_NL_OR_EOS:
2139 if (eptr < md->end_subject &&
2140 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2141 {
2142 if (md->partial != 0 &&
2143 eptr + 1 >= md->end_subject &&
2144 NLBLOCK->nltype == NLTYPE_FIXED &&
2145 NLBLOCK->nllen == 2 &&
2146 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2147 {
2148 md->hitend = TRUE;
2149 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2150 }
2151 RRETURN(MATCH_NOMATCH);
2152 }
2153
2154 /* Either at end of string or \n before end. */
2155
2156 SCHECK_PARTIAL();
2157 ecode++;
2158 break;
2159
2160 /* Word boundary assertions */
2161
2162 case OP_NOT_WORD_BOUNDARY:
2163 case OP_WORD_BOUNDARY:
2164 {
2165
2166 /* Find out if the previous and current characters are "word" characters.
2167 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2168 be "non-word" characters. Remember the earliest consulted character for
2169 partial matching. */
2170
2171 #ifdef SUPPORT_UTF
2172 if (utf)
2173 {
2174 /* Get status of previous character */
2175
2176 if (eptr == md->start_subject) prev_is_word = FALSE; else
2177 {
2178 PCRE_PUCHAR lastptr = eptr - 1;
2179 BACKCHAR(lastptr);
2180 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2181 GETCHAR(c, lastptr);
2182 #ifdef SUPPORT_UCP
2183 if (md->use_ucp)
2184 {
2185 if (c == '_') prev_is_word = TRUE; else
2186 {
2187 int cat = UCD_CATEGORY(c);
2188 prev_is_word = (cat == ucp_L || cat == ucp_N);
2189 }
2190 }
2191 else
2192 #endif
2193 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2194 }
2195
2196 /* Get status of next character */
2197
2198 if (eptr >= md->end_subject)
2199 {
2200 SCHECK_PARTIAL();
2201 cur_is_word = FALSE;
2202 }
2203 else
2204 {
2205 GETCHAR(c, eptr);
2206 #ifdef SUPPORT_UCP
2207 if (md->use_ucp)
2208 {
2209 if (c == '_') cur_is_word = TRUE; else
2210 {
2211 int cat = UCD_CATEGORY(c);
2212 cur_is_word = (cat == ucp_L || cat == ucp_N);
2213 }
2214 }
2215 else
2216 #endif
2217 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2218 }
2219 }
2220 else
2221 #endif
2222
2223 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2224 consistency with the behaviour of \w we do use it in this case. */
2225
2226 {
2227 /* Get status of previous character */
2228
2229 if (eptr == md->start_subject) prev_is_word = FALSE; else
2230 {
2231 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2232 #ifdef SUPPORT_UCP
2233 if (md->use_ucp)
2234 {
2235 c = eptr[-1];
2236 if (c == '_') prev_is_word = TRUE; else
2237 {
2238 int cat = UCD_CATEGORY(c);
2239 prev_is_word = (cat == ucp_L || cat == ucp_N);
2240 }
2241 }
2242 else
2243 #endif
2244 prev_is_word = MAX_255(eptr[-1])
2245 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2246 }
2247
2248 /* Get status of next character */
2249
2250 if (eptr >= md->end_subject)
2251 {
2252 SCHECK_PARTIAL();
2253 cur_is_word = FALSE;
2254 }
2255 else
2256 #ifdef SUPPORT_UCP
2257 if (md->use_ucp)
2258 {
2259 c = *eptr;
2260 if (c == '_') cur_is_word = TRUE; else
2261 {
2262 int cat = UCD_CATEGORY(c);
2263 cur_is_word = (cat == ucp_L || cat == ucp_N);
2264 }
2265 }
2266 else
2267 #endif
2268 cur_is_word = MAX_255(*eptr)
2269 && ((md->ctypes[*eptr] & ctype_word) != 0);
2270 }
2271
2272 /* Now see if the situation is what we want */
2273
2274 if ((*ecode++ == OP_WORD_BOUNDARY)?
2275 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2276 RRETURN(MATCH_NOMATCH);
2277 }
2278 break;
2279
2280 /* Match any single character type except newline; have to take care with
2281 CRLF newlines and partial matching. */
2282
2283 case OP_ANY:
2284 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2285 if (md->partial != 0 &&
2286 eptr + 1 >= md->end_subject &&
2287 NLBLOCK->nltype == NLTYPE_FIXED &&
2288 NLBLOCK->nllen == 2 &&
2289 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2290 {
2291 md->hitend = TRUE;
2292 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2293 }
2294
2295 /* Fall through */
2296
2297 /* Match any single character whatsoever. */
2298
2299 case OP_ALLANY:
2300 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2301 { /* not be updated before SCHECK_PARTIAL. */
2302 SCHECK_PARTIAL();
2303 RRETURN(MATCH_NOMATCH);
2304 }
2305 eptr++;
2306 #ifdef SUPPORT_UTF
2307 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2308 #endif
2309 ecode++;
2310 break;
2311
2312 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2313 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2314
2315 case OP_ANYBYTE:
2316 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2317 { /* not be updated before SCHECK_PARTIAL. */
2318 SCHECK_PARTIAL();
2319 RRETURN(MATCH_NOMATCH);
2320 }
2321 eptr++;
2322 ecode++;
2323 break;
2324
2325 case OP_NOT_DIGIT:
2326 if (eptr >= md->end_subject)
2327 {
2328 SCHECK_PARTIAL();
2329 RRETURN(MATCH_NOMATCH);
2330 }
2331 GETCHARINCTEST(c, eptr);
2332 if (
2333 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2334 c < 256 &&
2335 #endif
2336 (md->ctypes[c] & ctype_digit) != 0
2337 )
2338 RRETURN(MATCH_NOMATCH);
2339 ecode++;
2340 break;
2341
2342 case OP_DIGIT:
2343 if (eptr >= md->end_subject)
2344 {
2345 SCHECK_PARTIAL();
2346 RRETURN(MATCH_NOMATCH);
2347 }
2348 GETCHARINCTEST(c, eptr);
2349 if (
2350 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2351 c > 255 ||
2352 #endif
2353 (md->ctypes[c] & ctype_digit) == 0
2354 )
2355 RRETURN(MATCH_NOMATCH);
2356 ecode++;
2357 break;
2358
2359 case OP_NOT_WHITESPACE:
2360 if (eptr >= md->end_subject)
2361 {
2362 SCHECK_PARTIAL();
2363 RRETURN(MATCH_NOMATCH);
2364 }
2365 GETCHARINCTEST(c, eptr);
2366 if (
2367 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2368 c < 256 &&
2369 #endif
2370 (md->ctypes[c] & ctype_space) != 0
2371 )
2372 RRETURN(MATCH_NOMATCH);
2373 ecode++;
2374 break;
2375
2376 case OP_WHITESPACE:
2377 if (eptr >= md->end_subject)
2378 {
2379 SCHECK_PARTIAL();
2380 RRETURN(MATCH_NOMATCH);
2381 }
2382 GETCHARINCTEST(c, eptr);
2383 if (
2384 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2385 c > 255 ||
2386 #endif
2387 (md->ctypes[c] & ctype_space) == 0
2388 )
2389 RRETURN(MATCH_NOMATCH);
2390 ecode++;
2391 break;
2392
2393 case OP_NOT_WORDCHAR:
2394 if (eptr >= md->end_subject)
2395 {
2396 SCHECK_PARTIAL();
2397 RRETURN(MATCH_NOMATCH);
2398 }
2399 GETCHARINCTEST(c, eptr);
2400 if (
2401 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2402 c < 256 &&
2403 #endif
2404 (md->ctypes[c] & ctype_word) != 0
2405 )
2406 RRETURN(MATCH_NOMATCH);
2407 ecode++;
2408 break;
2409
2410 case OP_WORDCHAR:
2411 if (eptr >= md->end_subject)
2412 {
2413 SCHECK_PARTIAL();
2414 RRETURN(MATCH_NOMATCH);
2415 }
2416 GETCHARINCTEST(c, eptr);
2417 if (
2418 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2419 c > 255 ||
2420 #endif
2421 (md->ctypes[c] & ctype_word) == 0
2422 )
2423 RRETURN(MATCH_NOMATCH);
2424 ecode++;
2425 break;
2426
2427 case OP_ANYNL:
2428 if (eptr >= md->end_subject)
2429 {
2430 SCHECK_PARTIAL();
2431 RRETURN(MATCH_NOMATCH);
2432 }
2433 GETCHARINCTEST(c, eptr);
2434 switch(c)
2435 {
2436 default: RRETURN(MATCH_NOMATCH);
2437
2438 case CHAR_CR:
2439 if (eptr >= md->end_subject)
2440 {
2441 SCHECK_PARTIAL();
2442 }
2443 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2444 break;
2445
2446 case CHAR_LF:
2447 break;
2448
2449 case CHAR_VT:
2450 case CHAR_FF:
2451 case CHAR_NEL:
2452 #ifndef EBCDIC
2453 case 0x2028:
2454 case 0x2029:
2455 #endif /* Not EBCDIC */
2456 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2457 break;
2458 }
2459 ecode++;
2460 break;
2461
2462 case OP_NOT_HSPACE:
2463 if (eptr >= md->end_subject)
2464 {
2465 SCHECK_PARTIAL();
2466 RRETURN(MATCH_NOMATCH);
2467 }
2468 GETCHARINCTEST(c, eptr);
2469 switch(c)
2470 {
2471 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2472 default: break;
2473 }
2474 ecode++;
2475 break;
2476
2477 case OP_HSPACE:
2478 if (eptr >= md->end_subject)
2479 {
2480 SCHECK_PARTIAL();
2481 RRETURN(MATCH_NOMATCH);
2482 }
2483 GETCHARINCTEST(c, eptr);
2484 switch(c)
2485 {
2486 HSPACE_CASES: break; /* Byte and multibyte cases */
2487 default: RRETURN(MATCH_NOMATCH);
2488 }
2489 ecode++;
2490 break;
2491
2492 case OP_NOT_VSPACE:
2493 if (eptr >= md->end_subject)
2494 {
2495 SCHECK_PARTIAL();
2496 RRETURN(MATCH_NOMATCH);
2497 }
2498 GETCHARINCTEST(c, eptr);
2499 switch(c)
2500 {
2501 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2502 default: break;
2503 }
2504 ecode++;
2505 break;
2506
2507 case OP_VSPACE:
2508 if (eptr >= md->end_subject)
2509 {
2510 SCHECK_PARTIAL();
2511 RRETURN(MATCH_NOMATCH);
2512 }
2513 GETCHARINCTEST(c, eptr);
2514 switch(c)
2515 {
2516 VSPACE_CASES: break;
2517 default: RRETURN(MATCH_NOMATCH);
2518 }
2519 ecode++;
2520 break;
2521
2522 #ifdef SUPPORT_UCP
2523 /* Check the next character by Unicode property. We will get here only
2524 if the support is in the binary; otherwise a compile-time error occurs. */
2525
2526 case OP_PROP:
2527 case OP_NOTPROP:
2528 if (eptr >= md->end_subject)
2529 {
2530 SCHECK_PARTIAL();
2531 RRETURN(MATCH_NOMATCH);
2532 }
2533 GETCHARINCTEST(c, eptr);
2534 {
2535 const pcre_uint32 *cp;
2536 const ucd_record *prop = GET_UCD(c);
2537
2538 switch(ecode[1])
2539 {
2540 case PT_ANY:
2541 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2542 break;
2543
2544 case PT_LAMP:
2545 if ((prop->chartype == ucp_Lu ||
2546 prop->chartype == ucp_Ll ||
2547 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2548 RRETURN(MATCH_NOMATCH);
2549 break;
2550
2551 case PT_GC:
2552 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2553 RRETURN(MATCH_NOMATCH);
2554 break;
2555
2556 case PT_PC:
2557 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2558 RRETURN(MATCH_NOMATCH);
2559 break;
2560
2561 case PT_SC:
2562 if ((ecode[2] != prop->script) == (op == OP_PROP))
2563 RRETURN(MATCH_NOMATCH);
2564 break;
2565
2566 /* These are specials */
2567
2568 case PT_ALNUM:
2569 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2570 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2571 RRETURN(MATCH_NOMATCH);
2572 break;
2573
2574 case PT_SPACE: /* Perl space */
2575 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2576 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2577 == (op == OP_NOTPROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2580
2581 case PT_PXSPACE: /* POSIX space */
2582 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2583 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2584 c == CHAR_FF || c == CHAR_CR)
2585 == (op == OP_NOTPROP))
2586 RRETURN(MATCH_NOMATCH);
2587 break;
2588
2589 case PT_WORD:
2590 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2591 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2592 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2593 RRETURN(MATCH_NOMATCH);
2594 break;
2595
2596 case PT_CLIST:
2597 cp = PRIV(ucd_caseless_sets) + prop->caseset;
2598 for (;;)
2599 {
2600 if (c < *cp)
2601 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2602 if (c == *cp++)
2603 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2604 }
2605 break;
2606
2607 /* This should never occur */
2608
2609 default:
2610 RRETURN(PCRE_ERROR_INTERNAL);
2611 }
2612
2613 ecode += 3;
2614 }
2615 break;
2616
2617 /* Match an extended Unicode sequence. We will get here only if the support
2618 is in the binary; otherwise a compile-time error occurs. */
2619
2620 case OP_EXTUNI:
2621 if (eptr >= md->end_subject)
2622 {
2623 SCHECK_PARTIAL();
2624 RRETURN(MATCH_NOMATCH);
2625 }
2626 else
2627 {
2628 int lgb, rgb;
2629 GETCHARINCTEST(c, eptr);
2630 lgb = UCD_GRAPHBREAK(c);
2631 while (eptr < md->end_subject)
2632 {
2633 int len = 1;
2634 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2635 rgb = UCD_GRAPHBREAK(c);
2636 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2637 lgb = rgb;
2638 eptr += len;
2639 }
2640 }
2641 CHECK_PARTIAL();
2642 ecode++;
2643 break;
2644 #endif /* SUPPORT_UCP */
2645
2646
2647 /* Match a back reference, possibly repeatedly. Look past the end of the
2648 item to see if there is repeat information following. The code is similar
2649 to that for character classes, but repeated for efficiency. Then obey
2650 similar code to character type repeats - written out again for speed.
2651 However, if the referenced string is the empty string, always treat
2652 it as matched, any number of times (otherwise there could be infinite
2653 loops). */
2654
2655 case OP_REF:
2656 case OP_REFI:
2657 caseless = op == OP_REFI;
2658 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2659 ecode += 1 + IMM2_SIZE;
2660
2661 /* If the reference is unset, there are two possibilities:
2662
2663 (a) In the default, Perl-compatible state, set the length negative;
2664 this ensures that every attempt at a match fails. We can't just fail
2665 here, because of the possibility of quantifiers with zero minima.
2666
2667 (b) If the JavaScript compatibility flag is set, set the length to zero
2668 so that the back reference matches an empty string.
2669
2670 Otherwise, set the length to the length of what was matched by the
2671 referenced subpattern. */
2672
2673 if (offset >= offset_top || md->offset_vector[offset] < 0)
2674 length = (md->jscript_compat)? 0 : -1;
2675 else
2676 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2677
2678 /* Set up for repetition, or handle the non-repeated case */
2679
2680 switch (*ecode)
2681 {
2682 case OP_CRSTAR:
2683 case OP_CRMINSTAR:
2684 case OP_CRPLUS:
2685 case OP_CRMINPLUS:
2686 case OP_CRQUERY:
2687 case OP_CRMINQUERY:
2688 c = *ecode++ - OP_CRSTAR;
2689 minimize = (c & 1) != 0;
2690 min = rep_min[c]; /* Pick up values from tables; */
2691 max = rep_max[c]; /* zero for max => infinity */
2692 if (max == 0) max = INT_MAX;
2693 break;
2694
2695 case OP_CRRANGE:
2696 case OP_CRMINRANGE:
2697 minimize = (*ecode == OP_CRMINRANGE);
2698 min = GET2(ecode, 1);
2699 max = GET2(ecode, 1 + IMM2_SIZE);
2700 if (max == 0) max = INT_MAX;
2701 ecode += 1 + 2 * IMM2_SIZE;
2702 break;
2703
2704 default: /* No repeat follows */
2705 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2706 {
2707 if (length == -2) eptr = md->end_subject; /* Partial match */
2708 CHECK_PARTIAL();
2709 RRETURN(MATCH_NOMATCH);
2710 }
2711 eptr += length;
2712 continue; /* With the main loop */
2713 }
2714
2715 /* Handle repeated back references. If the length of the reference is
2716 zero, just continue with the main loop. If the length is negative, it
2717 means the reference is unset in non-Java-compatible mode. If the minimum is
2718 zero, we can continue at the same level without recursion. For any other
2719 minimum, carrying on will result in NOMATCH. */
2720
2721 if (length == 0) continue;
2722 if (length < 0 && min == 0) continue;
2723
2724 /* First, ensure the minimum number of matches are present. We get back
2725 the length of the reference string explicitly rather than passing the
2726 address of eptr, so that eptr can be a register variable. */
2727
2728 for (i = 1; i <= min; i++)
2729 {
2730 int slength;
2731 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2732 {
2733 if (slength == -2) eptr = md->end_subject; /* Partial match */
2734 CHECK_PARTIAL();
2735 RRETURN(MATCH_NOMATCH);
2736 }
2737 eptr += slength;
2738 }
2739
2740 /* If min = max, continue at the same level without recursion.
2741 They are not both allowed to be zero. */
2742
2743 if (min == max) continue;
2744
2745 /* If minimizing, keep trying and advancing the pointer */
2746
2747 if (minimize)
2748 {
2749 for (fi = min;; fi++)
2750 {
2751 int slength;
2752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2754 if (fi >= max) RRETURN(MATCH_NOMATCH);
2755 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2756 {
2757 if (slength == -2) eptr = md->end_subject; /* Partial match */
2758 CHECK_PARTIAL();
2759 RRETURN(MATCH_NOMATCH);
2760 }
2761 eptr += slength;
2762 }
2763 /* Control never gets here */
2764 }
2765
2766 /* If maximizing, find the longest string and work backwards */
2767
2768 else
2769 {
2770 pp = eptr;
2771 for (i = min; i < max; i++)
2772 {
2773 int slength;
2774 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2775 {
2776 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2777 the soft partial matching case. */
2778
2779 if (slength == -2 && md->partial != 0 &&
2780 md->end_subject > md->start_used_ptr)
2781 {
2782 md->hitend = TRUE;
2783 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2784 }
2785 break;
2786 }
2787 eptr += slength;
2788 }
2789
2790 while (eptr >= pp)
2791 {
2792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2794 eptr -= length;
2795 }
2796 RRETURN(MATCH_NOMATCH);
2797 }
2798 /* Control never gets here */
2799
2800 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2801 used when all the characters in the class have values in the range 0-255,
2802 and either the matching is caseful, or the characters are in the range
2803 0-127 when UTF-8 processing is enabled. The only difference between
2804 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2805 encountered.
2806
2807 First, look past the end of the item to see if there is repeat information
2808 following. Then obey similar code to character type repeats - written out
2809 again for speed. */
2810
2811 case OP_NCLASS:
2812 case OP_CLASS:
2813 {
2814 /* The data variable is saved across frames, so the byte map needs to
2815 be stored there. */
2816 #define BYTE_MAP ((pcre_uint8 *)data)
2817 data = ecode + 1; /* Save for matching */
2818 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2819
2820 switch (*ecode)
2821 {
2822 case OP_CRSTAR:
2823 case OP_CRMINSTAR:
2824 case OP_CRPLUS:
2825 case OP_CRMINPLUS:
2826 case OP_CRQUERY:
2827 case OP_CRMINQUERY:
2828 c = *ecode++ - OP_CRSTAR;
2829 minimize = (c & 1) != 0;
2830 min = rep_min[c]; /* Pick up values from tables; */
2831 max = rep_max[c]; /* zero for max => infinity */
2832 if (max == 0) max = INT_MAX;
2833 break;
2834
2835 case OP_CRRANGE:
2836 case OP_CRMINRANGE:
2837 minimize = (*ecode == OP_CRMINRANGE);
2838 min = GET2(ecode, 1);
2839 max = GET2(ecode, 1 + IMM2_SIZE);
2840 if (max == 0) max = INT_MAX;
2841 ecode += 1 + 2 * IMM2_SIZE;
2842 break;
2843
2844 default: /* No repeat follows */
2845 min = max = 1;
2846 break;
2847 }
2848
2849 /* First, ensure the minimum number of matches are present. */
2850
2851 #ifdef SUPPORT_UTF
2852 if (utf)
2853 {
2854 for (i = 1; i <= min; i++)
2855 {
2856 if (eptr >= md->end_subject)
2857 {
2858 SCHECK_PARTIAL();
2859 RRETURN(MATCH_NOMATCH);
2860 }
2861 GETCHARINC(c, eptr);
2862 if (c > 255)
2863 {
2864 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2865 }
2866 else
2867 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2868 }
2869 }
2870 else
2871 #endif
2872 /* Not UTF mode */
2873 {
2874 for (i = 1; i <= min; i++)
2875 {
2876 if (eptr >= md->end_subject)
2877 {
2878 SCHECK_PARTIAL();
2879 RRETURN(MATCH_NOMATCH);
2880 }
2881 c = *eptr++;
2882 #ifndef COMPILE_PCRE8
2883 if (c > 255)
2884 {
2885 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2886 }
2887 else
2888 #endif
2889 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2890 }
2891 }
2892
2893 /* If max == min we can continue with the main loop without the
2894 need to recurse. */
2895
2896 if (min == max) continue;
2897
2898 /* If minimizing, keep testing the rest of the expression and advancing
2899 the pointer while it matches the class. */
2900
2901 if (minimize)
2902 {
2903 #ifdef SUPPORT_UTF
2904 if (utf)
2905 {
2906 for (fi = min;; fi++)
2907 {
2908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 if (fi >= max) RRETURN(MATCH_NOMATCH);
2911 if (eptr >= md->end_subject)
2912 {
2913 SCHECK_PARTIAL();
2914 RRETURN(MATCH_NOMATCH);
2915 }
2916 GETCHARINC(c, eptr);
2917 if (c > 255)
2918 {
2919 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2920 }
2921 else
2922 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2923 }
2924 }
2925 else
2926 #endif
2927 /* Not UTF mode */
2928 {
2929 for (fi = min;; fi++)
2930 {
2931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2933 if (fi >= max) RRETURN(MATCH_NOMATCH);
2934 if (eptr >= md->end_subject)
2935 {
2936 SCHECK_PARTIAL();
2937 RRETURN(MATCH_NOMATCH);
2938 }
2939 c = *eptr++;
2940 #ifndef COMPILE_PCRE8
2941 if (c > 255)
2942 {
2943 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2944 }
2945 else
2946 #endif
2947 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2948 }
2949 }
2950 /* Control never gets here */
2951 }
2952
2953 /* If maximizing, find the longest possible run, then work backwards. */
2954
2955 else
2956 {
2957 pp = eptr;
2958
2959 #ifdef SUPPORT_UTF
2960 if (utf)
2961 {
2962 for (i = min; i < max; i++)
2963 {
2964 int len = 1;
2965 if (eptr >= md->end_subject)
2966 {
2967 SCHECK_PARTIAL();
2968 break;
2969 }
2970 GETCHARLEN(c, eptr, len);
2971 if (c > 255)
2972 {
2973 if (op == OP_CLASS) break;
2974 }
2975 else
2976 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2977 eptr += len;
2978 }
2979 for (;;)
2980 {
2981 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2983 if (eptr-- == pp) break; /* Stop if tried at original pos */
2984 BACKCHAR(eptr);
2985 }
2986 }
2987 else
2988 #endif
2989 /* Not UTF mode */
2990 {
2991 for (i = min; i < max; i++)
2992 {
2993 if (eptr >= md->end_subject)
2994 {
2995 SCHECK_PARTIAL();
2996 break;
2997 }
2998 c = *eptr;
2999 #ifndef COMPILE_PCRE8
3000 if (c > 255)
3001 {
3002 if (op == OP_CLASS) break;
3003 }
3004 else
3005 #endif
3006 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3007 eptr++;
3008 }
3009 while (eptr >= pp)
3010 {
3011 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3013 eptr--;
3014 }
3015 }
3016
3017 RRETURN(MATCH_NOMATCH);
3018 }
3019 #undef BYTE_MAP
3020 }
3021 /* Control never gets here */
3022
3023
3024 /* Match an extended character class. This opcode is encountered only
3025 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3026 mode, because Unicode properties are supported in non-UTF-8 mode. */
3027
3028 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3029 case OP_XCLASS:
3030 {
3031 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3032 ecode += GET(ecode, 1); /* Advance past the item */
3033
3034 switch (*ecode)
3035 {
3036 case OP_CRSTAR:
3037 case OP_CRMINSTAR:
3038 case OP_CRPLUS:
3039 case OP_CRMINPLUS:
3040 case OP_CRQUERY:
3041 case OP_CRMINQUERY:
3042 c = *ecode++ - OP_CRSTAR;
3043 minimize = (c & 1) != 0;
3044 min = rep_min[c]; /* Pick up values from tables; */
3045 max = rep_max[c]; /* zero for max => infinity */
3046 if (max == 0) max = INT_MAX;
3047 break;
3048
3049 case OP_CRRANGE:
3050 case OP_CRMINRANGE:
3051 minimize = (*ecode == OP_CRMINRANGE);
3052 min = GET2(ecode, 1);
3053 max = GET2(ecode, 1 + IMM2_SIZE);
3054 if (max == 0) max = INT_MAX;
3055 ecode += 1 + 2 * IMM2_SIZE;
3056 break;
3057
3058 default: /* No repeat follows */
3059 min = max = 1;
3060 break;
3061 }
3062
3063 /* First, ensure the minimum number of matches are present. */
3064
3065 for (i = 1; i <= min; i++)
3066 {
3067 if (eptr >= md->end_subject)
3068 {
3069 SCHECK_PARTIAL();
3070 RRETURN(MATCH_NOMATCH);
3071 }
3072 GETCHARINCTEST(c, eptr);
3073 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3074 }
3075
3076 /* If max == min we can continue with the main loop without the
3077 need to recurse. */
3078
3079 if (min == max) continue;
3080
3081 /* If minimizing, keep testing the rest of the expression and advancing
3082 the pointer while it matches the class. */
3083
3084 if (minimize)
3085 {
3086 for (fi = min;; fi++)
3087 {
3088 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3089 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3090 if (fi >= max) RRETURN(MATCH_NOMATCH);
3091 if (eptr >= md->end_subject)
3092 {
3093 SCHECK_PARTIAL();
3094 RRETURN(MATCH_NOMATCH);
3095 }
3096 GETCHARINCTEST(c, eptr);
3097 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3098 }
3099 /* Control never gets here */
3100 }
3101
3102 /* If maximizing, find the longest possible run, then work backwards. */
3103
3104 else
3105 {
3106 pp = eptr;
3107 for (i = min; i < max; i++)
3108 {
3109 int len = 1;
3110 if (eptr >= md->end_subject)
3111 {
3112 SCHECK_PARTIAL();
3113 break;
3114 }
3115 #ifdef SUPPORT_UTF
3116 GETCHARLENTEST(c, eptr, len);
3117 #else
3118 c = *eptr;
3119 #endif
3120 if (!PRIV(xclass)(c, data, utf)) break;
3121 eptr += len;
3122 }
3123 for(;;)
3124 {
3125 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3127 if (eptr-- == pp) break; /* Stop if tried at original pos */
3128 #ifdef SUPPORT_UTF
3129 if (utf) BACKCHAR(eptr);
3130 #endif
3131 }
3132 RRETURN(MATCH_NOMATCH);
3133 }
3134
3135 /* Control never gets here */
3136 }
3137 #endif /* End of XCLASS */
3138
3139 /* Match a single character, casefully */
3140
3141 case OP_CHAR:
3142 #ifdef SUPPORT_UTF
3143 if (utf)
3144 {
3145 length = 1;
3146 ecode++;
3147 GETCHARLEN(fc, ecode, length);
3148 if (length > md->end_subject - eptr)
3149 {
3150 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3151 RRETURN(MATCH_NOMATCH);
3152 }
3153 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3154 }
3155 else
3156 #endif
3157 /* Not UTF mode */
3158 {
3159 if (md->end_subject - eptr < 1)
3160 {
3161 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3162 RRETURN(MATCH_NOMATCH);
3163 }
3164 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3165 ecode += 2;
3166 }
3167 break;
3168
3169 /* Match a single character, caselessly. If we are at the end of the
3170 subject, give up immediately. */
3171
3172 case OP_CHARI:
3173 if (eptr >= md->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178
3179 #ifdef SUPPORT_UTF
3180 if (utf)
3181 {
3182 length = 1;
3183 ecode++;
3184 GETCHARLEN(fc, ecode, length);
3185
3186 /* If the pattern character's value is < 128, we have only one byte, and
3187 we know that its other case must also be one byte long, so we can use the
3188 fast lookup table. We know that there is at least one byte left in the
3189 subject. */
3190
3191 if (fc < 128)
3192 {
3193 pcre_uchar cc = RAWUCHAR(eptr);
3194 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3195 ecode++;
3196 eptr++;
3197 }
3198
3199 /* Otherwise we must pick up the subject character. Note that we cannot
3200 use the value of "length" to check for sufficient bytes left, because the
3201 other case of the character may have more or fewer bytes. */
3202
3203 else
3204 {
3205 pcre_uint32 dc;
3206 GETCHARINC(dc, eptr);
3207 ecode += length;
3208
3209 /* If we have Unicode property support, we can use it to test the other
3210 case of the character, if there is one. */
3211
3212 if (fc != dc)
3213 {
3214 #ifdef SUPPORT_UCP
3215 if (dc != UCD_OTHERCASE(fc))
3216 #endif
3217 RRETURN(MATCH_NOMATCH);
3218 }
3219 }
3220 }
3221 else
3222 #endif /* SUPPORT_UTF */
3223
3224 /* Not UTF mode */
3225 {
3226 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3227 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3228 eptr++;
3229 ecode += 2;
3230 }
3231 break;
3232
3233 /* Match a single character repeatedly. */
3234
3235 case OP_EXACT:
3236 case OP_EXACTI:
3237 min = max = GET2(ecode, 1);
3238 ecode += 1 + IMM2_SIZE;
3239 goto REPEATCHAR;
3240
3241 case OP_POSUPTO:
3242 case OP_POSUPTOI:
3243 possessive = TRUE;
3244 /* Fall through */
3245
3246 case OP_UPTO:
3247 case OP_UPTOI:
3248 case OP_MINUPTO:
3249 case OP_MINUPTOI:
3250 min = 0;
3251 max = GET2(ecode, 1);
3252 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3253 ecode += 1 + IMM2_SIZE;
3254 goto REPEATCHAR;
3255
3256 case OP_POSSTAR:
3257 case OP_POSSTARI:
3258 possessive = TRUE;
3259 min = 0;
3260 max = INT_MAX;
3261 ecode++;
3262 goto REPEATCHAR;
3263
3264 case OP_POSPLUS:
3265 case OP_POSPLUSI:
3266 possessive = TRUE;
3267 min = 1;
3268 max = INT_MAX;
3269 ecode++;
3270 goto REPEATCHAR;
3271
3272 case OP_POSQUERY:
3273 case OP_POSQUERYI:
3274 possessive = TRUE;
3275 min = 0;
3276 max = 1;
3277 ecode++;
3278 goto REPEATCHAR;
3279
3280 case OP_STAR:
3281 case OP_STARI:
3282 case OP_MINSTAR:
3283 case OP_MINSTARI:
3284 case OP_PLUS:
3285 case OP_PLUSI:
3286 case OP_MINPLUS:
3287 case OP_MINPLUSI:
3288 case OP_QUERY:
3289 case OP_QUERYI:
3290 case OP_MINQUERY:
3291 case OP_MINQUERYI:
3292 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3293 minimize = (c & 1) != 0;
3294 min = rep_min[c]; /* Pick up values from tables; */
3295 max = rep_max[c]; /* zero for max => infinity */
3296 if (max == 0) max = INT_MAX;
3297
3298 /* Common code for all repeated single-character matches. */
3299
3300 REPEATCHAR:
3301 #ifdef SUPPORT_UTF
3302 if (utf)
3303 {
3304 length = 1;
3305 charptr = ecode;
3306 GETCHARLEN(fc, ecode, length);
3307 ecode += length;
3308
3309 /* Handle multibyte character matching specially here. There is
3310 support for caseless matching if UCP support is present. */
3311
3312 if (length > 1)
3313 {
3314 #ifdef SUPPORT_UCP
3315 pcre_uint32 othercase;
3316 if (op >= OP_STARI && /* Caseless */
3317 (othercase = UCD_OTHERCASE(fc)) != fc)
3318 oclength = PRIV(ord2utf)(othercase, occhars);
3319 else oclength = 0;
3320 #endif /* SUPPORT_UCP */
3321
3322 for (i = 1; i <= min; i++)
3323 {
3324 if (eptr <= md->end_subject - length &&
3325 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3326 #ifdef SUPPORT_UCP
3327 else if (oclength > 0 &&
3328 eptr <= md->end_subject - oclength &&
3329 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3330 #endif /* SUPPORT_UCP */
3331 else
3332 {
3333 CHECK_PARTIAL();
3334 RRETURN(MATCH_NOMATCH);
3335 }
3336 }
3337
3338 if (min == max) continue;
3339
3340 if (minimize)
3341 {
3342 for (fi = min;; fi++)
3343 {
3344 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3345 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3346 if (fi >= max) RRETURN(MATCH_NOMATCH);
3347 if (eptr <= md->end_subject - length &&
3348 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3349 #ifdef SUPPORT_UCP
3350 else if (oclength > 0 &&
3351 eptr <= md->end_subject - oclength &&
3352 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3353 #endif /* SUPPORT_UCP */
3354 else
3355 {
3356 CHECK_PARTIAL();
3357 RRETURN(MATCH_NOMATCH);
3358 }
3359 }
3360 /* Control never gets here */
3361 }
3362
3363 else /* Maximize */
3364 {
3365 pp = eptr;
3366 for (i = min; i < max; i++)
3367 {
3368 if (eptr <= md->end_subject - length &&
3369 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3370 #ifdef SUPPORT_UCP
3371 else if (oclength > 0 &&
3372 eptr <= md->end_subject - oclength &&
3373 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3374 #endif /* SUPPORT_UCP */
3375 else
3376 {
3377 CHECK_PARTIAL();
3378 break;
3379 }
3380 }
3381
3382 if (possessive) continue;
3383
3384 for(;;)
3385 {
3386 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3387 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3388 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3389 #ifdef SUPPORT_UCP
3390 eptr--;
3391 BACKCHAR(eptr);
3392 #else /* without SUPPORT_UCP */
3393 eptr -= length;
3394 #endif /* SUPPORT_UCP */
3395 }
3396 }
3397 /* Control never gets here */
3398 }
3399
3400 /* If the length of a UTF-8 character is 1, we fall through here, and
3401 obey the code as for non-UTF-8 characters below, though in this case the
3402 value of fc will always be < 128. */
3403 }
3404 else
3405 #endif /* SUPPORT_UTF */
3406 /* When not in UTF-8 mode, load a single-byte character. */
3407 fc = *ecode++;
3408
3409 /* The value of fc at this point is always one character, though we may
3410 or may not be in UTF mode. The code is duplicated for the caseless and
3411 caseful cases, for speed, since matching characters is likely to be quite
3412 common. First, ensure the minimum number of matches are present. If min =
3413 max, continue at the same level without recursing. Otherwise, if
3414 minimizing, keep trying the rest of the expression and advancing one
3415 matching character if failing, up to the maximum. Alternatively, if
3416 maximizing, find the maximum number of characters and work backwards. */
3417
3418 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3419 max, (char *)eptr));
3420
3421 if (op >= OP_STARI) /* Caseless */
3422 {
3423 #ifdef COMPILE_PCRE8
3424 /* fc must be < 128 if UTF is enabled. */
3425 foc = md->fcc[fc];
3426 #else
3427 #ifdef SUPPORT_UTF
3428 #ifdef SUPPORT_UCP
3429 if (utf && fc > 127)
3430 foc = UCD_OTHERCASE(fc);
3431 #else
3432 if (utf && fc > 127)
3433 foc = fc;
3434 #endif /* SUPPORT_UCP */
3435 else
3436 #endif /* SUPPORT_UTF */
3437 foc = TABLE_GET(fc, md->fcc, fc);
3438 #endif /* COMPILE_PCRE8 */
3439
3440 for (i = 1; i <= min; i++)
3441 {
3442 pcre_uchar cc;
3443
3444 if (eptr >= md->end_subject)
3445 {
3446 SCHECK_PARTIAL();
3447 RRETURN(MATCH_NOMATCH);
3448 }
3449 cc = RAWUCHARTEST(eptr);
3450 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3451 eptr++;
3452 }
3453 if (min == max) continue;
3454 if (minimize)
3455 {
3456 for (fi = min;; fi++)
3457 {
3458 pcre_uchar cc;
3459
3460 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3461 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3462 if (fi >= max) RRETURN(MATCH_NOMATCH);
3463 if (eptr >= md->end_subject)
3464 {
3465 SCHECK_PARTIAL();
3466 RRETURN(MATCH_NOMATCH);
3467 }
3468 cc = RAWUCHARTEST(eptr);
3469 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3470 eptr++;
3471 }
3472 /* Control never gets here */
3473 }
3474 else /* Maximize */
3475 {
3476 pp = eptr;
3477 for (i = min; i < max; i++)
3478 {
3479 pcre_uchar cc;
3480
3481 if (eptr >= md->end_subject)
3482 {
3483 SCHECK_PARTIAL();
3484 break;
3485 }
3486 cc = RAWUCHARTEST(eptr);
3487 if (fc != cc && foc != cc) break;
3488 eptr++;
3489 }
3490
3491 if (possessive) continue;
3492
3493 while (eptr >= pp)
3494 {
3495 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3496 eptr--;
3497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3498 }
3499 RRETURN(MATCH_NOMATCH);
3500 }
3501 /* Control never gets here */
3502 }
3503
3504 /* Caseful comparisons (includes all multi-byte characters) */
3505
3506 else
3507 {
3508 for (i = 1; i <= min; i++)
3509 {
3510 if (eptr >= md->end_subject)
3511 {
3512 SCHECK_PARTIAL();
3513 RRETURN(MATCH_NOMATCH);
3514 }
3515 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3516 }
3517
3518 if (min == max) continue;
3519
3520 if (minimize)
3521 {
3522 for (fi = min;; fi++)
3523 {
3524 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3525 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3526 if (fi >= max) RRETURN(MATCH_NOMATCH);
3527 if (eptr >= md->end_subject)
3528 {
3529 SCHECK_PARTIAL();
3530 RRETURN(MATCH_NOMATCH);
3531 }
3532 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3533 }
3534 /* Control never gets here */
3535 }
3536 else /* Maximize */
3537 {
3538 pp = eptr;
3539 for (i = min; i < max; i++)
3540 {
3541 if (eptr >= md->end_subject)
3542 {
3543 SCHECK_PARTIAL();
3544 break;
3545 }
3546 if (fc != RAWUCHARTEST(eptr)) break;
3547 eptr++;
3548 }
3549 if (possessive) continue;
3550
3551 while (eptr >= pp)
3552 {
3553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3554 eptr--;
3555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3556 }
3557 RRETURN(MATCH_NOMATCH);
3558 }
3559 }
3560 /* Control never gets here */
3561
3562 /* Match a negated single one-byte character. The character we are
3563 checking can be multibyte. */
3564
3565 case OP_NOT:
3566 case OP_NOTI:
3567 if (eptr >= md->end_subject)
3568 {
3569 SCHECK_PARTIAL();
3570 RRETURN(MATCH_NOMATCH);
3571 }
3572 #ifdef SUPPORT_UTF
3573 if (utf)
3574 {
3575 register pcre_uint32 ch, och;
3576
3577 ecode++;
3578 GETCHARINC(ch, ecode);
3579 GETCHARINC(c, eptr);
3580
3581 if (op == OP_NOT)
3582 {
3583 if (ch == c) RRETURN(MATCH_NOMATCH);
3584 }
3585 else
3586 {
3587 #ifdef SUPPORT_UCP
3588 if (ch > 127)
3589 och = UCD_OTHERCASE(ch);
3590 #else
3591 if (ch > 127)
3592 och = ch;
3593 #endif /* SUPPORT_UCP */
3594 else
3595 och = TABLE_GET(ch, md->fcc, ch);
3596 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3597 }
3598 }
3599 else
3600 #endif
3601 {
3602 register pcre_uint32 ch = ecode[1];
3603 c = *eptr++;
3604 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3605 RRETURN(MATCH_NOMATCH);
3606 ecode += 2;
3607 }
3608 break;
3609
3610 /* Match a negated single one-byte character repeatedly. This is almost a
3611 repeat of the code for a repeated single character, but I haven't found a
3612 nice way of commoning these up that doesn't require a test of the
3613 positive/negative option for each character match. Maybe that wouldn't add
3614 very much to the time taken, but character matching *is* what this is all
3615 about... */
3616
3617 case OP_NOTEXACT:
3618 case OP_NOTEXACTI:
3619 min = max = GET2(ecode, 1);
3620 ecode += 1 + IMM2_SIZE;
3621 goto REPEATNOTCHAR;
3622
3623 case OP_NOTUPTO:
3624 case OP_NOTUPTOI:
3625 case OP_NOTMINUPTO:
3626 case OP_NOTMINUPTOI:
3627 min = 0;
3628 max = GET2(ecode, 1);
3629 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3630 ecode += 1 + IMM2_SIZE;
3631 goto REPEATNOTCHAR;
3632
3633 case OP_NOTPOSSTAR:
3634 case OP_NOTPOSSTARI:
3635 possessive = TRUE;
3636 min = 0;
3637 max = INT_MAX;
3638 ecode++;
3639 goto REPEATNOTCHAR;
3640
3641 case OP_NOTPOSPLUS:
3642 case OP_NOTPOSPLUSI:
3643 possessive = TRUE;
3644 min = 1;
3645 max = INT_MAX;
3646 ecode++;
3647 goto REPEATNOTCHAR;
3648
3649 case OP_NOTPOSQUERY:
3650 case OP_NOTPOSQUERYI:
3651 possessive = TRUE;
3652 min = 0;
3653 max = 1;
3654 ecode++;
3655 goto REPEATNOTCHAR;
3656
3657 case OP_NOTPOSUPTO:
3658 case OP_NOTPOSUPTOI:
3659 possessive = TRUE;
3660 min = 0;
3661 max = GET2(ecode, 1);
3662 ecode += 1 + IMM2_SIZE;
3663 goto REPEATNOTCHAR;
3664
3665 case OP_NOTSTAR:
3666 case OP_NOTSTARI:
3667 case OP_NOTMINSTAR:
3668 case OP_NOTMINSTARI:
3669 case OP_NOTPLUS:
3670 case OP_NOTPLUSI:
3671 case OP_NOTMINPLUS:
3672 case OP_NOTMINPLUSI:
3673 case OP_NOTQUERY:
3674 case OP_NOTQUERYI:
3675 case OP_NOTMINQUERY:
3676 case OP_NOTMINQUERYI:
3677 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3678 minimize = (c & 1) != 0;
3679 min = rep_min[c]; /* Pick up values from tables; */
3680 max = rep_max[c]; /* zero for max => infinity */
3681 if (max == 0) max = INT_MAX;
3682
3683 /* Common code for all repeated single-byte matches. */
3684
3685 REPEATNOTCHAR:
3686 GETCHARINCTEST(fc, ecode);
3687
3688 /* The code is duplicated for the caseless and caseful cases, for speed,
3689 since matching characters is likely to be quite common. First, ensure the
3690 minimum number of matches are present. If min = max, continue at the same
3691 level without recursing. Otherwise, if minimizing, keep trying the rest of
3692 the expression and advancing one matching character if failing, up to the
3693 maximum. Alternatively, if maximizing, find the maximum number of
3694 characters and work backwards. */
3695
3696 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3697 max, (char *)eptr));
3698
3699 if (op >= OP_NOTSTARI) /* Caseless */
3700 {
3701 #ifdef SUPPORT_UTF
3702 #ifdef SUPPORT_UCP
3703 if (utf && fc > 127)
3704 foc = UCD_OTHERCASE(fc);
3705 #else
3706 if (utf && fc > 127)
3707 foc = fc;
3708 #endif /* SUPPORT_UCP */
3709 else
3710 #endif /* SUPPORT_UTF */
3711 foc = TABLE_GET(fc, md->fcc, fc);
3712
3713 #ifdef SUPPORT_UTF
3714 if (utf)
3715 {
3716 register pcre_uint32 d;
3717 for (i = 1; i <= min; i++)
3718 {
3719 if (eptr >= md->end_subject)
3720 {
3721 SCHECK_PARTIAL();
3722 RRETURN(MATCH_NOMATCH);
3723 }
3724 GETCHARINC(d, eptr);
3725 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3726 }
3727 }
3728 else
3729 #endif
3730 /* Not UTF mode */
3731 {
3732 for (i = 1; i <= min; i++)
3733 {
3734 if (eptr >= md->end_subject)
3735 {
3736 SCHECK_PARTIAL();
3737 RRETURN(MATCH_NOMATCH);
3738 }
3739 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3740 eptr++;
3741 }
3742 }
3743
3744 if (min == max) continue;
3745
3746 if (minimize)
3747 {
3748 #ifdef SUPPORT_UTF
3749 if (utf)
3750 {
3751 register pcre_uint32 d;
3752 for (fi = min;; fi++)
3753 {
3754 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3756 if (fi >= max) RRETURN(MATCH_NOMATCH);
3757 if (eptr >= md->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 RRETURN(MATCH_NOMATCH);
3761 }
3762 GETCHARINC(d, eptr);
3763 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3764 }
3765 }
3766 else
3767 #endif
3768 /* Not UTF mode */
3769 {
3770 for (fi = min;; fi++)
3771 {
3772 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3774 if (fi >= max) RRETURN(MATCH_NOMATCH);
3775 if (eptr >= md->end_subject)
3776 {
3777 SCHECK_PARTIAL();
3778 RRETURN(MATCH_NOMATCH);
3779 }
3780 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3781 eptr++;
3782 }
3783 }
3784 /* Control never gets here */
3785 }
3786
3787 /* Maximize case */
3788
3789 else
3790 {
3791 pp = eptr;
3792
3793 #ifdef SUPPORT_UTF
3794 if (utf)
3795 {
3796 register pcre_uint32 d;
3797 for (i = min; i < max; i++)
3798 {
3799 int len = 1;
3800 if (eptr >= md->end_subject)
3801 {
3802 SCHECK_PARTIAL();
3803 break;
3804 }
3805 GETCHARLEN(d, eptr, len);
3806 if (fc == d || (unsigned int)foc == d) break;
3807 eptr += len;
3808 }
3809 if (possessive) continue;
3810 for(;;)
3811 {
3812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3814 if (eptr-- == pp) break; /* Stop if tried at original pos */
3815 BACKCHAR(eptr);
3816 }
3817 }
3818 else
3819 #endif
3820 /* Not UTF mode */
3821 {
3822 for (i = min; i < max; i++)
3823 {
3824 if (eptr >= md->end_subject)
3825 {
3826 SCHECK_PARTIAL();
3827 break;
3828 }
3829 if (fc == *eptr || foc == *eptr) break;
3830 eptr++;
3831 }
3832 if (possessive) continue;
3833 while (eptr >= pp)
3834 {
3835 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3837 eptr--;
3838 }
3839 }
3840
3841 RRETURN(MATCH_NOMATCH);
3842 }
3843 /* Control never gets here */
3844 }
3845
3846 /* Caseful comparisons */
3847
3848 else
3849 {
3850 #ifdef SUPPORT_UTF
3851 if (utf)
3852 {
3853 register pcre_uint32 d;
3854 for (i = 1; i <= min; i++)
3855 {
3856 if (eptr >= md->end_subject)
3857 {
3858 SCHECK_PARTIAL();
3859 RRETURN(MATCH_NOMATCH);
3860 }
3861 GETCHARINC(d, eptr);
3862 if (fc == d) RRETURN(MATCH_NOMATCH);
3863 }
3864 }
3865 else
3866 #endif
3867 /* Not UTF mode */
3868 {
3869 for (i = 1; i <= min; i++)
3870 {
3871 if (eptr >= md->end_subject)
3872 {
3873 SCHECK_PARTIAL();
3874 RRETURN(MATCH_NOMATCH);
3875 }
3876 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3877 }
3878 }
3879
3880 if (min == max) continue;
3881
3882 if (minimize)
3883 {
3884 #ifdef SUPPORT_UTF
3885 if (utf)
3886 {
3887 register pcre_uint32 d;
3888 for (fi = min;; fi++)
3889 {
3890 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3892 if (fi >= max) RRETURN(MATCH_NOMATCH);
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 RRETURN(MATCH_NOMATCH);
3897 }
3898 GETCHARINC(d, eptr);
3899 if (fc == d) RRETURN(MATCH_NOMATCH);
3900 }
3901 }
3902 else
3903 #endif
3904 /* Not UTF mode */
3905 {
3906 for (fi = min;; fi++)
3907 {
3908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3910 if (fi >= max) RRETURN(MATCH_NOMATCH);
3911 if (eptr >= md->end_subject)
3912 {
3913 SCHECK_PARTIAL();
3914 RRETURN(MATCH_NOMATCH);
3915 }
3916 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3917 }
3918 }
3919 /* Control never gets here */
3920 }
3921
3922 /* Maximize case */
3923
3924 else
3925 {
3926 pp = eptr;
3927
3928 #ifdef SUPPORT_UTF
3929 if (utf)
3930 {
3931 register pcre_uint32 d;
3932 for (i = min; i < max; i++)
3933 {
3934 int len = 1;
3935 if (eptr >= md->end_subject)
3936 {
3937 SCHECK_PARTIAL();
3938 break;
3939 }
3940 GETCHARLEN(d, eptr, len);
3941 if (fc == d) break;
3942 eptr += len;
3943 }
3944 if (possessive) continue;
3945 for(;;)
3946 {
3947 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3949 if (eptr-- == pp) break; /* Stop if tried at original pos */
3950 BACKCHAR(eptr);
3951 }
3952 }
3953 else
3954 #endif
3955 /* Not UTF mode */
3956 {
3957 for (i = min; i < max; i++)
3958 {
3959 if (eptr >= md->end_subject)
3960 {
3961 SCHECK_PARTIAL();
3962 break;
3963 }
3964 if (fc == *eptr) break;
3965 eptr++;
3966 }
3967 if (possessive) continue;
3968 while (eptr >= pp)
3969 {
3970 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3972 eptr--;
3973 }
3974 }
3975
3976 RRETURN(MATCH_NOMATCH);
3977 }
3978 }
3979 /* Control never gets here */
3980
3981 /* Match a single character type repeatedly; several different opcodes
3982 share code. This is very similar to the code for single characters, but we
3983 repeat it in the interests of efficiency. */
3984
3985 case OP_TYPEEXACT:
3986 min = max = GET2(ecode, 1);
3987 minimize = TRUE;
3988 ecode += 1 + IMM2_SIZE;
3989 goto REPEATTYPE;
3990
3991 case OP_TYPEUPTO:
3992 case OP_TYPEMINUPTO:
3993 min = 0;
3994 max = GET2(ecode, 1);
3995 minimize = *ecode == OP_TYPEMINUPTO;
3996 ecode += 1 + IMM2_SIZE;
3997 goto REPEATTYPE;
3998
3999 case OP_TYPEPOSSTAR:
4000 possessive = TRUE;
4001 min = 0;
4002 max = INT_MAX;
4003 ecode++;
4004 goto REPEATTYPE;
4005
4006 case OP_TYPEPOSPLUS:
4007 possessive = TRUE;
4008 min = 1;
4009 max = INT_MAX;
4010 ecode++;
4011 goto REPEATTYPE;
4012
4013 case OP_TYPEPOSQUERY:
4014 possessive = TRUE;
4015 min = 0;
4016 max = 1;
4017 ecode++;
4018 goto REPEATTYPE;
4019
4020 case OP_TYPEPOSUPTO:
4021 possessive = TRUE;
4022 min = 0;
4023 max = GET2(ecode, 1);
4024 ecode += 1 + IMM2_SIZE;
4025 goto REPEATTYPE;
4026
4027 case OP_TYPESTAR:
4028 case OP_TYPEMINSTAR:
4029 case OP_TYPEPLUS:
4030 case OP_TYPEMINPLUS:
4031 case OP_TYPEQUERY:
4032 case OP_TYPEMINQUERY:
4033 c = *ecode++ - OP_TYPESTAR;
4034 minimize = (c & 1) != 0;
4035 min = rep_min[c]; /* Pick up values from tables; */
4036 max = rep_max[c]; /* zero for max => infinity */
4037 if (max == 0) max = INT_MAX;
4038
4039 /* Common code for all repeated single character type matches. Note that
4040 in UTF-8 mode, '.' matches a character of any length, but for the other
4041 character types, the valid characters are all one-byte long. */
4042
4043 REPEATTYPE:
4044 ctype = *ecode++; /* Code for the character type */
4045
4046 #ifdef SUPPORT_UCP
4047 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4048 {
4049 prop_fail_result = ctype == OP_NOTPROP;
4050 prop_type = *ecode++;
4051 prop_value = *ecode++;
4052 }
4053 else prop_type = -1;
4054 #endif
4055
4056 /* First, ensure the minimum number of matches are present. Use inline
4057 code for maximizing the speed, and do the type test once at the start
4058 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4059 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4060 and single-bytes. */
4061
4062 if (min > 0)
4063 {
4064 #ifdef SUPPORT_UCP
4065 if (prop_type >= 0)
4066 {
4067 switch(prop_type)
4068 {
4069 case PT_ANY:
4070 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4071 for (i = 1; i <= min; i++)
4072 {
4073 if (eptr >= md->end_subject)
4074 {
4075 SCHECK_PARTIAL();
4076 RRETURN(MATCH_NOMATCH);
4077 }
4078 GETCHARINCTEST(c, eptr);
4079 }
4080 break;
4081
4082 case PT_LAMP:
4083 for (i = 1; i <= min; i++)
4084 {
4085 int chartype;
4086 if (eptr >= md->end_subject)
4087 {
4088 SCHECK_PARTIAL();
4089 RRETURN(MATCH_NOMATCH);
4090 }
4091 GETCHARINCTEST(c, eptr);
4092 chartype = UCD_CHARTYPE(c);
4093 if ((chartype == ucp_Lu ||
4094 chartype == ucp_Ll ||
4095 chartype == ucp_Lt) == prop_fail_result)
4096 RRETURN(MATCH_NOMATCH);
4097 }
4098 break;
4099
4100 case PT_GC:
4101 for (i = 1; i <= min; i++)
4102 {
4103 if (eptr >= md->end_subject)
4104 {
4105 SCHECK_PARTIAL();
4106 RRETURN(MATCH_NOMATCH);
4107 }
4108 GETCHARINCTEST(c, eptr);
4109 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4110 RRETURN(MATCH_NOMATCH);
4111 }
4112 break;
4113
4114 case PT_PC:
4115 for (i = 1; i <= min; i++)
4116 {
4117 if (eptr >= md->end_subject)
4118 {
4119 SCHECK_PARTIAL();
4120 RRETURN(MATCH_NOMATCH);
4121 }
4122 GETCHARINCTEST(c, eptr);
4123 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4124 RRETURN(MATCH_NOMATCH);
4125 }
4126 break;
4127
4128 case PT_SC:
4129 for (i = 1; i <= min; i++)
4130 {
4131 if (eptr >= md->end_subject)
4132 {
4133 SCHECK_PARTIAL();
4134 RRETURN(MATCH_NOMATCH);
4135 }
4136 GETCHARINCTEST(c, eptr);
4137 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4138 RRETURN(MATCH_NOMATCH);
4139 }
4140 break;
4141
4142 case PT_ALNUM:
4143 for (i = 1; i <= min; i++)
4144 {
4145 int category;
4146 if (eptr >= md->end_subject)
4147 {
4148 SCHECK_PARTIAL();
4149 RRETURN(MATCH_NOMATCH);
4150 }
4151 GETCHARINCTEST(c, eptr);
4152 category = UCD_CATEGORY(c);
4153 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4154 RRETURN(MATCH_NOMATCH);
4155 }
4156 break;
4157
4158 case PT_SPACE: /* Perl space */
4159 for (i = 1; i <= min; i++)
4160 {
4161 if (eptr >= md->end_subject)
4162 {
4163 SCHECK_PARTIAL();
4164 RRETURN(MATCH_NOMATCH);
4165 }
4166 GETCHARINCTEST(c, eptr);
4167 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4168 c == CHAR_FF || c == CHAR_CR)
4169 == prop_fail_result)
4170 RRETURN(MATCH_NOMATCH);
4171 }
4172 break;
4173
4174 case PT_PXSPACE: /* POSIX space */
4175 for (i = 1; i <= min; i++)
4176 {
4177 if (eptr >= md->end_subject)
4178 {
4179 SCHECK_PARTIAL();
4180 RRETURN(MATCH_NOMATCH);
4181 }
4182 GETCHARINCTEST(c, eptr);
4183 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4184 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4185 == prop_fail_result)
4186 RRETURN(MATCH_NOMATCH);
4187 }
4188 break;
4189
4190 case PT_WORD:
4191 for (i = 1; i <= min; i++)
4192 {
4193 int category;
4194 if (eptr >= md->end_subject)
4195 {
4196 SCHECK_PARTIAL();
4197 RRETURN(MATCH_NOMATCH);
4198 }
4199 GETCHARINCTEST(c, eptr);
4200 category = UCD_CATEGORY(c);
4201 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4202 == prop_fail_result)
4203 RRETURN(MATCH_NOMATCH);
4204 }
4205 break;
4206
4207 case PT_CLIST:
4208 for (i = 1; i <= min; i++)
4209 {
4210 const pcre_uint32 *cp;
4211 if (eptr >= md->end_subject)
4212 {
4213 SCHECK_PARTIAL();
4214 RRETURN(MATCH_NOMATCH);
4215 }
4216 GETCHARINCTEST(c, eptr);
4217 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4218 for (;;)
4219 {
4220 if (c < *cp)
4221 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4222 if (c == *cp++)
4223 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4224 }
4225 }
4226 break;
4227
4228 /* This should not occur */
4229
4230 default:
4231 RRETURN(PCRE_ERROR_INTERNAL);
4232 }
4233 }
4234
4235 /* Match extended Unicode sequences. We will get here only if the
4236 support is in the binary; otherwise a compile-time error occurs. */
4237
4238 else if (ctype == OP_EXTUNI)
4239 {
4240 for (i = 1; i <= min; i++)
4241 {
4242 if (eptr >= md->end_subject)
4243 {
4244 SCHECK_PARTIAL();
4245 RRETURN(MATCH_NOMATCH);
4246 }
4247 else
4248 {
4249 int lgb, rgb;
4250 GETCHARINCTEST(c, eptr);
4251 lgb = UCD_GRAPHBREAK(c);
4252 while (eptr < md->end_subject)
4253 {
4254 int len = 1;
4255 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4256 rgb = UCD_GRAPHBREAK(c);
4257 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4258 lgb = rgb;
4259 eptr += len;
4260 }
4261 }
4262 CHECK_PARTIAL();
4263 }
4264 }
4265
4266 else
4267 #endif /* SUPPORT_UCP */
4268
4269 /* Handle all other cases when the coding is UTF-8 */
4270
4271 #ifdef SUPPORT_UTF
4272 if (utf) switch(ctype)
4273 {
4274 case OP_ANY:
4275 for (i = 1; i <= min; i++)
4276 {
4277 if (eptr >= md->end_subject)
4278 {
4279 SCHECK_PARTIAL();
4280 RRETURN(MATCH_NOMATCH);
4281 }
4282 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4283 if (md->partial != 0 &&
4284 eptr + 1 >= md->end_subject &&
4285 NLBLOCK->nltype == NLTYPE_FIXED &&
4286 NLBLOCK->nllen == 2 &&
4287 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4288 {
4289 md->hitend = TRUE;
4290 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4291 }
4292 eptr++;
4293 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4294 }
4295 break;
4296
4297 case OP_ALLANY:
4298 for (i = 1; i <= min; i++)
4299 {
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 RRETURN(MATCH_NOMATCH);
4304 }
4305 eptr++;
4306 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4307 }
4308 break;
4309
4310 case OP_ANYBYTE:
4311 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4312 eptr += min;
4313 break;
4314
4315 case OP_ANYNL:
4316 for (i = 1; i <= min; i++)
4317 {
4318 if (eptr >= md->end_subject)
4319 {
4320 SCHECK_PARTIAL();
4321 RRETURN(MATCH_NOMATCH);
4322 }
4323 GETCHARINC(c, eptr);
4324 switch(c)
4325 {
4326 default: RRETURN(MATCH_NOMATCH);
4327
4328 case CHAR_CR:
4329 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4330 break;
4331
4332 case CHAR_LF:
4333 break;
4334
4335 case CHAR_VT:
4336 case CHAR_FF:
4337 case CHAR_NEL:
4338 #ifndef EBCDIC
4339 case 0x2028:
4340 case 0x2029:
4341 #endif /* Not EBCDIC */
4342 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4343 break;
4344 }
4345 }
4346 break;
4347
4348 case OP_NOT_HSPACE:
4349 for (i = 1; i <= min; i++)
4350 {
4351 if (eptr >= md->end_subject)
4352 {
4353 SCHECK_PARTIAL();
4354 RRETURN(MATCH_NOMATCH);
4355 }
4356 GETCHARINC(c, eptr);
4357 switch(c)
4358 {
4359 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4360 default: break;
4361 }
4362 }
4363 break;
4364
4365 case OP_HSPACE:
4366 for (i = 1; i <= min; i++)
4367 {
4368 if (eptr >= md->end_subject)
4369 {
4370 SCHECK_PARTIAL();
4371 RRETURN(MATCH_NOMATCH);
4372 }
4373 GETCHARINC(c, eptr);
4374 switch(c)
4375 {
4376 HSPACE_CASES: break; /* Byte and multibyte cases */
4377 default: RRETURN(MATCH_NOMATCH);
4378 }
4379 }
4380 break;
4381
4382 case OP_NOT_VSPACE:
4383 for (i = 1; i <= min; i++)
4384 {
4385 if (eptr >= md->end_subject)
4386 {
4387 SCHECK_PARTIAL();
4388 RRETURN(MATCH_NOMATCH);
4389 }
4390 GETCHARINC(c, eptr);
4391 switch(c)
4392 {
4393 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4394 default: break;
4395 }
4396 }
4397 break;
4398
4399 case OP_VSPACE:
4400 for (i = 1; i <= min; i++)
4401 {
4402 if (eptr >= md->end_subject)
4403 {
4404 SCHECK_PARTIAL();
4405 RRETURN(MATCH_NOMATCH);
4406 }
4407 GETCHARINC(c, eptr);
4408 switch(c)
4409 {
4410 VSPACE_CASES: break;
4411 default: RRETURN(MATCH_NOMATCH);
4412 }
4413 }
4414 break;
4415
4416 case OP_NOT_DIGIT:
4417 for (i = 1; i <= min; i++)
4418 {
4419 if (eptr >= md->end_subject)
4420 {
4421 SCHECK_PARTIAL();
4422 RRETURN(MATCH_NOMATCH);
4423 }
4424 GETCHARINC(c, eptr);
4425 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4426 RRETURN(MATCH_NOMATCH);
4427 }
4428 break;
4429
4430 case OP_DIGIT:
4431 for (i = 1; i <= min; i++)
4432 {
4433 pcre_uchar cc;
4434
4435 if (eptr >= md->end_subject)
4436 {
4437 SCHECK_PARTIAL();
4438 RRETURN(MATCH_NOMATCH);
4439 }
4440 cc = RAWUCHAR(eptr);
4441 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4442 RRETURN(MATCH_NOMATCH);
4443 eptr++;
4444 /* No need to skip more bytes - we know it's a 1-byte character */
4445 }
4446 break;
4447
4448 case OP_NOT_WHITESPACE:
4449 for (i = 1; i <= min; i++)
4450 {
4451 pcre_uchar cc;
4452
4453 if (eptr >= md->end_subject)
4454 {
4455 SCHECK_PARTIAL();
4456 RRETURN(MATCH_NOMATCH);
4457 }
4458 cc = RAWUCHAR(eptr);
4459 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4460 RRETURN(MATCH_NOMATCH);
4461 eptr++;
4462 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4463 }
4464 break;
4465
4466 case OP_WHITESPACE:
4467 for (i = 1; i <= min; i++)
4468 {
4469 pcre_uchar cc;
4470
4471 if (eptr >= md->end_subject)
4472 {
4473 SCHECK_PARTIAL();
4474 RRETURN(MATCH_NOMATCH);
4475 }
4476 cc = RAWUCHAR(eptr);
4477 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4478 RRETURN(MATCH_NOMATCH);
4479 eptr++;
4480 /* No need to skip more bytes - we know it's a 1-byte character */
4481 }
4482 break;
4483
4484 case OP_NOT_WORDCHAR:
4485 for (i = 1; i <= min; i++)
4486 {
4487 pcre_uchar cc;
4488
4489 if (eptr >= md->end_subject)
4490 {
4491 SCHECK_PARTIAL();
4492 RRETURN(MATCH_NOMATCH);
4493 }
4494 cc = RAWUCHAR(eptr);
4495 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4496 RRETURN(MATCH_NOMATCH);
4497 eptr++;
4498 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4499 }
4500 break;
4501
4502 case OP_WORDCHAR:
4503 for (i = 1; i <= min; i++)
4504 {
4505 pcre_uchar cc;
4506
4507 if (eptr >= md->end_subject)
4508 {
4509 SCHECK_PARTIAL();
4510 RRETURN(MATCH_NOMATCH);
4511 }
4512 cc = RAWUCHAR(eptr);
4513 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4514 RRETURN(MATCH_NOMATCH);
4515 eptr++;
4516 /* No need to skip more bytes - we know it's a 1-byte character */
4517 }
4518 break;
4519
4520 default:
4521 RRETURN(PCRE_ERROR_INTERNAL);
4522 } /* End switch(ctype) */
4523
4524 else
4525 #endif /* SUPPORT_UTF */
4526
4527 /* Code for the non-UTF-8 case for minimum matching of operators other
4528 than OP_PROP and OP_NOTPROP. */
4529
4530 switch(ctype)
4531 {
4532 case OP_ANY:
4533 for (i = 1; i <= min; i++)
4534 {
4535 if (eptr >= md->end_subject)
4536 {
4537 SCHECK_PARTIAL();
4538 RRETURN(MATCH_NOMATCH);
4539 }
4540 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4541 if (md->partial != 0 &&
4542 eptr + 1 >= md->end_subject &&
4543 NLBLOCK->nltype == NLTYPE_FIXED &&
4544 NLBLOCK->nllen == 2 &&
4545 *eptr == NLBLOCK->nl[0])
4546 {
4547 md->hitend = TRUE;
4548 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4549 }
4550 eptr++;
4551 }
4552 break;
4553
4554 case OP_ALLANY:
4555 if (eptr > md->end_subject - min)
4556 {
4557 SCHECK_PARTIAL();
4558 RRETURN(MATCH_NOMATCH);
4559 }
4560 eptr += min;
4561 break;
4562
4563 case OP_ANYBYTE:
4564 if (eptr > md->end_subject - min)
4565 {
4566 SCHECK_PARTIAL();
4567 RRETURN(MATCH_NOMATCH);
4568 }
4569 eptr += min;
4570 break;
4571
4572 case OP_ANYNL:
4573 for (i = 1; i <= min; i++)
4574 {
4575 if (eptr >= md->end_subject)
4576 {
4577 SCHECK_PARTIAL();
4578 RRETURN(MATCH_NOMATCH);
4579 }
4580 switch(*eptr++)
4581 {
4582 default: RRETURN(MATCH_NOMATCH);
4583
4584 case CHAR_CR:
4585 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4586 break;
4587
4588 case CHAR_LF:
4589 break;
4590
4591 case CHAR_VT:
4592 case CHAR_FF:
4593 case CHAR_NEL:
4594 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4595 case 0x2028:
4596 case 0x2029:
4597 #endif
4598 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4599 break;
4600 }
4601 }
4602 break;
4603
4604 case OP_NOT_HSPACE:
4605 for (i = 1; i <= min; i++)
4606 {
4607 if (eptr >= md->end_subject)
4608 {
4609 SCHECK_PARTIAL();
4610 RRETURN(MATCH_NOMATCH);
4611 }
4612 switch(*eptr++)
4613 {
4614 default: break;
4615 HSPACE_BYTE_CASES:
4616 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4617 HSPACE_MULTIBYTE_CASES:
4618 #endif
4619 RRETURN(MATCH_NOMATCH);
4620 }
4621 }
4622 break;
4623
4624 case OP_HSPACE:
4625 for (i = 1; i <= min; i++)
4626 {
4627 if (eptr >= md->end_subject)
4628 {
4629 SCHECK_PARTIAL();
4630 RRETURN(MATCH_NOMATCH);
4631 }
4632 switch(*eptr++)
4633 {
4634 default: RRETURN(MATCH_NOMATCH);
4635 HSPACE_BYTE_CASES:
4636 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4637 HSPACE_MULTIBYTE_CASES:
4638 #endif
4639 break;
4640 }
4641 }
4642 break;
4643
4644 case OP_NOT_VSPACE:
4645 for (i = 1; i <= min; i++)
4646 {
4647 if (eptr >= md->end_subject)
4648 {
4649 SCHECK_PARTIAL();
4650 RRETURN(MATCH_NOMATCH);
4651 }
4652 switch(*eptr++)
4653 {
4654 VSPACE_BYTE_CASES:
4655 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4656 VSPACE_MULTIBYTE_CASES:
4657 #endif
4658 RRETURN(MATCH_NOMATCH);
4659 default: break;
4660 }
4661 }
4662 break;
4663
4664 case OP_VSPACE:
4665 for (i = 1; i <= min; i++)
4666 {
4667 if (eptr >= md->end_subject)
4668 {
4669 SCHECK_PARTIAL();
4670 RRETURN(MATCH_NOMATCH);
4671 }
4672 switch(*eptr++)
4673 {
4674 default: RRETURN(MATCH_NOMATCH);
4675 VSPACE_BYTE_CASES:
4676 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4677 VSPACE_MULTIBYTE_CASES:
4678 #endif
4679 break;
4680 }
4681 }
4682 break;
4683
4684 case OP_NOT_DIGIT:
4685 for (i = 1; i <= min; i++)
4686 {
4687 if (eptr >= md->end_subject)
4688 {
4689 SCHECK_PARTIAL();
4690 RRETURN(MATCH_NOMATCH);
4691 }
4692 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4693 RRETURN(MATCH_NOMATCH);
4694 eptr++;
4695 }
4696 break;
4697
4698 case OP_DIGIT:
4699 for (i = 1; i <= min; i++)
4700 {
4701 if (eptr >= md->end_subject)
4702 {
4703 SCHECK_PARTIAL();
4704 RRETURN(MATCH_NOMATCH);
4705 }
4706 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4707 RRETURN(MATCH_NOMATCH);
4708 eptr++;
4709 }
4710 break;
4711
4712 case OP_NOT_WHITESPACE:
4713 for (i = 1; i <= min; i++)
4714 {
4715 if (eptr >= md->end_subject)
4716 {
4717 SCHECK_PARTIAL();
4718 RRETURN(MATCH_NOMATCH);
4719 }
4720 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4721 RRETURN(MATCH_NOMATCH);
4722 eptr++;
4723 }
4724 break;
4725
4726 case OP_WHITESPACE:
4727 for (i = 1; i <= min; i++)
4728 {
4729 if (eptr >= md->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4733 }
4734 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4735 RRETURN(MATCH_NOMATCH);
4736 eptr++;
4737 }
4738 break;
4739
4740 case OP_NOT_WORDCHAR:
4741 for (i = 1; i <= min; i++)
4742 {
4743 if (eptr >= md->end_subject)
4744 {
4745 SCHECK_PARTIAL();
4746 RRETURN(MATCH_NOMATCH);
4747 }
4748 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4749 RRETURN(MATCH_NOMATCH);
4750 eptr++;
4751 }
4752 break;
4753
4754 case OP_WORDCHAR:
4755 for (i = 1; i <= min; i++)
4756 {
4757 if (eptr >= md->end_subject)
4758 {
4759 SCHECK_PARTIAL();
4760 RRETURN(MATCH_NOMATCH);
4761 }
4762 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4763 RRETURN(MATCH_NOMATCH);
4764 eptr++;
4765 }
4766 break;
4767
4768 default:
4769 RRETURN(PCRE_ERROR_INTERNAL);
4770 }
4771 }
4772
4773 /* If min = max, continue at the same level without recursing */
4774
4775 if (min == max) continue;
4776
4777 /* If minimizing, we have to test the rest of the pattern before each
4778 subsequent match. Again, separate the UTF-8 case for speed, and also
4779 separate the UCP cases. */
4780
4781 if (minimize)
4782 {
4783 #ifdef SUPPORT_UCP
4784 if (prop_type >= 0)
4785 {
4786 switch(prop_type)
4787 {
4788 case PT_ANY:
4789 for (fi = min;; fi++)
4790 {
4791 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4793 if (fi >= max) RRETURN(MATCH_NOMATCH);
4794 if (eptr >= md->end_subject)
4795 {
4796 SCHECK_PARTIAL();
4797 RRETURN(MATCH_NOMATCH);
4798 }
4799 GETCHARINCTEST(c, eptr);
4800 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4801 }
4802 /* Control never gets here */
4803
4804 case PT_LAMP:
4805 for (fi = min;; fi++)
4806 {
4807 int chartype;
4808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4810 if (fi >= max) RRETURN(MATCH_NOMATCH);
4811 if (eptr >= md->end_subject)
4812 {
4813 SCHECK_PARTIAL();
4814 RRETURN(MATCH_NOMATCH);
4815 }
4816 GETCHARINCTEST(c, eptr);
4817 chartype = UCD_CHARTYPE(c);
4818 if ((chartype == ucp_Lu ||
4819 chartype == ucp_Ll ||
4820 chartype == ucp_Lt) == prop_fail_result)
4821 RRETURN(MATCH_NOMATCH);
4822 }
4823 /* Control never gets here */
4824
4825 case PT_GC:
4826 for (fi = min;; fi++)
4827 {
4828 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4830 if (fi >= max) RRETURN(MATCH_NOMATCH);
4831 if (eptr >= md->end_subject)
4832 {
4833 SCHECK_PARTIAL();
4834 RRETURN(MATCH_NOMATCH);
4835 }
4836 GETCHARINCTEST(c, eptr);
4837 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4838 RRETURN(MATCH_NOMATCH);
4839 }
4840 /* Control never gets here */
4841
4842 case PT_PC:
4843 for (fi = min;; fi++)
4844 {
4845 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4847 if (fi >= max) RRETURN(MATCH_NOMATCH);
4848 if (eptr >= md->end_subject)
4849 {
4850 SCHECK_PARTIAL();
4851 RRETURN(MATCH_NOMATCH);
4852 }
4853 GETCHARINCTEST(c, eptr);
4854 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4855 RRETURN(MATCH_NOMATCH);
4856 }
4857 /* Control never gets here */
4858
4859 case PT_SC:
4860 for (fi = min;; fi++)
4861 {
4862 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4864 if (fi >= max) RRETURN(MATCH_NOMATCH);
4865 if (eptr >= md->end_subject)
4866 {
4867 SCHECK_PARTIAL();
4868 RRETURN(MATCH_NOMATCH);
4869 }
4870 GETCHARINCTEST(c, eptr);
4871 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4872 RRETURN(MATCH_NOMATCH);
4873 }
4874 /* Control never gets here */
4875
4876 case PT_ALNUM:
4877 for (fi = min;; fi++)
4878 {
4879 int category;
4880 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4882 if (fi >= max) RRETURN(MATCH_NOMATCH);
4883 if (eptr >= md->end_subject)
4884 {
4885 SCHECK_PARTIAL();
4886 RRETURN(MATCH_NOMATCH);
4887 }
4888 GETCHARINCTEST(c, eptr);
4889 category = UCD_CATEGORY(c);
4890 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4891 RRETURN(MATCH_NOMATCH);
4892 }
4893 /* Control never gets here */
4894
4895 case PT_SPACE: /* Perl space */
4896 for (fi = min;; fi++)
4897 {
4898 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4899 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4900 if (fi >= max) RRETURN(MATCH_NOMATCH);
4901 if (eptr >= md->end_subject)
4902 {
4903 SCHECK_PARTIAL();
4904 RRETURN(MATCH_NOMATCH);
4905 }
4906 GETCHARINCTEST(c, eptr);
4907 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4908 c == CHAR_FF || c == CHAR_CR)
4909 == prop_fail_result)
4910 RRETURN(MATCH_NOMATCH);
4911 }
4912 /* Control never gets here */
4913
4914 case PT_PXSPACE: /* POSIX space */
4915 for (fi = min;; fi++)
4916 {
4917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919 if (fi >= max) RRETURN(MATCH_NOMATCH);
4920 if (eptr >= md->end_subject)
4921 {
4922 SCHECK_PARTIAL();
4923 RRETURN(MATCH_NOMATCH);
4924 }
4925 GETCHARINCTEST(c, eptr);
4926 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4927 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4928 == prop_fail_result)
4929 RRETURN(MATCH_NOMATCH);
4930 }
4931 /* Control never gets here */
4932
4933 case PT_WORD:
4934 for (fi = min;; fi++)
4935 {
4936 int category;
4937 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4939 if (fi >= max) RRETURN(MATCH_NOMATCH);
4940 if (eptr >= md->end_subject)
4941 {
4942 SCHECK_PARTIAL();
4943 RRETURN(MATCH_NOMATCH);
4944 }
4945 GETCHARINCTEST(c, eptr);
4946 category = UCD_CATEGORY(c);
4947 if ((category == ucp_L ||
4948 category == ucp_N ||
4949 c == CHAR_UNDERSCORE)
4950 == prop_fail_result)
4951 RRETURN(MATCH_NOMATCH);
4952 }
4953 /* Control never gets here */
4954
4955 case PT_CLIST:
4956 for (fi = min;; fi++)
4957 {
4958 const pcre_uint32 *cp;
4959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
4960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4961 if (fi >= max) RRETURN(MATCH_NOMATCH);
4962 if (eptr >= md->end_subject)
4963 {
4964 SCHECK_PARTIAL();
4965 RRETURN(MATCH_NOMATCH);
4966 }
4967 GETCHARINCTEST(c, eptr);
4968 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4969 for (;;)
4970 {
4971 if (c < *cp)
4972 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4973 if (c == *cp++)
4974 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4975 }
4976 }
4977 /* Control never gets here */
4978
4979 /* This should never occur */
4980 default:
4981 RRETURN(PCRE_ERROR_INTERNAL);
4982 }
4983 }
4984
4985 /* Match extended Unicode sequences. We will get here only if the
4986 support is in the binary; otherwise a compile-time error occurs. */
4987
4988 else if (ctype == OP_EXTUNI)
4989 {
4990 for (fi = min;; fi++)
4991 {
4992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4994 if (fi >= max) RRETURN(MATCH_NOMATCH);
4995 if (eptr >= md->end_subject)
4996 {
4997 SCHECK_PARTIAL();
4998 RRETURN(MATCH_NOMATCH);
4999 }
5000 else
5001 {
5002 int lgb, rgb;
5003 GETCHARINCTEST(c, eptr);
5004 lgb = UCD_GRAPHBREAK(c);
5005 while (eptr < md->end_subject)
5006 {
5007 int len = 1;
5008 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5009 rgb = UCD_GRAPHBREAK(c);
5010 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5011 lgb = rgb;
5012 eptr += len;
5013 }
5014 }
5015 CHECK_PARTIAL();
5016 }
5017 }
5018 else
5019 #endif /* SUPPORT_UCP */
5020
5021 #ifdef SUPPORT_UTF
5022 if (utf)
5023 {
5024 for (fi = min;; fi++)
5025 {
5026 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5027 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5028 if (fi >= max) RRETURN(MATCH_NOMATCH);
5029 if (eptr >= md->end_subject)
5030 {
5031 SCHECK_PARTIAL();
5032 RRETURN(MATCH_NOMATCH);
5033 }
5034 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5035 RRETURN(MATCH_NOMATCH);
5036 GETCHARINC(c, eptr);
5037 switch(ctype)
5038 {
5039 case OP_ANY: /* This is the non-NL case */
5040 if (md->partial != 0 && /* Take care with CRLF partial */
5041 eptr >= md->end_subject &&
5042 NLBLOCK->nltype == NLTYPE_FIXED &&
5043 NLBLOCK->nllen == 2 &&
5044 c == NLBLOCK->nl[0])
5045 {
5046 md->hitend = TRUE;
5047 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5048 }
5049 break;
5050
5051 case OP_ALLANY:
5052 case OP_ANYBYTE:
5053 break;
5054
5055 case OP_ANYNL:
5056 switch(c)
5057 {
5058 default: RRETURN(MATCH_NOMATCH);
5059 case CHAR_CR:
5060 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5061 break;
5062
5063 case CHAR_LF:
5064 break;
5065
5066 case CHAR_VT:
5067 case CHAR_FF:
5068 case CHAR_NEL:
5069 #ifndef EBCDIC
5070 case 0x2028:
5071 case 0x2029:
5072 #endif /* Not EBCDIC */
5073 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5074 break;
5075 }
5076 break;
5077
5078 case OP_NOT_HSPACE:
5079 switch(c)
5080 {
5081 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5082 default: break;
5083 }
5084 break;
5085
5086 case OP_HSPACE:
5087 switch(c)
5088 {
5089 HSPACE_CASES: break;
5090 default: RRETURN(MATCH_NOMATCH);
5091 }
5092 break;
5093
5094 case OP_NOT_VSPACE:
5095 switch(c)
5096 {
5097 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5098 default: break;
5099 }
5100 break;
5101
5102 case OP_VSPACE:
5103 switch(c)
5104 {
5105 VSPACE_CASES: break;
5106 default: RRETURN(MATCH_NOMATCH);
5107 }
5108 break;
5109
5110 case OP_NOT_DIGIT:
5111 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5112 RRETURN(MATCH_NOMATCH);
5113 break;
5114
5115 case OP_DIGIT:
5116 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5117 RRETURN(MATCH_NOMATCH);
5118 break;
5119
5120 case OP_NOT_WHITESPACE:
5121 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5122 RRETURN(MATCH_NOMATCH);
5123 break;
5124
5125 case OP_WHITESPACE:
5126 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5127 RRETURN(MATCH_NOMATCH);
5128 break;
5129
5130 case OP_NOT_WORDCHAR:
5131 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5132 RRETURN(MATCH_NOMATCH);
5133 break;
5134
5135 case OP_WORDCHAR:
5136 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5137 RRETURN(MATCH_NOMATCH);
5138 break;
5139
5140 default:
5141 RRETURN(PCRE_ERROR_INTERNAL);
5142 }
5143 }
5144 }
5145 else
5146 #endif
5147 /* Not UTF mode */
5148 {
5149 for (fi = min;; fi++)
5150 {
5151 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5152 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5153 if (fi >= max) RRETURN(MATCH_NOMATCH);
5154 if (eptr >= md->end_subject)
5155 {
5156 SCHECK_PARTIAL();
5157 RRETURN(MATCH_NOMATCH);
5158 }
5159 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5160 RRETURN(MATCH_NOMATCH);
5161 c = *eptr++;
5162 switch(ctype)
5163 {
5164 case OP_ANY: /* This is the non-NL case */
5165 if (md->partial != 0 && /* Take care with CRLF partial */
5166 eptr >= md->end_subject &&
5167 NLBLOCK->nltype == NLTYPE_FIXED &&
5168 NLBLOCK->nllen == 2 &&
5169 c == NLBLOCK->nl[0])
5170 {
5171 md->hitend = TRUE;
5172 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5173 }
5174 break;
5175
5176 case OP_ALLANY:
5177 case OP_ANYBYTE:
5178 break;
5179
5180 case OP_ANYNL:
5181 switch(c)
5182 {
5183 default: RRETURN(MATCH_NOMATCH);
5184 case CHAR_CR:
5185 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5186 break;
5187
5188 case CHAR_LF:
5189 break;
5190
5191 case CHAR_VT:
5192 case CHAR_FF:
5193 case CHAR_NEL:
5194 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5195 case 0x2028:
5196 case 0x2029:
5197 #endif
5198 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5199 break;
5200 }
5201 break;
5202
5203 case OP_NOT_HSPACE:
5204 switch(c)
5205 {
5206 default: break;
5207 HSPACE_BYTE_CASES:
5208 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5209 HSPACE_MULTIBYTE_CASES:
5210 #endif
5211 RRETURN(MATCH_NOMATCH);
5212 }
5213 break;
5214
5215 case OP_HSPACE:
5216 switch(c)
5217 {
5218 default: RRETURN(MATCH_NOMATCH);
5219 HSPACE_BYTE_CASES:
5220 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5221 HSPACE_MULTIBYTE_CASES:
5222 #endif
5223 break;
5224 }
5225 break;
5226
5227 case OP_NOT_VSPACE:
5228 switch(c)
5229 {
5230 default: break;
5231 VSPACE_BYTE_CASES:
5232 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5233 VSPACE_MULTIBYTE_CASES:
5234 #endif
5235 RRETURN(MATCH_NOMATCH);
5236 }
5237 break;
5238
5239 case OP_VSPACE:
5240 switch(c)
5241 {
5242 default: RRETURN(MATCH_NOMATCH);
5243 VSPACE_BYTE_CASES:
5244 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5245 VSPACE_MULTIBYTE_CASES:
5246 #endif
5247 break;
5248 }
5249 break;
5250
5251 case OP_NOT_DIGIT:
5252 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5253 break;
5254
5255 case OP_DIGIT:
5256 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5257 break;
5258
5259 case OP_NOT_WHITESPACE:
5260 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5261 break;
5262
5263 case OP_WHITESPACE:
5264 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5265 break;
5266
5267 case OP_NOT_WORDCHAR:
5268 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5269 break;
5270
5271 case OP_WORDCHAR:
5272 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5273 break;
5274
5275 default:
5276 RRETURN(PCRE_ERROR_INTERNAL);
5277 }
5278 }
5279 }
5280 /* Control never gets here */
5281 }
5282
5283 /* If maximizing, it is worth using inline code for speed, doing the type
5284 test once at the start (i.e. keep it out of the loop). Again, keep the
5285 UTF-8 and UCP stuff separate. */
5286
5287 else
5288 {
5289 pp = eptr; /* Remember where we started */
5290
5291 #ifdef SUPPORT_UCP
5292 if (prop_type >= 0)
5293 {
5294 switch(prop_type)
5295 {
5296 case PT_ANY:
5297 for (i = min; i < max; i++)
5298 {
5299 int len = 1;
5300 if (eptr >= md->end_subject)
5301 {
5302 SCHECK_PARTIAL();
5303 break;
5304 }
5305 GETCHARLENTEST(c, eptr, len);
5306 if (prop_fail_result) break;
5307 eptr+= len;
5308 }
5309 break;
5310
5311 case PT_LAMP:
5312 for (i = min; i < max; i++)
5313 {
5314 int chartype;
5315 int len = 1;
5316 if (eptr >= md->end_subject)
5317 {
5318 SCHECK_PARTIAL();
5319 break;
5320 }
5321 GETCHARLENTEST(c, eptr, len);
5322 chartype = UCD_CHARTYPE(c);
5323 if ((chartype == ucp_Lu ||
5324 chartype == ucp_Ll ||
5325 chartype == ucp_Lt) == prop_fail_result)
5326 break;
5327 eptr+= len;
5328 }
5329 break;
5330
5331 case PT_GC:
5332 for (i = min; i < max; i++)
5333 {
5334 int len = 1;
5335 if (eptr >= md->end_subject)
5336 {
5337 SCHECK_PARTIAL();
5338 break;
5339 }
5340 GETCHARLENTEST(c, eptr, len);
5341 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5342 eptr+= len;
5343 }
5344 break;
5345
5346 case PT_PC:
5347 for (i = min; i < max; i++)
5348 {
5349 int len = 1;
5350 if (eptr >= md->end_subject)
5351 {
5352 SCHECK_PARTIAL();
5353 break;
5354 }
5355 GETCHARLENTEST(c, eptr, len);
5356 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5357 eptr+= len;
5358 }
5359 break;
5360
5361 case PT_SC:
5362 for (i = min; i < max; i++)
5363 {
5364 int len = 1;
5365 if (eptr >= md->end_subject)
5366 {
5367 SCHECK_PARTIAL();
5368 break;
5369 }
5370 GETCHARLENTEST(c, eptr, len);
5371 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5372 eptr+= len;
5373 }
5374 break;
5375
5376 case PT_ALNUM:
5377 for (i = min; i < max; i++)
5378 {
5379 int category;
5380 int len = 1;
5381 if (eptr >= md->end_subject)
5382 {
5383 SCHECK_PARTIAL();
5384 break;
5385 }
5386 GETCHARLENTEST(c, eptr, len);
5387 category = UCD_CATEGORY(c);
5388 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5389 break;
5390 eptr+= len;
5391 }
5392 break;
5393
5394 case PT_SPACE: /* Perl space */
5395 for (i = min; i < max; i++)
5396 {
5397 int len = 1;
5398 if (eptr >= md->end_subject)
5399 {
5400 SCHECK_PARTIAL();
5401 break;
5402 }
5403 GETCHARLENTEST(c, eptr, len);
5404 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5405 c == CHAR_FF || c == CHAR_CR)
5406 == prop_fail_result)
5407 break;
5408 eptr+= len;
5409 }
5410 break;
5411
5412 case PT_PXSPACE: /* POSIX space */
5413 for (i = min; i < max; i++)
5414 {
5415 int len = 1;
5416 if (eptr >= md->end_subject)
5417 {
5418 SCHECK_PARTIAL();
5419 break;
5420 }
5421 GETCHARLENTEST(c, eptr, len);
5422 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5423 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5424 == prop_fail_result)
5425 break;
5426 eptr+= len;
5427 }
5428 break;
5429
5430 case PT_WORD:
5431 for (i = min; i < max; i++)
5432 {
5433 int category;
5434 int len = 1;
5435 if (eptr >= md->end_subject)
5436 {
5437 SCHECK_PARTIAL();
5438 break;
5439 }
5440 GETCHARLENTEST(c, eptr, len);
5441 category = UCD_CATEGORY(c);
5442 if ((category == ucp_L || category == ucp_N ||
5443 c == CHAR_UNDERSCORE) == prop_fail_result)
5444 break;
5445 eptr+= len;
5446 }
5447 break;
5448
5449 case PT_CLIST:
5450 for (i = min; i < max; i++)
5451 {
5452 const pcre_uint32 *cp;
5453 int len = 1;
5454 if (eptr >= md->end_subject)
5455 {
5456 SCHECK_PARTIAL();
5457 break;
5458 }
5459 GETCHARLENTEST(c, eptr, len);
5460 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
5461 for (;;)
5462 {
5463 if (c < *cp)
5464 { if (prop_fail_result) break; else goto GOT_MAX; }
5465 if (c == *cp++)
5466 { if (prop_fail_result) goto GOT_MAX; else break; }
5467 }
5468 eptr += len;
5469 }
5470 GOT_MAX:
5471 break;
5472
5473 default:
5474 RRETURN(PCRE_ERROR_INTERNAL);
5475 }
5476
5477 /* eptr is now past the end of the maximum run */
5478
5479 if (possessive) continue;
5480 for(;;)
5481 {
5482 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5484 if (eptr-- == pp) break; /* Stop if tried at original pos */
5485 if (utf) BACKCHAR(eptr);
5486 }
5487 }
5488
5489 /* Match extended Unicode sequences. We will get here only if the
5490 support is in the binary; otherwise a compile-time error occurs. */
5491
5492 else if (ctype == OP_EXTUNI)
5493 {
5494 for (i = min; i < max; i++)
5495 {
5496 if (eptr >= md->end_subject)
5497 {
5498 SCHECK_PARTIAL();
5499 break;
5500 }
5501 else
5502 {
5503 int lgb, rgb;
5504 GETCHARINCTEST(c, eptr);
5505 lgb = UCD_GRAPHBREAK(c);
5506 while (eptr < md->end_subject)
5507 {
5508 int len = 1;
5509 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5510 rgb = UCD_GRAPHBREAK(c);
5511 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5512 lgb = rgb;
5513 eptr += len;
5514 }
5515 }
5516 CHECK_PARTIAL();
5517 }
5518
5519 /* eptr is now past the end of the maximum run */
5520
5521 if (possessive) continue;
5522
5523 for(;;)
5524 {
5525 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5527 if (eptr-- == pp) break; /* Stop if tried at original pos */
5528 for (;;) /* Move back over one extended */
5529 {
5530 if (!utf) c = *eptr; else
5531 {
5532 BACKCHAR(eptr);
5533 GETCHAR(c, eptr);
5534 }
5535 if (UCD_CATEGORY(c) != ucp_M) break;
5536 eptr--;
5537 }
5538 }
5539 }
5540
5541 else
5542 #endif /* SUPPORT_UCP */
5543
5544 #ifdef SUPPORT_UTF
5545 if (utf)
5546 {
5547 switch(ctype)
5548 {
5549 case OP_ANY:
5550 if (max < INT_MAX)
5551 {
5552 for (i = min; i < max; i++)
5553 {
5554 if (eptr >= md->end_subject)
5555 {
5556 SCHECK_PARTIAL();
5557 break;
5558 }
5559 if (IS_NEWLINE(eptr)) break;
5560 if (md->partial != 0 && /* Take care with CRLF partial */
5561 eptr + 1 >= md->end_subject &&
5562 NLBLOCK->nltype == NLTYPE_FIXED &&
5563 NLBLOCK->nllen == 2 &&
5564 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5565 {
5566 md->hitend = TRUE;
5567 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5568 }
5569 eptr++;
5570 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5571 }
5572 }
5573
5574 /* Handle unlimited UTF-8 repeat */
5575
5576 else
5577 {
5578 for (i = min; i < max; i++)
5579 {
5580 if (eptr >= md->end_subject)
5581 {
5582 SCHECK_PARTIAL();
5583 break;
5584 }
5585 if (IS_NEWLINE(eptr)) break;
5586 if (md->partial != 0 && /* Take care with CRLF partial */
5587 eptr + 1 >= md->end_subject &&
5588 NLBLOCK->nltype == NLTYPE_FIXED &&
5589 NLBLOCK->nllen == 2 &&
5590 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5591 {
5592 md->hitend = TRUE;
5593 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5594 }
5595 eptr++;
5596 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5597 }
5598 }
5599 break;
5600
5601 case OP_ALLANY:
5602 if (max < INT_MAX)
5603 {
5604 for (i = min; i < max; i++)
5605 {
5606 if (eptr >= md->end_subject)
5607 {
5608 SCHECK_PARTIAL();
5609 break;
5610 }
5611 eptr++;
5612 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5613 }
5614 }
5615 else
5616 {
5617 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5618 SCHECK_PARTIAL();
5619 }
5620 break;
5621
5622 /* The byte case is the same as non-UTF8 */
5623
5624 case OP_ANYBYTE:
5625 c = max - min;
5626 if (c > (unsigned int)(md->end_subject - eptr))
5627 {
5628 eptr = md->end_subject;
5629 SCHECK_PARTIAL();
5630 }
5631 else eptr += c;
5632 break;
5633
5634 case OP_ANYNL:
5635 for (i = min; i < max; i++)
5636 {
5637 int len = 1;
5638 if (eptr >= md->end_subject)
5639 {
5640 SCHECK_PARTIAL();
5641 break;
5642 }
5643 GETCHARLEN(c, eptr, len);
5644 if (c == CHAR_CR)
5645 {
5646 if (++eptr >= md->end_subject) break;
5647 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5648 }
5649 else
5650 {
5651 if (c != CHAR_LF &&
5652 (md->bsr_anycrlf ||
5653 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5654 #ifndef EBCDIC
5655 && c != 0x2028 && c != 0x2029
5656 #endif /* Not EBCDIC */
5657 )))
5658 break;
5659 eptr += len;
5660 }
5661 }
5662 break;
5663
5664 case OP_NOT_HSPACE:
5665 case OP_HSPACE:
5666 for (i = min; i < max; i++)
5667 {
5668 BOOL gotspace;
5669 int len = 1;
5670 if (eptr >= md->end_subject)
5671 {
5672 SCHECK_PARTIAL();
5673 break;
5674 }
5675 GETCHARLEN(c, eptr, len);
5676 switch(c)
5677 {
5678 HSPACE_CASES: gotspace = TRUE; break;
5679 default: gotspace = FALSE; break;
5680 }
5681 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5682 eptr += len;
5683 }
5684 break;
5685
5686 case OP_NOT_VSPACE:
5687 case OP_VSPACE:
5688 for (i = min; i < max; i++)
5689 {
5690 BOOL gotspace;
5691 int len = 1;
5692 if (eptr >= md->end_subject)
5693 {
5694 SCHECK_PARTIAL();
5695 break;
5696 }
5697 GETCHARLEN(c, eptr, len);
5698 switch(c)
5699 {
5700 VSPACE_CASES: gotspace = TRUE; break;
5701 default: gotspace = FALSE; break;
5702 }
5703 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5704 eptr += len;
5705 }
5706 break;
5707
5708 case OP_NOT_DIGIT:
5709 for (i = min; i < max; i++)
5710 {
5711 int len = 1;
5712 if (eptr >= md->end_subject)
5713 {
5714 SCHECK_PARTIAL();
5715 break;
5716 }
5717 GETCHARLEN(c, eptr, len);
5718 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5719 eptr+= len;
5720 }
5721 break;
5722
5723 case OP_DIGIT:
5724 for (i = min; i < max; i++)
5725 {
5726 int len = 1;
5727 if (eptr >= md->end_subject)
5728 {
5729 SCHECK_PARTIAL();
5730 break;
5731 }
5732 GETCHARLEN(c, eptr, len);
5733 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5734 eptr+= len;
5735 }
5736 break;
5737
5738 case OP_NOT_WHITESPACE:
5739 for (i = min; i < max; i++)
5740 {
5741 int len = 1;
5742 if (eptr >= md->end_subject)
5743 {
5744 SCHECK_PARTIAL();
5745 break;
5746 }
5747 GETCHARLEN(c, eptr, len);
5748 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5749 eptr+= len;
5750 }
5751 break;
5752
5753 case OP_WHITESPACE:
5754 for (i = min; i < max; i++)
5755 {
5756 int len = 1;
5757 if (eptr >= md->end_subject)
5758 {
5759 SCHECK_PARTIAL();
5760 break;
5761 }
5762 GETCHARLEN(c, eptr, len);
5763 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5764 eptr+= len;
5765 }
5766 break;
5767
5768 case OP_NOT_WORDCHAR:
5769 for (i = min; i < max; i++)
5770 {
5771 int len = 1;
5772 if (eptr >= md->end_subject)
5773 {
5774 SCHECK_PARTIAL();
5775 break;
5776 }
5777 GETCHARLEN(c, eptr, len);
5778 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5779 eptr+= len;
5780 }
5781 break;
5782
5783 case OP_WORDCHAR:
5784 for (i = min; i < max; i++)
5785 {
5786 int len = 1;
5787 if (eptr >= md->end_subject)
5788 {
5789 SCHECK_PARTIAL();
5790 break;
5791 }
5792 GETCHARLEN(c, eptr, len);
5793 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5794 eptr+= len;
5795 }
5796 break;
5797
5798 default:
5799 RRETURN(PCRE_ERROR_INTERNAL);
5800 }
5801
5802 /* eptr is now past the end of the maximum run. If possessive, we are
5803 done (no backing up). Otherwise, match at this position; anything other
5804 than no match is immediately returned. For nomatch, back up one
5805 character, unless we are matching \R and the last thing matched was
5806 \r\n, in which case, back up two bytes. */
5807
5808 if (possessive) continue;
5809 for(;;)
5810 {
5811 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5813 if (eptr-- == pp) break; /* Stop if tried at original pos */
5814 BACKCHAR(eptr);
5815 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5816 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5817 }
5818 }
5819 else
5820 #endif /* SUPPORT_UTF */
5821 /* Not UTF mode */
5822 {
5823 switch(ctype)
5824 {
5825 case OP_ANY:
5826 for (i = min; i < max; i++)
5827 {
5828 if (eptr >= md->end_subject)
5829 {
5830 SCHECK_PARTIAL();
5831 break;
5832 }
5833 if (IS_NEWLINE(eptr)) break;
5834 if (md->partial != 0 && /* Take care with CRLF partial */
5835 eptr + 1 >= md->end_subject &&
5836 NLBLOCK->nltype == NLTYPE_FIXED &&
5837 NLBLOCK->nllen == 2 &&
5838 *eptr == NLBLOCK->nl[0])
5839 {
5840 md->hitend = TRUE;
5841 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5842 }
5843 eptr++;
5844 }
5845 break;
5846
5847 case OP_ALLANY:
5848 case OP_ANYBYTE:
5849 c = max - min;
5850 if (c > (unsigned int)(md->end_subject - eptr))
5851 {
5852 eptr = md->end_subject;
5853 SCHECK_PARTIAL();
5854 }
5855 else eptr += c;
5856 break;
5857
5858 case OP_ANYNL:
5859 for (i = min; i < max; i++)
5860 {
5861 if (eptr >= md->end_subject)
5862 {
5863 SCHECK_PARTIAL();
5864 break;
5865 }
5866 c = *eptr;
5867 if (c == CHAR_CR)
5868 {
5869 if (++eptr >= md->end_subject) break;
5870 if (*eptr == CHAR_LF) eptr++;
5871 }
5872 else
5873 {
5874 if (c != CHAR_LF && (md->bsr_anycrlf ||
5875 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5876 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5877 && c != 0x2028 && c != 0x2029
5878 #endif
5879 ))) break;
5880 eptr++;
5881 }
5882 }
5883 break;
5884
5885 case OP_NOT_HSPACE:
5886 for (i = min; i < max; i++)
5887 {
5888 if (eptr >= md->end_subject)
5889 {
5890 SCHECK_PARTIAL();
5891 break;
5892 }
5893 switch(*eptr)
5894 {
5895 default: eptr++; break;
5896 HSPACE_BYTE_CASES:
5897 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5898 HSPACE_MULTIBYTE_CASES:
5899 #endif
5900 goto ENDLOOP00;
5901 }
5902 }
5903 ENDLOOP00:
5904 break;
5905
5906 case OP_HSPACE:
5907 for (i = min; i < max; i++)
5908 {
5909 if (eptr >= md->end_subject)
5910 {
5911 SCHECK_PARTIAL();
5912 break;
5913 }
5914 switch(*eptr)
5915 {
5916 default: goto ENDLOOP01;
5917 HSPACE_BYTE_CASES:
5918 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5919 HSPACE_MULTIBYTE_CASES:
5920 #endif
5921 eptr++; break;
5922 }
5923 }
5924 ENDLOOP01:
5925 break;
5926
5927 case OP_NOT_VSPACE:
5928 for (i = min; i < max; i++)
5929 {
5930 if (eptr >= md->end_subject)
5931 {
5932 SCHECK_PARTIAL();
5933 break;
5934 }
5935 switch(*eptr)
5936 {
5937 default: eptr++; break;
5938 VSPACE_BYTE_CASES:
5939 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5940 VSPACE_MULTIBYTE_CASES:
5941 #endif
5942 goto ENDLOOP02;
5943 }
5944 }
5945 ENDLOOP02:
5946 break;
5947
5948 case OP_VSPACE:
5949 for (i = min; i < max; i++)
5950 {
5951 if (eptr >= md->end_subject)
5952 {
5953 SCHECK_PARTIAL();
5954 break;
5955 }
5956 switch(*eptr)
5957 {
5958 default: goto ENDLOOP03;
5959 VSPACE_BYTE_CASES:
5960 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5961 VSPACE_MULTIBYTE_CASES:
5962 #endif
5963 eptr++; break;
5964 }
5965 }
5966 ENDLOOP03:
5967 break;
5968
5969 case OP_NOT_DIGIT:
5970 for (i = min; i < max; i++)
5971 {
5972 if (eptr >= md->end_subject)
5973 {
5974 SCHECK_PARTIAL();
5975 break;
5976 }
5977 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5978 eptr++;
5979 }
5980 break;
5981
5982 case OP_DIGIT:
5983 for (i = min; i < max; i++)
5984 {
5985 if (eptr >= md->end_subject)
5986 {
5987 SCHECK_PARTIAL();
5988 break;
5989 }
5990 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5991 eptr++;
5992 }
5993 break;
5994
5995 case OP_NOT_WHITESPACE:
5996 for (i = min; i < max; i++)
5997 {
5998 if (eptr >= md->end_subject)
5999 {
6000 SCHECK_PARTIAL();
6001 break;
6002 }
6003 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6004 eptr++;
6005 }
6006 break;
6007
6008 case OP_WHITESPACE:
6009 for (i = min; i < max; i++)
6010 {
6011 if (eptr >= md->end_subject)
6012 {
6013 SCHECK_PARTIAL();
6014 break;
6015 }
6016 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6017 eptr++;
6018 }
6019 break;
6020
6021 case OP_NOT_WORDCHAR:
6022 for (i = min; i < max; i++)
6023 {
6024 if (eptr >= md->end_subject)
6025 {
6026 SCHECK_PARTIAL();
6027 break;
6028 }
6029 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6030 eptr++;
6031 }
6032 break;
6033
6034 case OP_WORDCHAR:
6035 for (i = min; i < max; i++)
6036 {
6037 if (eptr >= md->end_subject)
6038 {
6039 SCHECK_PARTIAL();
6040 break;
6041 }
6042 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6043 eptr++;
6044 }
6045 break;
6046
6047 default:
6048 RRETURN(PCRE_ERROR_INTERNAL);
6049 }
6050
6051 /* eptr is now past the end of the maximum run. If possessive, we are
6052 done (no backing up). Otherwise, match at this position; anything other
6053 than no match is immediately returned. For nomatch, back up one
6054 character (byte), unless we are matching \R and the last thing matched
6055 was \r\n, in which case, back up two bytes. */
6056
6057 if (possessive) continue;
6058 while (eptr >= pp)
6059 {
6060 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6062 eptr--;
6063 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6064 eptr[-1] == CHAR_CR) eptr--;
6065 }
6066 }
6067
6068 /* Get here if we can't make it match with any permitted repetitions */
6069
6070 RRETURN(MATCH_NOMATCH);
6071 }
6072 /* Control never gets here */
6073
6074 /* There's been some horrible disaster. Arrival here can only mean there is
6075 something seriously wrong in the code above or the OP_xxx definitions. */
6076
6077 default:
6078 DPRINTF(("Unknown opcode %d\n", *ecode));
6079 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6080 }
6081
6082 /* Do not stick any code in here without much thought; it is assumed
6083 that "continue" in the code above comes out to here to repeat the main
6084 loop. */
6085
6086 } /* End of main loop */
6087 /* Control never reaches here */
6088
6089
6090 /* When compiling to use the heap rather than the stack for recursive calls to
6091 match(), the RRETURN() macro jumps here. The number that is saved in
6092 frame->Xwhere indicates which label we actually want to return to. */
6093
6094 #ifdef NO_RECURSE
6095 #define LBL(val) case val: goto L_RM##val;
6096 HEAP_RETURN:
6097 switch (frame->Xwhere)
6098 {
6099 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6100 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6101 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6102 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6103 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6104 LBL(65) LBL(66)
6105 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6106 LBL(21)
6107 #endif
6108 #ifdef SUPPORT_UTF
6109 LBL(16) LBL(18) LBL(20)
6110 LBL(22) LBL(23) LBL(28) LBL(30)
6111 LBL(32) LBL(34) LBL(42) LBL(46)
6112 #ifdef SUPPORT_UCP
6113 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6114 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6115 #endif /* SUPPORT_UCP */
6116 #endif /* SUPPORT_UTF */
6117 default:
6118 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6119 return PCRE_ERROR_INTERNAL;
6120 }
6121 #undef LBL
6122 #endif /* NO_RECURSE */
6123 }
6124
6125
6126 /***************************************************************************
6127 ****************************************************************************
6128 RECURSION IN THE match() FUNCTION
6129
6130 Undefine all the macros that were defined above to handle this. */
6131
6132 #ifdef NO_RECURSE
6133 #undef eptr
6134 #undef ecode
6135 #undef mstart
6136 #undef offset_top
6137 #undef eptrb
6138 #undef flags
6139
6140 #undef callpat
6141 #undef charptr
6142 #undef data
6143 #undef next
6144 #undef pp
6145 #undef prev
6146 #undef saved_eptr
6147
6148 #undef new_recursive
6149
6150 #undef cur_is_word
6151 #undef condition
6152 #undef prev_is_word
6153
6154 #undef ctype
6155 #undef length
6156 #undef max
6157 #undef min
6158 #undef number
6159 #undef offset
6160 #undef op
6161 #undef save_capture_last
6162 #undef save_offset1
6163 #undef save_offset2
6164 #undef save_offset3
6165 #undef stacksave
6166
6167 #undef newptrb
6168
6169 #endif
6170
6171 /* These two are defined as macros in both cases */
6172
6173 #undef fc
6174 #undef fi
6175
6176 /***************************************************************************
6177 ***************************************************************************/
6178
6179
6180 #ifdef NO_RECURSE
6181 /*************************************************
6182 * Release allocated heap frames *
6183 *************************************************/
6184
6185 /* This function releases all the allocated frames. The base frame is on the
6186 machine stack, and so must not be freed.
6187
6188 Argument: the address of the base frame
6189 Returns: nothing
6190 */
6191
6192 static void
6193 release_match_heapframes (heapframe *frame_base)
6194 {
6195 heapframe *nextframe = frame_base->Xnextframe;
6196 while (nextframe != NULL)
6197 {
6198 heapframe *oldframe = nextframe;
6199 nextframe = nextframe->Xnextframe;
6200 (PUBL(stack_free))(oldframe);
6201 }
6202 }
6203 #endif
6204
6205
6206 /*************************************************
6207 * Execute a Regular Expression *
6208 *************************************************/
6209
6210 /* This function applies a compiled re to a subject string and picks out
6211 portions of the string if it matches. Two elements in the vector are set for
6212 each substring: the offsets to the start and end of the substring.
6213
6214 Arguments:
6215 argument_re points to the compiled expression
6216 extra_data points to extra data or is NULL
6217 subject points to the subject string
6218 length length of subject string (may contain binary zeros)
6219 start_offset where to start in the subject string
6220 options option bits
6221 offsets points to a vector of ints to be filled in with offsets
6222 offsetcount the number of elements in the vector
6223
6224 Returns: > 0 => success; value is the number of elements filled in
6225 = 0 => success, but offsets is not big enough
6226 -1 => failed to match
6227 < -1 => some kind of unexpected problem
6228 */
6229
6230 #if defined COMPILE_PCRE8
6231 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6232 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6233 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6234 int offsetcount)
6235 #elif defined COMPILE_PCRE16
6236 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6237 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6238 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6239 int offsetcount)
6240 #elif defined COMPILE_PCRE32
6241 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6242 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6243 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6244 int offsetcount)
6245 #endif
6246 {
6247 int rc, ocount, arg_offset_max;
6248 int newline;
6249 BOOL using_temporary_offsets = FALSE;
6250 BOOL anchored;
6251 BOOL startline;
6252 BOOL firstline;
6253 BOOL utf;
6254 BOOL has_first_char = FALSE;
6255 BOOL has_req_char = FALSE;
6256 pcre_uchar first_char = 0;
6257 pcre_uchar first_char2 = 0;
6258 pcre_uchar req_char = 0;
6259 pcre_uchar req_char2 = 0;
6260 match_data match_block;
6261 match_data *md = &match_block;
6262 const pcre_uint8 *tables;
6263 const pcre_uint8 *start_bits = NULL;
6264 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6265 PCRE_PUCHAR end_subject;
6266 PCRE_PUCHAR start_partial = NULL;
6267 PCRE_PUCHAR req_char_ptr = start_match - 1;
6268
6269 const pcre_study_data *study;
6270 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6271
6272 #ifdef NO_RECURSE
6273 heapframe frame_zero;
6274 frame_zero.Xprevframe = NULL; /* Marks the top level */
6275 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6276 md->match_frames_base = &frame_zero;
6277 #endif
6278
6279 /* Check for the special magic call that measures the size of the stack used
6280 per recursive call of match(). Without the funny casting for sizeof, a Windows
6281 compiler gave this error: "unary minus operator applied to unsigned type,
6282 result still unsigned". Hopefully the cast fixes that. */
6283
6284 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6285 start_offset == -999)
6286 #ifdef NO_RECURSE
6287 return -((int)sizeof(heapframe));
6288 #else
6289 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6290 #endif
6291
6292 /* Plausibility checks */
6293
6294 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6295 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6296 return PCRE_ERROR_NULL;
6297 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6298 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6299
6300 /* Check that the first field in the block is the magic number. If it is not,
6301 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6302 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6303 means that the pattern is likely compiled with different endianness. */
6304
6305 if (re->magic_number != MAGIC_NUMBER)
6306 return re->magic_number == REVERSED_MAGIC_NUMBER?
6307 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6308 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6309
6310 /* These two settings are used in the code for checking a UTF-8 string that
6311 follows immediately afterwards. Other values in the md block are used only
6312 during "normal" pcre_exec() processing, not when the JIT support is in use,
6313 so they are set up later. */
6314
6315 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6316 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6317 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6318 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6319
6320 /* Check a UTF-8 string if required. Pass back the character offset and error
6321 code for an invalid string if a results vector is available. */
6322
6323 #ifdef SUPPORT_UTF
6324 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6325 {
6326 int erroroffset;
6327 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6328 if (errorcode != 0)
6329 {
6330 if (offsetcount >= 2)
6331 {
6332 offsets[0] = erroroffset;
6333 offsets[1] = errorcode;
6334 }
6335 #if defined COMPILE_PCRE8
6336 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6337 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6338 #elif defined COMPILE_PCRE16
6339 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6340 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6341 #elif defined COMPILE_PCRE32
6342 return PCRE_ERROR_BADUTF32;
6343 #endif
6344 }
6345 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6346 /* Check that a start_offset points to the start of a UTF character. */
6347 if (start_offset > 0 && start_offset < length &&
6348 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6349 return PCRE_ERROR_BADUTF8_OFFSET;
6350 #endif
6351 }
6352 #endif
6353
6354 /* If the pattern was successfully studied with JIT support, run the JIT
6355 executable instead of the rest of this function. Most options must be set at
6356 compile time for the JIT code to be usable. Fallback to the normal code path if
6357 an unsupported flag is set. */
6358
6359 #ifdef SUPPORT_JIT
6360 if (extra_data != NULL
6361 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6362 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6363 && extra_data->executable_jit != NULL
6364 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6365 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6366 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6367 {
6368 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6369 start_offset, options, offsets, offsetcount);
6370
6371 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6372 mode is not compiled. In this case we simply fallback to interpreter. */
6373
6374 if (rc != PCRE_ERROR_NULL) return rc;
6375 }
6376 #endif
6377
6378 /* Carry on with non-JIT matching. This information is for finding all the
6379 numbers associated with a given name, for condition testing. */
6380
6381 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6382 md->name_count = re->name_count;
6383 md->name_entry_size = re->name_entry_size;
6384
6385 /* Fish out the optional data from the extra_data structure, first setting
6386 the default values. */
6387
6388 study = NULL;
6389 md->match_limit = MATCH_LIMIT;
6390 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6391 md->callout_data = NULL;
6392
6393 /* The table pointer is always in native byte order. */
6394
6395 tables = re->tables;
6396
6397 if (extra_data != NULL)
6398 {
6399 register unsigned int flags = extra_data->flags;
6400 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6401 study = (const pcre_study_data *)extra_data->study_data;
6402 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6403 md->match_limit = extra_data->match_limit;
6404 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6405 md->match_limit_recursion = extra_data->match_limit_recursion;
6406 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6407 md->callout_data = extra_data->callout_data;
6408 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6409 }
6410
6411 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6412 is a feature that makes it possible to save compiled regex and re-use them
6413 in other programs later. */
6414
6415 if (tables == NULL) tables = PRIV(default_tables);
6416
6417 /* Set up other data */
6418
6419 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6420 startline = (re->flags & PCRE_STARTLINE) != 0;
6421 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6422
6423 /* The code starts after the real_pcre block and the capture name table. */
6424
6425 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6426 re->name_count * re->name_entry_size;
6427
6428 md->start_subject = (PCRE_PUCHAR)subject;
6429 md->start_offset = start_offset;
6430 md->end_subject = md->start_subject + length;
6431 end_subject = md->end_subject;
6432
6433 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6434 md->use_ucp = (re->options & PCRE_UCP) != 0;
6435 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6436 md->ignore_skip_arg = FALSE;
6437
6438 /* Some options are unpacked into BOOL variables in the hope that testing
6439 them will be faster than individual option bits. */
6440
6441 md->notbol = (options & PCRE_NOTBOL) != 0;
6442 md->noteol = (options & PCRE_NOTEOL) != 0;
6443 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6444 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6445
6446 md->hitend = FALSE;
6447 md->mark = md->nomatch_mark = NULL; /* In case never set */
6448
6449 md->recursive = NULL; /* No recursion at top level */
6450 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6451
6452 md->lcc = tables + lcc_offset;
6453 md->fcc = tables + fcc_offset;
6454 md->ctypes = tables + ctypes_offset;
6455
6456 /* Handle different \R options. */
6457
6458 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6459 {
6460 case 0:
6461 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6462 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6463 else
6464 #ifdef BSR_ANYCRLF
6465 md->bsr_anycrlf = TRUE;
6466 #else
6467 md->bsr_anycrlf = FALSE;
6468 #endif
6469 break;
6470
6471 case PCRE_BSR_ANYCRLF:
6472 md->bsr_anycrlf = TRUE;
6473 break;
6474
6475 case PCRE_BSR_UNICODE:
6476 md->bsr_anycrlf = FALSE;
6477 break;
6478
6479 default: return PCRE_ERROR_BADNEWLINE;
6480 }
6481
6482 /* Handle different types of newline. The three bits give eight cases. If
6483 nothing is set at run time, whatever was used at compile time applies. */
6484
6485 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6486 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6487 {
6488 case 0: newline = NEWLINE; break; /* Compile-time default */
6489 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6490 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6491 case PCRE_NEWLINE_CR+
6492 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6493 case PCRE_NEWLINE_ANY: newline = -1; break;
6494 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6495 default: return PCRE_ERROR_BADNEWLINE;
6496 }
6497
6498 if (newline == -2)
6499 {
6500 md->nltype = NLTYPE_ANYCRLF;
6501 }
6502 else if (newline < 0)
6503 {
6504 md->nltype = NLTYPE_ANY;
6505 }
6506 else
6507 {
6508 md->nltype = NLTYPE_FIXED;
6509 if (newline > 255)
6510 {
6511 md->nllen = 2;
6512 md->nl[0] = (newline >> 8) & 255;
6513 md->nl[1] = newline & 255;
6514 }
6515 else
6516 {
6517 md->nllen = 1;
6518 md->nl[0] = newline;
6519 }
6520 }
6521
6522 /* Partial matching was originally supported only for a restricted set of
6523 regexes; from release 8.00 there are no restrictions, but the bits are still
6524 defined (though never set). So there's no harm in leaving this code. */
6525
6526 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6527 return PCRE_ERROR_BADPARTIAL;
6528
6529 /* If the expression has got more back references than the offsets supplied can
6530 hold, we get a temporary chunk of working store to use during the matching.
6531 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6532 of 3. */
6533
6534 ocount = offsetcount - (offsetcount % 3);
6535 arg_offset_max = (2*ocount)/3;
6536
6537 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6538 {
6539 ocount = re->top_backref * 3 + 3;
6540 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6541 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6542 using_temporary_offsets = TRUE;
6543 DPRINTF(("Got memory to hold back references\n"));
6544 }
6545 else md->offset_vector = offsets;
6546
6547 md->offset_end = ocount;
6548 md->offset_max = (2*ocount)/3;
6549 md->offset_overflow = FALSE;
6550 md->capture_last = -1;
6551
6552 /* Reset the working variable associated with each extraction. These should
6553 never be used unless previously set, but they get saved and restored, and so we
6554 initialize them to avoid reading uninitialized locations. Also, unset the
6555 offsets for the matched string. This is really just for tidiness with callouts,
6556 in case they inspect these fields. */
6557
6558 if (md->offset_vector != NULL)
6559 {
6560 register int *iptr = md->offset_vector + ocount;
6561 register int *iend = iptr - re->top_bracket;
6562 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6563 while (--iptr >= iend) *iptr = -1;
6564 md->offset_vector[0] = md->offset_vector[1] = -1;
6565 }
6566
6567 /* Set up the first character to match, if available. The first_char value is
6568 never set for an anchored regular expression, but the anchoring may be forced
6569 at run time, so we have to test for anchoring. The first char may be unset for
6570 an unanchored pattern, of course. If there's no first char and the pattern was
6571 studied, there may be a bitmap of possible first characters. */
6572
6573 if (!anchored)
6574 {
6575 if ((re->flags & PCRE_FIRSTSET) != 0)
6576 {
6577 has_first_char = TRUE;
6578 first_char = first_char2 = (pcre_uchar)(re->first_char);
6579 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6580 {
6581 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6582 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6583 if (utf && first_char > 127)
6584 first_char2 = UCD_OTHERCASE(first_char);
6585 #endif
6586 }
6587 }
6588 else
6589 if (!startline && study != NULL &&
6590 (study->flags & PCRE_STUDY_MAPPED) != 0)
6591 start_bits = study->start_bits;
6592 }
6593
6594 /* For anchored or unanchored matches, there may be a "last known required
6595 character" set. */
6596
6597 if ((re->flags & PCRE_REQCHSET) != 0)
6598 {
6599 has_req_char = TRUE;
6600 req_char = req_char2 = (pcre_uchar)(re->req_char);
6601 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6602 {
6603 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6604 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6605 if (utf && req_char > 127)
6606 req_char2 = UCD_OTHERCASE(req_char);
6607 #endif
6608 }
6609 }
6610
6611
6612 /* ==========================================================================*/
6613
6614 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6615 the loop runs just once. */
6616
6617 for(;;)
6618 {
6619 PCRE_PUCHAR save_end_subject = end_subject;
6620 PCRE_PUCHAR new_start_match;
6621
6622 /* If firstline is TRUE, the start of the match is constrained to the first
6623 line of a multiline string. That is, the match must be before or at the first
6624 newline. Implement this by temporarily adjusting end_subject so that we stop
6625 scanning at a newline. If the match fails at the newline, later code breaks
6626 this loop. */
6627
6628 if (firstline)
6629 {
6630 PCRE_PUCHAR t = start_match;
6631 #ifdef SUPPORT_UTF
6632 if (utf)
6633 {
6634 while (t < md->end_subject && !IS_NEWLINE(t))
6635 {
6636 t++;
6637 ACROSSCHAR(t < end_subject, *t, t++);
6638 }
6639 }
6640 else
6641 #endif
6642 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6643 end_subject = t;
6644 }
6645
6646 /* There are some optimizations that avoid running the match if a known
6647 starting point is not found, or if a known later character is not present.
6648 However, there is an option that disables these, for testing and for ensuring
6649 that all callouts do actually occur. The option can be set in the regex by
6650 (*NO_START_OPT) or passed in match-time options. */
6651
6652 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6653 {
6654 /* Advance to a unique first char if there is one. */
6655
6656 if (has_first_char)
6657 {
6658 pcre_uchar smc;
6659
6660 if (first_char != first_char2)
6661 while (start_match < end_subject &&
6662 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6663 start_match++;
6664 else
6665 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6666 start_match++;
6667 }
6668
6669 /* Or to just after a linebreak for a multiline match */
6670
6671 else if (startline)
6672 {
6673 if (start_match > md->start_subject + start_offset)
6674 {
6675 #ifdef SUPPORT_UTF
6676 if (utf)
6677 {
6678 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6679 {
6680 start_match++;
6681 ACROSSCHAR(start_match < end_subject, *start_match,
6682 start_match++);
6683 }
6684 }
6685 else
6686 #endif
6687 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6688 start_match++;
6689
6690 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6691 and we are now at a LF, advance the match position by one more character.
6692 */
6693
6694 if (start_match[-1] == CHAR_CR &&
6695 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6696 start_match < end_subject &&
6697 RAWUCHARTEST(start_match) == CHAR_NL)
6698 start_match++;
6699 }
6700 }
6701
6702 /* Or to a non-unique first byte after study */
6703
6704 else if (start_bits != NULL)
6705 {
6706 while (start_match < end_subject)
6707 {
6708 register pcre_uint32 c = RAWUCHARTEST(start_match);
6709 #ifndef COMPILE_PCRE8
6710 if (c > 255) c = 255;
6711 #endif
6712 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6713 {
6714 start_match++;
6715 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6716 /* In non 8-bit mode, the iteration will stop for
6717 characters > 255 at the beginning or not stop at all. */
6718 if (utf)
6719 ACROSSCHAR(start_match < end_subject, *start_match,
6720 start_match++);
6721 #endif
6722 }
6723 else break;
6724 }
6725 }
6726 } /* Starting optimizations */
6727
6728 /* Restore fudged end_subject */
6729
6730 end_subject = save_end_subject;
6731
6732 /* The following two optimizations are disabled for partial matching or if
6733 disabling is explicitly requested. */
6734
6735 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6736 {
6737 /* If the pattern was studied, a minimum subject length may be set. This is
6738 a lower bound; no actual string of that length may actually match the
6739 pattern. Although the value is, strictly, in characters, we treat it as
6740 bytes to avoid spending too much time in this optimization. */
6741
6742 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6743 (pcre_uint32)(end_subject - start_match) < study->minlength)
6744 {
6745 rc = MATCH_NOMATCH;
6746 break;
6747 }
6748
6749 /* If req_char is set, we know that that character must appear in the
6750 subject for the match to succeed. If the first character is set, req_char
6751 must be later in the subject; otherwise the test starts at the match point.
6752 This optimization can save a huge amount of backtracking in patterns with
6753 nested unlimited repeats that aren't going to match. Writing separate code
6754 for cased/caseless versions makes it go faster, as does using an
6755 autoincrement and backing off on a match.
6756
6757 HOWEVER: when the subject string is very, very long, searching to its end
6758 can take a long time, and give bad performance on quite ordinary patterns.
6759 This showed up when somebody was matching something like /^\d+C/ on a
6760 32-megabyte string... so we don't do this when the string is sufficiently
6761 long. */
6762
6763 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6764 {