/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1238 - (show annotations)
Sat Jan 5 16:27:59 2013 UTC (6 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 212061 byte(s)
Small tweaks give performance improvements.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95 #ifdef PCRE_DEBUG
96 /*************************************************
97 * Debugging function to print chars *
98 *************************************************/
99
100 /* Print a sequence of chars in printable format, stopping at the end of the
101 subject if the requested.
102
103 Arguments:
104 p points to characters
105 length number to print
106 is_subject TRUE if printing from within md->start_subject
107 md pointer to matching data block, if is_subject is TRUE
108
109 Returns: nothing
110 */
111
112 static void
113 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
114 {
115 pcre_uint32 c;
116 BOOL utf = md->utf;
117 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
118 while (length-- > 0)
119 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
120 }
121 #endif
122
123
124
125 /*************************************************
126 * Match a back-reference *
127 *************************************************/
128
129 /* Normally, if a back reference hasn't been set, the length that is passed is
130 negative, so the match always fails. However, in JavaScript compatibility mode,
131 the length passed is zero. Note that in caseless UTF-8 mode, the number of
132 subject bytes matched may be different to the number of reference bytes.
133
134 Arguments:
135 offset index into the offset vector
136 eptr pointer into the subject
137 length length of reference to be matched (number of bytes)
138 md points to match data block
139 caseless TRUE if caseless
140
141 Returns: >= 0 the number of subject bytes matched
142 -1 no match
143 -2 partial match; always given if at end subject
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152 #ifdef SUPPORT_UTF
153 BOOL utf = md->utf;
154 #endif
155
156 #ifdef PCRE_DEBUG
157 if (eptr >= md->end_subject)
158 printf("matching subject <null>");
159 else
160 {
161 printf("matching subject ");
162 pchars(eptr, length, TRUE, md);
163 }
164 printf(" against backref ");
165 pchars(p, length, FALSE, md);
166 printf("\n");
167 #endif
168
169 /* Always fail if reference not set (and not JavaScript compatible - in that
170 case the length is passed as zero). */
171
172 if (length < 0) return -1;
173
174 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
175 properly if Unicode properties are supported. Otherwise, we can check only
176 ASCII characters. */
177
178 if (caseless)
179 {
180 #ifdef SUPPORT_UTF
181 #ifdef SUPPORT_UCP
182 if (utf)
183 {
184 /* Match characters up to the end of the reference. NOTE: the number of
185 data units matched may differ, because in UTF-8 there are some characters
186 whose upper and lower case versions code have different numbers of bytes.
187 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
188 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
189 sequence of two of the latter. It is important, therefore, to check the
190 length along the reference, not along the subject (earlier code did this
191 wrong). */
192
193 PCRE_PUCHAR endptr = p + length;
194 while (p < endptr)
195 {
196 pcre_uint32 c, d;
197 const ucd_record *ur;
198 if (eptr >= md->end_subject) return -2; /* Partial match */
199 GETCHARINC(c, eptr);
200 GETCHARINC(d, p);
201 ur = GET_UCD(d);
202 if (c != d && c != d + ur->other_case)
203 {
204 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
205 for (;;)
206 {
207 if (c < *pp) return -1;
208 if (c == *pp++) break;
209 }
210 }
211 }
212 }
213 else
214 #endif
215 #endif
216
217 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
218 is no UCP support. */
219 {
220 while (length-- > 0)
221 {
222 pcre_uchar cc, cp;
223 if (eptr >= md->end_subject) return -2; /* Partial match */
224 cc = RAWUCHARTEST(eptr);
225 cp = RAWUCHARTEST(p);
226 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
227 p++;
228 eptr++;
229 }
230 }
231 }
232
233 /* In the caseful case, we can just compare the bytes, whether or not we
234 are in UTF-8 mode. */
235
236 else
237 {
238 while (length-- > 0)
239 {
240 if (eptr >= md->end_subject) return -2; /* Partial match */
241 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
242 }
243 }
244
245 return (int)(eptr - eptr_start);
246 }
247
248
249
250 /***************************************************************************
251 ****************************************************************************
252 RECURSION IN THE match() FUNCTION
253
254 The match() function is highly recursive, though not every recursive call
255 increases the recursive depth. Nevertheless, some regular expressions can cause
256 it to recurse to a great depth. I was writing for Unix, so I just let it call
257 itself recursively. This uses the stack for saving everything that has to be
258 saved for a recursive call. On Unix, the stack can be large, and this works
259 fine.
260
261 It turns out that on some non-Unix-like systems there are problems with
262 programs that use a lot of stack. (This despite the fact that every last chip
263 has oodles of memory these days, and techniques for extending the stack have
264 been known for decades.) So....
265
266 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
267 calls by keeping local variables that need to be preserved in blocks of memory
268 obtained from malloc() instead instead of on the stack. Macros are used to
269 achieve this so that the actual code doesn't look very different to what it
270 always used to.
271
272 The original heap-recursive code used longjmp(). However, it seems that this
273 can be very slow on some operating systems. Following a suggestion from Stan
274 Switzer, the use of longjmp() has been abolished, at the cost of having to
275 provide a unique number for each call to RMATCH. There is no way of generating
276 a sequence of numbers at compile time in C. I have given them names, to make
277 them stand out more clearly.
278
279 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
280 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
281 tests. Furthermore, not using longjmp() means that local dynamic variables
282 don't have indeterminate values; this has meant that the frame size can be
283 reduced because the result can be "passed back" by straight setting of the
284 variable instead of being passed in the frame.
285 ****************************************************************************
286 ***************************************************************************/
287
288 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
289 below must be updated in sync. */
290
291 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
292 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
293 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
294 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
295 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
296 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
297 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
298
299 /* These versions of the macros use the stack, as normal. There are debugging
300 versions and production versions. Note that the "rw" argument of RMATCH isn't
301 actually used in this definition. */
302
303 #ifndef NO_RECURSE
304 #define REGISTER register
305
306 #ifdef PCRE_DEBUG
307 #define RMATCH(ra,rb,rc,rd,re,rw) \
308 { \
309 printf("match() called in line %d\n", __LINE__); \
310 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
311 printf("to line %d\n", __LINE__); \
312 }
313 #define RRETURN(ra) \
314 { \
315 printf("match() returned %d from line %d\n", ra, __LINE__); \
316 return ra; \
317 }
318 #else
319 #define RMATCH(ra,rb,rc,rd,re,rw) \
320 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
321 #define RRETURN(ra) return ra
322 #endif
323
324 #else
325
326
327 /* These versions of the macros manage a private stack on the heap. Note that
328 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
329 argument of match(), which never changes. */
330
331 #define REGISTER
332
333 #define RMATCH(ra,rb,rc,rd,re,rw)\
334 {\
335 heapframe *newframe = frame->Xnextframe;\
336 if (newframe == NULL)\
337 {\
338 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
339 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
340 newframe->Xnextframe = NULL;\
341 frame->Xnextframe = newframe;\
342 }\
343 frame->Xwhere = rw;\
344 newframe->Xeptr = ra;\
345 newframe->Xecode = rb;\
346 newframe->Xmstart = mstart;\
347 newframe->Xoffset_top = rc;\
348 newframe->Xeptrb = re;\
349 newframe->Xrdepth = frame->Xrdepth + 1;\
350 newframe->Xprevframe = frame;\
351 frame = newframe;\
352 DPRINTF(("restarting from line %d\n", __LINE__));\
353 goto HEAP_RECURSE;\
354 L_##rw:\
355 DPRINTF(("jumped back to line %d\n", __LINE__));\
356 }
357
358 #define RRETURN(ra)\
359 {\
360 heapframe *oldframe = frame;\
361 frame = oldframe->Xprevframe;\
362 if (frame != NULL)\
363 {\
364 rrc = ra;\
365 goto HEAP_RETURN;\
366 }\
367 return ra;\
368 }
369
370
371 /* Structure for remembering the local variables in a private frame */
372
373 typedef struct heapframe {
374 struct heapframe *Xprevframe;
375 struct heapframe *Xnextframe;
376
377 /* Function arguments that may change */
378
379 PCRE_PUCHAR Xeptr;
380 const pcre_uchar *Xecode;
381 PCRE_PUCHAR Xmstart;
382 int Xoffset_top;
383 eptrblock *Xeptrb;
384 unsigned int Xrdepth;
385
386 /* Function local variables */
387
388 PCRE_PUCHAR Xcallpat;
389 #ifdef SUPPORT_UTF
390 PCRE_PUCHAR Xcharptr;
391 #endif
392 PCRE_PUCHAR Xdata;
393 PCRE_PUCHAR Xnext;
394 PCRE_PUCHAR Xpp;
395 PCRE_PUCHAR Xprev;
396 PCRE_PUCHAR Xsaved_eptr;
397
398 recursion_info Xnew_recursive;
399
400 BOOL Xcur_is_word;
401 BOOL Xcondition;
402 BOOL Xprev_is_word;
403
404 #ifdef SUPPORT_UCP
405 int Xprop_type;
406 unsigned int Xprop_value;
407 int Xprop_fail_result;
408 int Xoclength;
409 pcre_uchar Xocchars[6];
410 #endif
411
412 int Xcodelink;
413 int Xctype;
414 unsigned int Xfc;
415 int Xfi;
416 int Xlength;
417 int Xmax;
418 int Xmin;
419 unsigned int Xnumber;
420 int Xoffset;
421 unsigned int Xop;
422 int Xsave_capture_last;
423 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
424 int Xstacksave[REC_STACK_SAVE_MAX];
425
426 eptrblock Xnewptrb;
427
428 /* Where to jump back to */
429
430 int Xwhere;
431
432 } heapframe;
433
434 #endif
435
436
437 /***************************************************************************
438 ***************************************************************************/
439
440
441
442 /*************************************************
443 * Match from current position *
444 *************************************************/
445
446 /* This function is called recursively in many circumstances. Whenever it
447 returns a negative (error) response, the outer incarnation must also return the
448 same response. */
449
450 /* These macros pack up tests that are used for partial matching, and which
451 appear several times in the code. We set the "hit end" flag if the pointer is
452 at the end of the subject and also past the start of the subject (i.e.
453 something has been matched). For hard partial matching, we then return
454 immediately. The second one is used when we already know we are past the end of
455 the subject. */
456
457 #define CHECK_PARTIAL()\
458 if (md->partial != 0 && eptr >= md->end_subject && \
459 eptr > md->start_used_ptr) \
460 { \
461 md->hitend = TRUE; \
462 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
463 }
464
465 #define SCHECK_PARTIAL()\
466 if (md->partial != 0 && eptr > md->start_used_ptr) \
467 { \
468 md->hitend = TRUE; \
469 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
470 }
471
472
473 /* Performance note: It might be tempting to extract commonly used fields from
474 the md structure (e.g. utf, end_subject) into individual variables to improve
475 performance. Tests using gcc on a SPARC disproved this; in the first case, it
476 made performance worse.
477
478 Arguments:
479 eptr pointer to current character in subject
480 ecode pointer to current position in compiled code
481 mstart pointer to the current match start position (can be modified
482 by encountering \K)
483 offset_top current top pointer
484 md pointer to "static" info for the match
485 eptrb pointer to chain of blocks containing eptr at start of
486 brackets - for testing for empty matches
487 rdepth the recursion depth
488
489 Returns: MATCH_MATCH if matched ) these values are >= 0
490 MATCH_NOMATCH if failed to match )
491 a negative MATCH_xxx value for PRUNE, SKIP, etc
492 a negative PCRE_ERROR_xxx value if aborted by an error condition
493 (e.g. stopped by repeated call or recursion limit)
494 */
495
496 static int
497 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
498 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
499 unsigned int rdepth)
500 {
501 /* These variables do not need to be preserved over recursion in this function,
502 so they can be ordinary variables in all cases. Mark some of them with
503 "register" because they are used a lot in loops. */
504
505 register int rrc; /* Returns from recursive calls */
506 register int i; /* Used for loops not involving calls to RMATCH() */
507 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
508 register BOOL utf; /* Local copy of UTF flag for speed */
509
510 BOOL minimize, possessive; /* Quantifier options */
511 BOOL caseless;
512 int condcode;
513
514 /* When recursion is not being used, all "local" variables that have to be
515 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
516 frame on the stack here; subsequent instantiations are obtained from the heap
517 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
518 the top-level on the stack rather than malloc-ing them all gives a performance
519 boost in many cases where there is not much "recursion". */
520
521 #ifdef NO_RECURSE
522 heapframe *frame = (heapframe *)md->match_frames_base;
523
524 /* Copy in the original argument variables */
525
526 frame->Xeptr = eptr;
527 frame->Xecode = ecode;
528 frame->Xmstart = mstart;
529 frame->Xoffset_top = offset_top;
530 frame->Xeptrb = eptrb;
531 frame->Xrdepth = rdepth;
532
533 /* This is where control jumps back to to effect "recursion" */
534
535 HEAP_RECURSE:
536
537 /* Macros make the argument variables come from the current frame */
538
539 #define eptr frame->Xeptr
540 #define ecode frame->Xecode
541 #define mstart frame->Xmstart
542 #define offset_top frame->Xoffset_top
543 #define eptrb frame->Xeptrb
544 #define rdepth frame->Xrdepth
545
546 /* Ditto for the local variables */
547
548 #ifdef SUPPORT_UTF
549 #define charptr frame->Xcharptr
550 #endif
551 #define callpat frame->Xcallpat
552 #define codelink frame->Xcodelink
553 #define data frame->Xdata
554 #define next frame->Xnext
555 #define pp frame->Xpp
556 #define prev frame->Xprev
557 #define saved_eptr frame->Xsaved_eptr
558
559 #define new_recursive frame->Xnew_recursive
560
561 #define cur_is_word frame->Xcur_is_word
562 #define condition frame->Xcondition
563 #define prev_is_word frame->Xprev_is_word
564
565 #ifdef SUPPORT_UCP
566 #define prop_type frame->Xprop_type
567 #define prop_value frame->Xprop_value
568 #define prop_fail_result frame->Xprop_fail_result
569 #define oclength frame->Xoclength
570 #define occhars frame->Xocchars
571 #endif
572
573 #define ctype frame->Xctype
574 #define fc frame->Xfc
575 #define fi frame->Xfi
576 #define length frame->Xlength
577 #define max frame->Xmax
578 #define min frame->Xmin
579 #define number frame->Xnumber
580 #define offset frame->Xoffset
581 #define op frame->Xop
582 #define save_capture_last frame->Xsave_capture_last
583 #define save_offset1 frame->Xsave_offset1
584 #define save_offset2 frame->Xsave_offset2
585 #define save_offset3 frame->Xsave_offset3
586 #define stacksave frame->Xstacksave
587
588 #define newptrb frame->Xnewptrb
589
590 /* When recursion is being used, local variables are allocated on the stack and
591 get preserved during recursion in the normal way. In this environment, fi and
592 i, and fc and c, can be the same variables. */
593
594 #else /* NO_RECURSE not defined */
595 #define fi i
596 #define fc c
597
598 /* Many of the following variables are used only in small blocks of the code.
599 My normal style of coding would have declared them within each of those blocks.
600 However, in order to accommodate the version of this code that uses an external
601 "stack" implemented on the heap, it is easier to declare them all here, so the
602 declarations can be cut out in a block. The only declarations within blocks
603 below are for variables that do not have to be preserved over a recursive call
604 to RMATCH(). */
605
606 #ifdef SUPPORT_UTF
607 const pcre_uchar *charptr;
608 #endif
609 const pcre_uchar *callpat;
610 const pcre_uchar *data;
611 const pcre_uchar *next;
612 PCRE_PUCHAR pp;
613 const pcre_uchar *prev;
614 PCRE_PUCHAR saved_eptr;
615
616 recursion_info new_recursive;
617
618 BOOL cur_is_word;
619 BOOL condition;
620 BOOL prev_is_word;
621
622 #ifdef SUPPORT_UCP
623 int prop_type;
624 unsigned int prop_value;
625 int prop_fail_result;
626 int oclength;
627 pcre_uchar occhars[6];
628 #endif
629
630 int codelink;
631 int ctype;
632 int length;
633 int max;
634 int min;
635 unsigned int number;
636 int offset;
637 unsigned int op;
638 int save_capture_last;
639 int save_offset1, save_offset2, save_offset3;
640 int stacksave[REC_STACK_SAVE_MAX];
641
642 eptrblock newptrb;
643
644 /* There is a special fudge for calling match() in a way that causes it to
645 measure the size of its basic stack frame when the stack is being used for
646 recursion. The second argument (ecode) being NULL triggers this behaviour. It
647 cannot normally ever be NULL. The return is the negated value of the frame
648 size. */
649
650 if (ecode == NULL)
651 {
652 if (rdepth == 0)
653 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
654 else
655 {
656 int len = (char *)&rdepth - (char *)eptr;
657 return (len > 0)? -len : len;
658 }
659 }
660 #endif /* NO_RECURSE */
661
662 /* To save space on the stack and in the heap frame, I have doubled up on some
663 of the local variables that are used only in localised parts of the code, but
664 still need to be preserved over recursive calls of match(). These macros define
665 the alternative names that are used. */
666
667 #define allow_zero cur_is_word
668 #define cbegroup condition
669 #define code_offset codelink
670 #define condassert condition
671 #define matched_once prev_is_word
672 #define foc number
673 #define save_mark data
674
675 /* These statements are here to stop the compiler complaining about unitialized
676 variables. */
677
678 #ifdef SUPPORT_UCP
679 prop_value = 0;
680 prop_fail_result = 0;
681 #endif
682
683
684 /* This label is used for tail recursion, which is used in a few cases even
685 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
686 used. Thanks to Ian Taylor for noticing this possibility and sending the
687 original patch. */
688
689 TAIL_RECURSE:
690
691 /* OK, now we can get on with the real code of the function. Recursive calls
692 are specified by the macro RMATCH and RRETURN is used to return. When
693 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
694 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
695 defined). However, RMATCH isn't like a function call because it's quite a
696 complicated macro. It has to be used in one particular way. This shouldn't,
697 however, impact performance when true recursion is being used. */
698
699 #ifdef SUPPORT_UTF
700 utf = md->utf; /* Local copy of the flag */
701 #else
702 utf = FALSE;
703 #endif
704
705 /* First check that we haven't called match() too many times, or that we
706 haven't exceeded the recursive call limit. */
707
708 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
709 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
710
711 /* At the start of a group with an unlimited repeat that may match an empty
712 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
713 done this way to save having to use another function argument, which would take
714 up space on the stack. See also MATCH_CONDASSERT below.
715
716 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
717 such remembered pointers, to be checked when we hit the closing ket, in order
718 to break infinite loops that match no characters. When match() is called in
719 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
720 NOT be used with tail recursion, because the memory block that is used is on
721 the stack, so a new one may be required for each match(). */
722
723 if (md->match_function_type == MATCH_CBEGROUP)
724 {
725 newptrb.epb_saved_eptr = eptr;
726 newptrb.epb_prev = eptrb;
727 eptrb = &newptrb;
728 md->match_function_type = 0;
729 }
730
731 /* Now start processing the opcodes. */
732
733 for (;;)
734 {
735 minimize = possessive = FALSE;
736 op = *ecode;
737
738 switch(op)
739 {
740 case OP_MARK:
741 md->nomatch_mark = ecode + 2;
742 md->mark = NULL; /* In case previously set by assertion */
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 eptrb, RM55);
745 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746 md->mark == NULL) md->mark = ecode + 2;
747
748 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
749 argument, and we must check whether that argument matches this MARK's
750 argument. It is passed back in md->start_match_ptr (an overloading of that
751 variable). If it does match, we reset that variable to the current subject
752 position and return MATCH_SKIP. Otherwise, pass back the return code
753 unaltered. */
754
755 else if (rrc == MATCH_SKIP_ARG &&
756 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
757 {
758 md->start_match_ptr = eptr;
759 RRETURN(MATCH_SKIP);
760 }
761 RRETURN(rrc);
762
763 case OP_FAIL:
764 RRETURN(MATCH_NOMATCH);
765
766 /* COMMIT overrides PRUNE, SKIP, and THEN */
767
768 case OP_COMMIT:
769 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
770 eptrb, RM52);
771 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
772 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
773 rrc != MATCH_THEN)
774 RRETURN(rrc);
775 RRETURN(MATCH_COMMIT);
776
777 /* PRUNE overrides THEN */
778
779 case OP_PRUNE:
780 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
781 eptrb, RM51);
782 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
783 RRETURN(MATCH_PRUNE);
784
785 case OP_PRUNE_ARG:
786 md->nomatch_mark = ecode + 2;
787 md->mark = NULL; /* In case previously set by assertion */
788 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
789 eptrb, RM56);
790 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
791 md->mark == NULL) md->mark = ecode + 2;
792 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
793 RRETURN(MATCH_PRUNE);
794
795 /* SKIP overrides PRUNE and THEN */
796
797 case OP_SKIP:
798 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
799 eptrb, RM53);
800 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
801 RRETURN(rrc);
802 md->start_match_ptr = eptr; /* Pass back current position */
803 RRETURN(MATCH_SKIP);
804
805 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
806 nomatch_mark. There is a flag that disables this opcode when re-matching a
807 pattern that ended with a SKIP for which there was not a matching MARK. */
808
809 case OP_SKIP_ARG:
810 if (md->ignore_skip_arg)
811 {
812 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
813 break;
814 }
815 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
816 eptrb, RM57);
817 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
818 RRETURN(rrc);
819
820 /* Pass back the current skip name by overloading md->start_match_ptr and
821 returning the special MATCH_SKIP_ARG return code. This will either be
822 caught by a matching MARK, or get to the top, where it causes a rematch
823 with the md->ignore_skip_arg flag set. */
824
825 md->start_match_ptr = ecode + 2;
826 RRETURN(MATCH_SKIP_ARG);
827
828 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
829 the branch in which it occurs can be determined. Overload the start of
830 match pointer to do this. */
831
832 case OP_THEN:
833 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
834 eptrb, RM54);
835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
836 md->start_match_ptr = ecode;
837 RRETURN(MATCH_THEN);
838
839 case OP_THEN_ARG:
840 md->nomatch_mark = ecode + 2;
841 md->mark = NULL; /* In case previously set by assertion */
842 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
843 md, eptrb, RM58);
844 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
845 md->mark == NULL) md->mark = ecode + 2;
846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
847 md->start_match_ptr = ecode;
848 RRETURN(MATCH_THEN);
849
850 /* Handle an atomic group that does not contain any capturing parentheses.
851 This can be handled like an assertion. Prior to 8.13, all atomic groups
852 were handled this way. In 8.13, the code was changed as below for ONCE, so
853 that backups pass through the group and thereby reset captured values.
854 However, this uses a lot more stack, so in 8.20, atomic groups that do not
855 contain any captures generate OP_ONCE_NC, which can be handled in the old,
856 less stack intensive way.
857
858 Check the alternative branches in turn - the matching won't pass the KET
859 for this kind of subpattern. If any one branch matches, we carry on as at
860 the end of a normal bracket, leaving the subject pointer, but resetting
861 the start-of-match value in case it was changed by \K. */
862
863 case OP_ONCE_NC:
864 prev = ecode;
865 saved_eptr = eptr;
866 save_mark = md->mark;
867 do
868 {
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
870 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
871 {
872 mstart = md->start_match_ptr;
873 break;
874 }
875 if (rrc == MATCH_THEN)
876 {
877 next = ecode + GET(ecode,1);
878 if (md->start_match_ptr < next &&
879 (*ecode == OP_ALT || *next == OP_ALT))
880 rrc = MATCH_NOMATCH;
881 }
882
883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
884 ecode += GET(ecode,1);
885 md->mark = save_mark;
886 }
887 while (*ecode == OP_ALT);
888
889 /* If hit the end of the group (which could be repeated), fail */
890
891 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
892
893 /* Continue as from after the group, updating the offsets high water
894 mark, since extracts may have been taken. */
895
896 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
897
898 offset_top = md->end_offset_top;
899 eptr = md->end_match_ptr;
900
901 /* For a non-repeating ket, just continue at this level. This also
902 happens for a repeating ket if no characters were matched in the group.
903 This is the forcible breaking of infinite loops as implemented in Perl
904 5.005. */
905
906 if (*ecode == OP_KET || eptr == saved_eptr)
907 {
908 ecode += 1+LINK_SIZE;
909 break;
910 }
911
912 /* The repeating kets try the rest of the pattern or restart from the
913 preceding bracket, in the appropriate order. The second "call" of match()
914 uses tail recursion, to avoid using another stack frame. */
915
916 if (*ecode == OP_KETRMIN)
917 {
918 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
920 ecode = prev;
921 goto TAIL_RECURSE;
922 }
923 else /* OP_KETRMAX */
924 {
925 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
927 ecode += 1 + LINK_SIZE;
928 goto TAIL_RECURSE;
929 }
930 /* Control never gets here */
931
932 /* Handle a capturing bracket, other than those that are possessive with an
933 unlimited repeat. If there is space in the offset vector, save the current
934 subject position in the working slot at the top of the vector. We mustn't
935 change the current values of the data slot, because they may be set from a
936 previous iteration of this group, and be referred to by a reference inside
937 the group. A failure to match might occur after the group has succeeded,
938 if something later on doesn't match. For this reason, we need to restore
939 the working value and also the values of the final offsets, in case they
940 were set by a previous iteration of the same bracket.
941
942 If there isn't enough space in the offset vector, treat this as if it were
943 a non-capturing bracket. Don't worry about setting the flag for the error
944 case here; that is handled in the code for KET. */
945
946 case OP_CBRA:
947 case OP_SCBRA:
948 number = GET2(ecode, 1+LINK_SIZE);
949 offset = number << 1;
950
951 #ifdef PCRE_DEBUG
952 printf("start bracket %d\n", number);
953 printf("subject=");
954 pchars(eptr, 16, TRUE, md);
955 printf("\n");
956 #endif
957
958 if (offset < md->offset_max)
959 {
960 save_offset1 = md->offset_vector[offset];
961 save_offset2 = md->offset_vector[offset+1];
962 save_offset3 = md->offset_vector[md->offset_end - number];
963 save_capture_last = md->capture_last;
964 save_mark = md->mark;
965
966 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
967 md->offset_vector[md->offset_end - number] =
968 (int)(eptr - md->start_subject);
969
970 for (;;)
971 {
972 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
973 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
974 eptrb, RM1);
975 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
976
977 /* If we backed up to a THEN, check whether it is within the current
978 branch by comparing the address of the THEN that is passed back with
979 the end of the branch. If it is within the current branch, and the
980 branch is one of two or more alternatives (it either starts or ends
981 with OP_ALT), we have reached the limit of THEN's action, so convert
982 the return code to NOMATCH, which will cause normal backtracking to
983 happen from now on. Otherwise, THEN is passed back to an outer
984 alternative. This implements Perl's treatment of parenthesized groups,
985 where a group not containing | does not affect the current alternative,
986 that is, (X) is NOT the same as (X|(*F)). */
987
988 if (rrc == MATCH_THEN)
989 {
990 next = ecode + GET(ecode,1);
991 if (md->start_match_ptr < next &&
992 (*ecode == OP_ALT || *next == OP_ALT))
993 rrc = MATCH_NOMATCH;
994 }
995
996 /* Anything other than NOMATCH is passed back. */
997
998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
999 md->capture_last = save_capture_last;
1000 ecode += GET(ecode, 1);
1001 md->mark = save_mark;
1002 if (*ecode != OP_ALT) break;
1003 }
1004
1005 DPRINTF(("bracket %d failed\n", number));
1006 md->offset_vector[offset] = save_offset1;
1007 md->offset_vector[offset+1] = save_offset2;
1008 md->offset_vector[md->offset_end - number] = save_offset3;
1009
1010 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1011
1012 RRETURN(rrc);
1013 }
1014
1015 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1016 as a non-capturing bracket. */
1017
1018 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1019 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1020
1021 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1022
1023 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1024 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1025
1026 /* Non-capturing or atomic group, except for possessive with unlimited
1027 repeat and ONCE group with no captures. Loop for all the alternatives.
1028
1029 When we get to the final alternative within the brackets, we used to return
1030 the result of a recursive call to match() whatever happened so it was
1031 possible to reduce stack usage by turning this into a tail recursion,
1032 except in the case of a possibly empty group. However, now that there is
1033 the possiblity of (*THEN) occurring in the final alternative, this
1034 optimization is no longer always possible.
1035
1036 We can optimize if we know there are no (*THEN)s in the pattern; at present
1037 this is the best that can be done.
1038
1039 MATCH_ONCE is returned when the end of an atomic group is successfully
1040 reached, but subsequent matching fails. It passes back up the tree (causing
1041 captured values to be reset) until the original atomic group level is
1042 reached. This is tested by comparing md->once_target with the start of the
1043 group. At this point, the return is converted into MATCH_NOMATCH so that
1044 previous backup points can be taken. */
1045
1046 case OP_ONCE:
1047 case OP_BRA:
1048 case OP_SBRA:
1049 DPRINTF(("start non-capturing bracket\n"));
1050
1051 for (;;)
1052 {
1053 if (op >= OP_SBRA || op == OP_ONCE)
1054 md->match_function_type = MATCH_CBEGROUP;
1055
1056 /* If this is not a possibly empty group, and there are no (*THEN)s in
1057 the pattern, and this is the final alternative, optimize as described
1058 above. */
1059
1060 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1061 {
1062 ecode += PRIV(OP_lengths)[*ecode];
1063 goto TAIL_RECURSE;
1064 }
1065
1066 /* In all other cases, we have to make another call to match(). */
1067
1068 save_mark = md->mark;
1069 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1070 RM2);
1071
1072 /* See comment in the code for capturing groups above about handling
1073 THEN. */
1074
1075 if (rrc == MATCH_THEN)
1076 {
1077 next = ecode + GET(ecode,1);
1078 if (md->start_match_ptr < next &&
1079 (*ecode == OP_ALT || *next == OP_ALT))
1080 rrc = MATCH_NOMATCH;
1081 }
1082
1083 if (rrc != MATCH_NOMATCH)
1084 {
1085 if (rrc == MATCH_ONCE)
1086 {
1087 const pcre_uchar *scode = ecode;
1088 if (*scode != OP_ONCE) /* If not at start, find it */
1089 {
1090 while (*scode == OP_ALT) scode += GET(scode, 1);
1091 scode -= GET(scode, 1);
1092 }
1093 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1094 }
1095 RRETURN(rrc);
1096 }
1097 ecode += GET(ecode, 1);
1098 md->mark = save_mark;
1099 if (*ecode != OP_ALT) break;
1100 }
1101
1102 RRETURN(MATCH_NOMATCH);
1103
1104 /* Handle possessive capturing brackets with an unlimited repeat. We come
1105 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1106 handled similarly to the normal case above. However, the matching is
1107 different. The end of these brackets will always be OP_KETRPOS, which
1108 returns MATCH_KETRPOS without going further in the pattern. By this means
1109 we can handle the group by iteration rather than recursion, thereby
1110 reducing the amount of stack needed. */
1111
1112 case OP_CBRAPOS:
1113 case OP_SCBRAPOS:
1114 allow_zero = FALSE;
1115
1116 POSSESSIVE_CAPTURE:
1117 number = GET2(ecode, 1+LINK_SIZE);
1118 offset = number << 1;
1119
1120 #ifdef PCRE_DEBUG
1121 printf("start possessive bracket %d\n", number);
1122 printf("subject=");
1123 pchars(eptr, 16, TRUE, md);
1124 printf("\n");
1125 #endif
1126
1127 if (offset < md->offset_max)
1128 {
1129 matched_once = FALSE;
1130 code_offset = (int)(ecode - md->start_code);
1131
1132 save_offset1 = md->offset_vector[offset];
1133 save_offset2 = md->offset_vector[offset+1];
1134 save_offset3 = md->offset_vector[md->offset_end - number];
1135 save_capture_last = md->capture_last;
1136
1137 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1138
1139 /* Each time round the loop, save the current subject position for use
1140 when the group matches. For MATCH_MATCH, the group has matched, so we
1141 restart it with a new subject starting position, remembering that we had
1142 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1143 usual. If we haven't matched any alternatives in any iteration, check to
1144 see if a previous iteration matched. If so, the group has matched;
1145 continue from afterwards. Otherwise it has failed; restore the previous
1146 capture values before returning NOMATCH. */
1147
1148 for (;;)
1149 {
1150 md->offset_vector[md->offset_end - number] =
1151 (int)(eptr - md->start_subject);
1152 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1153 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1154 eptrb, RM63);
1155 if (rrc == MATCH_KETRPOS)
1156 {
1157 offset_top = md->end_offset_top;
1158 eptr = md->end_match_ptr;
1159 ecode = md->start_code + code_offset;
1160 save_capture_last = md->capture_last;
1161 matched_once = TRUE;
1162 continue;
1163 }
1164
1165 /* See comment in the code for capturing groups above about handling
1166 THEN. */
1167
1168 if (rrc == MATCH_THEN)
1169 {
1170 next = ecode + GET(ecode,1);
1171 if (md->start_match_ptr < next &&
1172 (*ecode == OP_ALT || *next == OP_ALT))
1173 rrc = MATCH_NOMATCH;
1174 }
1175
1176 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1177 md->capture_last = save_capture_last;
1178 ecode += GET(ecode, 1);
1179 if (*ecode != OP_ALT) break;
1180 }
1181
1182 if (!matched_once)
1183 {
1184 md->offset_vector[offset] = save_offset1;
1185 md->offset_vector[offset+1] = save_offset2;
1186 md->offset_vector[md->offset_end - number] = save_offset3;
1187 }
1188
1189 if (allow_zero || matched_once)
1190 {
1191 ecode += 1 + LINK_SIZE;
1192 break;
1193 }
1194
1195 RRETURN(MATCH_NOMATCH);
1196 }
1197
1198 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1199 as a non-capturing bracket. */
1200
1201 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1202 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1203
1204 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1205
1206 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1207 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1208
1209 /* Non-capturing possessive bracket with unlimited repeat. We come here
1210 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1211 without the capturing complication. It is written out separately for speed
1212 and cleanliness. */
1213
1214 case OP_BRAPOS:
1215 case OP_SBRAPOS:
1216 allow_zero = FALSE;
1217
1218 POSSESSIVE_NON_CAPTURE:
1219 matched_once = FALSE;
1220 code_offset = (int)(ecode - md->start_code);
1221
1222 for (;;)
1223 {
1224 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1225 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1226 eptrb, RM48);
1227 if (rrc == MATCH_KETRPOS)
1228 {
1229 offset_top = md->end_offset_top;
1230 eptr = md->end_match_ptr;
1231 ecode = md->start_code + code_offset;
1232 matched_once = TRUE;
1233 continue;
1234 }
1235
1236 /* See comment in the code for capturing groups above about handling
1237 THEN. */
1238
1239 if (rrc == MATCH_THEN)
1240 {
1241 next = ecode + GET(ecode,1);
1242 if (md->start_match_ptr < next &&
1243 (*ecode == OP_ALT || *next == OP_ALT))
1244 rrc = MATCH_NOMATCH;
1245 }
1246
1247 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1248 ecode += GET(ecode, 1);
1249 if (*ecode != OP_ALT) break;
1250 }
1251
1252 if (matched_once || allow_zero)
1253 {
1254 ecode += 1 + LINK_SIZE;
1255 break;
1256 }
1257 RRETURN(MATCH_NOMATCH);
1258
1259 /* Control never reaches here. */
1260
1261 /* Conditional group: compilation checked that there are no more than
1262 two branches. If the condition is false, skipping the first branch takes us
1263 past the end if there is only one branch, but that's OK because that is
1264 exactly what going to the ket would do. */
1265
1266 case OP_COND:
1267 case OP_SCOND:
1268 codelink = GET(ecode, 1);
1269
1270 /* Because of the way auto-callout works during compile, a callout item is
1271 inserted between OP_COND and an assertion condition. */
1272
1273 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1274 {
1275 if (PUBL(callout) != NULL)
1276 {
1277 PUBL(callout_block) cb;
1278 cb.version = 2; /* Version 1 of the callout block */
1279 cb.callout_number = ecode[LINK_SIZE+2];
1280 cb.offset_vector = md->offset_vector;
1281 #if defined COMPILE_PCRE8
1282 cb.subject = (PCRE_SPTR)md->start_subject;
1283 #elif defined COMPILE_PCRE16
1284 cb.subject = (PCRE_SPTR16)md->start_subject;
1285 #elif defined COMPILE_PCRE32
1286 cb.subject = (PCRE_SPTR32)md->start_subject;
1287 #endif
1288 cb.subject_length = (int)(md->end_subject - md->start_subject);
1289 cb.start_match = (int)(mstart - md->start_subject);
1290 cb.current_position = (int)(eptr - md->start_subject);
1291 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1292 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1293 cb.capture_top = offset_top/2;
1294 cb.capture_last = md->capture_last;
1295 cb.callout_data = md->callout_data;
1296 cb.mark = md->nomatch_mark;
1297 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1298 if (rrc < 0) RRETURN(rrc);
1299 }
1300 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1301 }
1302
1303 condcode = ecode[LINK_SIZE+1];
1304
1305 /* Now see what the actual condition is */
1306
1307 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1308 {
1309 if (md->recursive == NULL) /* Not recursing => FALSE */
1310 {
1311 condition = FALSE;
1312 ecode += GET(ecode, 1);
1313 }
1314 else
1315 {
1316 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1317 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1318
1319 /* If the test is for recursion into a specific subpattern, and it is
1320 false, but the test was set up by name, scan the table to see if the
1321 name refers to any other numbers, and test them. The condition is true
1322 if any one is set. */
1323
1324 if (!condition && condcode == OP_NRREF)
1325 {
1326 pcre_uchar *slotA = md->name_table;
1327 for (i = 0; i < md->name_count; i++)
1328 {
1329 if (GET2(slotA, 0) == recno) break;
1330 slotA += md->name_entry_size;
1331 }
1332
1333 /* Found a name for the number - there can be only one; duplicate
1334 names for different numbers are allowed, but not vice versa. First
1335 scan down for duplicates. */
1336
1337 if (i < md->name_count)
1338 {
1339 pcre_uchar *slotB = slotA;
1340 while (slotB > md->name_table)
1341 {
1342 slotB -= md->name_entry_size;
1343 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1344 {
1345 condition = GET2(slotB, 0) == md->recursive->group_num;
1346 if (condition) break;
1347 }
1348 else break;
1349 }
1350
1351 /* Scan up for duplicates */
1352
1353 if (!condition)
1354 {
1355 slotB = slotA;
1356 for (i++; i < md->name_count; i++)
1357 {
1358 slotB += md->name_entry_size;
1359 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1360 {
1361 condition = GET2(slotB, 0) == md->recursive->group_num;
1362 if (condition) break;
1363 }
1364 else break;
1365 }
1366 }
1367 }
1368 }
1369
1370 /* Chose branch according to the condition */
1371
1372 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1373 }
1374 }
1375
1376 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1377 {
1378 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1379 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1380
1381 /* If the numbered capture is unset, but the reference was by name,
1382 scan the table to see if the name refers to any other numbers, and test
1383 them. The condition is true if any one is set. This is tediously similar
1384 to the code above, but not close enough to try to amalgamate. */
1385
1386 if (!condition && condcode == OP_NCREF)
1387 {
1388 unsigned int refno = offset >> 1;
1389 pcre_uchar *slotA = md->name_table;
1390
1391 for (i = 0; i < md->name_count; i++)
1392 {
1393 if (GET2(slotA, 0) == refno) break;
1394 slotA += md->name_entry_size;
1395 }
1396
1397 /* Found a name for the number - there can be only one; duplicate names
1398 for different numbers are allowed, but not vice versa. First scan down
1399 for duplicates. */
1400
1401 if (i < md->name_count)
1402 {
1403 pcre_uchar *slotB = slotA;
1404 while (slotB > md->name_table)
1405 {
1406 slotB -= md->name_entry_size;
1407 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1408 {
1409 offset = GET2(slotB, 0) << 1;
1410 condition = offset < offset_top &&
1411 md->offset_vector[offset] >= 0;
1412 if (condition) break;
1413 }
1414 else break;
1415 }
1416
1417 /* Scan up for duplicates */
1418
1419 if (!condition)
1420 {
1421 slotB = slotA;
1422 for (i++; i < md->name_count; i++)
1423 {
1424 slotB += md->name_entry_size;
1425 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1426 {
1427 offset = GET2(slotB, 0) << 1;
1428 condition = offset < offset_top &&
1429 md->offset_vector[offset] >= 0;
1430 if (condition) break;
1431 }
1432 else break;
1433 }
1434 }
1435 }
1436 }
1437
1438 /* Chose branch according to the condition */
1439
1440 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1441 }
1442
1443 else if (condcode == OP_DEF) /* DEFINE - always false */
1444 {
1445 condition = FALSE;
1446 ecode += GET(ecode, 1);
1447 }
1448
1449 /* The condition is an assertion. Call match() to evaluate it - setting
1450 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1451 an assertion. */
1452
1453 else
1454 {
1455 md->match_function_type = MATCH_CONDASSERT;
1456 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1457 if (rrc == MATCH_MATCH)
1458 {
1459 if (md->end_offset_top > offset_top)
1460 offset_top = md->end_offset_top; /* Captures may have happened */
1461 condition = TRUE;
1462 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1463 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1464 }
1465
1466 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1467 assertion; it is therefore treated as NOMATCH. */
1468
1469 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1470 {
1471 RRETURN(rrc); /* Need braces because of following else */
1472 }
1473 else
1474 {
1475 condition = FALSE;
1476 ecode += codelink;
1477 }
1478 }
1479
1480 /* We are now at the branch that is to be obeyed. As there is only one, can
1481 use tail recursion to avoid using another stack frame, except when there is
1482 unlimited repeat of a possibly empty group. In the latter case, a recursive
1483 call to match() is always required, unless the second alternative doesn't
1484 exist, in which case we can just plough on. Note that, for compatibility
1485 with Perl, the | in a conditional group is NOT treated as creating two
1486 alternatives. If a THEN is encountered in the branch, it propagates out to
1487 the enclosing alternative (unless nested in a deeper set of alternatives,
1488 of course). */
1489
1490 if (condition || *ecode == OP_ALT)
1491 {
1492 if (op != OP_SCOND)
1493 {
1494 ecode += 1 + LINK_SIZE;
1495 goto TAIL_RECURSE;
1496 }
1497
1498 md->match_function_type = MATCH_CBEGROUP;
1499 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1500 RRETURN(rrc);
1501 }
1502
1503 /* Condition false & no alternative; continue after the group. */
1504
1505 else
1506 {
1507 ecode += 1 + LINK_SIZE;
1508 }
1509 break;
1510
1511
1512 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1513 to close any currently open capturing brackets. */
1514
1515 case OP_CLOSE:
1516 number = GET2(ecode, 1);
1517 offset = number << 1;
1518
1519 #ifdef PCRE_DEBUG
1520 printf("end bracket %d at *ACCEPT", number);
1521 printf("\n");
1522 #endif
1523
1524 md->capture_last = number;
1525 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1526 {
1527 md->offset_vector[offset] =
1528 md->offset_vector[md->offset_end - number];
1529 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1530 if (offset_top <= offset) offset_top = offset + 2;
1531 }
1532 ecode += 1 + IMM2_SIZE;
1533 break;
1534
1535
1536 /* End of the pattern, either real or forced. */
1537
1538 case OP_END:
1539 case OP_ACCEPT:
1540 case OP_ASSERT_ACCEPT:
1541
1542 /* If we have matched an empty string, fail if not in an assertion and not
1543 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1544 is set and we have matched at the start of the subject. In both cases,
1545 backtracking will then try other alternatives, if any. */
1546
1547 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1548 md->recursive == NULL &&
1549 (md->notempty ||
1550 (md->notempty_atstart &&
1551 mstart == md->start_subject + md->start_offset)))
1552 RRETURN(MATCH_NOMATCH);
1553
1554 /* Otherwise, we have a match. */
1555
1556 md->end_match_ptr = eptr; /* Record where we ended */
1557 md->end_offset_top = offset_top; /* and how many extracts were taken */
1558 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1559
1560 /* For some reason, the macros don't work properly if an expression is
1561 given as the argument to RRETURN when the heap is in use. */
1562
1563 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1564 RRETURN(rrc);
1565
1566 /* Assertion brackets. Check the alternative branches in turn - the
1567 matching won't pass the KET for an assertion. If any one branch matches,
1568 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1569 start of each branch to move the current point backwards, so the code at
1570 this level is identical to the lookahead case. When the assertion is part
1571 of a condition, we want to return immediately afterwards. The caller of
1572 this incarnation of the match() function will have set MATCH_CONDASSERT in
1573 md->match_function type, and one of these opcodes will be the first opcode
1574 that is processed. We use a local variable that is preserved over calls to
1575 match() to remember this case. */
1576
1577 case OP_ASSERT:
1578 case OP_ASSERTBACK:
1579 save_mark = md->mark;
1580 if (md->match_function_type == MATCH_CONDASSERT)
1581 {
1582 condassert = TRUE;
1583 md->match_function_type = 0;
1584 }
1585 else condassert = FALSE;
1586
1587 do
1588 {
1589 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1590 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1591 {
1592 mstart = md->start_match_ptr; /* In case \K reset it */
1593 break;
1594 }
1595 md->mark = save_mark;
1596
1597 /* A COMMIT failure must fail the entire assertion, without trying any
1598 subsequent branches. */
1599
1600 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1601
1602 /* PCRE does not allow THEN to escape beyond an assertion; it
1603 is treated as NOMATCH. */
1604
1605 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1606 ecode += GET(ecode, 1);
1607 }
1608 while (*ecode == OP_ALT);
1609
1610 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1611
1612 /* If checking an assertion for a condition, return MATCH_MATCH. */
1613
1614 if (condassert) RRETURN(MATCH_MATCH);
1615
1616 /* Continue from after the assertion, updating the offsets high water
1617 mark, since extracts may have been taken during the assertion. */
1618
1619 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1620 ecode += 1 + LINK_SIZE;
1621 offset_top = md->end_offset_top;
1622 continue;
1623
1624 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1625 PRUNE, or COMMIT means we must assume failure without checking subsequent
1626 branches. */
1627
1628 case OP_ASSERT_NOT:
1629 case OP_ASSERTBACK_NOT:
1630 save_mark = md->mark;
1631 if (md->match_function_type == MATCH_CONDASSERT)
1632 {
1633 condassert = TRUE;
1634 md->match_function_type = 0;
1635 }
1636 else condassert = FALSE;
1637
1638 do
1639 {
1640 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1641 md->mark = save_mark;
1642 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1643 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1644 {
1645 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1646 break;
1647 }
1648
1649 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1650 as NOMATCH. */
1651
1652 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1653 ecode += GET(ecode,1);
1654 }
1655 while (*ecode == OP_ALT);
1656
1657 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1658
1659 ecode += 1 + LINK_SIZE;
1660 continue;
1661
1662 /* Move the subject pointer back. This occurs only at the start of
1663 each branch of a lookbehind assertion. If we are too close to the start to
1664 move back, this match function fails. When working with UTF-8 we move
1665 back a number of characters, not bytes. */
1666
1667 case OP_REVERSE:
1668 #ifdef SUPPORT_UTF
1669 if (utf)
1670 {
1671 i = GET(ecode, 1);
1672 while (i-- > 0)
1673 {
1674 eptr--;
1675 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1676 BACKCHAR(eptr);
1677 }
1678 }
1679 else
1680 #endif
1681
1682 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1683
1684 {
1685 eptr -= GET(ecode, 1);
1686 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1687 }
1688
1689 /* Save the earliest consulted character, then skip to next op code */
1690
1691 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1692 ecode += 1 + LINK_SIZE;
1693 break;
1694
1695 /* The callout item calls an external function, if one is provided, passing
1696 details of the match so far. This is mainly for debugging, though the
1697 function is able to force a failure. */
1698
1699 case OP_CALLOUT:
1700 if (PUBL(callout) != NULL)
1701 {
1702 PUBL(callout_block) cb;
1703 cb.version = 2; /* Version 1 of the callout block */
1704 cb.callout_number = ecode[1];
1705 cb.offset_vector = md->offset_vector;
1706 #if defined COMPILE_PCRE8
1707 cb.subject = (PCRE_SPTR)md->start_subject;
1708 #elif defined COMPILE_PCRE16
1709 cb.subject = (PCRE_SPTR16)md->start_subject;
1710 #elif defined COMPILE_PCRE32
1711 cb.subject = (PCRE_SPTR32)md->start_subject;
1712 #endif
1713 cb.subject_length = (int)(md->end_subject - md->start_subject);
1714 cb.start_match = (int)(mstart - md->start_subject);
1715 cb.current_position = (int)(eptr - md->start_subject);
1716 cb.pattern_position = GET(ecode, 2);
1717 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1718 cb.capture_top = offset_top/2;
1719 cb.capture_last = md->capture_last;
1720 cb.callout_data = md->callout_data;
1721 cb.mark = md->nomatch_mark;
1722 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1723 if (rrc < 0) RRETURN(rrc);
1724 }
1725 ecode += 2 + 2*LINK_SIZE;
1726 break;
1727
1728 /* Recursion either matches the current regex, or some subexpression. The
1729 offset data is the offset to the starting bracket from the start of the
1730 whole pattern. (This is so that it works from duplicated subpatterns.)
1731
1732 The state of the capturing groups is preserved over recursion, and
1733 re-instated afterwards. We don't know how many are started and not yet
1734 finished (offset_top records the completed total) so we just have to save
1735 all the potential data. There may be up to 65535 such values, which is too
1736 large to put on the stack, but using malloc for small numbers seems
1737 expensive. As a compromise, the stack is used when there are no more than
1738 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1739
1740 There are also other values that have to be saved. We use a chained
1741 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1742 for the original version of this logic. It has, however, been hacked around
1743 a lot, so he is not to blame for the current way it works. */
1744
1745 case OP_RECURSE:
1746 {
1747 recursion_info *ri;
1748 unsigned int recno;
1749
1750 callpat = md->start_code + GET(ecode, 1);
1751 recno = (callpat == md->start_code)? 0 :
1752 GET2(callpat, 1 + LINK_SIZE);
1753
1754 /* Check for repeating a recursion without advancing the subject pointer.
1755 This should catch convoluted mutual recursions. (Some simple cases are
1756 caught at compile time.) */
1757
1758 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1759 if (recno == ri->group_num && eptr == ri->subject_position)
1760 RRETURN(PCRE_ERROR_RECURSELOOP);
1761
1762 /* Add to "recursing stack" */
1763
1764 new_recursive.group_num = recno;
1765 new_recursive.subject_position = eptr;
1766 new_recursive.prevrec = md->recursive;
1767 md->recursive = &new_recursive;
1768
1769 /* Where to continue from afterwards */
1770
1771 ecode += 1 + LINK_SIZE;
1772
1773 /* Now save the offset data */
1774
1775 new_recursive.saved_max = md->offset_end;
1776 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1777 new_recursive.offset_save = stacksave;
1778 else
1779 {
1780 new_recursive.offset_save =
1781 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1782 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1783 }
1784 memcpy(new_recursive.offset_save, md->offset_vector,
1785 new_recursive.saved_max * sizeof(int));
1786
1787 /* OK, now we can do the recursion. After processing each alternative,
1788 restore the offset data. If there were nested recursions, md->recursive
1789 might be changed, so reset it before looping. */
1790
1791 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1792 cbegroup = (*callpat >= OP_SBRA);
1793 do
1794 {
1795 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1796 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1797 md, eptrb, RM6);
1798 memcpy(md->offset_vector, new_recursive.offset_save,
1799 new_recursive.saved_max * sizeof(int));
1800 md->recursive = new_recursive.prevrec;
1801 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1802 {
1803 DPRINTF(("Recursion matched\n"));
1804 if (new_recursive.offset_save != stacksave)
1805 (PUBL(free))(new_recursive.offset_save);
1806
1807 /* Set where we got to in the subject, and reset the start in case
1808 it was changed by \K. This *is* propagated back out of a recursion,
1809 for Perl compatibility. */
1810
1811 eptr = md->end_match_ptr;
1812 mstart = md->start_match_ptr;
1813 goto RECURSION_MATCHED; /* Exit loop; end processing */
1814 }
1815
1816 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1817 is treated as NOMATCH. */
1818
1819 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1820 rrc != MATCH_COMMIT)
1821 {
1822 DPRINTF(("Recursion gave error %d\n", rrc));
1823 if (new_recursive.offset_save != stacksave)
1824 (PUBL(free))(new_recursive.offset_save);
1825 RRETURN(rrc);
1826 }
1827
1828 md->recursive = &new_recursive;
1829 callpat += GET(callpat, 1);
1830 }
1831 while (*callpat == OP_ALT);
1832
1833 DPRINTF(("Recursion didn't match\n"));
1834 md->recursive = new_recursive.prevrec;
1835 if (new_recursive.offset_save != stacksave)
1836 (PUBL(free))(new_recursive.offset_save);
1837 RRETURN(MATCH_NOMATCH);
1838 }
1839
1840 RECURSION_MATCHED:
1841 break;
1842
1843 /* An alternation is the end of a branch; scan along to find the end of the
1844 bracketed group and go to there. */
1845
1846 case OP_ALT:
1847 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1848 break;
1849
1850 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1851 indicating that it may occur zero times. It may repeat infinitely, or not
1852 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1853 with fixed upper repeat limits are compiled as a number of copies, with the
1854 optional ones preceded by BRAZERO or BRAMINZERO. */
1855
1856 case OP_BRAZERO:
1857 next = ecode + 1;
1858 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1860 do next += GET(next, 1); while (*next == OP_ALT);
1861 ecode = next + 1 + LINK_SIZE;
1862 break;
1863
1864 case OP_BRAMINZERO:
1865 next = ecode + 1;
1866 do next += GET(next, 1); while (*next == OP_ALT);
1867 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1869 ecode++;
1870 break;
1871
1872 case OP_SKIPZERO:
1873 next = ecode+1;
1874 do next += GET(next,1); while (*next == OP_ALT);
1875 ecode = next + 1 + LINK_SIZE;
1876 break;
1877
1878 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1879 here; just jump to the group, with allow_zero set TRUE. */
1880
1881 case OP_BRAPOSZERO:
1882 op = *(++ecode);
1883 allow_zero = TRUE;
1884 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1885 goto POSSESSIVE_NON_CAPTURE;
1886
1887 /* End of a group, repeated or non-repeating. */
1888
1889 case OP_KET:
1890 case OP_KETRMIN:
1891 case OP_KETRMAX:
1892 case OP_KETRPOS:
1893 prev = ecode - GET(ecode, 1);
1894
1895 /* If this was a group that remembered the subject start, in order to break
1896 infinite repeats of empty string matches, retrieve the subject start from
1897 the chain. Otherwise, set it NULL. */
1898
1899 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1900 {
1901 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1902 eptrb = eptrb->epb_prev; /* Backup to previous group */
1903 }
1904 else saved_eptr = NULL;
1905
1906 /* If we are at the end of an assertion group or a non-capturing atomic
1907 group, stop matching and return MATCH_MATCH, but record the current high
1908 water mark for use by positive assertions. We also need to record the match
1909 start in case it was changed by \K. */
1910
1911 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1912 *prev == OP_ONCE_NC)
1913 {
1914 md->end_match_ptr = eptr; /* For ONCE_NC */
1915 md->end_offset_top = offset_top;
1916 md->start_match_ptr = mstart;
1917 RRETURN(MATCH_MATCH); /* Sets md->mark */
1918 }
1919
1920 /* For capturing groups we have to check the group number back at the start
1921 and if necessary complete handling an extraction by setting the offsets and
1922 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1923 into group 0, so it won't be picked up here. Instead, we catch it when the
1924 OP_END is reached. Other recursion is handled here. We just have to record
1925 the current subject position and start match pointer and give a MATCH
1926 return. */
1927
1928 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1929 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1930 {
1931 number = GET2(prev, 1+LINK_SIZE);
1932 offset = number << 1;
1933
1934 #ifdef PCRE_DEBUG
1935 printf("end bracket %d", number);
1936 printf("\n");
1937 #endif
1938
1939 /* Handle a recursively called group. */
1940
1941 if (md->recursive != NULL && md->recursive->group_num == number)
1942 {
1943 md->end_match_ptr = eptr;
1944 md->start_match_ptr = mstart;
1945 RRETURN(MATCH_MATCH);
1946 }
1947
1948 /* Deal with capturing */
1949
1950 md->capture_last = number;
1951 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1952 {
1953 /* If offset is greater than offset_top, it means that we are
1954 "skipping" a capturing group, and that group's offsets must be marked
1955 unset. In earlier versions of PCRE, all the offsets were unset at the
1956 start of matching, but this doesn't work because atomic groups and
1957 assertions can cause a value to be set that should later be unset.
1958 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1959 part of the atomic group, but this is not on the final matching path,
1960 so must be unset when 2 is set. (If there is no group 2, there is no
1961 problem, because offset_top will then be 2, indicating no capture.) */
1962
1963 if (offset > offset_top)
1964 {
1965 register int *iptr = md->offset_vector + offset_top;
1966 register int *iend = md->offset_vector + offset;
1967 while (iptr < iend) *iptr++ = -1;
1968 }
1969
1970 /* Now make the extraction */
1971
1972 md->offset_vector[offset] =
1973 md->offset_vector[md->offset_end - number];
1974 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1975 if (offset_top <= offset) offset_top = offset + 2;
1976 }
1977 }
1978
1979 /* For an ordinary non-repeating ket, just continue at this level. This
1980 also happens for a repeating ket if no characters were matched in the
1981 group. This is the forcible breaking of infinite loops as implemented in
1982 Perl 5.005. For a non-repeating atomic group that includes captures,
1983 establish a backup point by processing the rest of the pattern at a lower
1984 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1985 original OP_ONCE level, thereby bypassing intermediate backup points, but
1986 resetting any captures that happened along the way. */
1987
1988 if (*ecode == OP_KET || eptr == saved_eptr)
1989 {
1990 if (*prev == OP_ONCE)
1991 {
1992 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1994 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1995 RRETURN(MATCH_ONCE);
1996 }
1997 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1998 break;
1999 }
2000
2001 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2002 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2003 at a time from the outer level, thus saving stack. */
2004
2005 if (*ecode == OP_KETRPOS)
2006 {
2007 md->end_match_ptr = eptr;
2008 md->end_offset_top = offset_top;
2009 RRETURN(MATCH_KETRPOS);
2010 }
2011
2012 /* The normal repeating kets try the rest of the pattern or restart from
2013 the preceding bracket, in the appropriate order. In the second case, we can
2014 use tail recursion to avoid using another stack frame, unless we have an
2015 an atomic group or an unlimited repeat of a group that can match an empty
2016 string. */
2017
2018 if (*ecode == OP_KETRMIN)
2019 {
2020 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (*prev == OP_ONCE)
2023 {
2024 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2027 RRETURN(MATCH_ONCE);
2028 }
2029 if (*prev >= OP_SBRA) /* Could match an empty string */
2030 {
2031 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2032 RRETURN(rrc);
2033 }
2034 ecode = prev;
2035 goto TAIL_RECURSE;
2036 }
2037 else /* OP_KETRMAX */
2038 {
2039 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2040 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2041 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2042 if (*prev == OP_ONCE)
2043 {
2044 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046 md->once_target = prev;
2047 RRETURN(MATCH_ONCE);
2048 }
2049 ecode += 1 + LINK_SIZE;
2050 goto TAIL_RECURSE;
2051 }
2052 /* Control never gets here */
2053
2054 /* Not multiline mode: start of subject assertion, unless notbol. */
2055
2056 case OP_CIRC:
2057 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2058
2059 /* Start of subject assertion */
2060
2061 case OP_SOD:
2062 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2063 ecode++;
2064 break;
2065
2066 /* Multiline mode: start of subject unless notbol, or after any newline. */
2067
2068 case OP_CIRCM:
2069 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2070 if (eptr != md->start_subject &&
2071 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2072 RRETURN(MATCH_NOMATCH);
2073 ecode++;
2074 break;
2075
2076 /* Start of match assertion */
2077
2078 case OP_SOM:
2079 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2080 ecode++;
2081 break;
2082
2083 /* Reset the start of match point */
2084
2085 case OP_SET_SOM:
2086 mstart = eptr;
2087 ecode++;
2088 break;
2089
2090 /* Multiline mode: assert before any newline, or before end of subject
2091 unless noteol is set. */
2092
2093 case OP_DOLLM:
2094 if (eptr < md->end_subject)
2095 {
2096 if (!IS_NEWLINE(eptr))
2097 {
2098 if (md->partial != 0 &&
2099 eptr + 1 >= md->end_subject &&
2100 NLBLOCK->nltype == NLTYPE_FIXED &&
2101 NLBLOCK->nllen == 2 &&
2102 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2103 {
2104 md->hitend = TRUE;
2105 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2106 }
2107 RRETURN(MATCH_NOMATCH);
2108 }
2109 }
2110 else
2111 {
2112 if (md->noteol) RRETURN(MATCH_NOMATCH);
2113 SCHECK_PARTIAL();
2114 }
2115 ecode++;
2116 break;
2117
2118 /* Not multiline mode: assert before a terminating newline or before end of
2119 subject unless noteol is set. */
2120
2121 case OP_DOLL:
2122 if (md->noteol) RRETURN(MATCH_NOMATCH);
2123 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2124
2125 /* ... else fall through for endonly */
2126
2127 /* End of subject assertion (\z) */
2128
2129 case OP_EOD:
2130 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2131 SCHECK_PARTIAL();
2132 ecode++;
2133 break;
2134
2135 /* End of subject or ending \n assertion (\Z) */
2136
2137 case OP_EODN:
2138 ASSERT_NL_OR_EOS:
2139 if (eptr < md->end_subject &&
2140 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2141 {
2142 if (md->partial != 0 &&
2143 eptr + 1 >= md->end_subject &&
2144 NLBLOCK->nltype == NLTYPE_FIXED &&
2145 NLBLOCK->nllen == 2 &&
2146 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2147 {
2148 md->hitend = TRUE;
2149 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2150 }
2151 RRETURN(MATCH_NOMATCH);
2152 }
2153
2154 /* Either at end of string or \n before end. */
2155
2156 SCHECK_PARTIAL();
2157 ecode++;
2158 break;
2159
2160 /* Word boundary assertions */
2161
2162 case OP_NOT_WORD_BOUNDARY:
2163 case OP_WORD_BOUNDARY:
2164 {
2165
2166 /* Find out if the previous and current characters are "word" characters.
2167 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2168 be "non-word" characters. Remember the earliest consulted character for
2169 partial matching. */
2170
2171 #ifdef SUPPORT_UTF
2172 if (utf)
2173 {
2174 /* Get status of previous character */
2175
2176 if (eptr == md->start_subject) prev_is_word = FALSE; else
2177 {
2178 PCRE_PUCHAR lastptr = eptr - 1;
2179 BACKCHAR(lastptr);
2180 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2181 GETCHAR(c, lastptr);
2182 #ifdef SUPPORT_UCP
2183 if (md->use_ucp)
2184 {
2185 if (c == '_') prev_is_word = TRUE; else
2186 {
2187 int cat = UCD_CATEGORY(c);
2188 prev_is_word = (cat == ucp_L || cat == ucp_N);
2189 }
2190 }
2191 else
2192 #endif
2193 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2194 }
2195
2196 /* Get status of next character */
2197
2198 if (eptr >= md->end_subject)
2199 {
2200 SCHECK_PARTIAL();
2201 cur_is_word = FALSE;
2202 }
2203 else
2204 {
2205 GETCHAR(c, eptr);
2206 #ifdef SUPPORT_UCP
2207 if (md->use_ucp)
2208 {
2209 if (c == '_') cur_is_word = TRUE; else
2210 {
2211 int cat = UCD_CATEGORY(c);
2212 cur_is_word = (cat == ucp_L || cat == ucp_N);
2213 }
2214 }
2215 else
2216 #endif
2217 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2218 }
2219 }
2220 else
2221 #endif
2222
2223 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2224 consistency with the behaviour of \w we do use it in this case. */
2225
2226 {
2227 /* Get status of previous character */
2228
2229 if (eptr == md->start_subject) prev_is_word = FALSE; else
2230 {
2231 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2232 #ifdef SUPPORT_UCP
2233 if (md->use_ucp)
2234 {
2235 c = eptr[-1];
2236 if (c == '_') prev_is_word = TRUE; else
2237 {
2238 int cat = UCD_CATEGORY(c);
2239 prev_is_word = (cat == ucp_L || cat == ucp_N);
2240 }
2241 }
2242 else
2243 #endif
2244 prev_is_word = MAX_255(eptr[-1])
2245 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2246 }
2247
2248 /* Get status of next character */
2249
2250 if (eptr >= md->end_subject)
2251 {
2252 SCHECK_PARTIAL();
2253 cur_is_word = FALSE;
2254 }
2255 else
2256 #ifdef SUPPORT_UCP
2257 if (md->use_ucp)
2258 {
2259 c = *eptr;
2260 if (c == '_') cur_is_word = TRUE; else
2261 {
2262 int cat = UCD_CATEGORY(c);
2263 cur_is_word = (cat == ucp_L || cat == ucp_N);
2264 }
2265 }
2266 else
2267 #endif
2268 cur_is_word = MAX_255(*eptr)
2269 && ((md->ctypes[*eptr] & ctype_word) != 0);
2270 }
2271
2272 /* Now see if the situation is what we want */
2273
2274 if ((*ecode++ == OP_WORD_BOUNDARY)?
2275 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2276 RRETURN(MATCH_NOMATCH);
2277 }
2278 break;
2279
2280 /* Match any single character type except newline; have to take care with
2281 CRLF newlines and partial matching. */
2282
2283 case OP_ANY:
2284 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2285 if (md->partial != 0 &&
2286 eptr + 1 >= md->end_subject &&
2287 NLBLOCK->nltype == NLTYPE_FIXED &&
2288 NLBLOCK->nllen == 2 &&
2289 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2290 {
2291 md->hitend = TRUE;
2292 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2293 }
2294
2295 /* Fall through */
2296
2297 /* Match any single character whatsoever. */
2298
2299 case OP_ALLANY:
2300 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2301 { /* not be updated before SCHECK_PARTIAL. */
2302 SCHECK_PARTIAL();
2303 RRETURN(MATCH_NOMATCH);
2304 }
2305 eptr++;
2306 #ifdef SUPPORT_UTF
2307 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2308 #endif
2309 ecode++;
2310 break;
2311
2312 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2313 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2314
2315 case OP_ANYBYTE:
2316 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2317 { /* not be updated before SCHECK_PARTIAL. */
2318 SCHECK_PARTIAL();
2319 RRETURN(MATCH_NOMATCH);
2320 }
2321 eptr++;
2322 ecode++;
2323 break;
2324
2325 case OP_NOT_DIGIT:
2326 if (eptr >= md->end_subject)
2327 {
2328 SCHECK_PARTIAL();
2329 RRETURN(MATCH_NOMATCH);
2330 }
2331 GETCHARINCTEST(c, eptr);
2332 if (
2333 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2334 c < 256 &&
2335 #endif
2336 (md->ctypes[c] & ctype_digit) != 0
2337 )
2338 RRETURN(MATCH_NOMATCH);
2339 ecode++;
2340 break;
2341
2342 case OP_DIGIT:
2343 if (eptr >= md->end_subject)
2344 {
2345 SCHECK_PARTIAL();
2346 RRETURN(MATCH_NOMATCH);
2347 }
2348 GETCHARINCTEST(c, eptr);
2349 if (
2350 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2351 c > 255 ||
2352 #endif
2353 (md->ctypes[c] & ctype_digit) == 0
2354 )
2355 RRETURN(MATCH_NOMATCH);
2356 ecode++;
2357 break;
2358
2359 case OP_NOT_WHITESPACE:
2360 if (eptr >= md->end_subject)
2361 {
2362 SCHECK_PARTIAL();
2363 RRETURN(MATCH_NOMATCH);
2364 }
2365 GETCHARINCTEST(c, eptr);
2366 if (
2367 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2368 c < 256 &&
2369 #endif
2370 (md->ctypes[c] & ctype_space) != 0
2371 )
2372 RRETURN(MATCH_NOMATCH);
2373 ecode++;
2374 break;
2375
2376 case OP_WHITESPACE:
2377 if (eptr >= md->end_subject)
2378 {
2379 SCHECK_PARTIAL();
2380 RRETURN(MATCH_NOMATCH);
2381 }
2382 GETCHARINCTEST(c, eptr);
2383 if (
2384 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2385 c > 255 ||
2386 #endif
2387 (md->ctypes[c] & ctype_space) == 0
2388 )
2389 RRETURN(MATCH_NOMATCH);
2390 ecode++;
2391 break;
2392
2393 case OP_NOT_WORDCHAR:
2394 if (eptr >= md->end_subject)
2395 {
2396 SCHECK_PARTIAL();
2397 RRETURN(MATCH_NOMATCH);
2398 }
2399 GETCHARINCTEST(c, eptr);
2400 if (
2401 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2402 c < 256 &&
2403 #endif
2404 (md->ctypes[c] & ctype_word) != 0
2405 )
2406 RRETURN(MATCH_NOMATCH);
2407 ecode++;
2408 break;
2409
2410 case OP_WORDCHAR:
2411 if (eptr >= md->end_subject)
2412 {
2413 SCHECK_PARTIAL();
2414 RRETURN(MATCH_NOMATCH);
2415 }
2416 GETCHARINCTEST(c, eptr);
2417 if (
2418 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2419 c > 255 ||
2420 #endif
2421 (md->ctypes[c] & ctype_word) == 0
2422 )
2423 RRETURN(MATCH_NOMATCH);
2424 ecode++;
2425 break;
2426
2427 case OP_ANYNL:
2428 if (eptr >= md->end_subject)
2429 {
2430 SCHECK_PARTIAL();
2431 RRETURN(MATCH_NOMATCH);
2432 }
2433 GETCHARINCTEST(c, eptr);
2434 switch(c)
2435 {
2436 default: RRETURN(MATCH_NOMATCH);
2437
2438 case CHAR_CR:
2439 if (eptr >= md->end_subject)
2440 {
2441 SCHECK_PARTIAL();
2442 }
2443 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2444 break;
2445
2446 case CHAR_LF:
2447 break;
2448
2449 case CHAR_VT:
2450 case CHAR_FF:
2451 case CHAR_NEL:
2452 #ifndef EBCDIC
2453 case 0x2028:
2454 case 0x2029:
2455 #endif /* Not EBCDIC */
2456 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2457 break;
2458 }
2459 ecode++;
2460 break;
2461
2462 case OP_NOT_HSPACE:
2463 if (eptr >= md->end_subject)
2464 {
2465 SCHECK_PARTIAL();
2466 RRETURN(MATCH_NOMATCH);
2467 }
2468 GETCHARINCTEST(c, eptr);
2469 switch(c)
2470 {
2471 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2472 default: break;
2473 }
2474 ecode++;
2475 break;
2476
2477 case OP_HSPACE:
2478 if (eptr >= md->end_subject)
2479 {
2480 SCHECK_PARTIAL();
2481 RRETURN(MATCH_NOMATCH);
2482 }
2483 GETCHARINCTEST(c, eptr);
2484 switch(c)
2485 {
2486 HSPACE_CASES: break; /* Byte and multibyte cases */
2487 default: RRETURN(MATCH_NOMATCH);
2488 }
2489 ecode++;
2490 break;
2491
2492 case OP_NOT_VSPACE:
2493 if (eptr >= md->end_subject)
2494 {
2495 SCHECK_PARTIAL();
2496 RRETURN(MATCH_NOMATCH);
2497 }
2498 GETCHARINCTEST(c, eptr);
2499 switch(c)
2500 {
2501 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2502 default: break;
2503 }
2504 ecode++;
2505 break;
2506
2507 case OP_VSPACE:
2508 if (eptr >= md->end_subject)
2509 {
2510 SCHECK_PARTIAL();
2511 RRETURN(MATCH_NOMATCH);
2512 }
2513 GETCHARINCTEST(c, eptr);
2514 switch(c)
2515 {
2516 VSPACE_CASES: break;
2517 default: RRETURN(MATCH_NOMATCH);
2518 }
2519 ecode++;
2520 break;
2521
2522 #ifdef SUPPORT_UCP
2523 /* Check the next character by Unicode property. We will get here only
2524 if the support is in the binary; otherwise a compile-time error occurs. */
2525
2526 case OP_PROP:
2527 case OP_NOTPROP:
2528 if (eptr >= md->end_subject)
2529 {
2530 SCHECK_PARTIAL();
2531 RRETURN(MATCH_NOMATCH);
2532 }
2533 GETCHARINCTEST(c, eptr);
2534 {
2535 const pcre_uint32 *cp;
2536 const ucd_record *prop = GET_UCD(c);
2537
2538 switch(ecode[1])
2539 {
2540 case PT_ANY:
2541 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2542 break;
2543
2544 case PT_LAMP:
2545 if ((prop->chartype == ucp_Lu ||
2546 prop->chartype == ucp_Ll ||
2547 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2548 RRETURN(MATCH_NOMATCH);
2549 break;
2550
2551 case PT_GC:
2552 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2553 RRETURN(MATCH_NOMATCH);
2554 break;
2555
2556 case PT_PC:
2557 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2558 RRETURN(MATCH_NOMATCH);
2559 break;
2560
2561 case PT_SC:
2562 if ((ecode[2] != prop->script) == (op == OP_PROP))
2563 RRETURN(MATCH_NOMATCH);
2564 break;
2565
2566 /* These are specials */
2567
2568 case PT_ALNUM:
2569 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2570 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2571 RRETURN(MATCH_NOMATCH);
2572 break;
2573
2574 case PT_SPACE: /* Perl space */
2575 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2576 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2577 == (op == OP_NOTPROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2580
2581 case PT_PXSPACE: /* POSIX space */
2582 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2583 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2584 c == CHAR_FF || c == CHAR_CR)
2585 == (op == OP_NOTPROP))
2586 RRETURN(MATCH_NOMATCH);
2587 break;
2588
2589 case PT_WORD:
2590 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2591 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2592 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2593 RRETURN(MATCH_NOMATCH);
2594 break;
2595
2596 case PT_CLIST:
2597 cp = PRIV(ucd_caseless_sets) + ecode[2];
2598 for (;;)
2599 {
2600 if (c < *cp)
2601 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2602 if (c == *cp++)
2603 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2604 }
2605 break;
2606
2607 /* This should never occur */
2608
2609 default:
2610 RRETURN(PCRE_ERROR_INTERNAL);
2611 }
2612
2613 ecode += 3;
2614 }
2615 break;
2616
2617 /* Match an extended Unicode sequence. We will get here only if the support
2618 is in the binary; otherwise a compile-time error occurs. */
2619
2620 case OP_EXTUNI:
2621 if (eptr >= md->end_subject)
2622 {
2623 SCHECK_PARTIAL();
2624 RRETURN(MATCH_NOMATCH);
2625 }
2626 else
2627 {
2628 int lgb, rgb;
2629 GETCHARINCTEST(c, eptr);
2630 lgb = UCD_GRAPHBREAK(c);
2631 while (eptr < md->end_subject)
2632 {
2633 int len = 1;
2634 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2635 rgb = UCD_GRAPHBREAK(c);
2636 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2637 lgb = rgb;
2638 eptr += len;
2639 }
2640 }
2641 CHECK_PARTIAL();
2642 ecode++;
2643 break;
2644 #endif /* SUPPORT_UCP */
2645
2646
2647 /* Match a back reference, possibly repeatedly. Look past the end of the
2648 item to see if there is repeat information following. The code is similar
2649 to that for character classes, but repeated for efficiency. Then obey
2650 similar code to character type repeats - written out again for speed.
2651 However, if the referenced string is the empty string, always treat
2652 it as matched, any number of times (otherwise there could be infinite
2653 loops). */
2654
2655 case OP_REF:
2656 case OP_REFI:
2657 caseless = op == OP_REFI;
2658 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2659 ecode += 1 + IMM2_SIZE;
2660
2661 /* If the reference is unset, there are two possibilities:
2662
2663 (a) In the default, Perl-compatible state, set the length negative;
2664 this ensures that every attempt at a match fails. We can't just fail
2665 here, because of the possibility of quantifiers with zero minima.
2666
2667 (b) If the JavaScript compatibility flag is set, set the length to zero
2668 so that the back reference matches an empty string.
2669
2670 Otherwise, set the length to the length of what was matched by the
2671 referenced subpattern. */
2672
2673 if (offset >= offset_top || md->offset_vector[offset] < 0)
2674 length = (md->jscript_compat)? 0 : -1;
2675 else
2676 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2677
2678 /* Set up for repetition, or handle the non-repeated case */
2679
2680 switch (*ecode)
2681 {
2682 case OP_CRSTAR:
2683 case OP_CRMINSTAR:
2684 case OP_CRPLUS:
2685 case OP_CRMINPLUS:
2686 case OP_CRQUERY:
2687 case OP_CRMINQUERY:
2688 c = *ecode++ - OP_CRSTAR;
2689 minimize = (c & 1) != 0;
2690 min = rep_min[c]; /* Pick up values from tables; */
2691 max = rep_max[c]; /* zero for max => infinity */
2692 if (max == 0) max = INT_MAX;
2693 break;
2694
2695 case OP_CRRANGE:
2696 case OP_CRMINRANGE:
2697 minimize = (*ecode == OP_CRMINRANGE);
2698 min = GET2(ecode, 1);
2699 max = GET2(ecode, 1 + IMM2_SIZE);
2700 if (max == 0) max = INT_MAX;
2701 ecode += 1 + 2 * IMM2_SIZE;
2702 break;
2703
2704 default: /* No repeat follows */
2705 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2706 {
2707 if (length == -2) eptr = md->end_subject; /* Partial match */
2708 CHECK_PARTIAL();
2709 RRETURN(MATCH_NOMATCH);
2710 }
2711 eptr += length;
2712 continue; /* With the main loop */
2713 }
2714
2715 /* Handle repeated back references. If the length of the reference is
2716 zero, just continue with the main loop. If the length is negative, it
2717 means the reference is unset in non-Java-compatible mode. If the minimum is
2718 zero, we can continue at the same level without recursion. For any other
2719 minimum, carrying on will result in NOMATCH. */
2720
2721 if (length == 0) continue;
2722 if (length < 0 && min == 0) continue;
2723
2724 /* First, ensure the minimum number of matches are present. We get back
2725 the length of the reference string explicitly rather than passing the
2726 address of eptr, so that eptr can be a register variable. */
2727
2728 for (i = 1; i <= min; i++)
2729 {
2730 int slength;
2731 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2732 {
2733 if (slength == -2) eptr = md->end_subject; /* Partial match */
2734 CHECK_PARTIAL();
2735 RRETURN(MATCH_NOMATCH);
2736 }
2737 eptr += slength;
2738 }
2739
2740 /* If min = max, continue at the same level without recursion.
2741 They are not both allowed to be zero. */
2742
2743 if (min == max) continue;
2744
2745 /* If minimizing, keep trying and advancing the pointer */
2746
2747 if (minimize)
2748 {
2749 for (fi = min;; fi++)
2750 {
2751 int slength;
2752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2754 if (fi >= max) RRETURN(MATCH_NOMATCH);
2755 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2756 {
2757 if (slength == -2) eptr = md->end_subject; /* Partial match */
2758 CHECK_PARTIAL();
2759 RRETURN(MATCH_NOMATCH);
2760 }
2761 eptr += slength;
2762 }
2763 /* Control never gets here */
2764 }
2765
2766 /* If maximizing, find the longest string and work backwards */
2767
2768 else
2769 {
2770 pp = eptr;
2771 for (i = min; i < max; i++)
2772 {
2773 int slength;
2774 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2775 {
2776 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2777 the soft partial matching case. */
2778
2779 if (slength == -2 && md->partial != 0 &&
2780 md->end_subject > md->start_used_ptr)
2781 {
2782 md->hitend = TRUE;
2783 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2784 }
2785 break;
2786 }
2787 eptr += slength;
2788 }
2789
2790 while (eptr >= pp)
2791 {
2792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2794 eptr -= length;
2795 }
2796 RRETURN(MATCH_NOMATCH);
2797 }
2798 /* Control never gets here */
2799
2800 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2801 used when all the characters in the class have values in the range 0-255,
2802 and either the matching is caseful, or the characters are in the range
2803 0-127 when UTF-8 processing is enabled. The only difference between
2804 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2805 encountered.
2806
2807 First, look past the end of the item to see if there is repeat information
2808 following. Then obey similar code to character type repeats - written out
2809 again for speed. */
2810
2811 case OP_NCLASS:
2812 case OP_CLASS:
2813 {
2814 /* The data variable is saved across frames, so the byte map needs to
2815 be stored there. */
2816 #define BYTE_MAP ((pcre_uint8 *)data)
2817 data = ecode + 1; /* Save for matching */
2818 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2819
2820 switch (*ecode)
2821 {
2822 case OP_CRSTAR:
2823 case OP_CRMINSTAR:
2824 case OP_CRPLUS:
2825 case OP_CRMINPLUS:
2826 case OP_CRQUERY:
2827 case OP_CRMINQUERY:
2828 c = *ecode++ - OP_CRSTAR;
2829 minimize = (c & 1) != 0;
2830 min = rep_min[c]; /* Pick up values from tables; */
2831 max = rep_max[c]; /* zero for max => infinity */
2832 if (max == 0) max = INT_MAX;
2833 break;
2834
2835 case OP_CRRANGE:
2836 case OP_CRMINRANGE:
2837 minimize = (*ecode == OP_CRMINRANGE);
2838 min = GET2(ecode, 1);
2839 max = GET2(ecode, 1 + IMM2_SIZE);
2840 if (max == 0) max = INT_MAX;
2841 ecode += 1 + 2 * IMM2_SIZE;
2842 break;
2843
2844 default: /* No repeat follows */
2845 min = max = 1;
2846 break;
2847 }
2848
2849 /* First, ensure the minimum number of matches are present. */
2850
2851 #ifdef SUPPORT_UTF
2852 if (utf)
2853 {
2854 for (i = 1; i <= min; i++)
2855 {
2856 if (eptr >= md->end_subject)
2857 {
2858 SCHECK_PARTIAL();
2859 RRETURN(MATCH_NOMATCH);
2860 }
2861 GETCHARINC(c, eptr);
2862 if (c > 255)
2863 {
2864 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2865 }
2866 else
2867 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2868 }
2869 }
2870 else
2871 #endif
2872 /* Not UTF mode */
2873 {
2874 for (i = 1; i <= min; i++)
2875 {
2876 if (eptr >= md->end_subject)
2877 {
2878 SCHECK_PARTIAL();
2879 RRETURN(MATCH_NOMATCH);
2880 }
2881 c = *eptr++;
2882 #ifndef COMPILE_PCRE8
2883 if (c > 255)
2884 {
2885 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2886 }
2887 else
2888 #endif
2889 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2890 }
2891 }
2892
2893 /* If max == min we can continue with the main loop without the
2894 need to recurse. */
2895
2896 if (min == max) continue;
2897
2898 /* If minimizing, keep testing the rest of the expression and advancing
2899 the pointer while it matches the class. */
2900
2901 if (minimize)
2902 {
2903 #ifdef SUPPORT_UTF
2904 if (utf)
2905 {
2906 for (fi = min;; fi++)
2907 {
2908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 if (fi >= max) RRETURN(MATCH_NOMATCH);
2911 if (eptr >= md->end_subject)
2912 {
2913 SCHECK_PARTIAL();
2914 RRETURN(MATCH_NOMATCH);
2915 }
2916 GETCHARINC(c, eptr);
2917 if (c > 255)
2918 {
2919 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2920 }
2921 else
2922 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2923 }
2924 }
2925 else
2926 #endif
2927 /* Not UTF mode */
2928 {
2929 for (fi = min;; fi++)
2930 {
2931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2933 if (fi >= max) RRETURN(MATCH_NOMATCH);
2934 if (eptr >= md->end_subject)
2935 {
2936 SCHECK_PARTIAL();
2937 RRETURN(MATCH_NOMATCH);
2938 }
2939 c = *eptr++;
2940 #ifndef COMPILE_PCRE8
2941 if (c > 255)
2942 {
2943 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2944 }
2945 else
2946 #endif
2947 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2948 }
2949 }
2950 /* Control never gets here */
2951 }
2952
2953 /* If maximizing, find the longest possible run, then work backwards. */
2954
2955 else
2956 {
2957 pp = eptr;
2958
2959 #ifdef SUPPORT_UTF
2960 if (utf)
2961 {
2962 for (i = min; i < max; i++)
2963 {
2964 int len = 1;
2965 if (eptr >= md->end_subject)
2966 {
2967 SCHECK_PARTIAL();
2968 break;
2969 }
2970 GETCHARLEN(c, eptr, len);
2971 if (c > 255)
2972 {
2973 if (op == OP_CLASS) break;
2974 }
2975 else
2976 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2977 eptr += len;
2978 }
2979 for (;;)
2980 {
2981 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2983 if (eptr-- == pp) break; /* Stop if tried at original pos */
2984 BACKCHAR(eptr);
2985 }
2986 }
2987 else
2988 #endif
2989 /* Not UTF mode */
2990 {
2991 for (i = min; i < max; i++)
2992 {
2993 if (eptr >= md->end_subject)
2994 {
2995 SCHECK_PARTIAL();
2996 break;
2997 }
2998 c = *eptr;
2999 #ifndef COMPILE_PCRE8
3000 if (c > 255)
3001 {
3002 if (op == OP_CLASS) break;
3003 }
3004 else
3005 #endif
3006 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3007 eptr++;
3008 }
3009 while (eptr >= pp)
3010 {
3011 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3013 eptr--;
3014 }
3015 }
3016
3017 RRETURN(MATCH_NOMATCH);
3018 }
3019 #undef BYTE_MAP
3020 }
3021 /* Control never gets here */
3022
3023
3024 /* Match an extended character class. This opcode is encountered only
3025 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3026 mode, because Unicode properties are supported in non-UTF-8 mode. */
3027
3028 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3029 case OP_XCLASS:
3030 {
3031 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3032 ecode += GET(ecode, 1); /* Advance past the item */
3033
3034 switch (*ecode)
3035 {
3036 case OP_CRSTAR:
3037 case OP_CRMINSTAR:
3038 case OP_CRPLUS:
3039 case OP_CRMINPLUS:
3040 case OP_CRQUERY:
3041 case OP_CRMINQUERY:
3042 c = *ecode++ - OP_CRSTAR;
3043 minimize = (c & 1) != 0;
3044 min = rep_min[c]; /* Pick up values from tables; */
3045 max = rep_max[c]; /* zero for max => infinity */
3046 if (max == 0) max = INT_MAX;
3047 break;
3048
3049 case OP_CRRANGE:
3050 case OP_CRMINRANGE:
3051 minimize = (*ecode == OP_CRMINRANGE);
3052 min = GET2(ecode, 1);
3053 max = GET2(ecode, 1 + IMM2_SIZE);
3054 if (max == 0) max = INT_MAX;
3055 ecode += 1 + 2 * IMM2_SIZE;
3056 break;
3057
3058 default: /* No repeat follows */
3059 min = max = 1;
3060 break;
3061 }
3062
3063 /* First, ensure the minimum number of matches are present. */
3064
3065 for (i = 1; i <= min; i++)
3066 {
3067 if (eptr >= md->end_subject)
3068 {
3069 SCHECK_PARTIAL();
3070 RRETURN(MATCH_NOMATCH);
3071 }
3072 GETCHARINCTEST(c, eptr);
3073 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3074 }
3075
3076 /* If max == min we can continue with the main loop without the
3077 need to recurse. */
3078
3079 if (min == max) continue;
3080
3081 /* If minimizing, keep testing the rest of the expression and advancing
3082 the pointer while it matches the class. */
3083
3084 if (minimize)
3085 {
3086 for (fi = min;; fi++)
3087 {
3088 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3089 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3090 if (fi >= max) RRETURN(MATCH_NOMATCH);
3091 if (eptr >= md->end_subject)
3092 {
3093 SCHECK_PARTIAL();
3094 RRETURN(MATCH_NOMATCH);
3095 }
3096 GETCHARINCTEST(c, eptr);
3097 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3098 }
3099 /* Control never gets here */
3100 }
3101
3102 /* If maximizing, find the longest possible run, then work backwards. */
3103
3104 else
3105 {
3106 pp = eptr;
3107 for (i = min; i < max; i++)
3108 {
3109 int len = 1;
3110 if (eptr >= md->end_subject)
3111 {
3112 SCHECK_PARTIAL();
3113 break;
3114 }
3115 #ifdef SUPPORT_UTF
3116 GETCHARLENTEST(c, eptr, len);
3117 #else
3118 c = *eptr;
3119 #endif
3120 if (!PRIV(xclass)(c, data, utf)) break;
3121 eptr += len;
3122 }
3123 for(;;)
3124 {
3125 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3127 if (eptr-- == pp) break; /* Stop if tried at original pos */
3128 #ifdef SUPPORT_UTF
3129 if (utf) BACKCHAR(eptr);
3130 #endif
3131 }
3132 RRETURN(MATCH_NOMATCH);
3133 }
3134
3135 /* Control never gets here */
3136 }
3137 #endif /* End of XCLASS */
3138
3139 /* Match a single character, casefully */
3140
3141 case OP_CHAR:
3142 #ifdef SUPPORT_UTF
3143 if (utf)
3144 {
3145 length = 1;
3146 ecode++;
3147 GETCHARLEN(fc, ecode, length);
3148 if (length > md->end_subject - eptr)
3149 {
3150 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3151 RRETURN(MATCH_NOMATCH);
3152 }
3153 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3154 }
3155 else
3156 #endif
3157 /* Not UTF mode */
3158 {
3159 if (md->end_subject - eptr < 1)
3160 {
3161 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3162 RRETURN(MATCH_NOMATCH);
3163 }
3164 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3165 ecode += 2;
3166 }
3167 break;
3168
3169 /* Match a single character, caselessly. If we are at the end of the
3170 subject, give up immediately. */
3171
3172 case OP_CHARI:
3173 if (eptr >= md->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178
3179 #ifdef SUPPORT_UTF
3180 if (utf)
3181 {
3182 length = 1;
3183 ecode++;
3184 GETCHARLEN(fc, ecode, length);
3185
3186 /* If the pattern character's value is < 128, we have only one byte, and
3187 we know that its other case must also be one byte long, so we can use the
3188 fast lookup table. We know that there is at least one byte left in the
3189 subject. */
3190
3191 if (fc < 128)
3192 {
3193 pcre_uchar cc = RAWUCHAR(eptr);
3194 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3195 ecode++;
3196 eptr++;
3197 }
3198
3199 /* Otherwise we must pick up the subject character. Note that we cannot
3200 use the value of "length" to check for sufficient bytes left, because the
3201 other case of the character may have more or fewer bytes. */
3202
3203 else
3204 {
3205 pcre_uint32 dc;
3206 GETCHARINC(dc, eptr);
3207 ecode += length;
3208
3209 /* If we have Unicode property support, we can use it to test the other
3210 case of the character, if there is one. */
3211
3212 if (fc != dc)
3213 {
3214 #ifdef SUPPORT_UCP
3215 if (dc != UCD_OTHERCASE(fc))
3216 #endif
3217 RRETURN(MATCH_NOMATCH);
3218 }
3219 }
3220 }
3221 else
3222 #endif /* SUPPORT_UTF */
3223
3224 /* Not UTF mode */
3225 {
3226 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3227 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3228 eptr++;
3229 ecode += 2;
3230 }
3231 break;
3232
3233 /* Match a single character repeatedly. */
3234
3235 case OP_EXACT:
3236 case OP_EXACTI:
3237 min = max = GET2(ecode, 1);
3238 ecode += 1 + IMM2_SIZE;
3239 goto REPEATCHAR;
3240
3241 case OP_POSUPTO:
3242 case OP_POSUPTOI:
3243 possessive = TRUE;
3244 /* Fall through */
3245
3246 case OP_UPTO:
3247 case OP_UPTOI:
3248 case OP_MINUPTO:
3249 case OP_MINUPTOI:
3250 min = 0;
3251 max = GET2(ecode, 1);
3252 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3253 ecode += 1 + IMM2_SIZE;
3254 goto REPEATCHAR;
3255
3256 case OP_POSSTAR:
3257 case OP_POSSTARI:
3258 possessive = TRUE;
3259 min = 0;
3260 max = INT_MAX;
3261 ecode++;
3262 goto REPEATCHAR;
3263
3264 case OP_POSPLUS:
3265 case OP_POSPLUSI:
3266 possessive = TRUE;
3267 min = 1;
3268 max = INT_MAX;
3269 ecode++;
3270 goto REPEATCHAR;
3271
3272 case OP_POSQUERY:
3273 case OP_POSQUERYI:
3274 possessive = TRUE;
3275 min = 0;
3276 max = 1;
3277 ecode++;
3278 goto REPEATCHAR;
3279
3280 case OP_STAR:
3281 case OP_STARI:
3282 case OP_MINSTAR:
3283 case OP_MINSTARI:
3284 case OP_PLUS:
3285 case OP_PLUSI:
3286 case OP_MINPLUS:
3287 case OP_MINPLUSI:
3288 case OP_QUERY:
3289 case OP_QUERYI:
3290 case OP_MINQUERY:
3291 case OP_MINQUERYI:
3292 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3293 minimize = (c & 1) != 0;
3294 min = rep_min[c]; /* Pick up values from tables; */
3295 max = rep_max[c]; /* zero for max => infinity */
3296 if (max == 0) max = INT_MAX;
3297
3298 /* Common code for all repeated single-character matches. */
3299
3300 REPEATCHAR:
3301 #ifdef SUPPORT_UTF
3302 if (utf)
3303 {
3304 length = 1;
3305 charptr = ecode;
3306 GETCHARLEN(fc, ecode, length);
3307 ecode += length;
3308
3309 /* Handle multibyte character matching specially here. There is
3310 support for caseless matching if UCP support is present. */
3311
3312 if (length > 1)
3313 {
3314 #ifdef SUPPORT_UCP
3315 pcre_uint32 othercase;
3316 if (op >= OP_STARI && /* Caseless */
3317 (othercase = UCD_OTHERCASE(fc)) != fc)
3318 oclength = PRIV(ord2utf)(othercase, occhars);
3319 else oclength = 0;
3320 #endif /* SUPPORT_UCP */
3321
3322 for (i = 1; i <= min; i++)
3323 {
3324 if (eptr <= md->end_subject - length &&
3325 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3326 #ifdef SUPPORT_UCP
3327 else if (oclength > 0 &&
3328 eptr <= md->end_subject - oclength &&
3329 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3330 #endif /* SUPPORT_UCP */
3331 else
3332 {
3333 CHECK_PARTIAL();
3334 RRETURN(MATCH_NOMATCH);
3335 }
3336 }
3337
3338 if (min == max) continue;
3339
3340 if (minimize)
3341 {
3342 for (fi = min;; fi++)
3343 {
3344 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3345 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3346 if (fi >= max) RRETURN(MATCH_NOMATCH);
3347 if (eptr <= md->end_subject - length &&
3348 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3349 #ifdef SUPPORT_UCP
3350 else if (oclength > 0 &&
3351 eptr <= md->end_subject - oclength &&
3352 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3353 #endif /* SUPPORT_UCP */
3354 else
3355 {
3356 CHECK_PARTIAL();
3357 RRETURN(MATCH_NOMATCH);
3358 }
3359 }
3360 /* Control never gets here */
3361 }
3362
3363 else /* Maximize */
3364 {
3365 pp = eptr;
3366 for (i = min; i < max; i++)
3367 {
3368 if (eptr <= md->end_subject - length &&
3369 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3370 #ifdef SUPPORT_UCP
3371 else if (oclength > 0 &&
3372 eptr <= md->end_subject - oclength &&
3373 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3374 #endif /* SUPPORT_UCP */
3375 else
3376 {
3377 CHECK_PARTIAL();
3378 break;
3379 }
3380 }
3381
3382 if (possessive) continue;
3383
3384 for(;;)
3385 {
3386 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3387 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3388 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3389 #ifdef SUPPORT_UCP
3390 eptr--;
3391 BACKCHAR(eptr);
3392 #else /* without SUPPORT_UCP */
3393 eptr -= length;
3394 #endif /* SUPPORT_UCP */
3395 }
3396 }
3397 /* Control never gets here */
3398 }
3399
3400 /* If the length of a UTF-8 character is 1, we fall through here, and
3401 obey the code as for non-UTF-8 characters below, though in this case the
3402 value of fc will always be < 128. */
3403 }
3404 else
3405 #endif /* SUPPORT_UTF */
3406 /* When not in UTF-8 mode, load a single-byte character. */
3407 fc = *ecode++;
3408
3409 /* The value of fc at this point is always one character, though we may
3410 or may not be in UTF mode. The code is duplicated for the caseless and
3411 caseful cases, for speed, since matching characters is likely to be quite
3412 common. First, ensure the minimum number of matches are present. If min =
3413 max, continue at the same level without recursing. Otherwise, if
3414 minimizing, keep trying the rest of the expression and advancing one
3415 matching character if failing, up to the maximum. Alternatively, if
3416 maximizing, find the maximum number of characters and work backwards. */
3417
3418 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3419 max, (char *)eptr));
3420
3421 if (op >= OP_STARI) /* Caseless */
3422 {
3423 #ifdef COMPILE_PCRE8
3424 /* fc must be < 128 if UTF is enabled. */
3425 foc = md->fcc[fc];
3426 #else
3427 #ifdef SUPPORT_UTF
3428 #ifdef SUPPORT_UCP
3429 if (utf && fc > 127)
3430 foc = UCD_OTHERCASE(fc);
3431 #else
3432 if (utf && fc > 127)
3433 foc = fc;
3434 #endif /* SUPPORT_UCP */
3435 else
3436 #endif /* SUPPORT_UTF */
3437 foc = TABLE_GET(fc, md->fcc, fc);
3438 #endif /* COMPILE_PCRE8 */
3439
3440 for (i = 1; i <= min; i++)
3441 {
3442 pcre_uint32 cc; /* Faster than pcre_uchar */
3443 if (eptr >= md->end_subject)
3444 {
3445 SCHECK_PARTIAL();
3446 RRETURN(MATCH_NOMATCH);
3447 }
3448 cc = RAWUCHARTEST(eptr);
3449 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3450 eptr++;
3451 }
3452 if (min == max) continue;
3453 if (minimize)
3454 {
3455 for (fi = min;; fi++)
3456 {
3457 pcre_uint32 cc; /* Faster than pcre_uchar */
3458 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3460 if (fi >= max) RRETURN(MATCH_NOMATCH);
3461 if (eptr >= md->end_subject)
3462 {
3463 SCHECK_PARTIAL();
3464 RRETURN(MATCH_NOMATCH);
3465 }
3466 cc = RAWUCHARTEST(eptr);
3467 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3468 eptr++;
3469 }
3470 /* Control never gets here */
3471 }
3472 else /* Maximize */
3473 {
3474 pp = eptr;
3475 for (i = min; i < max; i++)
3476 {
3477 pcre_uint32 cc; /* Faster than pcre_uchar */
3478 if (eptr >= md->end_subject)
3479 {
3480 SCHECK_PARTIAL();
3481 break;
3482 }
3483 cc = RAWUCHARTEST(eptr);
3484 if (fc != cc && foc != cc) break;
3485 eptr++;
3486 }
3487
3488 if (possessive) continue;
3489
3490 while (eptr >= pp)
3491 {
3492 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3493 eptr--;
3494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3495 }
3496 RRETURN(MATCH_NOMATCH);
3497 }
3498 /* Control never gets here */
3499 }
3500
3501 /* Caseful comparisons (includes all multi-byte characters) */
3502
3503 else
3504 {
3505 for (i = 1; i <= min; i++)
3506 {
3507 if (eptr >= md->end_subject)
3508 {
3509 SCHECK_PARTIAL();
3510 RRETURN(MATCH_NOMATCH);
3511 }
3512 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3513 }
3514
3515 if (min == max) continue;
3516
3517 if (minimize)
3518 {
3519 for (fi = min;; fi++)
3520 {
3521 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3522 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3523 if (fi >= max) RRETURN(MATCH_NOMATCH);
3524 if (eptr >= md->end_subject)
3525 {
3526 SCHECK_PARTIAL();
3527 RRETURN(MATCH_NOMATCH);
3528 }
3529 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3530 }
3531 /* Control never gets here */
3532 }
3533 else /* Maximize */
3534 {
3535 pp = eptr;
3536 for (i = min; i < max; i++)
3537 {
3538 if (eptr >= md->end_subject)
3539 {
3540 SCHECK_PARTIAL();
3541 break;
3542 }
3543 if (fc != RAWUCHARTEST(eptr)) break;
3544 eptr++;
3545 }
3546 if (possessive) continue;
3547
3548 while (eptr >= pp)
3549 {
3550 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3551 eptr--;
3552 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3553 }
3554 RRETURN(MATCH_NOMATCH);
3555 }
3556 }
3557 /* Control never gets here */
3558
3559 /* Match a negated single one-byte character. The character we are
3560 checking can be multibyte. */
3561
3562 case OP_NOT:
3563 case OP_NOTI:
3564 if (eptr >= md->end_subject)
3565 {
3566 SCHECK_PARTIAL();
3567 RRETURN(MATCH_NOMATCH);
3568 }
3569 #ifdef SUPPORT_UTF
3570 if (utf)
3571 {
3572 register pcre_uint32 ch, och;
3573
3574 ecode++;
3575 GETCHARINC(ch, ecode);
3576 GETCHARINC(c, eptr);
3577
3578 if (op == OP_NOT)
3579 {
3580 if (ch == c) RRETURN(MATCH_NOMATCH);
3581 }
3582 else
3583 {
3584 #ifdef SUPPORT_UCP
3585 if (ch > 127)
3586 och = UCD_OTHERCASE(ch);
3587 #else
3588 if (ch > 127)
3589 och = ch;
3590 #endif /* SUPPORT_UCP */
3591 else
3592 och = TABLE_GET(ch, md->fcc, ch);
3593 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3594 }
3595 }
3596 else
3597 #endif
3598 {
3599 register pcre_uint32 ch = ecode[1];
3600 c = *eptr++;
3601 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3602 RRETURN(MATCH_NOMATCH);
3603 ecode += 2;
3604 }
3605 break;
3606
3607 /* Match a negated single one-byte character repeatedly. This is almost a
3608 repeat of the code for a repeated single character, but I haven't found a
3609 nice way of commoning these up that doesn't require a test of the
3610 positive/negative option for each character match. Maybe that wouldn't add
3611 very much to the time taken, but character matching *is* what this is all
3612 about... */
3613
3614 case OP_NOTEXACT:
3615 case OP_NOTEXACTI:
3616 min = max = GET2(ecode, 1);
3617 ecode += 1 + IMM2_SIZE;
3618 goto REPEATNOTCHAR;
3619
3620 case OP_NOTUPTO:
3621 case OP_NOTUPTOI:
3622 case OP_NOTMINUPTO:
3623 case OP_NOTMINUPTOI:
3624 min = 0;
3625 max = GET2(ecode, 1);
3626 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3627 ecode += 1 + IMM2_SIZE;
3628 goto REPEATNOTCHAR;
3629
3630 case OP_NOTPOSSTAR:
3631 case OP_NOTPOSSTARI:
3632 possessive = TRUE;
3633 min = 0;
3634 max = INT_MAX;
3635 ecode++;
3636 goto REPEATNOTCHAR;
3637
3638 case OP_NOTPOSPLUS:
3639 case OP_NOTPOSPLUSI:
3640 possessive = TRUE;
3641 min = 1;
3642 max = INT_MAX;
3643 ecode++;
3644 goto REPEATNOTCHAR;
3645
3646 case OP_NOTPOSQUERY:
3647 case OP_NOTPOSQUERYI:
3648 possessive = TRUE;
3649 min = 0;
3650 max = 1;
3651 ecode++;
3652 goto REPEATNOTCHAR;
3653
3654 case OP_NOTPOSUPTO:
3655 case OP_NOTPOSUPTOI:
3656 possessive = TRUE;
3657 min = 0;
3658 max = GET2(ecode, 1);
3659 ecode += 1 + IMM2_SIZE;
3660 goto REPEATNOTCHAR;
3661
3662 case OP_NOTSTAR:
3663 case OP_NOTSTARI:
3664 case OP_NOTMINSTAR:
3665 case OP_NOTMINSTARI:
3666 case OP_NOTPLUS:
3667 case OP_NOTPLUSI:
3668 case OP_NOTMINPLUS:
3669 case OP_NOTMINPLUSI:
3670 case OP_NOTQUERY:
3671 case OP_NOTQUERYI:
3672 case OP_NOTMINQUERY:
3673 case OP_NOTMINQUERYI:
3674 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3675 minimize = (c & 1) != 0;
3676 min = rep_min[c]; /* Pick up values from tables; */
3677 max = rep_max[c]; /* zero for max => infinity */
3678 if (max == 0) max = INT_MAX;
3679
3680 /* Common code for all repeated single-byte matches. */
3681
3682 REPEATNOTCHAR:
3683 GETCHARINCTEST(fc, ecode);
3684
3685 /* The code is duplicated for the caseless and caseful cases, for speed,
3686 since matching characters is likely to be quite common. First, ensure the
3687 minimum number of matches are present. If min = max, continue at the same
3688 level without recursing. Otherwise, if minimizing, keep trying the rest of
3689 the expression and advancing one matching character if failing, up to the
3690 maximum. Alternatively, if maximizing, find the maximum number of
3691 characters and work backwards. */
3692
3693 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3694 max, (char *)eptr));
3695
3696 if (op >= OP_NOTSTARI) /* Caseless */
3697 {
3698 #ifdef SUPPORT_UTF
3699 #ifdef SUPPORT_UCP
3700 if (utf && fc > 127)
3701 foc = UCD_OTHERCASE(fc);
3702 #else
3703 if (utf && fc > 127)
3704 foc = fc;
3705 #endif /* SUPPORT_UCP */
3706 else
3707 #endif /* SUPPORT_UTF */
3708 foc = TABLE_GET(fc, md->fcc, fc);
3709
3710 #ifdef SUPPORT_UTF
3711 if (utf)
3712 {
3713 register pcre_uint32 d;
3714 for (i = 1; i <= min; i++)
3715 {
3716 if (eptr >= md->end_subject)
3717 {
3718 SCHECK_PARTIAL();
3719 RRETURN(MATCH_NOMATCH);
3720 }
3721 GETCHARINC(d, eptr);
3722 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3723 }
3724 }
3725 else
3726 #endif
3727 /* Not UTF mode */
3728 {
3729 for (i = 1; i <= min; i++)
3730 {
3731 if (eptr >= md->end_subject)
3732 {
3733 SCHECK_PARTIAL();
3734 RRETURN(MATCH_NOMATCH);
3735 }
3736 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3737 eptr++;
3738 }
3739 }
3740
3741 if (min == max) continue;
3742
3743 if (minimize)
3744 {
3745 #ifdef SUPPORT_UTF
3746 if (utf)
3747 {
3748 register pcre_uint32 d;
3749 for (fi = min;; fi++)
3750 {
3751 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3753 if (fi >= max) RRETURN(MATCH_NOMATCH);
3754 if (eptr >= md->end_subject)
3755 {
3756 SCHECK_PARTIAL();
3757 RRETURN(MATCH_NOMATCH);
3758 }
3759 GETCHARINC(d, eptr);
3760 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3761 }
3762 }
3763 else
3764 #endif
3765 /* Not UTF mode */
3766 {
3767 for (fi = min;; fi++)
3768 {
3769 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3770 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3772 if (eptr >= md->end_subject)
3773 {
3774 SCHECK_PARTIAL();
3775 RRETURN(MATCH_NOMATCH);
3776 }
3777 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3778 eptr++;
3779 }
3780 }
3781 /* Control never gets here */
3782 }
3783
3784 /* Maximize case */
3785
3786 else
3787 {
3788 pp = eptr;
3789
3790 #ifdef SUPPORT_UTF
3791 if (utf)
3792 {
3793 register pcre_uint32 d;
3794 for (i = min; i < max; i++)
3795 {
3796 int len = 1;
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 break;
3801 }
3802 GETCHARLEN(d, eptr, len);
3803 if (fc == d || (unsigned int)foc == d) break;
3804 eptr += len;
3805 }
3806 if (possessive) continue;
3807 for(;;)
3808 {
3809 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3811 if (eptr-- == pp) break; /* Stop if tried at original pos */
3812 BACKCHAR(eptr);
3813 }
3814 }
3815 else
3816 #endif
3817 /* Not UTF mode */
3818 {
3819 for (i = min; i < max; i++)
3820 {
3821 if (eptr >= md->end_subject)
3822 {
3823 SCHECK_PARTIAL();
3824 break;
3825 }
3826 if (fc == *eptr || foc == *eptr) break;
3827 eptr++;
3828 }
3829 if (possessive) continue;
3830 while (eptr >= pp)
3831 {
3832 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3834 eptr--;
3835 }
3836 }
3837
3838 RRETURN(MATCH_NOMATCH);
3839 }
3840 /* Control never gets here */
3841 }
3842
3843 /* Caseful comparisons */
3844
3845 else
3846 {
3847 #ifdef SUPPORT_UTF
3848 if (utf)
3849 {
3850 register pcre_uint32 d;
3851 for (i = 1; i <= min; i++)
3852 {
3853 if (eptr >= md->end_subject)
3854 {
3855 SCHECK_PARTIAL();
3856 RRETURN(MATCH_NOMATCH);
3857 }
3858 GETCHARINC(d, eptr);
3859 if (fc == d) RRETURN(MATCH_NOMATCH);
3860 }
3861 }
3862 else
3863 #endif
3864 /* Not UTF mode */
3865 {
3866 for (i = 1; i <= min; i++)
3867 {
3868 if (eptr >= md->end_subject)
3869 {
3870 SCHECK_PARTIAL();
3871 RRETURN(MATCH_NOMATCH);
3872 }
3873 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3874 }
3875 }
3876
3877 if (min == max) continue;
3878
3879 if (minimize)
3880 {
3881 #ifdef SUPPORT_UTF
3882 if (utf)
3883 {
3884 register pcre_uint32 d;
3885 for (fi = min;; fi++)
3886 {
3887 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3889 if (fi >= max) RRETURN(MATCH_NOMATCH);
3890 if (eptr >= md->end_subject)
3891 {
3892 SCHECK_PARTIAL();
3893 RRETURN(MATCH_NOMATCH);
3894 }
3895 GETCHARINC(d, eptr);
3896 if (fc == d) RRETURN(MATCH_NOMATCH);
3897 }
3898 }
3899 else
3900 #endif
3901 /* Not UTF mode */
3902 {
3903 for (fi = min;; fi++)
3904 {
3905 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3906 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3907 if (fi >= max) RRETURN(MATCH_NOMATCH);
3908 if (eptr >= md->end_subject)
3909 {
3910 SCHECK_PARTIAL();
3911 RRETURN(MATCH_NOMATCH);
3912 }
3913 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3914 }
3915 }
3916 /* Control never gets here */
3917 }
3918
3919 /* Maximize case */
3920
3921 else
3922 {
3923 pp = eptr;
3924
3925 #ifdef SUPPORT_UTF
3926 if (utf)
3927 {
3928 register pcre_uint32 d;
3929 for (i = min; i < max; i++)
3930 {
3931 int len = 1;
3932 if (eptr >= md->end_subject)
3933 {
3934 SCHECK_PARTIAL();
3935 break;
3936 }
3937 GETCHARLEN(d, eptr, len);
3938 if (fc == d) break;
3939 eptr += len;
3940 }
3941 if (possessive) continue;
3942 for(;;)
3943 {
3944 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3946 if (eptr-- == pp) break; /* Stop if tried at original pos */
3947 BACKCHAR(eptr);
3948 }
3949 }
3950 else
3951 #endif
3952 /* Not UTF mode */
3953 {
3954 for (i = min; i < max; i++)
3955 {
3956 if (eptr >= md->end_subject)
3957 {
3958 SCHECK_PARTIAL();
3959 break;
3960 }
3961 if (fc == *eptr) break;
3962 eptr++;
3963 }
3964 if (possessive) continue;
3965 while (eptr >= pp)
3966 {
3967 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3969 eptr--;
3970 }
3971 }
3972
3973 RRETURN(MATCH_NOMATCH);
3974 }
3975 }
3976 /* Control never gets here */
3977
3978 /* Match a single character type repeatedly; several different opcodes
3979 share code. This is very similar to the code for single characters, but we
3980 repeat it in the interests of efficiency. */
3981
3982 case OP_TYPEEXACT:
3983 min = max = GET2(ecode, 1);
3984 minimize = TRUE;
3985 ecode += 1 + IMM2_SIZE;
3986 goto REPEATTYPE;
3987
3988 case OP_TYPEUPTO:
3989 case OP_TYPEMINUPTO:
3990 min = 0;
3991 max = GET2(ecode, 1);
3992 minimize = *ecode == OP_TYPEMINUPTO;
3993 ecode += 1 + IMM2_SIZE;
3994 goto REPEATTYPE;
3995
3996 case OP_TYPEPOSSTAR:
3997 possessive = TRUE;
3998 min = 0;
3999 max = INT_MAX;
4000 ecode++;
4001 goto REPEATTYPE;
4002
4003 case OP_TYPEPOSPLUS:
4004 possessive = TRUE;
4005 min = 1;
4006 max = INT_MAX;
4007 ecode++;
4008 goto REPEATTYPE;
4009
4010 case OP_TYPEPOSQUERY:
4011 possessive = TRUE;
4012 min = 0;
4013 max = 1;
4014 ecode++;
4015 goto REPEATTYPE;
4016
4017 case OP_TYPEPOSUPTO:
4018 possessive = TRUE;
4019 min = 0;
4020 max = GET2(ecode, 1);
4021 ecode += 1 + IMM2_SIZE;
4022 goto REPEATTYPE;
4023
4024 case OP_TYPESTAR:
4025 case OP_TYPEMINSTAR:
4026 case OP_TYPEPLUS:
4027 case OP_TYPEMINPLUS:
4028 case OP_TYPEQUERY:
4029 case OP_TYPEMINQUERY:
4030 c = *ecode++ - OP_TYPESTAR;
4031 minimize = (c & 1) != 0;
4032 min = rep_min[c]; /* Pick up values from tables; */
4033 max = rep_max[c]; /* zero for max => infinity */
4034 if (max == 0) max = INT_MAX;
4035
4036 /* Common code for all repeated single character type matches. Note that
4037 in UTF-8 mode, '.' matches a character of any length, but for the other
4038 character types, the valid characters are all one-byte long. */
4039
4040 REPEATTYPE:
4041 ctype = *ecode++; /* Code for the character type */
4042
4043 #ifdef SUPPORT_UCP
4044 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4045 {
4046 prop_fail_result = ctype == OP_NOTPROP;
4047 prop_type = *ecode++;
4048 prop_value = *ecode++;
4049 }
4050 else prop_type = -1;
4051 #endif
4052
4053 /* First, ensure the minimum number of matches are present. Use inline
4054 code for maximizing the speed, and do the type test once at the start
4055 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4056 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4057 and single-bytes. */
4058
4059 if (min > 0)
4060 {
4061 #ifdef SUPPORT_UCP
4062 if (prop_type >= 0)
4063 {
4064 switch(prop_type)
4065 {
4066 case PT_ANY:
4067 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4068 for (i = 1; i <= min; i++)
4069 {
4070 if (eptr >= md->end_subject)
4071 {
4072 SCHECK_PARTIAL();
4073 RRETURN(MATCH_NOMATCH);
4074 }
4075 GETCHARINCTEST(c, eptr);
4076 }
4077 break;
4078
4079 case PT_LAMP:
4080 for (i = 1; i <= min; i++)
4081 {
4082 int chartype;
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 RRETURN(MATCH_NOMATCH);
4087 }
4088 GETCHARINCTEST(c, eptr);
4089 chartype = UCD_CHARTYPE(c);
4090 if ((chartype == ucp_Lu ||
4091 chartype == ucp_Ll ||
4092 chartype == ucp_Lt) == prop_fail_result)
4093 RRETURN(MATCH_NOMATCH);
4094 }
4095 break;
4096
4097 case PT_GC:
4098 for (i = 1; i <= min; i++)
4099 {
4100 if (eptr >= md->end_subject)
4101 {
4102 SCHECK_PARTIAL();
4103 RRETURN(MATCH_NOMATCH);
4104 }
4105 GETCHARINCTEST(c, eptr);
4106 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4107 RRETURN(MATCH_NOMATCH);
4108 }
4109 break;
4110
4111 case PT_PC:
4112 for (i = 1; i <= min; i++)
4113 {
4114 if (eptr >= md->end_subject)
4115 {
4116 SCHECK_PARTIAL();
4117 RRETURN(MATCH_NOMATCH);
4118 }
4119 GETCHARINCTEST(c, eptr);
4120 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4121 RRETURN(MATCH_NOMATCH);
4122 }
4123 break;
4124
4125 case PT_SC:
4126 for (i = 1; i <= min; i++)
4127 {
4128 if (eptr >= md->end_subject)
4129 {
4130 SCHECK_PARTIAL();
4131 RRETURN(MATCH_NOMATCH);
4132 }
4133 GETCHARINCTEST(c, eptr);
4134 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4135 RRETURN(MATCH_NOMATCH);
4136 }
4137 break;
4138
4139 case PT_ALNUM:
4140 for (i = 1; i <= min; i++)
4141 {
4142 int category;
4143 if (eptr >= md->end_subject)
4144 {
4145 SCHECK_PARTIAL();
4146 RRETURN(MATCH_NOMATCH);
4147 }
4148 GETCHARINCTEST(c, eptr);
4149 category = UCD_CATEGORY(c);
4150 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4151 RRETURN(MATCH_NOMATCH);
4152 }
4153 break;
4154
4155 case PT_SPACE: /* Perl space */
4156 for (i = 1; i <= min; i++)
4157 {
4158 if (eptr >= md->end_subject)
4159 {
4160 SCHECK_PARTIAL();
4161 RRETURN(MATCH_NOMATCH);
4162 }
4163 GETCHARINCTEST(c, eptr);
4164 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4165 c == CHAR_FF || c == CHAR_CR)
4166 == prop_fail_result)
4167 RRETURN(MATCH_NOMATCH);
4168 }
4169 break;
4170
4171 case PT_PXSPACE: /* POSIX space */
4172 for (i = 1; i <= min; i++)
4173 {
4174 if (eptr >= md->end_subject)
4175 {
4176 SCHECK_PARTIAL();
4177 RRETURN(MATCH_NOMATCH);
4178 }
4179 GETCHARINCTEST(c, eptr);
4180 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4181 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4182 == prop_fail_result)
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 break;
4186
4187 case PT_WORD:
4188 for (i = 1; i <= min; i++)
4189 {
4190 int category;
4191 if (eptr >= md->end_subject)
4192 {
4193 SCHECK_PARTIAL();
4194 RRETURN(MATCH_NOMATCH);
4195 }
4196 GETCHARINCTEST(c, eptr);
4197 category = UCD_CATEGORY(c);
4198 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4199 == prop_fail_result)
4200 RRETURN(MATCH_NOMATCH);
4201 }
4202 break;
4203
4204 case PT_CLIST:
4205 for (i = 1; i <= min; i++)
4206 {
4207 const pcre_uint32 *cp;
4208 if (eptr >= md->end_subject)
4209 {
4210 SCHECK_PARTIAL();
4211 RRETURN(MATCH_NOMATCH);
4212 }
4213 GETCHARINCTEST(c, eptr);
4214 cp = PRIV(ucd_caseless_sets) + prop_value;
4215 for (;;)
4216 {
4217 if (c < *cp)
4218 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4219 if (c == *cp++)
4220 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4221 }
4222 }
4223 break;
4224
4225 /* This should not occur */
4226
4227 default:
4228 RRETURN(PCRE_ERROR_INTERNAL);
4229 }
4230 }
4231
4232 /* Match extended Unicode sequences. We will get here only if the
4233 support is in the binary; otherwise a compile-time error occurs. */
4234
4235 else if (ctype == OP_EXTUNI)
4236 {
4237 for (i = 1; i <= min; i++)
4238 {
4239 if (eptr >= md->end_subject)
4240 {
4241 SCHECK_PARTIAL();
4242 RRETURN(MATCH_NOMATCH);
4243 }
4244 else
4245 {
4246 int lgb, rgb;
4247 GETCHARINCTEST(c, eptr);
4248 lgb = UCD_GRAPHBREAK(c);
4249 while (eptr < md->end_subject)
4250 {
4251 int len = 1;
4252 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4253 rgb = UCD_GRAPHBREAK(c);
4254 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4255 lgb = rgb;
4256 eptr += len;
4257 }
4258 }
4259 CHECK_PARTIAL();
4260 }
4261 }
4262
4263 else
4264 #endif /* SUPPORT_UCP */
4265
4266 /* Handle all other cases when the coding is UTF-8 */
4267
4268 #ifdef SUPPORT_UTF
4269 if (utf) switch(ctype)
4270 {
4271 case OP_ANY:
4272 for (i = 1; i <= min; i++)
4273 {
4274 if (eptr >= md->end_subject)
4275 {
4276 SCHECK_PARTIAL();
4277 RRETURN(MATCH_NOMATCH);
4278 }
4279 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4280 if (md->partial != 0 &&
4281 eptr + 1 >= md->end_subject &&
4282 NLBLOCK->nltype == NLTYPE_FIXED &&
4283 NLBLOCK->nllen == 2 &&
4284 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4285 {
4286 md->hitend = TRUE;
4287 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4288 }
4289 eptr++;
4290 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4291 }
4292 break;
4293
4294 case OP_ALLANY:
4295 for (i = 1; i <= min; i++)
4296 {
4297 if (eptr >= md->end_subject)
4298 {
4299 SCHECK_PARTIAL();
4300 RRETURN(MATCH_NOMATCH);
4301 }
4302 eptr++;
4303 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4304 }
4305 break;
4306
4307 case OP_ANYBYTE:
4308 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4309 eptr += min;
4310 break;
4311
4312 case OP_ANYNL:
4313 for (i = 1; i <= min; i++)
4314 {
4315 if (eptr >= md->end_subject)
4316 {
4317 SCHECK_PARTIAL();
4318 RRETURN(MATCH_NOMATCH);
4319 }
4320 GETCHARINC(c, eptr);
4321 switch(c)
4322 {
4323 default: RRETURN(MATCH_NOMATCH);
4324
4325 case CHAR_CR:
4326 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4327 break;
4328
4329 case CHAR_LF:
4330 break;
4331
4332 case CHAR_VT:
4333 case CHAR_FF:
4334 case CHAR_NEL:
4335 #ifndef EBCDIC
4336 case 0x2028:
4337 case 0x2029:
4338 #endif /* Not EBCDIC */
4339 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4340 break;
4341 }
4342 }
4343 break;
4344
4345 case OP_NOT_HSPACE:
4346 for (i = 1; i <= min; i++)
4347 {
4348 if (eptr >= md->end_subject)
4349 {
4350 SCHECK_PARTIAL();
4351 RRETURN(MATCH_NOMATCH);
4352 }
4353 GETCHARINC(c, eptr);
4354 switch(c)
4355 {
4356 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4357 default: break;
4358 }
4359 }
4360 break;
4361
4362 case OP_HSPACE:
4363 for (i = 1; i <= min; i++)
4364 {
4365 if (eptr >= md->end_subject)
4366 {
4367 SCHECK_PARTIAL();
4368 RRETURN(MATCH_NOMATCH);
4369 }
4370 GETCHARINC(c, eptr);
4371 switch(c)
4372 {
4373 HSPACE_CASES: break; /* Byte and multibyte cases */
4374 default: RRETURN(MATCH_NOMATCH);
4375 }
4376 }
4377 break;
4378
4379 case OP_NOT_VSPACE:
4380 for (i = 1; i <= min; i++)
4381 {
4382 if (eptr >= md->end_subject)
4383 {
4384 SCHECK_PARTIAL();
4385 RRETURN(MATCH_NOMATCH);
4386 }
4387 GETCHARINC(c, eptr);
4388 switch(c)
4389 {
4390 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4391 default: break;
4392 }
4393 }
4394 break;
4395
4396 case OP_VSPACE:
4397 for (i = 1; i <= min; i++)
4398 {
4399 if (eptr >= md->end_subject)
4400 {
4401 SCHECK_PARTIAL();
4402 RRETURN(MATCH_NOMATCH);
4403 }
4404 GETCHARINC(c, eptr);
4405 switch(c)
4406 {
4407 VSPACE_CASES: break;
4408 default: RRETURN(MATCH_NOMATCH);
4409 }
4410 }
4411 break;
4412
4413 case OP_NOT_DIGIT:
4414 for (i = 1; i <= min; i++)
4415 {
4416 if (eptr >= md->end_subject)
4417 {
4418 SCHECK_PARTIAL();
4419 RRETURN(MATCH_NOMATCH);
4420 }
4421 GETCHARINC(c, eptr);
4422 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4423 RRETURN(MATCH_NOMATCH);
4424 }
4425 break;
4426
4427 case OP_DIGIT:
4428 for (i = 1; i <= min; i++)
4429 {
4430 pcre_uchar cc;
4431
4432 if (eptr >= md->end_subject)
4433 {
4434 SCHECK_PARTIAL();
4435 RRETURN(MATCH_NOMATCH);
4436 }
4437 cc = RAWUCHAR(eptr);
4438 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4439 RRETURN(MATCH_NOMATCH);
4440 eptr++;
4441 /* No need to skip more bytes - we know it's a 1-byte character */
4442 }
4443 break;
4444
4445 case OP_NOT_WHITESPACE:
4446 for (i = 1; i <= min; i++)
4447 {
4448 pcre_uchar cc;
4449
4450 if (eptr >= md->end_subject)
4451 {
4452 SCHECK_PARTIAL();
4453 RRETURN(MATCH_NOMATCH);
4454 }
4455 cc = RAWUCHAR(eptr);
4456 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4457 RRETURN(MATCH_NOMATCH);
4458 eptr++;
4459 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4460 }
4461 break;
4462
4463 case OP_WHITESPACE:
4464 for (i = 1; i <= min; i++)
4465 {
4466 pcre_uchar cc;
4467
4468 if (eptr >= md->end_subject)
4469 {
4470 SCHECK_PARTIAL();
4471 RRETURN(MATCH_NOMATCH);
4472 }
4473 cc = RAWUCHAR(eptr);
4474 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4475 RRETURN(MATCH_NOMATCH);
4476 eptr++;
4477 /* No need to skip more bytes - we know it's a 1-byte character */
4478 }
4479 break;
4480
4481 case OP_NOT_WORDCHAR:
4482 for (i = 1; i <= min; i++)
4483 {
4484 pcre_uchar cc;
4485
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 RRETURN(MATCH_NOMATCH);
4490 }
4491 cc = RAWUCHAR(eptr);
4492 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4493 RRETURN(MATCH_NOMATCH);
4494 eptr++;
4495 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4496 }
4497 break;
4498
4499 case OP_WORDCHAR:
4500 for (i = 1; i <= min; i++)
4501 {
4502 pcre_uchar cc;
4503
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 RRETURN(MATCH_NOMATCH);
4508 }
4509 cc = RAWUCHAR(eptr);
4510 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4511 RRETURN(MATCH_NOMATCH);
4512 eptr++;
4513 /* No need to skip more bytes - we know it's a 1-byte character */
4514 }
4515 break;
4516
4517 default:
4518 RRETURN(PCRE_ERROR_INTERNAL);
4519 } /* End switch(ctype) */
4520
4521 else
4522 #endif /* SUPPORT_UTF */
4523
4524 /* Code for the non-UTF-8 case for minimum matching of operators other
4525 than OP_PROP and OP_NOTPROP. */
4526
4527 switch(ctype)
4528 {
4529 case OP_ANY:
4530 for (i = 1; i <= min; i++)
4531 {
4532 if (eptr >= md->end_subject)
4533 {
4534 SCHECK_PARTIAL();
4535 RRETURN(MATCH_NOMATCH);
4536 }
4537 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4538 if (md->partial != 0 &&
4539 eptr + 1 >= md->end_subject &&
4540 NLBLOCK->nltype == NLTYPE_FIXED &&
4541 NLBLOCK->nllen == 2 &&
4542 *eptr == NLBLOCK->nl[0])
4543 {
4544 md->hitend = TRUE;
4545 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4546 }
4547 eptr++;
4548 }
4549 break;
4550
4551 case OP_ALLANY:
4552 if (eptr > md->end_subject - min)
4553 {
4554 SCHECK_PARTIAL();
4555 RRETURN(MATCH_NOMATCH);
4556 }
4557 eptr += min;
4558 break;
4559
4560 case OP_ANYBYTE:
4561 if (eptr > md->end_subject - min)
4562 {
4563 SCHECK_PARTIAL();
4564 RRETURN(MATCH_NOMATCH);
4565 }
4566 eptr += min;
4567 break;
4568
4569 case OP_ANYNL:
4570 for (i = 1; i <= min; i++)
4571 {
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 RRETURN(MATCH_NOMATCH);
4576 }
4577 switch(*eptr++)
4578 {
4579 default: RRETURN(MATCH_NOMATCH);
4580
4581 case CHAR_CR:
4582 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4583 break;
4584
4585 case CHAR_LF:
4586 break;
4587
4588 case CHAR_VT:
4589 case CHAR_FF:
4590 case CHAR_NEL:
4591 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4592 case 0x2028:
4593 case 0x2029:
4594 #endif
4595 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4596 break;
4597 }
4598 }
4599 break;
4600
4601 case OP_NOT_HSPACE:
4602 for (i = 1; i <= min; i++)
4603 {
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 RRETURN(MATCH_NOMATCH);
4608 }
4609 switch(*eptr++)
4610 {
4611 default: break;
4612 HSPACE_BYTE_CASES:
4613 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4614 HSPACE_MULTIBYTE_CASES:
4615 #endif
4616 RRETURN(MATCH_NOMATCH);
4617 }
4618 }
4619 break;
4620
4621 case OP_HSPACE:
4622 for (i = 1; i <= min; i++)
4623 {
4624 if (eptr >= md->end_subject)
4625 {
4626 SCHECK_PARTIAL();
4627 RRETURN(MATCH_NOMATCH);
4628 }
4629 switch(*eptr++)
4630 {
4631 default: RRETURN(MATCH_NOMATCH);
4632 HSPACE_BYTE_CASES:
4633 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4634 HSPACE_MULTIBYTE_CASES:
4635 #endif
4636 break;
4637 }
4638 }
4639 break;
4640
4641 case OP_NOT_VSPACE:
4642 for (i = 1; i <= min; i++)
4643 {
4644 if (eptr >= md->end_subject)
4645 {
4646 SCHECK_PARTIAL();
4647 RRETURN(MATCH_NOMATCH);
4648 }
4649 switch(*eptr++)
4650 {
4651 VSPACE_BYTE_CASES:
4652 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4653 VSPACE_MULTIBYTE_CASES:
4654 #endif
4655 RRETURN(MATCH_NOMATCH);
4656 default: break;
4657 }
4658 }
4659 break;
4660
4661 case OP_VSPACE:
4662 for (i = 1; i <= min; i++)
4663 {
4664 if (eptr >= md->end_subject)
4665 {
4666 SCHECK_PARTIAL();
4667 RRETURN(MATCH_NOMATCH);
4668 }
4669 switch(*eptr++)
4670 {
4671 default: RRETURN(MATCH_NOMATCH);
4672 VSPACE_BYTE_CASES:
4673 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4674 VSPACE_MULTIBYTE_CASES:
4675 #endif
4676 break;
4677 }
4678 }
4679 break;
4680
4681 case OP_NOT_DIGIT:
4682 for (i = 1; i <= min; i++)
4683 {
4684 if (eptr >= md->end_subject)
4685 {
4686 SCHECK_PARTIAL();
4687 RRETURN(MATCH_NOMATCH);
4688 }
4689 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4690 RRETURN(MATCH_NOMATCH);
4691 eptr++;
4692 }
4693 break;
4694
4695 case OP_DIGIT:
4696 for (i = 1; i <= min; i++)
4697 {
4698 if (eptr >= md->end_subject)
4699 {
4700 SCHECK_PARTIAL();
4701 RRETURN(MATCH_NOMATCH);
4702 }
4703 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4704 RRETURN(MATCH_NOMATCH);
4705 eptr++;
4706 }
4707 break;
4708
4709 case OP_NOT_WHITESPACE:
4710 for (i = 1; i <= min; i++)
4711 {
4712 if (eptr >= md->end_subject)
4713 {
4714 SCHECK_PARTIAL();
4715 RRETURN(MATCH_NOMATCH);
4716 }
4717 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4718 RRETURN(MATCH_NOMATCH);
4719 eptr++;
4720 }
4721 break;
4722
4723 case OP_WHITESPACE:
4724 for (i = 1; i <= min; i++)
4725 {
4726 if (eptr >= md->end_subject)
4727 {
4728 SCHECK_PARTIAL();
4729 RRETURN(MATCH_NOMATCH);
4730 }
4731 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4732 RRETURN(MATCH_NOMATCH);
4733 eptr++;
4734 }
4735 break;
4736
4737 case OP_NOT_WORDCHAR:
4738 for (i = 1; i <= min; i++)
4739 {
4740 if (eptr >= md->end_subject)
4741 {
4742 SCHECK_PARTIAL();
4743 RRETURN(MATCH_NOMATCH);
4744 }
4745 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4746 RRETURN(MATCH_NOMATCH);
4747 eptr++;
4748 }
4749 break;
4750
4751 case OP_WORDCHAR:
4752 for (i = 1; i <= min; i++)
4753 {
4754 if (eptr >= md->end_subject)
4755 {
4756 SCHECK_PARTIAL();
4757 RRETURN(MATCH_NOMATCH);
4758 }
4759 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4760 RRETURN(MATCH_NOMATCH);
4761 eptr++;
4762 }
4763 break;
4764
4765 default:
4766 RRETURN(PCRE_ERROR_INTERNAL);
4767 }
4768 }
4769
4770 /* If min = max, continue at the same level without recursing */
4771
4772 if (min == max) continue;
4773
4774 /* If minimizing, we have to test the rest of the pattern before each
4775 subsequent match. Again, separate the UTF-8 case for speed, and also
4776 separate the UCP cases. */
4777
4778 if (minimize)
4779 {
4780 #ifdef SUPPORT_UCP
4781 if (prop_type >= 0)
4782 {
4783 switch(prop_type)
4784 {
4785 case PT_ANY:
4786 for (fi = min;; fi++)
4787 {
4788 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4790 if (fi >= max) RRETURN(MATCH_NOMATCH);
4791 if (eptr >= md->end_subject)
4792 {
4793 SCHECK_PARTIAL();
4794 RRETURN(MATCH_NOMATCH);
4795 }
4796 GETCHARINCTEST(c, eptr);
4797 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4798 }
4799 /* Control never gets here */
4800
4801 case PT_LAMP:
4802 for (fi = min;; fi++)
4803 {
4804 int chartype;
4805 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4807 if (fi >= max) RRETURN(MATCH_NOMATCH);
4808 if (eptr >= md->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 RRETURN(MATCH_NOMATCH);
4812 }
4813 GETCHARINCTEST(c, eptr);
4814 chartype = UCD_CHARTYPE(c);
4815 if ((chartype == ucp_Lu ||
4816 chartype == ucp_Ll ||
4817 chartype == ucp_Lt) == prop_fail_result)
4818 RRETURN(MATCH_NOMATCH);
4819 }
4820 /* Control never gets here */
4821
4822 case PT_GC:
4823 for (fi = min;; fi++)
4824 {
4825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4827 if (fi >= max) RRETURN(MATCH_NOMATCH);
4828 if (eptr >= md->end_subject)
4829 {
4830 SCHECK_PARTIAL();
4831 RRETURN(MATCH_NOMATCH);
4832 }
4833 GETCHARINCTEST(c, eptr);
4834 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4835 RRETURN(MATCH_NOMATCH);
4836 }
4837 /* Control never gets here */
4838
4839 case PT_PC:
4840 for (fi = min;; fi++)
4841 {
4842 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4844 if (fi >= max) RRETURN(MATCH_NOMATCH);
4845 if (eptr >= md->end_subject)
4846 {
4847 SCHECK_PARTIAL();
4848 RRETURN(MATCH_NOMATCH);
4849 }
4850 GETCHARINCTEST(c, eptr);
4851 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4852 RRETURN(MATCH_NOMATCH);
4853 }
4854 /* Control never gets here */
4855
4856 case PT_SC:
4857 for (fi = min;; fi++)
4858 {
4859 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4861 if (fi >= max) RRETURN(MATCH_NOMATCH);
4862 if (eptr >= md->end_subject)
4863 {
4864 SCHECK_PARTIAL();
4865 RRETURN(MATCH_NOMATCH);
4866 }
4867 GETCHARINCTEST(c, eptr);
4868 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4869 RRETURN(MATCH_NOMATCH);
4870 }
4871 /* Control never gets here */
4872
4873 case PT_ALNUM:
4874 for (fi = min;; fi++)
4875 {
4876 int category;
4877 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4879 if (fi >= max) RRETURN(MATCH_NOMATCH);
4880 if (eptr >= md->end_subject)
4881 {
4882 SCHECK_PARTIAL();
4883 RRETURN(MATCH_NOMATCH);
4884 }
4885 GETCHARINCTEST(c, eptr);
4886 category = UCD_CATEGORY(c);
4887 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4888 RRETURN(MATCH_NOMATCH);
4889 }
4890 /* Control never gets here */
4891
4892 case PT_SPACE: /* Perl space */
4893 for (fi = min;; fi++)
4894 {
4895 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4896 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4897 if (fi >= max) RRETURN(MATCH_NOMATCH);
4898 if (eptr >= md->end_subject)
4899 {
4900 SCHECK_PARTIAL();
4901 RRETURN(MATCH_NOMATCH);
4902 }
4903 GETCHARINCTEST(c, eptr);
4904 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4905 c == CHAR_FF || c == CHAR_CR)
4906 == prop_fail_result)
4907 RRETURN(MATCH_NOMATCH);
4908 }
4909 /* Control never gets here */
4910
4911 case PT_PXSPACE: /* POSIX space */
4912 for (fi = min;; fi++)
4913 {
4914 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4916 if (fi >= max) RRETURN(MATCH_NOMATCH);
4917 if (eptr >= md->end_subject)
4918 {
4919 SCHECK_PARTIAL();
4920 RRETURN(MATCH_NOMATCH);
4921 }
4922 GETCHARINCTEST(c, eptr);
4923 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4924 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4925 == prop_fail_result)
4926 RRETURN(MATCH_NOMATCH);
4927 }
4928 /* Control never gets here */
4929
4930 case PT_WORD:
4931 for (fi = min;; fi++)
4932 {
4933 int category;
4934 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4936 if (fi >= max) RRETURN(MATCH_NOMATCH);
4937 if (eptr >= md->end_subject)
4938 {
4939 SCHECK_PARTIAL();
4940 RRETURN(MATCH_NOMATCH);
4941 }
4942 GETCHARINCTEST(c, eptr);
4943 category = UCD_CATEGORY(c);
4944 if ((category == ucp_L ||
4945 category == ucp_N ||
4946 c == CHAR_UNDERSCORE)
4947 == prop_fail_result)
4948 RRETURN(MATCH_NOMATCH);
4949 }
4950 /* Control never gets here */
4951
4952 case PT_CLIST:
4953 for (fi = min;; fi++)
4954 {
4955 const pcre_uint32 *cp;
4956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
4957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4958 if (fi >= max) RRETURN(MATCH_NOMATCH);
4959 if (eptr >= md->end_subject)
4960 {
4961 SCHECK_PARTIAL();
4962 RRETURN(MATCH_NOMATCH);
4963 }
4964 GETCHARINCTEST(c, eptr);
4965 cp = PRIV(ucd_caseless_sets) + prop_value;
4966 for (;;)
4967 {
4968 if (c < *cp)
4969 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4970 if (c == *cp++)
4971 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4972 }
4973 }
4974 /* Control never gets here */
4975
4976 /* This should never occur */
4977 default:
4978 RRETURN(PCRE_ERROR_INTERNAL);
4979 }
4980 }
4981
4982 /* Match extended Unicode sequences. We will get here only if the
4983 support is in the binary; otherwise a compile-time error occurs. */
4984
4985 else if (ctype == OP_EXTUNI)
4986 {
4987 for (fi = min;; fi++)
4988 {
4989 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4991 if (fi >= max) RRETURN(MATCH_NOMATCH);
4992 if (eptr >= md->end_subject)
4993 {
4994 SCHECK_PARTIAL();
4995 RRETURN(MATCH_NOMATCH);
4996 }
4997 else
4998 {
4999 int lgb, rgb;
5000 GETCHARINCTEST(c, eptr);
5001 lgb = UCD_GRAPHBREAK(c);
5002 while (eptr < md->end_subject)
5003 {
5004 int len = 1;
5005 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5006 rgb = UCD_GRAPHBREAK(c);
5007 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5008 lgb = rgb;
5009 eptr += len;
5010 }
5011 }
5012 CHECK_PARTIAL();
5013 }
5014 }
5015 else
5016 #endif /* SUPPORT_UCP */
5017
5018 #ifdef SUPPORT_UTF
5019 if (utf)
5020 {
5021 for (fi = min;; fi++)
5022 {
5023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5025 if (fi >= max) RRETURN(MATCH_NOMATCH);
5026 if (eptr >= md->end_subject)
5027 {
5028 SCHECK_PARTIAL();
5029 RRETURN(MATCH_NOMATCH);
5030 }
5031 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5032 RRETURN(MATCH_NOMATCH);
5033 GETCHARINC(c, eptr);
5034 switch(ctype)
5035 {
5036 case OP_ANY: /* This is the non-NL case */
5037 if (md->partial != 0 && /* Take care with CRLF partial */
5038 eptr >= md->end_subject &&
5039 NLBLOCK->nltype == NLTYPE_FIXED &&
5040 NLBLOCK->nllen == 2 &&
5041 c == NLBLOCK->nl[0])
5042 {
5043 md->hitend = TRUE;
5044 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5045 }
5046 break;
5047
5048 case OP_ALLANY:
5049 case OP_ANYBYTE:
5050 break;
5051
5052 case OP_ANYNL:
5053 switch(c)
5054 {
5055 default: RRETURN(MATCH_NOMATCH);
5056 case CHAR_CR:
5057 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5058 break;
5059
5060 case CHAR_LF:
5061 break;
5062
5063 case CHAR_VT:
5064 case CHAR_FF:
5065 case CHAR_NEL:
5066 #ifndef EBCDIC
5067 case 0x2028:
5068 case 0x2029:
5069 #endif /* Not EBCDIC */
5070 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5071 break;
5072 }
5073 break;
5074
5075 case OP_NOT_HSPACE:
5076 switch(c)
5077 {
5078 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5079 default: break;
5080 }
5081 break;
5082
5083 case OP_HSPACE:
5084 switch(c)
5085 {
5086 HSPACE_CASES: break;
5087 default: RRETURN(MATCH_NOMATCH);
5088 }
5089 break;
5090
5091 case OP_NOT_VSPACE:
5092 switch(c)
5093 {
5094 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5095 default: break;
5096 }
5097 break;
5098
5099 case OP_VSPACE:
5100 switch(c)
5101 {
5102 VSPACE_CASES: break;
5103 default: RRETURN(MATCH_NOMATCH);
5104 }
5105 break;
5106
5107 case OP_NOT_DIGIT:
5108 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5109 RRETURN(MATCH_NOMATCH);
5110 break;
5111
5112 case OP_DIGIT:
5113 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5114 RRETURN(MATCH_NOMATCH);
5115 break;
5116
5117 case OP_NOT_WHITESPACE:
5118 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5119 RRETURN(MATCH_NOMATCH);
5120 break;
5121
5122 case OP_WHITESPACE:
5123 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5124 RRETURN(MATCH_NOMATCH);
5125 break;
5126
5127 case OP_NOT_WORDCHAR:
5128 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5129 RRETURN(MATCH_NOMATCH);
5130 break;
5131
5132 case OP_WORDCHAR:
5133 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5134 RRETURN(MATCH_NOMATCH);
5135 break;
5136
5137 default:
5138 RRETURN(PCRE_ERROR_INTERNAL);
5139 }
5140 }
5141 }
5142 else
5143 #endif
5144 /* Not UTF mode */
5145 {
5146 for (fi = min;; fi++)
5147 {
5148 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5149 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5150 if (fi >= max) RRETURN(MATCH_NOMATCH);
5151 if (eptr >= md->end_subject)
5152 {
5153 SCHECK_PARTIAL();
5154 RRETURN(MATCH_NOMATCH);
5155 }
5156 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5157 RRETURN(MATCH_NOMATCH);
5158 c = *eptr++;
5159 switch(ctype)
5160 {
5161 case OP_ANY: /* This is the non-NL case */
5162 if (md->partial != 0 && /* Take care with CRLF partial */
5163 eptr >= md->end_subject &&
5164 NLBLOCK->nltype == NLTYPE_FIXED &&
5165 NLBLOCK->nllen == 2 &&
5166 c == NLBLOCK->nl[0])
5167 {
5168 md->hitend = TRUE;
5169 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5170 }
5171 break;
5172
5173 case OP_ALLANY:
5174 case OP_ANYBYTE:
5175 break;
5176
5177 case OP_ANYNL:
5178 switch(c)
5179 {
5180 default: RRETURN(MATCH_NOMATCH);
5181 case CHAR_CR:
5182 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5183 break;
5184
5185 case CHAR_LF:
5186 break;
5187
5188 case CHAR_VT:
5189 case CHAR_FF:
5190 case CHAR_NEL:
5191 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5192 case 0x2028:
5193 case 0x2029:
5194 #endif
5195 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5196 break;
5197 }
5198 break;
5199
5200 case OP_NOT_HSPACE:
5201 switch(c)
5202 {
5203 default: break;
5204 HSPACE_BYTE_CASES:
5205 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5206 HSPACE_MULTIBYTE_CASES:
5207 #endif
5208 RRETURN(MATCH_NOMATCH);
5209 }
5210 break;
5211
5212 case OP_HSPACE:
5213 switch(c)
5214 {
5215 default: RRETURN(MATCH_NOMATCH);
5216 HSPACE_BYTE_CASES:
5217 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5218 HSPACE_MULTIBYTE_CASES:
5219 #endif
5220 break;
5221 }
5222 break;
5223
5224 case OP_NOT_VSPACE:
5225 switch(c)
5226 {
5227 default: break;
5228 VSPACE_BYTE_CASES:
5229 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5230 VSPACE_MULTIBYTE_CASES:
5231 #endif
5232 RRETURN(MATCH_NOMATCH);
5233 }
5234 break;
5235
5236 case OP_VSPACE:
5237 switch(c)
5238 {
5239 default: RRETURN(MATCH_NOMATCH);
5240 VSPACE_BYTE_CASES:
5241 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5242 VSPACE_MULTIBYTE_CASES:
5243 #endif
5244 break;
5245 }
5246 break;
5247
5248 case OP_NOT_DIGIT:
5249 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5250 break;
5251
5252 case OP_DIGIT:
5253 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5254 break;
5255
5256 case OP_NOT_WHITESPACE:
5257 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5258 break;
5259
5260 case OP_WHITESPACE:
5261 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5262 break;
5263
5264 case OP_NOT_WORDCHAR:
5265 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5266 break;
5267
5268 case OP_WORDCHAR:
5269 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5270 break;
5271
5272 default:
5273 RRETURN(PCRE_ERROR_INTERNAL);
5274 }
5275 }
5276 }
5277 /* Control never gets here */
5278 }
5279
5280 /* If maximizing, it is worth using inline code for speed, doing the type
5281 test once at the start (i.e. keep it out of the loop). Again, keep the
5282 UTF-8 and UCP stuff separate. */
5283
5284 else
5285 {
5286 pp = eptr; /* Remember where we started */
5287
5288 #ifdef SUPPORT_UCP
5289 if (prop_type >= 0)
5290 {
5291 switch(prop_type)
5292 {
5293 case PT_ANY:
5294 for (i = min; i < max; i++)
5295 {
5296 int len = 1;
5297 if (eptr >= md->end_subject)
5298 {
5299 SCHECK_PARTIAL();
5300 break;
5301 }
5302 GETCHARLENTEST(c, eptr, len);
5303 if (prop_fail_result) break;
5304 eptr+= len;
5305 }
5306 break;
5307
5308 case PT_LAMP:
5309 for (i = min; i < max; i++)
5310 {
5311 int chartype;
5312 int len = 1;
5313 if (eptr >= md->end_subject)
5314 {
5315 SCHECK_PARTIAL();
5316 break;
5317 }
5318 GETCHARLENTEST(c, eptr, len);
5319 chartype = UCD_CHARTYPE(c);
5320 if ((chartype == ucp_Lu ||
5321 chartype == ucp_Ll ||
5322 chartype == ucp_Lt) == prop_fail_result)
5323 break;
5324 eptr+= len;
5325 }
5326 break;
5327
5328 case PT_GC:
5329 for (i = min; i < max; i++)
5330 {
5331 int len = 1;
5332 if (eptr >= md->end_subject)
5333 {
5334 SCHECK_PARTIAL();
5335 break;
5336 }
5337 GETCHARLENTEST(c, eptr, len);
5338 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5339 eptr+= len;
5340 }
5341 break;
5342
5343 case PT_PC:
5344 for (i = min; i < max; i++)
5345 {
5346 int len = 1;
5347 if (eptr >= md->end_subject)
5348 {
5349 SCHECK_PARTIAL();
5350 break;
5351 }
5352 GETCHARLENTEST(c, eptr, len);
5353 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5354 eptr+= len;
5355 }
5356 break;
5357
5358 case PT_SC:
5359 for (i = min; i < max; i++)
5360 {
5361 int len = 1;
5362 if (eptr >= md->end_subject)
5363 {
5364 SCHECK_PARTIAL();
5365 break;
5366 }
5367 GETCHARLENTEST(c, eptr, len);
5368 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5369 eptr+= len;
5370 }
5371 break;
5372
5373 case PT_ALNUM:
5374 for (i = min; i < max; i++)
5375 {
5376 int category;
5377 int len = 1;
5378 if (eptr >= md->end_subject)
5379 {
5380 SCHECK_PARTIAL();
5381 break;
5382 }
5383 GETCHARLENTEST(c, eptr, len);
5384 category = UCD_CATEGORY(c);
5385 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5386 break;
5387 eptr+= len;
5388 }
5389 break;
5390
5391 case PT_SPACE: /* Perl space */
5392 for (i = min; i < max; i++)
5393 {
5394 int len = 1;
5395 if (eptr >= md->end_subject)
5396 {
5397 SCHECK_PARTIAL();
5398 break;
5399 }
5400 GETCHARLENTEST(c, eptr, len);
5401 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5402 c == CHAR_FF || c == CHAR_CR)
5403 == prop_fail_result)
5404 break;
5405 eptr+= len;
5406 }
5407 break;
5408
5409 case PT_PXSPACE: /* POSIX space */
5410 for (i = min; i < max; i++)
5411 {
5412 int len = 1;
5413 if (eptr >= md->end_subject)
5414 {
5415 SCHECK_PARTIAL();
5416 break;
5417 }
5418 GETCHARLENTEST(c, eptr, len);
5419 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5420 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5421 == prop_fail_result)
5422 break;
5423 eptr+= len;
5424 }
5425 break;
5426
5427 case PT_WORD:
5428 for (i = min; i < max; i++)
5429 {
5430 int category;
5431 int len = 1;
5432 if (eptr >= md->end_subject)
5433 {
5434 SCHECK_PARTIAL();
5435 break;
5436 }
5437 GETCHARLENTEST(c, eptr, len);
5438 category = UCD_CATEGORY(c);
5439 if ((category == ucp_L || category == ucp_N ||
5440 c == CHAR_UNDERSCORE) == prop_fail_result)
5441 break;
5442 eptr+= len;
5443 }
5444 break;
5445
5446 case PT_CLIST:
5447 for (i = min; i < max; i++)
5448 {
5449 const pcre_uint32 *cp;
5450 int len = 1;
5451 if (eptr >= md->end_subject)
5452 {
5453 SCHECK_PARTIAL();
5454 break;
5455 }
5456 GETCHARLENTEST(c, eptr, len);
5457 cp = PRIV(ucd_caseless_sets) + prop_value;
5458 for (;;)
5459 {
5460 if (c < *cp)
5461 { if (prop_fail_result) break; else goto GOT_MAX; }
5462 if (c == *cp++)
5463 { if (prop_fail_result) goto GOT_MAX; else break; }
5464 }
5465 eptr += len;
5466 }
5467 GOT_MAX:
5468 break;
5469
5470 default:
5471 RRETURN(PCRE_ERROR_INTERNAL);
5472 }
5473
5474 /* eptr is now past the end of the maximum run */
5475
5476 if (possessive) continue;
5477 for(;;)
5478 {
5479 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5481 if (eptr-- == pp) break; /* Stop if tried at original pos */
5482 if (utf) BACKCHAR(eptr);
5483 }
5484 }
5485
5486 /* Match extended Unicode sequences. We will get here only if the
5487 support is in the binary; otherwise a compile-time error occurs. */
5488
5489 else if (ctype == OP_EXTUNI)
5490 {
5491 for (i = min; i < max; i++)
5492 {
5493 if (eptr >= md->end_subject)
5494 {
5495 SCHECK_PARTIAL();
5496 break;
5497 }
5498 else
5499 {
5500 int lgb, rgb;
5501 GETCHARINCTEST(c, eptr);
5502 lgb = UCD_GRAPHBREAK(c);
5503 while (eptr < md->end_subject)
5504 {
5505 int len = 1;
5506 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5507 rgb = UCD_GRAPHBREAK(c);
5508 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5509 lgb = rgb;
5510 eptr += len;
5511 }
5512 }
5513 CHECK_PARTIAL();
5514 }
5515
5516 /* eptr is now past the end of the maximum run */
5517
5518 if (possessive) continue;
5519
5520 for(;;)
5521 {
5522 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5523 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5524 if (eptr-- == pp) break; /* Stop if tried at original pos */
5525 for (;;) /* Move back over one extended */
5526 {
5527 if (!utf) c = *eptr; else
5528 {
5529 BACKCHAR(eptr);
5530 GETCHAR(c, eptr);
5531 }
5532 if (UCD_CATEGORY(c) != ucp_M) break;
5533 eptr--;
5534 }
5535 }
5536 }
5537
5538 else
5539 #endif /* SUPPORT_UCP */
5540
5541 #ifdef SUPPORT_UTF
5542 if (utf)
5543 {
5544 switch(ctype)
5545 {
5546 case OP_ANY:
5547 if (max < INT_MAX)
5548 {
5549 for (i = min; i < max; i++)
5550 {
5551 if (eptr >= md->end_subject)
5552 {
5553 SCHECK_PARTIAL();
5554 break;
5555 }
5556 if (IS_NEWLINE(eptr)) break;
5557 if (md->partial != 0 && /* Take care with CRLF partial */
5558 eptr + 1 >= md->end_subject &&
5559 NLBLOCK->nltype == NLTYPE_FIXED &&
5560 NLBLOCK->nllen == 2 &&
5561 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5562 {
5563 md->hitend = TRUE;
5564 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5565 }
5566 eptr++;
5567 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5568 }
5569 }
5570
5571 /* Handle unlimited UTF-8 repeat */
5572
5573 else
5574 {
5575 for (i = min; i < max; i++)
5576 {
5577 if (eptr >= md->end_subject)
5578 {
5579 SCHECK_PARTIAL();
5580 break;
5581 }
5582 if (IS_NEWLINE(eptr)) break;
5583 if (md->partial != 0 && /* Take care with CRLF partial */
5584 eptr + 1 >= md->end_subject &&
5585 NLBLOCK->nltype == NLTYPE_FIXED &&
5586 NLBLOCK->nllen == 2 &&
5587 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5588 {
5589 md->hitend = TRUE;
5590 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5591 }
5592 eptr++;
5593 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5594 }
5595 }
5596 break;
5597
5598 case OP_ALLANY:
5599 if (max < INT_MAX)
5600 {
5601 for (i = min; i < max; i++)
5602 {
5603 if (eptr >= md->end_subject)
5604 {
5605 SCHECK_PARTIAL();
5606 break;
5607 }
5608 eptr++;
5609 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5610 }
5611 }
5612 else
5613 {
5614 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5615 SCHECK_PARTIAL();
5616 }
5617 break;
5618
5619 /* The byte case is the same as non-UTF8 */
5620
5621 case OP_ANYBYTE:
5622 c = max - min;
5623 if (c > (unsigned int)(md->end_subject - eptr))
5624 {
5625 eptr = md->end_subject;
5626 SCHECK_PARTIAL();
5627 }
5628 else eptr += c;
5629 break;
5630
5631 case OP_ANYNL:
5632 for (i = min; i < max; i++)
5633 {
5634 int len = 1;
5635 if (eptr >= md->end_subject)
5636 {
5637 SCHECK_PARTIAL();
5638 break;
5639 }
5640 GETCHARLEN(c, eptr, len);
5641 if (c == CHAR_CR)
5642 {
5643 if (++eptr >= md->end_subject) break;
5644 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5645 }
5646 else
5647 {
5648 if (c != CHAR_LF &&
5649 (md->bsr_anycrlf ||
5650 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5651 #ifndef EBCDIC
5652 && c != 0x2028 && c != 0x2029
5653 #endif /* Not EBCDIC */
5654 )))
5655 break;
5656 eptr += len;
5657 }
5658 }
5659 break;
5660
5661 case OP_NOT_HSPACE:
5662 case OP_HSPACE:
5663 for (i = min; i < max; i++)
5664 {
5665 BOOL gotspace;
5666 int len = 1;
5667 if (eptr >= md->end_subject)
5668 {
5669 SCHECK_PARTIAL();
5670 break;
5671 }
5672 GETCHARLEN(c, eptr, len);
5673 switch(c)
5674 {
5675 HSPACE_CASES: gotspace = TRUE; break;
5676 default: gotspace = FALSE; break;
5677 }
5678 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5679 eptr += len;
5680 }
5681 break;
5682
5683 case OP_NOT_VSPACE:
5684 case OP_VSPACE:
5685 for (i = min; i < max; i++)
5686 {
5687 BOOL gotspace;
5688 int len = 1;
5689 if (eptr >= md->end_subject)
5690 {
5691 SCHECK_PARTIAL();
5692 break;
5693 }
5694 GETCHARLEN(c, eptr, len);
5695 switch(c)
5696 {
5697 VSPACE_CASES: gotspace = TRUE; break;
5698 default: gotspace = FALSE; break;
5699 }
5700 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5701 eptr += len;
5702 }
5703 break;
5704
5705 case OP_NOT_DIGIT:
5706 for (i = min; i < max; i++)
5707 {
5708 int len = 1;
5709 if (eptr >= md->end_subject)
5710 {
5711 SCHECK_PARTIAL();
5712 break;
5713 }
5714 GETCHARLEN(c, eptr, len);
5715 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5716 eptr+= len;
5717 }
5718 break;
5719
5720 case OP_DIGIT:
5721 for (i = min; i < max; i++)
5722 {
5723 int len = 1;
5724 if (eptr >= md->end_subject)
5725 {
5726 SCHECK_PARTIAL();
5727 break;
5728 }
5729 GETCHARLEN(c, eptr, len);
5730 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5731 eptr+= len;
5732 }
5733 break;
5734
5735 case OP_NOT_WHITESPACE:
5736 for (i = min; i < max; i++)
5737 {
5738 int len = 1;
5739 if (eptr >= md->end_subject)
5740 {
5741 SCHECK_PARTIAL();
5742 break;
5743 }
5744 GETCHARLEN(c, eptr, len);
5745 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5746 eptr+= len;
5747 }
5748 break;
5749
5750 case OP_WHITESPACE:
5751 for (i = min; i < max; i++)
5752 {
5753 int len = 1;
5754 if (eptr >= md->end_subject)
5755 {
5756 SCHECK_PARTIAL();
5757 break;
5758 }
5759 GETCHARLEN(c, eptr, len);
5760 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5761 eptr+= len;
5762 }
5763 break;
5764
5765 case OP_NOT_WORDCHAR:
5766 for (i = min; i < max; i++)
5767 {
5768 int len = 1;
5769 if (eptr >= md->end_subject)
5770 {
5771 SCHECK_PARTIAL();
5772 break;
5773 }
5774 GETCHARLEN(c, eptr, len);
5775 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5776 eptr+= len;
5777 }
5778 break;
5779
5780 case OP_WORDCHAR:
5781 for (i = min; i < max; i++)
5782 {
5783 int len = 1;
5784 if (eptr >= md->end_subject)
5785 {
5786 SCHECK_PARTIAL();
5787 break;
5788 }
5789 GETCHARLEN(c, eptr, len);
5790 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5791 eptr+= len;
5792 }
5793 break;
5794
5795 default:
5796 RRETURN(PCRE_ERROR_INTERNAL);
5797 }
5798
5799 /* eptr is now past the end of the maximum run. If possessive, we are
5800 done (no backing up). Otherwise, match at this position; anything other
5801 than no match is immediately returned. For nomatch, back up one
5802 character, unless we are matching \R and the last thing matched was
5803 \r\n, in which case, back up two bytes. */
5804
5805 if (possessive) continue;
5806 for(;;)
5807 {
5808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5810 if (eptr-- == pp) break; /* Stop if tried at original pos */
5811 BACKCHAR(eptr);
5812 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5813 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5814 }
5815 }
5816 else
5817 #endif /* SUPPORT_UTF */
5818 /* Not UTF mode */
5819 {
5820 switch(ctype)
5821 {
5822 case OP_ANY:
5823 for (i = min; i < max; i++)
5824 {
5825 if (eptr >= md->end_subject)
5826 {
5827 SCHECK_PARTIAL();
5828 break;
5829 }
5830 if (IS_NEWLINE(eptr)) break;
5831 if (md->partial != 0 && /* Take care with CRLF partial */
5832 eptr + 1 >= md->end_subject &&
5833 NLBLOCK->nltype == NLTYPE_FIXED &&
5834 NLBLOCK->nllen == 2 &&
5835 *eptr == NLBLOCK->nl[0])
5836 {
5837 md->hitend = TRUE;
5838 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5839 }
5840 eptr++;
5841 }
5842 break;
5843
5844 case OP_ALLANY:
5845 case OP_ANYBYTE:
5846 c = max - min;
5847 if (c > (unsigned int)(md->end_subject - eptr))
5848 {
5849 eptr = md->end_subject;
5850 SCHECK_PARTIAL();
5851 }
5852 else eptr += c;
5853 break;
5854
5855 case OP_ANYNL:
5856 for (i = min; i < max; i++)
5857 {
5858 if (eptr >= md->end_subject)
5859 {
5860 SCHECK_PARTIAL();
5861 break;
5862 }
5863 c = *eptr;
5864 if (c == CHAR_CR)
5865 {
5866 if (++eptr >= md->end_subject) break;
5867 if (*eptr == CHAR_LF) eptr++;
5868 }
5869 else
5870 {
5871 if (c != CHAR_LF && (md->bsr_anycrlf ||
5872 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5873 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5874 && c != 0x2028 && c != 0x2029
5875 #endif
5876 ))) break;
5877 eptr++;
5878 }
5879 }
5880 break;
5881
5882 case OP_NOT_HSPACE:
5883 for (i = min; i < max; i++)
5884 {
5885 if (eptr >= md->end_subject)
5886 {
5887 SCHECK_PARTIAL();
5888 break;
5889 }
5890 switch(*eptr)
5891 {
5892 default: eptr++; break;
5893 HSPACE_BYTE_CASES:
5894 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5895 HSPACE_MULTIBYTE_CASES:
5896 #endif
5897 goto ENDLOOP00;
5898 }
5899 }
5900 ENDLOOP00:
5901 break;
5902
5903 case OP_HSPACE:
5904 for (i = min; i < max; i++)
5905 {
5906 if (eptr >= md->end_subject)
5907 {
5908 SCHECK_PARTIAL();
5909 break;
5910 }
5911 switch(*eptr)
5912 {
5913 default: goto ENDLOOP01;
5914 HSPACE_BYTE_CASES:
5915 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5916 HSPACE_MULTIBYTE_CASES:
5917 #endif
5918 eptr++; break;
5919 }
5920 }
5921 ENDLOOP01:
5922 break;
5923
5924 case OP_NOT_VSPACE:
5925 for (i = min; i < max; i++)
5926 {
5927 if (eptr >= md->end_subject)
5928 {
5929 SCHECK_PARTIAL();
5930 break;
5931 }
5932 switch(*eptr)
5933 {
5934 default: eptr++; break;
5935 VSPACE_BYTE_CASES:
5936 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5937 VSPACE_MULTIBYTE_CASES:
5938 #endif
5939 goto ENDLOOP02;
5940 }
5941 }
5942 ENDLOOP02:
5943 break;
5944
5945 case OP_VSPACE:
5946 for (i = min; i < max; i++)
5947 {
5948 if (eptr >= md->end_subject)
5949 {
5950 SCHECK_PARTIAL();
5951 break;
5952 }
5953 switch(*eptr)
5954 {
5955 default: goto ENDLOOP03;
5956 VSPACE_BYTE_CASES:
5957 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5958 VSPACE_MULTIBYTE_CASES:
5959 #endif
5960 eptr++; break;
5961 }
5962 }
5963 ENDLOOP03:
5964 break;
5965
5966 case OP_NOT_DIGIT:
5967 for (i = min; i < max; i++)
5968 {
5969 if (eptr >= md->end_subject)
5970 {
5971 SCHECK_PARTIAL();
5972 break;
5973 }
5974 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5975 eptr++;
5976 }
5977 break;
5978
5979 case OP_DIGIT:
5980 for (i = min; i < max; i++)
5981 {
5982 if (eptr >= md->end_subject)
5983 {
5984 SCHECK_PARTIAL();
5985 break;
5986 }
5987 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5988 eptr++;
5989 }
5990 break;
5991
5992 case OP_NOT_WHITESPACE:
5993 for (i = min; i < max; i++)
5994 {
5995 if (eptr >= md->end_subject)
5996 {
5997 SCHECK_PARTIAL();
5998 break;
5999 }
6000 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6001 eptr++;
6002 }
6003 break;
6004
6005 case OP_WHITESPACE:
6006 for (i = min; i < max; i++)
6007 {
6008 if (eptr >= md->end_subject)
6009 {
6010 SCHECK_PARTIAL();
6011 break;
6012 }
6013 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6014 eptr++;
6015 }
6016 break;
6017
6018 case OP_NOT_WORDCHAR:
6019 for (i = min; i < max; i++)
6020 {
6021 if (eptr >= md->end_subject)
6022 {
6023 SCHECK_PARTIAL();
6024 break;
6025 }
6026 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6027 eptr++;
6028 }
6029 break;
6030
6031 case OP_WORDCHAR:
6032 for (i = min; i < max; i++)
6033 {
6034 if (eptr >= md->end_subject)
6035 {
6036 SCHECK_PARTIAL();
6037 break;
6038 }
6039 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6040 eptr++;
6041 }
6042 break;
6043
6044 default:
6045 RRETURN(PCRE_ERROR_INTERNAL);
6046 }
6047
6048 /* eptr is now past the end of the maximum run. If possessive, we are
6049 done (no backing up). Otherwise, match at this position; anything other
6050 than no match is immediately returned. For nomatch, back up one
6051 character (byte), unless we are matching \R and the last thing matched
6052 was \r\n, in which case, back up two bytes. */
6053
6054 if (possessive) continue;
6055 while (eptr >= pp)
6056 {
6057 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6059 eptr--;
6060 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6061 eptr[-1] == CHAR_CR) eptr--;
6062 }
6063 }
6064
6065 /* Get here if we can't make it match with any permitted repetitions */
6066
6067 RRETURN(MATCH_NOMATCH);
6068 }
6069 /* Control never gets here */
6070
6071 /* There's been some horrible disaster. Arrival here can only mean there is
6072 something seriously wrong in the code above or the OP_xxx definitions. */
6073
6074 default:
6075 DPRINTF(("Unknown opcode %d\n", *ecode));
6076 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6077 }
6078
6079 /* Do not stick any code in here without much thought; it is assumed
6080 that "continue" in the code above comes out to here to repeat the main
6081 loop. */
6082
6083 } /* End of main loop */
6084 /* Control never reaches here */
6085
6086
6087 /* When compiling to use the heap rather than the stack for recursive calls to
6088 match(), the RRETURN() macro jumps here. The number that is saved in
6089 frame->Xwhere indicates which label we actually want to return to. */
6090
6091 #ifdef NO_RECURSE
6092 #define LBL(val) case val: goto L_RM##val;
6093 HEAP_RETURN:
6094 switch (frame->Xwhere)
6095 {
6096 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6097 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6098 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6099 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6100 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6101 LBL(65) LBL(66)
6102 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6103 LBL(21)
6104 #endif
6105 #ifdef SUPPORT_UTF
6106 LBL(16) LBL(18) LBL(20)
6107 LBL(22) LBL(23) LBL(28) LBL(30)
6108 LBL(32) LBL(34) LBL(42) LBL(46)
6109 #ifdef SUPPORT_UCP
6110 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6111 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6112 #endif /* SUPPORT_UCP */
6113 #endif /* SUPPORT_UTF */
6114 default:
6115 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6116 return PCRE_ERROR_INTERNAL;
6117 }
6118 #undef LBL
6119 #endif /* NO_RECURSE */
6120 }
6121
6122
6123 /***************************************************************************
6124 ****************************************************************************
6125 RECURSION IN THE match() FUNCTION
6126
6127 Undefine all the macros that were defined above to handle this. */
6128
6129 #ifdef NO_RECURSE
6130 #undef eptr
6131 #undef ecode
6132 #undef mstart
6133 #undef offset_top
6134 #undef eptrb
6135 #undef flags
6136
6137 #undef callpat
6138 #undef charptr
6139 #undef data
6140 #undef next
6141 #undef pp
6142 #undef prev
6143 #undef saved_eptr
6144
6145 #undef new_recursive
6146
6147 #undef cur_is_word
6148 #undef condition
6149 #undef prev_is_word
6150
6151 #undef ctype
6152 #undef length
6153 #undef max
6154 #undef min
6155 #undef number
6156 #undef offset
6157 #undef op
6158 #undef save_capture_last
6159 #undef save_offset1
6160 #undef save_offset2
6161 #undef save_offset3
6162 #undef stacksave
6163
6164 #undef newptrb
6165
6166 #endif
6167
6168 /* These two are defined as macros in both cases */
6169
6170 #undef fc
6171 #undef fi
6172
6173 /***************************************************************************
6174 ***************************************************************************/
6175
6176
6177 #ifdef NO_RECURSE
6178 /*************************************************
6179 * Release allocated heap frames *
6180 *************************************************/
6181
6182 /* This function releases all the allocated frames. The base frame is on the
6183 machine stack, and so must not be freed.
6184
6185 Argument: the address of the base frame
6186 Returns: nothing
6187 */
6188
6189 static void
6190 release_match_heapframes (heapframe *frame_base)
6191 {
6192 heapframe *nextframe = frame_base->Xnextframe;
6193 while (nextframe != NULL)
6194 {
6195 heapframe *oldframe = nextframe;
6196 nextframe = nextframe->Xnextframe;
6197 (PUBL(stack_free))(oldframe);
6198 }
6199 }
6200 #endif
6201
6202
6203 /*************************************************
6204 * Execute a Regular Expression *
6205 *************************************************/
6206
6207 /* This function applies a compiled re to a subject string and picks out
6208 portions of the string if it matches. Two elements in the vector are set for
6209 each substring: the offsets to the start and end of the substring.
6210
6211 Arguments:
6212 argument_re points to the compiled expression
6213 extra_data points to extra data or is NULL
6214 subject points to the subject string
6215 length length of subject string (may contain binary zeros)
6216 start_offset where to start in the subject string
6217 options option bits
6218 offsets points to a vector of ints to be filled in with offsets
6219 offsetcount the number of elements in the vector
6220
6221 Returns: > 0 => success; value is the number of elements filled in
6222 = 0 => success, but offsets is not big enough
6223 -1 => failed to match
6224 < -1 => some kind of unexpected problem
6225 */
6226
6227 #if defined COMPILE_PCRE8
6228 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6229 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6230 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6231 int offsetcount)
6232 #elif defined COMPILE_PCRE16
6233 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6234 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6235 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6236 int offsetcount)
6237 #elif defined COMPILE_PCRE32
6238 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6239 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6240 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6241 int offsetcount)
6242 #endif
6243 {
6244 int rc, ocount, arg_offset_max;
6245 int newline;
6246 BOOL using_temporary_offsets = FALSE;
6247 BOOL anchored;
6248 BOOL startline;
6249 BOOL firstline;
6250 BOOL utf;
6251 BOOL has_first_char = FALSE;
6252 BOOL has_req_char = FALSE;
6253 pcre_uchar first_char = 0;
6254 pcre_uchar first_char2 = 0;
6255 pcre_uchar req_char = 0;
6256 pcre_uchar req_char2 = 0;
6257 match_data match_block;
6258 match_data *md = &match_block;
6259 const pcre_uint8 *tables;
6260 const pcre_uint8 *start_bits = NULL;
6261 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6262 PCRE_PUCHAR end_subject;
6263 PCRE_PUCHAR start_partial = NULL;
6264 PCRE_PUCHAR req_char_ptr = start_match - 1;
6265
6266 const pcre_study_data *study;
6267 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6268
6269 #ifdef NO_RECURSE
6270 heapframe frame_zero;
6271 frame_zero.Xprevframe = NULL; /* Marks the top level */
6272 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6273 md->match_frames_base = &frame_zero;
6274 #endif
6275
6276 /* Check for the special magic call that measures the size of the stack used
6277 per recursive call of match(). Without the funny casting for sizeof, a Windows
6278 compiler gave this error: "unary minus operator applied to unsigned type,
6279 result still unsigned". Hopefully the cast fixes that. */
6280
6281 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6282 start_offset == -999)
6283 #ifdef NO_RECURSE
6284 return -((int)sizeof(heapframe));
6285 #else
6286 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6287 #endif
6288
6289 /* Plausibility checks */
6290
6291 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6292 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6293 return PCRE_ERROR_NULL;
6294 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6295 if (length < 0) return PCRE_ERROR_BADLENGTH;
6296 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6297
6298 /* Check that the first field in the block is the magic number. If it is not,
6299 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6300 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6301 means that the pattern is likely compiled with different endianness. */
6302
6303 if (re->magic_number != MAGIC_NUMBER)
6304 return re->magic_number == REVERSED_MAGIC_NUMBER?
6305 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6306 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6307
6308 /* These two settings are used in the code for checking a UTF-8 string that
6309 follows immediately afterwards. Other values in the md block are used only
6310 during "normal" pcre_exec() processing, not when the JIT support is in use,
6311 so they are set up later. */
6312
6313 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6314 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6315 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6316 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6317
6318 /* Check a UTF-8 string if required. Pass back the character offset and error
6319 code for an invalid string if a results vector is available. */
6320
6321 #ifdef SUPPORT_UTF
6322 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6323 {
6324 int erroroffset;
6325 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6326 if (errorcode != 0)
6327 {
6328 if (offsetcount >= 2)
6329 {
6330 offsets[0] = erroroffset;
6331 offsets[1] = errorcode;
6332 }
6333 #if defined COMPILE_PCRE8
6334 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6335 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6336 #elif defined COMPILE_PCRE16
6337 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6338 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6339 #elif defined COMPILE_PCRE32
6340 return PCRE_ERROR_BADUTF32;
6341 #endif
6342 }
6343 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6344 /* Check that a start_offset points to the start of a UTF character. */
6345 if (start_offset > 0 && start_offset < length &&
6346 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6347 return PCRE_ERROR_BADUTF8_OFFSET;
6348 #endif
6349 }
6350 #endif
6351
6352 /* If the pattern was successfully studied with JIT support, run the JIT
6353 executable instead of the rest of this function. Most options must be set at
6354 compile time for the JIT code to be usable. Fallback to the normal code path if
6355 an unsupported flag is set. */
6356
6357 #ifdef SUPPORT_JIT
6358 if (extra_data != NULL
6359 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6360 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6361 && extra_data->executable_jit != NULL
6362 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6363 {
6364 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6365 start_offset, options, offsets, offsetcount);
6366
6367 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6368 mode is not compiled. In this case we simply fallback to interpreter. */
6369
6370 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6371 }
6372 #endif
6373
6374 /* Carry on with non-JIT matching. This information is for finding all the
6375 numbers associated with a given name, for condition testing. */
6376
6377 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6378 md->name_count = re->name_count;
6379 md->name_entry_size = re->name_entry_size;
6380
6381 /* Fish out the optional data from the extra_data structure, first setting
6382 the default values. */
6383
6384 study = NULL;
6385 md->match_limit = MATCH_LIMIT;
6386 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6387 md->callout_data = NULL;
6388
6389 /* The table pointer is always in native byte order. */
6390
6391 tables = re->tables;
6392
6393 if (extra_data != NULL)
6394 {
6395 register unsigned int flags = extra_data->flags;
6396 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6397 study = (const pcre_study_data *)extra_data->study_data;
6398 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6399 md->match_limit = extra_data->match_limit;
6400 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6401 md->match_limit_recursion = extra_data->match_limit_recursion;
6402 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6403 md->callout_data = extra_data->callout_data;
6404 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6405 }
6406
6407 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6408 is a feature that makes it possible to save compiled regex and re-use them
6409 in other programs later. */
6410
6411 if (tables == NULL) tables = PRIV(default_tables);
6412
6413 /* Set up other data */
6414
6415 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6416 startline = (re->flags & PCRE_STARTLINE) != 0;
6417 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6418
6419 /* The code starts after the real_pcre block and the capture name table. */
6420
6421 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6422 re->name_count * re->name_entry_size;
6423
6424 md->start_subject = (PCRE_PUCHAR)subject;
6425 md->start_offset = start_offset;
6426 md->end_subject = md->start_subject + length;
6427 end_subject = md->end_subject;
6428
6429 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6430 md->use_ucp = (re->options & PCRE_UCP) != 0;
6431 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6432 md->ignore_skip_arg = FALSE;
6433
6434 /* Some options are unpacked into BOOL variables in the hope that testing
6435 them will be faster than individual option bits. */
6436
6437 md->notbol = (options & PCRE_NOTBOL) != 0;
6438 md->noteol = (options & PCRE_NOTEOL) != 0;
6439 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6440 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6441
6442 md->hitend = FALSE;
6443 md->mark = md->nomatch_mark = NULL; /* In case never set */
6444
6445 md->recursive = NULL; /* No recursion at top level */
6446 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6447
6448 md->lcc = tables + lcc_offset;
6449 md->fcc = tables + fcc_offset;
6450 md->ctypes = tables + ctypes_offset;
6451
6452 /* Handle different \R options. */
6453
6454 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6455 {
6456 case 0:
6457 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6458 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6459 else
6460 #ifdef BSR_ANYCRLF
6461 md->bsr_anycrlf = TRUE;
6462 #else
6463 md->bsr_anycrlf = FALSE;
6464 #endif
6465 break;
6466
6467 case PCRE_BSR_ANYCRLF:
6468 md->bsr_anycrlf = TRUE;
6469 break;
6470
6471 case PCRE_BSR_UNICODE:
6472 md->bsr_anycrlf = FALSE;
6473 break;
6474
6475 default: return PCRE_ERROR_BADNEWLINE;
6476 }
6477
6478 /* Handle different types of newline. The three bits give eight cases. If
6479 nothing is set at run time, whatever was used at compile time applies. */
6480
6481 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6482 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6483 {
6484 case 0: newline = NEWLINE; break; /* Compile-time default */
6485 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6486 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6487 case PCRE_NEWLINE_CR+
6488 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6489 case PCRE_NEWLINE_ANY: newline = -1; break;
6490 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6491 default: return PCRE_ERROR_BADNEWLINE;
6492 }
6493
6494 if (newline == -2)
6495 {
6496 md->nltype = NLTYPE_ANYCRLF;
6497 }
6498 else if (newline < 0)
6499 {
6500 md->nltype = NLTYPE_ANY;
6501 }
6502 else
6503 {
6504 md->nltype = NLTYPE_FIXED;
6505 if (newline > 255)
6506 {
6507 md->nllen = 2;
6508 md->nl[0] = (newline >> 8) & 255;
6509 md->nl[1] = newline & 255;
6510 }
6511 else
6512 {
6513 md->nllen = 1;
6514 md->nl[0] = newline;
6515 }
6516 }
6517
6518 /* Partial matching was originally supported only for a restricted set of
6519 regexes; from release 8.00 there are no restrictions, but the bits are still
6520 defined (though never set). So there's no harm in leaving this code. */
6521
6522 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6523 return PCRE_ERROR_BADPARTIAL;
6524
6525 /* If the expression has got more back references than the offsets supplied can
6526 hold, we get a temporary chunk of working store to use during the matching.
6527 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6528 of 3. */
6529
6530 ocount = offsetcount - (offsetcount % 3);
6531 arg_offset_max = (2*ocount)/3;
6532
6533 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6534 {
6535 ocount = re->top_backref * 3 + 3;
6536 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6537 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6538 using_temporary_offsets = TRUE;
6539 DPRINTF(("Got memory to hold back references\n"));
6540 }
6541 else md->offset_vector = offsets;
6542
6543 md->offset_end = ocount;
6544 md->offset_max = (2*ocount)/3;
6545 md->offset_overflow = FALSE;
6546 md->capture_last = -1;
6547
6548 /* Reset the working variable associated with each extraction. These should
6549 never be used unless previously set, but they get saved and restored, and so we
6550 initialize them to avoid reading uninitialized locations. Also, unset the
6551 offsets for the matched string. This is really just for tidiness with callouts,
6552 in case they inspect these fields. */
6553
6554 if (md->offset_vector != NULL)
6555 {
6556 register int *iptr = md->offset_vector + ocount;
6557 register int *iend = iptr - re->top_bracket;
6558 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6559 while (--iptr >= iend) *iptr = -1;
6560 md->offset_vector[0] = md->offset_vector[1] = -1;
6561 }
6562
6563 /* Set up the first character to match, if available. The first_char value is
6564 never set for an anchored regular expression, but the anchoring may be forced
6565 at run time, so we have to test for anchoring. The first char may be unset for
6566 an unanchored pattern, of course. If there's no first char and the pattern was
6567 studied, there may be a bitmap of possible first characters. */
6568
6569 if (!anchored)
6570 {
6571 if ((re->flags & PCRE_FIRSTSET) != 0)
6572 {
6573 has_first_char = TRUE;
6574 first_char = first_char2 = (pcre_uchar)(re->first_char);
6575 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6576 {
6577 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6578 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6579 if (utf && first_char > 127)
6580 first_char2 = UCD_OTHERCASE(first_char);
6581 #endif
6582 }
6583 }
6584 else
6585 if (!startline && study != NULL &&
6586 (study->flags & PCRE_STUDY_MAPPED) != 0)
6587 start_bits = study->start_bits;
6588 }
6589
6590 /* For anchored or unanchored matches, there may be a "last known required
6591 character" set. */
6592
6593 if ((re->flags & PCRE_REQCHSET) != 0)
6594 {
6595 has_req_char = TRUE;
6596 req_char = req_char2 = (pcre_uchar)(re->req_char);
6597 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6598 {
6599 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6600 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6601 if (utf && req_char > 127)
6602 req_char2 = UCD_OTHERCASE(req_char);
6603 #endif
6604 }
6605 }
6606
6607
6608 /* ==========================================================================*/
6609
6610 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6611 the loop runs just once. */
6612
6613 for(;;)
6614 {
6615 PCRE_PUCHAR save_end_subject = end_subject;
6616 PCRE_PUCHAR new_start_match;
6617
6618 /* If firstline is TRUE, the start of the match is constrained to the first
6619 line of a multiline string. That is, the match must be before or at the first
6620 newline. Implement this by temporarily adjusting end_subject so that we stop
6621 scanning at a newline. If the match fails at the newline, later code breaks
6622 this loop. */
6623
6624 if (firstline)
6625 {
6626 PCRE_PUCHAR t = start_match;
6627 #ifdef SUPPORT_UTF
6628 if (utf)
6629 {
6630 while (t < md->end_subject && !IS_NEWLINE(t))
6631 {
6632 t++;
6633 ACROSSCHAR(t < end_subject, *t, t++);
6634 }
6635 }
6636 else
6637 #endif
6638 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6639 end_subject = t;
6640 }
6641
6642 /* There are some optimizations that avoid running the match if a known
6643 starting point is not found, or if a known later character is not present.
6644 However, there is an option that disables these, for testing and for ensuring
6645 that all callouts do actually occur. The option can be set in the regex by
6646 (*NO_START_OPT) or passed in match-time options. */
6647
6648 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6649 {
6650 /* Advance to a unique first char if there is one. */
6651
6652 if (has_first_char)
6653 {
6654 pcre_uchar smc;
6655
6656 if (first_char != first_char2)
6657 while (start_match < end_subject &&
6658 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6659 start_match++;
6660 else
6661 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6662 start_match++;
6663 }
6664
6665 /* Or to just after a linebreak for a multiline match */
6666
6667 else if (startline)
6668 {
6669 if (start_match > md->start_subject + start_offset)
6670 {
6671 #ifdef SUPPORT_UTF
6672 if (utf)
6673 {
6674 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6675 {
6676 start_match++;
6677 ACROSSCHAR(start_match < end_subject, *start_match,
6678 start_match++);
6679 }
6680 }
6681 else
6682 #endif
6683 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6684 start_match++;
6685
6686 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6687 and we are now at a LF, advance the match position by one more character.
6688 */
6689
6690 if (start_match[-1] == CHAR_CR &&
6691 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6692 start_match < end_subject &&
6693 RAWUCHARTEST(start_match) == CHAR_NL)
6694 start_match++;
6695 }
6696 }
6697
6698 /* Or to a non-unique first byte after study */
6699
6700 else if (start_bits != NULL)
6701 {
6702 while (start_match < end_subject)
6703 {
6704 register pcre_uint32 c = RAWUCHARTEST(start_match);
6705 #ifndef COMPILE_PCRE8
6706 if (c > 255) c = 255;
6707 #endif
6708 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6709 {
6710 start_match++;
6711 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6712 /* In non 8-bit mode, the iteration will stop for
6713 characters > 255 at the beginning or not stop at all. */
6714 if (utf)
6715 ACROSSCHAR(start_match < end_subject, *start_match,
6716 start_match++);
6717 #endif
6718 }
6719 else break;
6720 }
6721 }
6722 } /* Starting optimizations */
6723
6724 /* Restore fudged end_subject */
6725
6726 end_subject = save_end_subject;
6727
6728 /* The following two optimizations are disabled for partial matching or if
6729 disabling is explicitly requested. */
6730
6731 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6732 {
6733 /* If the pattern was studied, a minimum subject length may be set. This is
6734 a lower bound; no actual string of that length may actually match the
6735 pattern. Although the value is, strictly, in characters, we treat it as
6736 bytes to avoid spending too much time in this optimization. */
6737
6738 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6739 (pcre_uint32)(end_subject - start_match) < study->minlength)
6740 {
6741 rc = MATCH_NOMATCH;
6742 break;
6743 }
6744
6745 /* If req_char is set, we know that that character must appear in the
6746 subject for the match to succeed. If the first character is set, req_char
6747 must be later in the subject; otherwise the test starts at the match point.
6748 This optimization can save a huge amount of backtracking in patterns with
6749 nested unlimited repeats that aren't going to match. Writing separate code
6750 for cased/caseless versions makes it go faster, as does using an
6751 autoincrement and backing off on a match.
6752
6753 HOWEVER: when the subject string is very, very long, searching to its end
6754 can take a long time, and give bad performance on quite ordinary patterns.
6755 This showed up when somebody was matching something like /^\d+C/ on a
6756 32-megabyte string... so we don't do this when the string is sufficiently
6757 long. */
6758
6759 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6760 {
6761 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6762
6763 /* We don't need to repeat the search if we haven't yet reached the
6764 place we found it at last time. */