/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1033 - (show annotations)
Mon Sep 10 11:02:48 2012 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 220007 byte(s)
General spring-clean of EBCDIC-related issues in the code, which had decayed 
over time. Also the documentation. Added one test that can be run in an ASCII
world to do a little testing of EBCDIC-related things. 
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95
96
97 #ifdef PCRE_DEBUG
98 /*************************************************
99 * Debugging function to print chars *
100 *************************************************/
101
102 /* Print a sequence of chars in printable format, stopping at the end of the
103 subject if the requested.
104
105 Arguments:
106 p points to characters
107 length number to print
108 is_subject TRUE if printing from within md->start_subject
109 md pointer to matching data block, if is_subject is TRUE
110
111 Returns: nothing
112 */
113
114 static void
115 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
116 {
117 unsigned int c;
118 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
119 while (length-- > 0)
120 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121 }
122 #endif
123
124
125
126 /*************************************************
127 * Match a back-reference *
128 *************************************************/
129
130 /* Normally, if a back reference hasn't been set, the length that is passed is
131 negative, so the match always fails. However, in JavaScript compatibility mode,
132 the length passed is zero. Note that in caseless UTF-8 mode, the number of
133 subject bytes matched may be different to the number of reference bytes.
134
135 Arguments:
136 offset index into the offset vector
137 eptr pointer into the subject
138 length length of reference to be matched (number of bytes)
139 md points to match data block
140 caseless TRUE if caseless
141
142 Returns: >= 0 the number of subject bytes matched
143 -1 no match
144 -2 partial match; always given if at end subject
145 */
146
147 static int
148 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
149 BOOL caseless)
150 {
151 PCRE_PUCHAR eptr_start = eptr;
152 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if reference not set (and not JavaScript compatible - in that
168 case the length is passed as zero). */
169
170 if (length < 0) return -1;
171
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
175
176 if (caseless)
177 {
178 #ifdef SUPPORT_UTF
179 #ifdef SUPPORT_UCP
180 if (md->utf)
181 {
182 /* Match characters up to the end of the reference. NOTE: the number of
183 bytes matched may differ, because there are some characters whose upper and
184 lower case versions code as different numbers of bytes. For example, U+023A
185 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
186 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
187 the latter. It is important, therefore, to check the length along the
188 reference, not along the subject (earlier code did this wrong). */
189
190 PCRE_PUCHAR endptr = p + length;
191 while (p < endptr)
192 {
193 int c, d;
194 if (eptr >= md->end_subject) return -2; /* Partial match */
195 GETCHARINC(c, eptr);
196 GETCHARINC(d, p);
197 if (c != d && c != UCD_OTHERCASE(d)) return -1;
198 }
199 }
200 else
201 #endif
202 #endif
203
204 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
205 is no UCP support. */
206 {
207 while (length-- > 0)
208 {
209 if (eptr >= md->end_subject) return -2; /* Partial match */
210 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
211 p++;
212 eptr++;
213 }
214 }
215 }
216
217 /* In the caseful case, we can just compare the bytes, whether or not we
218 are in UTF-8 mode. */
219
220 else
221 {
222 while (length-- > 0)
223 {
224 if (eptr >= md->end_subject) return -2; /* Partial match */
225 if (*p++ != *eptr++) return -1;
226 }
227 }
228
229 return (int)(eptr - eptr_start);
230 }
231
232
233
234 /***************************************************************************
235 ****************************************************************************
236 RECURSION IN THE match() FUNCTION
237
238 The match() function is highly recursive, though not every recursive call
239 increases the recursive depth. Nevertheless, some regular expressions can cause
240 it to recurse to a great depth. I was writing for Unix, so I just let it call
241 itself recursively. This uses the stack for saving everything that has to be
242 saved for a recursive call. On Unix, the stack can be large, and this works
243 fine.
244
245 It turns out that on some non-Unix-like systems there are problems with
246 programs that use a lot of stack. (This despite the fact that every last chip
247 has oodles of memory these days, and techniques for extending the stack have
248 been known for decades.) So....
249
250 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
251 calls by keeping local variables that need to be preserved in blocks of memory
252 obtained from malloc() instead instead of on the stack. Macros are used to
253 achieve this so that the actual code doesn't look very different to what it
254 always used to.
255
256 The original heap-recursive code used longjmp(). However, it seems that this
257 can be very slow on some operating systems. Following a suggestion from Stan
258 Switzer, the use of longjmp() has been abolished, at the cost of having to
259 provide a unique number for each call to RMATCH. There is no way of generating
260 a sequence of numbers at compile time in C. I have given them names, to make
261 them stand out more clearly.
262
263 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
264 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
265 tests. Furthermore, not using longjmp() means that local dynamic variables
266 don't have indeterminate values; this has meant that the frame size can be
267 reduced because the result can be "passed back" by straight setting of the
268 variable instead of being passed in the frame.
269 ****************************************************************************
270 ***************************************************************************/
271
272 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
273 below must be updated in sync. */
274
275 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
276 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
277 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
278 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
279 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
280 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
281 RM61, RM62, RM63, RM64, RM65, RM66 };
282
283 /* These versions of the macros use the stack, as normal. There are debugging
284 versions and production versions. Note that the "rw" argument of RMATCH isn't
285 actually used in this definition. */
286
287 #ifndef NO_RECURSE
288 #define REGISTER register
289
290 #ifdef PCRE_DEBUG
291 #define RMATCH(ra,rb,rc,rd,re,rw) \
292 { \
293 printf("match() called in line %d\n", __LINE__); \
294 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
295 printf("to line %d\n", __LINE__); \
296 }
297 #define RRETURN(ra) \
298 { \
299 printf("match() returned %d from line %d ", ra, __LINE__); \
300 return ra; \
301 }
302 #else
303 #define RMATCH(ra,rb,rc,rd,re,rw) \
304 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
305 #define RRETURN(ra) return ra
306 #endif
307
308 #else
309
310
311 /* These versions of the macros manage a private stack on the heap. Note that
312 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
313 argument of match(), which never changes. */
314
315 #define REGISTER
316
317 #define RMATCH(ra,rb,rc,rd,re,rw)\
318 {\
319 heapframe *newframe = frame->Xnextframe;\
320 if (newframe == NULL)\
321 {\
322 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
323 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
324 newframe->Xnextframe = NULL;\
325 frame->Xnextframe = newframe;\
326 }\
327 frame->Xwhere = rw;\
328 newframe->Xeptr = ra;\
329 newframe->Xecode = rb;\
330 newframe->Xmstart = mstart;\
331 newframe->Xoffset_top = rc;\
332 newframe->Xeptrb = re;\
333 newframe->Xrdepth = frame->Xrdepth + 1;\
334 newframe->Xprevframe = frame;\
335 frame = newframe;\
336 DPRINTF(("restarting from line %d\n", __LINE__));\
337 goto HEAP_RECURSE;\
338 L_##rw:\
339 DPRINTF(("jumped back to line %d\n", __LINE__));\
340 }
341
342 #define RRETURN(ra)\
343 {\
344 heapframe *oldframe = frame;\
345 frame = oldframe->Xprevframe;\
346 if (frame != NULL)\
347 {\
348 rrc = ra;\
349 goto HEAP_RETURN;\
350 }\
351 return ra;\
352 }
353
354
355 /* Structure for remembering the local variables in a private frame */
356
357 typedef struct heapframe {
358 struct heapframe *Xprevframe;
359 struct heapframe *Xnextframe;
360
361 /* Function arguments that may change */
362
363 PCRE_PUCHAR Xeptr;
364 const pcre_uchar *Xecode;
365 PCRE_PUCHAR Xmstart;
366 int Xoffset_top;
367 eptrblock *Xeptrb;
368 unsigned int Xrdepth;
369
370 /* Function local variables */
371
372 PCRE_PUCHAR Xcallpat;
373 #ifdef SUPPORT_UTF
374 PCRE_PUCHAR Xcharptr;
375 #endif
376 PCRE_PUCHAR Xdata;
377 PCRE_PUCHAR Xnext;
378 PCRE_PUCHAR Xpp;
379 PCRE_PUCHAR Xprev;
380 PCRE_PUCHAR Xsaved_eptr;
381
382 recursion_info Xnew_recursive;
383
384 BOOL Xcur_is_word;
385 BOOL Xcondition;
386 BOOL Xprev_is_word;
387
388 #ifdef SUPPORT_UCP
389 int Xprop_type;
390 int Xprop_value;
391 int Xprop_fail_result;
392 int Xoclength;
393 pcre_uchar Xocchars[6];
394 #endif
395
396 int Xcodelink;
397 int Xctype;
398 unsigned int Xfc;
399 int Xfi;
400 int Xlength;
401 int Xmax;
402 int Xmin;
403 int Xnumber;
404 int Xoffset;
405 int Xop;
406 int Xsave_capture_last;
407 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
408 int Xstacksave[REC_STACK_SAVE_MAX];
409
410 eptrblock Xnewptrb;
411
412 /* Where to jump back to */
413
414 int Xwhere;
415
416 } heapframe;
417
418 #endif
419
420
421 /***************************************************************************
422 ***************************************************************************/
423
424
425
426 /*************************************************
427 * Match from current position *
428 *************************************************/
429
430 /* This function is called recursively in many circumstances. Whenever it
431 returns a negative (error) response, the outer incarnation must also return the
432 same response. */
433
434 /* These macros pack up tests that are used for partial matching, and which
435 appear several times in the code. We set the "hit end" flag if the pointer is
436 at the end of the subject and also past the start of the subject (i.e.
437 something has been matched). For hard partial matching, we then return
438 immediately. The second one is used when we already know we are past the end of
439 the subject. */
440
441 #define CHECK_PARTIAL()\
442 if (md->partial != 0 && eptr >= md->end_subject && \
443 eptr > md->start_used_ptr) \
444 { \
445 md->hitend = TRUE; \
446 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
447 }
448
449 #define SCHECK_PARTIAL()\
450 if (md->partial != 0 && eptr > md->start_used_ptr) \
451 { \
452 md->hitend = TRUE; \
453 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
454 }
455
456
457 /* Performance note: It might be tempting to extract commonly used fields from
458 the md structure (e.g. utf, end_subject) into individual variables to improve
459 performance. Tests using gcc on a SPARC disproved this; in the first case, it
460 made performance worse.
461
462 Arguments:
463 eptr pointer to current character in subject
464 ecode pointer to current position in compiled code
465 mstart pointer to the current match start position (can be modified
466 by encountering \K)
467 offset_top current top pointer
468 md pointer to "static" info for the match
469 eptrb pointer to chain of blocks containing eptr at start of
470 brackets - for testing for empty matches
471 rdepth the recursion depth
472
473 Returns: MATCH_MATCH if matched ) these values are >= 0
474 MATCH_NOMATCH if failed to match )
475 a negative MATCH_xxx value for PRUNE, SKIP, etc
476 a negative PCRE_ERROR_xxx value if aborted by an error condition
477 (e.g. stopped by repeated call or recursion limit)
478 */
479
480 static int
481 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
482 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
483 unsigned int rdepth)
484 {
485 /* These variables do not need to be preserved over recursion in this function,
486 so they can be ordinary variables in all cases. Mark some of them with
487 "register" because they are used a lot in loops. */
488
489 register int rrc; /* Returns from recursive calls */
490 register int i; /* Used for loops not involving calls to RMATCH() */
491 register unsigned int c; /* Character values not kept over RMATCH() calls */
492 register BOOL utf; /* Local copy of UTF flag for speed */
493
494 BOOL minimize, possessive; /* Quantifier options */
495 BOOL caseless;
496 int condcode;
497
498 /* When recursion is not being used, all "local" variables that have to be
499 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
500 frame on the stack here; subsequent instantiations are obtained from the heap
501 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
502 the top-level on the stack rather than malloc-ing them all gives a performance
503 boost in many cases where there is not much "recursion". */
504
505 #ifdef NO_RECURSE
506 heapframe *frame = (heapframe *)md->match_frames_base;
507
508 /* Copy in the original argument variables */
509
510 frame->Xeptr = eptr;
511 frame->Xecode = ecode;
512 frame->Xmstart = mstart;
513 frame->Xoffset_top = offset_top;
514 frame->Xeptrb = eptrb;
515 frame->Xrdepth = rdepth;
516
517 /* This is where control jumps back to to effect "recursion" */
518
519 HEAP_RECURSE:
520
521 /* Macros make the argument variables come from the current frame */
522
523 #define eptr frame->Xeptr
524 #define ecode frame->Xecode
525 #define mstart frame->Xmstart
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define oclength frame->Xoclength
554 #define occhars frame->Xocchars
555 #endif
556
557 #define ctype frame->Xctype
558 #define fc frame->Xfc
559 #define fi frame->Xfi
560 #define length frame->Xlength
561 #define max frame->Xmax
562 #define min frame->Xmin
563 #define number frame->Xnumber
564 #define offset frame->Xoffset
565 #define op frame->Xop
566 #define save_capture_last frame->Xsave_capture_last
567 #define save_offset1 frame->Xsave_offset1
568 #define save_offset2 frame->Xsave_offset2
569 #define save_offset3 frame->Xsave_offset3
570 #define stacksave frame->Xstacksave
571
572 #define newptrb frame->Xnewptrb
573
574 /* When recursion is being used, local variables are allocated on the stack and
575 get preserved during recursion in the normal way. In this environment, fi and
576 i, and fc and c, can be the same variables. */
577
578 #else /* NO_RECURSE not defined */
579 #define fi i
580 #define fc c
581
582 /* Many of the following variables are used only in small blocks of the code.
583 My normal style of coding would have declared them within each of those blocks.
584 However, in order to accommodate the version of this code that uses an external
585 "stack" implemented on the heap, it is easier to declare them all here, so the
586 declarations can be cut out in a block. The only declarations within blocks
587 below are for variables that do not have to be preserved over a recursive call
588 to RMATCH(). */
589
590 #ifdef SUPPORT_UTF
591 const pcre_uchar *charptr;
592 #endif
593 const pcre_uchar *callpat;
594 const pcre_uchar *data;
595 const pcre_uchar *next;
596 PCRE_PUCHAR pp;
597 const pcre_uchar *prev;
598 PCRE_PUCHAR saved_eptr;
599
600 recursion_info new_recursive;
601
602 BOOL cur_is_word;
603 BOOL condition;
604 BOOL prev_is_word;
605
606 #ifdef SUPPORT_UCP
607 int prop_type;
608 int prop_value;
609 int prop_fail_result;
610 int oclength;
611 pcre_uchar occhars[6];
612 #endif
613
614 int codelink;
615 int ctype;
616 int length;
617 int max;
618 int min;
619 int number;
620 int offset;
621 int op;
622 int save_capture_last;
623 int save_offset1, save_offset2, save_offset3;
624 int stacksave[REC_STACK_SAVE_MAX];
625
626 eptrblock newptrb;
627
628 /* There is a special fudge for calling match() in a way that causes it to
629 measure the size of its basic stack frame when the stack is being used for
630 recursion. The second argument (ecode) being NULL triggers this behaviour. It
631 cannot normally ever be NULL. The return is the negated value of the frame
632 size. */
633
634 if (ecode == NULL)
635 {
636 if (rdepth == 0)
637 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
638 else
639 {
640 int len = (char *)&rdepth - (char *)eptr;
641 return (len > 0)? -len : len;
642 }
643 }
644 #endif /* NO_RECURSE */
645
646 /* To save space on the stack and in the heap frame, I have doubled up on some
647 of the local variables that are used only in localised parts of the code, but
648 still need to be preserved over recursive calls of match(). These macros define
649 the alternative names that are used. */
650
651 #define allow_zero cur_is_word
652 #define cbegroup condition
653 #define code_offset codelink
654 #define condassert condition
655 #define matched_once prev_is_word
656 #define foc number
657 #define save_mark data
658
659 /* These statements are here to stop the compiler complaining about unitialized
660 variables. */
661
662 #ifdef SUPPORT_UCP
663 prop_value = 0;
664 prop_fail_result = 0;
665 #endif
666
667
668 /* This label is used for tail recursion, which is used in a few cases even
669 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
670 used. Thanks to Ian Taylor for noticing this possibility and sending the
671 original patch. */
672
673 TAIL_RECURSE:
674
675 /* OK, now we can get on with the real code of the function. Recursive calls
676 are specified by the macro RMATCH and RRETURN is used to return. When
677 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
678 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
679 defined). However, RMATCH isn't like a function call because it's quite a
680 complicated macro. It has to be used in one particular way. This shouldn't,
681 however, impact performance when true recursion is being used. */
682
683 #ifdef SUPPORT_UTF
684 utf = md->utf; /* Local copy of the flag */
685 #else
686 utf = FALSE;
687 #endif
688
689 /* First check that we haven't called match() too many times, or that we
690 haven't exceeded the recursive call limit. */
691
692 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
693 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
694
695 /* At the start of a group with an unlimited repeat that may match an empty
696 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
697 done this way to save having to use another function argument, which would take
698 up space on the stack. See also MATCH_CONDASSERT below.
699
700 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
701 such remembered pointers, to be checked when we hit the closing ket, in order
702 to break infinite loops that match no characters. When match() is called in
703 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
704 NOT be used with tail recursion, because the memory block that is used is on
705 the stack, so a new one may be required for each match(). */
706
707 if (md->match_function_type == MATCH_CBEGROUP)
708 {
709 newptrb.epb_saved_eptr = eptr;
710 newptrb.epb_prev = eptrb;
711 eptrb = &newptrb;
712 md->match_function_type = 0;
713 }
714
715 /* Now start processing the opcodes. */
716
717 for (;;)
718 {
719 minimize = possessive = FALSE;
720 op = *ecode;
721
722 switch(op)
723 {
724 case OP_MARK:
725 md->nomatch_mark = ecode + 2;
726 md->mark = NULL; /* In case previously set by assertion */
727 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
728 eptrb, RM55);
729 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
730 md->mark == NULL) md->mark = ecode + 2;
731
732 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
733 argument, and we must check whether that argument matches this MARK's
734 argument. It is passed back in md->start_match_ptr (an overloading of that
735 variable). If it does match, we reset that variable to the current subject
736 position and return MATCH_SKIP. Otherwise, pass back the return code
737 unaltered. */
738
739 else if (rrc == MATCH_SKIP_ARG &&
740 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
741 {
742 md->start_match_ptr = eptr;
743 RRETURN(MATCH_SKIP);
744 }
745 RRETURN(rrc);
746
747 case OP_FAIL:
748 RRETURN(MATCH_NOMATCH);
749
750 /* COMMIT overrides PRUNE, SKIP, and THEN */
751
752 case OP_COMMIT:
753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 eptrb, RM52);
755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
756 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
757 rrc != MATCH_THEN)
758 RRETURN(rrc);
759 RRETURN(MATCH_COMMIT);
760
761 /* PRUNE overrides THEN */
762
763 case OP_PRUNE:
764 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
765 eptrb, RM51);
766 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
767 RRETURN(MATCH_PRUNE);
768
769 case OP_PRUNE_ARG:
770 md->nomatch_mark = ecode + 2;
771 md->mark = NULL; /* In case previously set by assertion */
772 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
773 eptrb, RM56);
774 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
775 md->mark == NULL) md->mark = ecode + 2;
776 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
777 RRETURN(MATCH_PRUNE);
778
779 /* SKIP overrides PRUNE and THEN */
780
781 case OP_SKIP:
782 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
783 eptrb, RM53);
784 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
785 RRETURN(rrc);
786 md->start_match_ptr = eptr; /* Pass back current position */
787 RRETURN(MATCH_SKIP);
788
789 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
790 nomatch_mark. There is a flag that disables this opcode when re-matching a
791 pattern that ended with a SKIP for which there was not a matching MARK. */
792
793 case OP_SKIP_ARG:
794 if (md->ignore_skip_arg)
795 {
796 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
797 break;
798 }
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM57);
801 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
802 RRETURN(rrc);
803
804 /* Pass back the current skip name by overloading md->start_match_ptr and
805 returning the special MATCH_SKIP_ARG return code. This will either be
806 caught by a matching MARK, or get to the top, where it causes a rematch
807 with the md->ignore_skip_arg flag set. */
808
809 md->start_match_ptr = ecode + 2;
810 RRETURN(MATCH_SKIP_ARG);
811
812 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
813 the branch in which it occurs can be determined. Overload the start of
814 match pointer to do this. */
815
816 case OP_THEN:
817 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
818 eptrb, RM54);
819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
820 md->start_match_ptr = ecode;
821 RRETURN(MATCH_THEN);
822
823 case OP_THEN_ARG:
824 md->nomatch_mark = ecode + 2;
825 md->mark = NULL; /* In case previously set by assertion */
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
827 md, eptrb, RM58);
828 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
829 md->mark == NULL) md->mark = ecode + 2;
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 md->start_match_ptr = ecode;
832 RRETURN(MATCH_THEN);
833
834 /* Handle an atomic group that does not contain any capturing parentheses.
835 This can be handled like an assertion. Prior to 8.13, all atomic groups
836 were handled this way. In 8.13, the code was changed as below for ONCE, so
837 that backups pass through the group and thereby reset captured values.
838 However, this uses a lot more stack, so in 8.20, atomic groups that do not
839 contain any captures generate OP_ONCE_NC, which can be handled in the old,
840 less stack intensive way.
841
842 Check the alternative branches in turn - the matching won't pass the KET
843 for this kind of subpattern. If any one branch matches, we carry on as at
844 the end of a normal bracket, leaving the subject pointer, but resetting
845 the start-of-match value in case it was changed by \K. */
846
847 case OP_ONCE_NC:
848 prev = ecode;
849 saved_eptr = eptr;
850 save_mark = md->mark;
851 do
852 {
853 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
854 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
855 {
856 mstart = md->start_match_ptr;
857 break;
858 }
859 if (rrc == MATCH_THEN)
860 {
861 next = ecode + GET(ecode,1);
862 if (md->start_match_ptr < next &&
863 (*ecode == OP_ALT || *next == OP_ALT))
864 rrc = MATCH_NOMATCH;
865 }
866
867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
868 ecode += GET(ecode,1);
869 md->mark = save_mark;
870 }
871 while (*ecode == OP_ALT);
872
873 /* If hit the end of the group (which could be repeated), fail */
874
875 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
876
877 /* Continue as from after the group, updating the offsets high water
878 mark, since extracts may have been taken. */
879
880 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
881
882 offset_top = md->end_offset_top;
883 eptr = md->end_match_ptr;
884
885 /* For a non-repeating ket, just continue at this level. This also
886 happens for a repeating ket if no characters were matched in the group.
887 This is the forcible breaking of infinite loops as implemented in Perl
888 5.005. */
889
890 if (*ecode == OP_KET || eptr == saved_eptr)
891 {
892 ecode += 1+LINK_SIZE;
893 break;
894 }
895
896 /* The repeating kets try the rest of the pattern or restart from the
897 preceding bracket, in the appropriate order. The second "call" of match()
898 uses tail recursion, to avoid using another stack frame. */
899
900 if (*ecode == OP_KETRMIN)
901 {
902 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
904 ecode = prev;
905 goto TAIL_RECURSE;
906 }
907 else /* OP_KETRMAX */
908 {
909 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
911 ecode += 1 + LINK_SIZE;
912 goto TAIL_RECURSE;
913 }
914 /* Control never gets here */
915
916 /* Handle a capturing bracket, other than those that are possessive with an
917 unlimited repeat. If there is space in the offset vector, save the current
918 subject position in the working slot at the top of the vector. We mustn't
919 change the current values of the data slot, because they may be set from a
920 previous iteration of this group, and be referred to by a reference inside
921 the group. A failure to match might occur after the group has succeeded,
922 if something later on doesn't match. For this reason, we need to restore
923 the working value and also the values of the final offsets, in case they
924 were set by a previous iteration of the same bracket.
925
926 If there isn't enough space in the offset vector, treat this as if it were
927 a non-capturing bracket. Don't worry about setting the flag for the error
928 case here; that is handled in the code for KET. */
929
930 case OP_CBRA:
931 case OP_SCBRA:
932 number = GET2(ecode, 1+LINK_SIZE);
933 offset = number << 1;
934
935 #ifdef PCRE_DEBUG
936 printf("start bracket %d\n", number);
937 printf("subject=");
938 pchars(eptr, 16, TRUE, md);
939 printf("\n");
940 #endif
941
942 if (offset < md->offset_max)
943 {
944 save_offset1 = md->offset_vector[offset];
945 save_offset2 = md->offset_vector[offset+1];
946 save_offset3 = md->offset_vector[md->offset_end - number];
947 save_capture_last = md->capture_last;
948 save_mark = md->mark;
949
950 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
951 md->offset_vector[md->offset_end - number] =
952 (int)(eptr - md->start_subject);
953
954 for (;;)
955 {
956 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
957 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
958 eptrb, RM1);
959 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
960
961 /* If we backed up to a THEN, check whether it is within the current
962 branch by comparing the address of the THEN that is passed back with
963 the end of the branch. If it is within the current branch, and the
964 branch is one of two or more alternatives (it either starts or ends
965 with OP_ALT), we have reached the limit of THEN's action, so convert
966 the return code to NOMATCH, which will cause normal backtracking to
967 happen from now on. Otherwise, THEN is passed back to an outer
968 alternative. This implements Perl's treatment of parenthesized groups,
969 where a group not containing | does not affect the current alternative,
970 that is, (X) is NOT the same as (X|(*F)). */
971
972 if (rrc == MATCH_THEN)
973 {
974 next = ecode + GET(ecode,1);
975 if (md->start_match_ptr < next &&
976 (*ecode == OP_ALT || *next == OP_ALT))
977 rrc = MATCH_NOMATCH;
978 }
979
980 /* Anything other than NOMATCH is passed back. */
981
982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
983 md->capture_last = save_capture_last;
984 ecode += GET(ecode, 1);
985 md->mark = save_mark;
986 if (*ecode != OP_ALT) break;
987 }
988
989 DPRINTF(("bracket %d failed\n", number));
990 md->offset_vector[offset] = save_offset1;
991 md->offset_vector[offset+1] = save_offset2;
992 md->offset_vector[md->offset_end - number] = save_offset3;
993
994 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
995
996 RRETURN(rrc);
997 }
998
999 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1000 as a non-capturing bracket. */
1001
1002 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1003 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1004
1005 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1006
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1009
1010 /* Non-capturing or atomic group, except for possessive with unlimited
1011 repeat and ONCE group with no captures. Loop for all the alternatives.
1012
1013 When we get to the final alternative within the brackets, we used to return
1014 the result of a recursive call to match() whatever happened so it was
1015 possible to reduce stack usage by turning this into a tail recursion,
1016 except in the case of a possibly empty group. However, now that there is
1017 the possiblity of (*THEN) occurring in the final alternative, this
1018 optimization is no longer always possible.
1019
1020 We can optimize if we know there are no (*THEN)s in the pattern; at present
1021 this is the best that can be done.
1022
1023 MATCH_ONCE is returned when the end of an atomic group is successfully
1024 reached, but subsequent matching fails. It passes back up the tree (causing
1025 captured values to be reset) until the original atomic group level is
1026 reached. This is tested by comparing md->once_target with the start of the
1027 group. At this point, the return is converted into MATCH_NOMATCH so that
1028 previous backup points can be taken. */
1029
1030 case OP_ONCE:
1031 case OP_BRA:
1032 case OP_SBRA:
1033 DPRINTF(("start non-capturing bracket\n"));
1034
1035 for (;;)
1036 {
1037 if (op >= OP_SBRA || op == OP_ONCE)
1038 md->match_function_type = MATCH_CBEGROUP;
1039
1040 /* If this is not a possibly empty group, and there are no (*THEN)s in
1041 the pattern, and this is the final alternative, optimize as described
1042 above. */
1043
1044 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1045 {
1046 ecode += PRIV(OP_lengths)[*ecode];
1047 goto TAIL_RECURSE;
1048 }
1049
1050 /* In all other cases, we have to make another call to match(). */
1051
1052 save_mark = md->mark;
1053 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1054 RM2);
1055
1056 /* See comment in the code for capturing groups above about handling
1057 THEN. */
1058
1059 if (rrc == MATCH_THEN)
1060 {
1061 next = ecode + GET(ecode,1);
1062 if (md->start_match_ptr < next &&
1063 (*ecode == OP_ALT || *next == OP_ALT))
1064 rrc = MATCH_NOMATCH;
1065 }
1066
1067 if (rrc != MATCH_NOMATCH)
1068 {
1069 if (rrc == MATCH_ONCE)
1070 {
1071 const pcre_uchar *scode = ecode;
1072 if (*scode != OP_ONCE) /* If not at start, find it */
1073 {
1074 while (*scode == OP_ALT) scode += GET(scode, 1);
1075 scode -= GET(scode, 1);
1076 }
1077 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1078 }
1079 RRETURN(rrc);
1080 }
1081 ecode += GET(ecode, 1);
1082 md->mark = save_mark;
1083 if (*ecode != OP_ALT) break;
1084 }
1085
1086 RRETURN(MATCH_NOMATCH);
1087
1088 /* Handle possessive capturing brackets with an unlimited repeat. We come
1089 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1090 handled similarly to the normal case above. However, the matching is
1091 different. The end of these brackets will always be OP_KETRPOS, which
1092 returns MATCH_KETRPOS without going further in the pattern. By this means
1093 we can handle the group by iteration rather than recursion, thereby
1094 reducing the amount of stack needed. */
1095
1096 case OP_CBRAPOS:
1097 case OP_SCBRAPOS:
1098 allow_zero = FALSE;
1099
1100 POSSESSIVE_CAPTURE:
1101 number = GET2(ecode, 1+LINK_SIZE);
1102 offset = number << 1;
1103
1104 #ifdef PCRE_DEBUG
1105 printf("start possessive bracket %d\n", number);
1106 printf("subject=");
1107 pchars(eptr, 16, TRUE, md);
1108 printf("\n");
1109 #endif
1110
1111 if (offset < md->offset_max)
1112 {
1113 matched_once = FALSE;
1114 code_offset = (int)(ecode - md->start_code);
1115
1116 save_offset1 = md->offset_vector[offset];
1117 save_offset2 = md->offset_vector[offset+1];
1118 save_offset3 = md->offset_vector[md->offset_end - number];
1119 save_capture_last = md->capture_last;
1120
1121 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1122
1123 /* Each time round the loop, save the current subject position for use
1124 when the group matches. For MATCH_MATCH, the group has matched, so we
1125 restart it with a new subject starting position, remembering that we had
1126 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1127 usual. If we haven't matched any alternatives in any iteration, check to
1128 see if a previous iteration matched. If so, the group has matched;
1129 continue from afterwards. Otherwise it has failed; restore the previous
1130 capture values before returning NOMATCH. */
1131
1132 for (;;)
1133 {
1134 md->offset_vector[md->offset_end - number] =
1135 (int)(eptr - md->start_subject);
1136 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1137 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1138 eptrb, RM63);
1139 if (rrc == MATCH_KETRPOS)
1140 {
1141 offset_top = md->end_offset_top;
1142 eptr = md->end_match_ptr;
1143 ecode = md->start_code + code_offset;
1144 save_capture_last = md->capture_last;
1145 matched_once = TRUE;
1146 continue;
1147 }
1148
1149 /* See comment in the code for capturing groups above about handling
1150 THEN. */
1151
1152 if (rrc == MATCH_THEN)
1153 {
1154 next = ecode + GET(ecode,1);
1155 if (md->start_match_ptr < next &&
1156 (*ecode == OP_ALT || *next == OP_ALT))
1157 rrc = MATCH_NOMATCH;
1158 }
1159
1160 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1161 md->capture_last = save_capture_last;
1162 ecode += GET(ecode, 1);
1163 if (*ecode != OP_ALT) break;
1164 }
1165
1166 if (!matched_once)
1167 {
1168 md->offset_vector[offset] = save_offset1;
1169 md->offset_vector[offset+1] = save_offset2;
1170 md->offset_vector[md->offset_end - number] = save_offset3;
1171 }
1172
1173 if (allow_zero || matched_once)
1174 {
1175 ecode += 1 + LINK_SIZE;
1176 break;
1177 }
1178
1179 RRETURN(MATCH_NOMATCH);
1180 }
1181
1182 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1183 as a non-capturing bracket. */
1184
1185 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187
1188 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1189
1190 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1191 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1192
1193 /* Non-capturing possessive bracket with unlimited repeat. We come here
1194 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1195 without the capturing complication. It is written out separately for speed
1196 and cleanliness. */
1197
1198 case OP_BRAPOS:
1199 case OP_SBRAPOS:
1200 allow_zero = FALSE;
1201
1202 POSSESSIVE_NON_CAPTURE:
1203 matched_once = FALSE;
1204 code_offset = (int)(ecode - md->start_code);
1205
1206 for (;;)
1207 {
1208 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1209 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1210 eptrb, RM48);
1211 if (rrc == MATCH_KETRPOS)
1212 {
1213 offset_top = md->end_offset_top;
1214 eptr = md->end_match_ptr;
1215 ecode = md->start_code + code_offset;
1216 matched_once = TRUE;
1217 continue;
1218 }
1219
1220 /* See comment in the code for capturing groups above about handling
1221 THEN. */
1222
1223 if (rrc == MATCH_THEN)
1224 {
1225 next = ecode + GET(ecode,1);
1226 if (md->start_match_ptr < next &&
1227 (*ecode == OP_ALT || *next == OP_ALT))
1228 rrc = MATCH_NOMATCH;
1229 }
1230
1231 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1232 ecode += GET(ecode, 1);
1233 if (*ecode != OP_ALT) break;
1234 }
1235
1236 if (matched_once || allow_zero)
1237 {
1238 ecode += 1 + LINK_SIZE;
1239 break;
1240 }
1241 RRETURN(MATCH_NOMATCH);
1242
1243 /* Control never reaches here. */
1244
1245 /* Conditional group: compilation checked that there are no more than
1246 two branches. If the condition is false, skipping the first branch takes us
1247 past the end if there is only one branch, but that's OK because that is
1248 exactly what going to the ket would do. */
1249
1250 case OP_COND:
1251 case OP_SCOND:
1252 codelink = GET(ecode, 1);
1253
1254 /* Because of the way auto-callout works during compile, a callout item is
1255 inserted between OP_COND and an assertion condition. */
1256
1257 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1258 {
1259 if (PUBL(callout) != NULL)
1260 {
1261 PUBL(callout_block) cb;
1262 cb.version = 2; /* Version 1 of the callout block */
1263 cb.callout_number = ecode[LINK_SIZE+2];
1264 cb.offset_vector = md->offset_vector;
1265 #ifdef COMPILE_PCRE8
1266 cb.subject = (PCRE_SPTR)md->start_subject;
1267 #else
1268 cb.subject = (PCRE_SPTR16)md->start_subject;
1269 #endif
1270 cb.subject_length = (int)(md->end_subject - md->start_subject);
1271 cb.start_match = (int)(mstart - md->start_subject);
1272 cb.current_position = (int)(eptr - md->start_subject);
1273 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1274 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1275 cb.capture_top = offset_top/2;
1276 cb.capture_last = md->capture_last;
1277 cb.callout_data = md->callout_data;
1278 cb.mark = md->nomatch_mark;
1279 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1280 if (rrc < 0) RRETURN(rrc);
1281 }
1282 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1283 }
1284
1285 condcode = ecode[LINK_SIZE+1];
1286
1287 /* Now see what the actual condition is */
1288
1289 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1290 {
1291 if (md->recursive == NULL) /* Not recursing => FALSE */
1292 {
1293 condition = FALSE;
1294 ecode += GET(ecode, 1);
1295 }
1296 else
1297 {
1298 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1299 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1300
1301 /* If the test is for recursion into a specific subpattern, and it is
1302 false, but the test was set up by name, scan the table to see if the
1303 name refers to any other numbers, and test them. The condition is true
1304 if any one is set. */
1305
1306 if (!condition && condcode == OP_NRREF)
1307 {
1308 pcre_uchar *slotA = md->name_table;
1309 for (i = 0; i < md->name_count; i++)
1310 {
1311 if (GET2(slotA, 0) == recno) break;
1312 slotA += md->name_entry_size;
1313 }
1314
1315 /* Found a name for the number - there can be only one; duplicate
1316 names for different numbers are allowed, but not vice versa. First
1317 scan down for duplicates. */
1318
1319 if (i < md->name_count)
1320 {
1321 pcre_uchar *slotB = slotA;
1322 while (slotB > md->name_table)
1323 {
1324 slotB -= md->name_entry_size;
1325 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1326 {
1327 condition = GET2(slotB, 0) == md->recursive->group_num;
1328 if (condition) break;
1329 }
1330 else break;
1331 }
1332
1333 /* Scan up for duplicates */
1334
1335 if (!condition)
1336 {
1337 slotB = slotA;
1338 for (i++; i < md->name_count; i++)
1339 {
1340 slotB += md->name_entry_size;
1341 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1342 {
1343 condition = GET2(slotB, 0) == md->recursive->group_num;
1344 if (condition) break;
1345 }
1346 else break;
1347 }
1348 }
1349 }
1350 }
1351
1352 /* Chose branch according to the condition */
1353
1354 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1355 }
1356 }
1357
1358 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1359 {
1360 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1361 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1362
1363 /* If the numbered capture is unset, but the reference was by name,
1364 scan the table to see if the name refers to any other numbers, and test
1365 them. The condition is true if any one is set. This is tediously similar
1366 to the code above, but not close enough to try to amalgamate. */
1367
1368 if (!condition && condcode == OP_NCREF)
1369 {
1370 int refno = offset >> 1;
1371 pcre_uchar *slotA = md->name_table;
1372
1373 for (i = 0; i < md->name_count; i++)
1374 {
1375 if (GET2(slotA, 0) == refno) break;
1376 slotA += md->name_entry_size;
1377 }
1378
1379 /* Found a name for the number - there can be only one; duplicate names
1380 for different numbers are allowed, but not vice versa. First scan down
1381 for duplicates. */
1382
1383 if (i < md->name_count)
1384 {
1385 pcre_uchar *slotB = slotA;
1386 while (slotB > md->name_table)
1387 {
1388 slotB -= md->name_entry_size;
1389 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1390 {
1391 offset = GET2(slotB, 0) << 1;
1392 condition = offset < offset_top &&
1393 md->offset_vector[offset] >= 0;
1394 if (condition) break;
1395 }
1396 else break;
1397 }
1398
1399 /* Scan up for duplicates */
1400
1401 if (!condition)
1402 {
1403 slotB = slotA;
1404 for (i++; i < md->name_count; i++)
1405 {
1406 slotB += md->name_entry_size;
1407 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1408 {
1409 offset = GET2(slotB, 0) << 1;
1410 condition = offset < offset_top &&
1411 md->offset_vector[offset] >= 0;
1412 if (condition) break;
1413 }
1414 else break;
1415 }
1416 }
1417 }
1418 }
1419
1420 /* Chose branch according to the condition */
1421
1422 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1423 }
1424
1425 else if (condcode == OP_DEF) /* DEFINE - always false */
1426 {
1427 condition = FALSE;
1428 ecode += GET(ecode, 1);
1429 }
1430
1431 /* The condition is an assertion. Call match() to evaluate it - setting
1432 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1433 an assertion. */
1434
1435 else
1436 {
1437 md->match_function_type = MATCH_CONDASSERT;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1439 if (rrc == MATCH_MATCH)
1440 {
1441 if (md->end_offset_top > offset_top)
1442 offset_top = md->end_offset_top; /* Captures may have happened */
1443 condition = TRUE;
1444 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1445 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1446 }
1447
1448 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1449 assertion; it is therefore treated as NOMATCH. */
1450
1451 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1452 {
1453 RRETURN(rrc); /* Need braces because of following else */
1454 }
1455 else
1456 {
1457 condition = FALSE;
1458 ecode += codelink;
1459 }
1460 }
1461
1462 /* We are now at the branch that is to be obeyed. As there is only one, can
1463 use tail recursion to avoid using another stack frame, except when there is
1464 unlimited repeat of a possibly empty group. In the latter case, a recursive
1465 call to match() is always required, unless the second alternative doesn't
1466 exist, in which case we can just plough on. Note that, for compatibility
1467 with Perl, the | in a conditional group is NOT treated as creating two
1468 alternatives. If a THEN is encountered in the branch, it propagates out to
1469 the enclosing alternative (unless nested in a deeper set of alternatives,
1470 of course). */
1471
1472 if (condition || *ecode == OP_ALT)
1473 {
1474 if (op != OP_SCOND)
1475 {
1476 ecode += 1 + LINK_SIZE;
1477 goto TAIL_RECURSE;
1478 }
1479
1480 md->match_function_type = MATCH_CBEGROUP;
1481 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1482 RRETURN(rrc);
1483 }
1484
1485 /* Condition false & no alternative; continue after the group. */
1486
1487 else
1488 {
1489 ecode += 1 + LINK_SIZE;
1490 }
1491 break;
1492
1493
1494 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1495 to close any currently open capturing brackets. */
1496
1497 case OP_CLOSE:
1498 number = GET2(ecode, 1);
1499 offset = number << 1;
1500
1501 #ifdef PCRE_DEBUG
1502 printf("end bracket %d at *ACCEPT", number);
1503 printf("\n");
1504 #endif
1505
1506 md->capture_last = number;
1507 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1508 {
1509 md->offset_vector[offset] =
1510 md->offset_vector[md->offset_end - number];
1511 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1512 if (offset_top <= offset) offset_top = offset + 2;
1513 }
1514 ecode += 1 + IMM2_SIZE;
1515 break;
1516
1517
1518 /* End of the pattern, either real or forced. */
1519
1520 case OP_END:
1521 case OP_ACCEPT:
1522 case OP_ASSERT_ACCEPT:
1523
1524 /* If we have matched an empty string, fail if not in an assertion and not
1525 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1526 is set and we have matched at the start of the subject. In both cases,
1527 backtracking will then try other alternatives, if any. */
1528
1529 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1530 md->recursive == NULL &&
1531 (md->notempty ||
1532 (md->notempty_atstart &&
1533 mstart == md->start_subject + md->start_offset)))
1534 RRETURN(MATCH_NOMATCH);
1535
1536 /* Otherwise, we have a match. */
1537
1538 md->end_match_ptr = eptr; /* Record where we ended */
1539 md->end_offset_top = offset_top; /* and how many extracts were taken */
1540 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1541
1542 /* For some reason, the macros don't work properly if an expression is
1543 given as the argument to RRETURN when the heap is in use. */
1544
1545 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1546 RRETURN(rrc);
1547
1548 /* Assertion brackets. Check the alternative branches in turn - the
1549 matching won't pass the KET for an assertion. If any one branch matches,
1550 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1551 start of each branch to move the current point backwards, so the code at
1552 this level is identical to the lookahead case. When the assertion is part
1553 of a condition, we want to return immediately afterwards. The caller of
1554 this incarnation of the match() function will have set MATCH_CONDASSERT in
1555 md->match_function type, and one of these opcodes will be the first opcode
1556 that is processed. We use a local variable that is preserved over calls to
1557 match() to remember this case. */
1558
1559 case OP_ASSERT:
1560 case OP_ASSERTBACK:
1561 save_mark = md->mark;
1562 if (md->match_function_type == MATCH_CONDASSERT)
1563 {
1564 condassert = TRUE;
1565 md->match_function_type = 0;
1566 }
1567 else condassert = FALSE;
1568
1569 do
1570 {
1571 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1572 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1573 {
1574 mstart = md->start_match_ptr; /* In case \K reset it */
1575 break;
1576 }
1577 md->mark = save_mark;
1578
1579 /* A COMMIT failure must fail the entire assertion, without trying any
1580 subsequent branches. */
1581
1582 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1583
1584 /* PCRE does not allow THEN to escape beyond an assertion; it
1585 is treated as NOMATCH. */
1586
1587 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1588 ecode += GET(ecode, 1);
1589 }
1590 while (*ecode == OP_ALT);
1591
1592 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1593
1594 /* If checking an assertion for a condition, return MATCH_MATCH. */
1595
1596 if (condassert) RRETURN(MATCH_MATCH);
1597
1598 /* Continue from after the assertion, updating the offsets high water
1599 mark, since extracts may have been taken during the assertion. */
1600
1601 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1602 ecode += 1 + LINK_SIZE;
1603 offset_top = md->end_offset_top;
1604 continue;
1605
1606 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1607 PRUNE, or COMMIT means we must assume failure without checking subsequent
1608 branches. */
1609
1610 case OP_ASSERT_NOT:
1611 case OP_ASSERTBACK_NOT:
1612 save_mark = md->mark;
1613 if (md->match_function_type == MATCH_CONDASSERT)
1614 {
1615 condassert = TRUE;
1616 md->match_function_type = 0;
1617 }
1618 else condassert = FALSE;
1619
1620 do
1621 {
1622 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1623 md->mark = save_mark;
1624 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1625 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1626 {
1627 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1628 break;
1629 }
1630
1631 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1632 as NOMATCH. */
1633
1634 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1635 ecode += GET(ecode,1);
1636 }
1637 while (*ecode == OP_ALT);
1638
1639 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1640
1641 ecode += 1 + LINK_SIZE;
1642 continue;
1643
1644 /* Move the subject pointer back. This occurs only at the start of
1645 each branch of a lookbehind assertion. If we are too close to the start to
1646 move back, this match function fails. When working with UTF-8 we move
1647 back a number of characters, not bytes. */
1648
1649 case OP_REVERSE:
1650 #ifdef SUPPORT_UTF
1651 if (utf)
1652 {
1653 i = GET(ecode, 1);
1654 while (i-- > 0)
1655 {
1656 eptr--;
1657 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1658 BACKCHAR(eptr);
1659 }
1660 }
1661 else
1662 #endif
1663
1664 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1665
1666 {
1667 eptr -= GET(ecode, 1);
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 }
1670
1671 /* Save the earliest consulted character, then skip to next op code */
1672
1673 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1674 ecode += 1 + LINK_SIZE;
1675 break;
1676
1677 /* The callout item calls an external function, if one is provided, passing
1678 details of the match so far. This is mainly for debugging, though the
1679 function is able to force a failure. */
1680
1681 case OP_CALLOUT:
1682 if (PUBL(callout) != NULL)
1683 {
1684 PUBL(callout_block) cb;
1685 cb.version = 2; /* Version 1 of the callout block */
1686 cb.callout_number = ecode[1];
1687 cb.offset_vector = md->offset_vector;
1688 #ifdef COMPILE_PCRE8
1689 cb.subject = (PCRE_SPTR)md->start_subject;
1690 #else
1691 cb.subject = (PCRE_SPTR16)md->start_subject;
1692 #endif
1693 cb.subject_length = (int)(md->end_subject - md->start_subject);
1694 cb.start_match = (int)(mstart - md->start_subject);
1695 cb.current_position = (int)(eptr - md->start_subject);
1696 cb.pattern_position = GET(ecode, 2);
1697 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1698 cb.capture_top = offset_top/2;
1699 cb.capture_last = md->capture_last;
1700 cb.callout_data = md->callout_data;
1701 cb.mark = md->nomatch_mark;
1702 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1703 if (rrc < 0) RRETURN(rrc);
1704 }
1705 ecode += 2 + 2*LINK_SIZE;
1706 break;
1707
1708 /* Recursion either matches the current regex, or some subexpression. The
1709 offset data is the offset to the starting bracket from the start of the
1710 whole pattern. (This is so that it works from duplicated subpatterns.)
1711
1712 The state of the capturing groups is preserved over recursion, and
1713 re-instated afterwards. We don't know how many are started and not yet
1714 finished (offset_top records the completed total) so we just have to save
1715 all the potential data. There may be up to 65535 such values, which is too
1716 large to put on the stack, but using malloc for small numbers seems
1717 expensive. As a compromise, the stack is used when there are no more than
1718 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1719
1720 There are also other values that have to be saved. We use a chained
1721 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1722 for the original version of this logic. It has, however, been hacked around
1723 a lot, so he is not to blame for the current way it works. */
1724
1725 case OP_RECURSE:
1726 {
1727 recursion_info *ri;
1728 int recno;
1729
1730 callpat = md->start_code + GET(ecode, 1);
1731 recno = (callpat == md->start_code)? 0 :
1732 GET2(callpat, 1 + LINK_SIZE);
1733
1734 /* Check for repeating a recursion without advancing the subject pointer.
1735 This should catch convoluted mutual recursions. (Some simple cases are
1736 caught at compile time.) */
1737
1738 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1739 if (recno == ri->group_num && eptr == ri->subject_position)
1740 RRETURN(PCRE_ERROR_RECURSELOOP);
1741
1742 /* Add to "recursing stack" */
1743
1744 new_recursive.group_num = recno;
1745 new_recursive.subject_position = eptr;
1746 new_recursive.prevrec = md->recursive;
1747 md->recursive = &new_recursive;
1748
1749 /* Where to continue from afterwards */
1750
1751 ecode += 1 + LINK_SIZE;
1752
1753 /* Now save the offset data */
1754
1755 new_recursive.saved_max = md->offset_end;
1756 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1757 new_recursive.offset_save = stacksave;
1758 else
1759 {
1760 new_recursive.offset_save =
1761 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1762 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1763 }
1764 memcpy(new_recursive.offset_save, md->offset_vector,
1765 new_recursive.saved_max * sizeof(int));
1766
1767 /* OK, now we can do the recursion. After processing each alternative,
1768 restore the offset data. If there were nested recursions, md->recursive
1769 might be changed, so reset it before looping. */
1770
1771 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1772 cbegroup = (*callpat >= OP_SBRA);
1773 do
1774 {
1775 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1776 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1777 md, eptrb, RM6);
1778 memcpy(md->offset_vector, new_recursive.offset_save,
1779 new_recursive.saved_max * sizeof(int));
1780 md->recursive = new_recursive.prevrec;
1781 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1782 {
1783 DPRINTF(("Recursion matched\n"));
1784 if (new_recursive.offset_save != stacksave)
1785 (PUBL(free))(new_recursive.offset_save);
1786
1787 /* Set where we got to in the subject, and reset the start in case
1788 it was changed by \K. This *is* propagated back out of a recursion,
1789 for Perl compatibility. */
1790
1791 eptr = md->end_match_ptr;
1792 mstart = md->start_match_ptr;
1793 goto RECURSION_MATCHED; /* Exit loop; end processing */
1794 }
1795
1796 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1797 is treated as NOMATCH. */
1798
1799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1800 rrc != MATCH_COMMIT)
1801 {
1802 DPRINTF(("Recursion gave error %d\n", rrc));
1803 if (new_recursive.offset_save != stacksave)
1804 (PUBL(free))(new_recursive.offset_save);
1805 RRETURN(rrc);
1806 }
1807
1808 md->recursive = &new_recursive;
1809 callpat += GET(callpat, 1);
1810 }
1811 while (*callpat == OP_ALT);
1812
1813 DPRINTF(("Recursion didn't match\n"));
1814 md->recursive = new_recursive.prevrec;
1815 if (new_recursive.offset_save != stacksave)
1816 (PUBL(free))(new_recursive.offset_save);
1817 RRETURN(MATCH_NOMATCH);
1818 }
1819
1820 RECURSION_MATCHED:
1821 break;
1822
1823 /* An alternation is the end of a branch; scan along to find the end of the
1824 bracketed group and go to there. */
1825
1826 case OP_ALT:
1827 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1828 break;
1829
1830 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1831 indicating that it may occur zero times. It may repeat infinitely, or not
1832 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1833 with fixed upper repeat limits are compiled as a number of copies, with the
1834 optional ones preceded by BRAZERO or BRAMINZERO. */
1835
1836 case OP_BRAZERO:
1837 next = ecode + 1;
1838 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840 do next += GET(next, 1); while (*next == OP_ALT);
1841 ecode = next + 1 + LINK_SIZE;
1842 break;
1843
1844 case OP_BRAMINZERO:
1845 next = ecode + 1;
1846 do next += GET(next, 1); while (*next == OP_ALT);
1847 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1849 ecode++;
1850 break;
1851
1852 case OP_SKIPZERO:
1853 next = ecode+1;
1854 do next += GET(next,1); while (*next == OP_ALT);
1855 ecode = next + 1 + LINK_SIZE;
1856 break;
1857
1858 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1859 here; just jump to the group, with allow_zero set TRUE. */
1860
1861 case OP_BRAPOSZERO:
1862 op = *(++ecode);
1863 allow_zero = TRUE;
1864 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1865 goto POSSESSIVE_NON_CAPTURE;
1866
1867 /* End of a group, repeated or non-repeating. */
1868
1869 case OP_KET:
1870 case OP_KETRMIN:
1871 case OP_KETRMAX:
1872 case OP_KETRPOS:
1873 prev = ecode - GET(ecode, 1);
1874
1875 /* If this was a group that remembered the subject start, in order to break
1876 infinite repeats of empty string matches, retrieve the subject start from
1877 the chain. Otherwise, set it NULL. */
1878
1879 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1880 {
1881 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1882 eptrb = eptrb->epb_prev; /* Backup to previous group */
1883 }
1884 else saved_eptr = NULL;
1885
1886 /* If we are at the end of an assertion group or a non-capturing atomic
1887 group, stop matching and return MATCH_MATCH, but record the current high
1888 water mark for use by positive assertions. We also need to record the match
1889 start in case it was changed by \K. */
1890
1891 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1892 *prev == OP_ONCE_NC)
1893 {
1894 md->end_match_ptr = eptr; /* For ONCE_NC */
1895 md->end_offset_top = offset_top;
1896 md->start_match_ptr = mstart;
1897 RRETURN(MATCH_MATCH); /* Sets md->mark */
1898 }
1899
1900 /* For capturing groups we have to check the group number back at the start
1901 and if necessary complete handling an extraction by setting the offsets and
1902 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1903 into group 0, so it won't be picked up here. Instead, we catch it when the
1904 OP_END is reached. Other recursion is handled here. We just have to record
1905 the current subject position and start match pointer and give a MATCH
1906 return. */
1907
1908 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1909 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1910 {
1911 number = GET2(prev, 1+LINK_SIZE);
1912 offset = number << 1;
1913
1914 #ifdef PCRE_DEBUG
1915 printf("end bracket %d", number);
1916 printf("\n");
1917 #endif
1918
1919 /* Handle a recursively called group. */
1920
1921 if (md->recursive != NULL && md->recursive->group_num == number)
1922 {
1923 md->end_match_ptr = eptr;
1924 md->start_match_ptr = mstart;
1925 RRETURN(MATCH_MATCH);
1926 }
1927
1928 /* Deal with capturing */
1929
1930 md->capture_last = number;
1931 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1932 {
1933 /* If offset is greater than offset_top, it means that we are
1934 "skipping" a capturing group, and that group's offsets must be marked
1935 unset. In earlier versions of PCRE, all the offsets were unset at the
1936 start of matching, but this doesn't work because atomic groups and
1937 assertions can cause a value to be set that should later be unset.
1938 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1939 part of the atomic group, but this is not on the final matching path,
1940 so must be unset when 2 is set. (If there is no group 2, there is no
1941 problem, because offset_top will then be 2, indicating no capture.) */
1942
1943 if (offset > offset_top)
1944 {
1945 register int *iptr = md->offset_vector + offset_top;
1946 register int *iend = md->offset_vector + offset;
1947 while (iptr < iend) *iptr++ = -1;
1948 }
1949
1950 /* Now make the extraction */
1951
1952 md->offset_vector[offset] =
1953 md->offset_vector[md->offset_end - number];
1954 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1955 if (offset_top <= offset) offset_top = offset + 2;
1956 }
1957 }
1958
1959 /* For an ordinary non-repeating ket, just continue at this level. This
1960 also happens for a repeating ket if no characters were matched in the
1961 group. This is the forcible breaking of infinite loops as implemented in
1962 Perl 5.005. For a non-repeating atomic group that includes captures,
1963 establish a backup point by processing the rest of the pattern at a lower
1964 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1965 original OP_ONCE level, thereby bypassing intermediate backup points, but
1966 resetting any captures that happened along the way. */
1967
1968 if (*ecode == OP_KET || eptr == saved_eptr)
1969 {
1970 if (*prev == OP_ONCE)
1971 {
1972 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1975 RRETURN(MATCH_ONCE);
1976 }
1977 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1978 break;
1979 }
1980
1981 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1982 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1983 at a time from the outer level, thus saving stack. */
1984
1985 if (*ecode == OP_KETRPOS)
1986 {
1987 md->end_match_ptr = eptr;
1988 md->end_offset_top = offset_top;
1989 RRETURN(MATCH_KETRPOS);
1990 }
1991
1992 /* The normal repeating kets try the rest of the pattern or restart from
1993 the preceding bracket, in the appropriate order. In the second case, we can
1994 use tail recursion to avoid using another stack frame, unless we have an
1995 an atomic group or an unlimited repeat of a group that can match an empty
1996 string. */
1997
1998 if (*ecode == OP_KETRMIN)
1999 {
2000 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2002 if (*prev == OP_ONCE)
2003 {
2004 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2006 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2007 RRETURN(MATCH_ONCE);
2008 }
2009 if (*prev >= OP_SBRA) /* Could match an empty string */
2010 {
2011 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2012 RRETURN(rrc);
2013 }
2014 ecode = prev;
2015 goto TAIL_RECURSE;
2016 }
2017 else /* OP_KETRMAX */
2018 {
2019 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2020 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (*prev == OP_ONCE)
2023 {
2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev;
2027 RRETURN(MATCH_ONCE);
2028 }
2029 ecode += 1 + LINK_SIZE;
2030 goto TAIL_RECURSE;
2031 }
2032 /* Control never gets here */
2033
2034 /* Not multiline mode: start of subject assertion, unless notbol. */
2035
2036 case OP_CIRC:
2037 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2038
2039 /* Start of subject assertion */
2040
2041 case OP_SOD:
2042 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2043 ecode++;
2044 break;
2045
2046 /* Multiline mode: start of subject unless notbol, or after any newline. */
2047
2048 case OP_CIRCM:
2049 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2050 if (eptr != md->start_subject &&
2051 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2052 RRETURN(MATCH_NOMATCH);
2053 ecode++;
2054 break;
2055
2056 /* Start of match assertion */
2057
2058 case OP_SOM:
2059 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2060 ecode++;
2061 break;
2062
2063 /* Reset the start of match point */
2064
2065 case OP_SET_SOM:
2066 mstart = eptr;
2067 ecode++;
2068 break;
2069
2070 /* Multiline mode: assert before any newline, or before end of subject
2071 unless noteol is set. */
2072
2073 case OP_DOLLM:
2074 if (eptr < md->end_subject)
2075 {
2076 if (!IS_NEWLINE(eptr))
2077 {
2078 if (md->partial != 0 &&
2079 eptr + 1 >= md->end_subject &&
2080 NLBLOCK->nltype == NLTYPE_FIXED &&
2081 NLBLOCK->nllen == 2 &&
2082 *eptr == NLBLOCK->nl[0])
2083 {
2084 md->hitend = TRUE;
2085 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2086 }
2087 RRETURN(MATCH_NOMATCH);
2088 }
2089 }
2090 else
2091 {
2092 if (md->noteol) RRETURN(MATCH_NOMATCH);
2093 SCHECK_PARTIAL();
2094 }
2095 ecode++;
2096 break;
2097
2098 /* Not multiline mode: assert before a terminating newline or before end of
2099 subject unless noteol is set. */
2100
2101 case OP_DOLL:
2102 if (md->noteol) RRETURN(MATCH_NOMATCH);
2103 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2104
2105 /* ... else fall through for endonly */
2106
2107 /* End of subject assertion (\z) */
2108
2109 case OP_EOD:
2110 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2111 SCHECK_PARTIAL();
2112 ecode++;
2113 break;
2114
2115 /* End of subject or ending \n assertion (\Z) */
2116
2117 case OP_EODN:
2118 ASSERT_NL_OR_EOS:
2119 if (eptr < md->end_subject &&
2120 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2121 {
2122 if (md->partial != 0 &&
2123 eptr + 1 >= md->end_subject &&
2124 NLBLOCK->nltype == NLTYPE_FIXED &&
2125 NLBLOCK->nllen == 2 &&
2126 *eptr == NLBLOCK->nl[0])
2127 {
2128 md->hitend = TRUE;
2129 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2130 }
2131 RRETURN(MATCH_NOMATCH);
2132 }
2133
2134 /* Either at end of string or \n before end. */
2135
2136 SCHECK_PARTIAL();
2137 ecode++;
2138 break;
2139
2140 /* Word boundary assertions */
2141
2142 case OP_NOT_WORD_BOUNDARY:
2143 case OP_WORD_BOUNDARY:
2144 {
2145
2146 /* Find out if the previous and current characters are "word" characters.
2147 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2148 be "non-word" characters. Remember the earliest consulted character for
2149 partial matching. */
2150
2151 #ifdef SUPPORT_UTF
2152 if (utf)
2153 {
2154 /* Get status of previous character */
2155
2156 if (eptr == md->start_subject) prev_is_word = FALSE; else
2157 {
2158 PCRE_PUCHAR lastptr = eptr - 1;
2159 BACKCHAR(lastptr);
2160 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2161 GETCHAR(c, lastptr);
2162 #ifdef SUPPORT_UCP
2163 if (md->use_ucp)
2164 {
2165 if (c == '_') prev_is_word = TRUE; else
2166 {
2167 int cat = UCD_CATEGORY(c);
2168 prev_is_word = (cat == ucp_L || cat == ucp_N);
2169 }
2170 }
2171 else
2172 #endif
2173 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2174 }
2175
2176 /* Get status of next character */
2177
2178 if (eptr >= md->end_subject)
2179 {
2180 SCHECK_PARTIAL();
2181 cur_is_word = FALSE;
2182 }
2183 else
2184 {
2185 GETCHAR(c, eptr);
2186 #ifdef SUPPORT_UCP
2187 if (md->use_ucp)
2188 {
2189 if (c == '_') cur_is_word = TRUE; else
2190 {
2191 int cat = UCD_CATEGORY(c);
2192 cur_is_word = (cat == ucp_L || cat == ucp_N);
2193 }
2194 }
2195 else
2196 #endif
2197 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2198 }
2199 }
2200 else
2201 #endif
2202
2203 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2204 consistency with the behaviour of \w we do use it in this case. */
2205
2206 {
2207 /* Get status of previous character */
2208
2209 if (eptr == md->start_subject) prev_is_word = FALSE; else
2210 {
2211 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2212 #ifdef SUPPORT_UCP
2213 if (md->use_ucp)
2214 {
2215 c = eptr[-1];
2216 if (c == '_') prev_is_word = TRUE; else
2217 {
2218 int cat = UCD_CATEGORY(c);
2219 prev_is_word = (cat == ucp_L || cat == ucp_N);
2220 }
2221 }
2222 else
2223 #endif
2224 prev_is_word = MAX_255(eptr[-1])
2225 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2226 }
2227
2228 /* Get status of next character */
2229
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 cur_is_word = FALSE;
2234 }
2235 else
2236 #ifdef SUPPORT_UCP
2237 if (md->use_ucp)
2238 {
2239 c = *eptr;
2240 if (c == '_') cur_is_word = TRUE; else
2241 {
2242 int cat = UCD_CATEGORY(c);
2243 cur_is_word = (cat == ucp_L || cat == ucp_N);
2244 }
2245 }
2246 else
2247 #endif
2248 cur_is_word = MAX_255(*eptr)
2249 && ((md->ctypes[*eptr] & ctype_word) != 0);
2250 }
2251
2252 /* Now see if the situation is what we want */
2253
2254 if ((*ecode++ == OP_WORD_BOUNDARY)?
2255 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2256 RRETURN(MATCH_NOMATCH);
2257 }
2258 break;
2259
2260 /* Match any single character type except newline; have to take care with
2261 CRLF newlines and partial matching. */
2262
2263 case OP_ANY:
2264 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2265 if (md->partial != 0 &&
2266 eptr + 1 >= md->end_subject &&
2267 NLBLOCK->nltype == NLTYPE_FIXED &&
2268 NLBLOCK->nllen == 2 &&
2269 *eptr == NLBLOCK->nl[0])
2270 {
2271 md->hitend = TRUE;
2272 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2273 }
2274
2275 /* Fall through */
2276
2277 /* Match any single character whatsoever. */
2278
2279 case OP_ALLANY:
2280 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2281 { /* not be updated before SCHECK_PARTIAL. */
2282 SCHECK_PARTIAL();
2283 RRETURN(MATCH_NOMATCH);
2284 }
2285 eptr++;
2286 #ifdef SUPPORT_UTF
2287 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2288 #endif
2289 ecode++;
2290 break;
2291
2292 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2293 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2294
2295 case OP_ANYBYTE:
2296 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2297 { /* not be updated before SCHECK_PARTIAL. */
2298 SCHECK_PARTIAL();
2299 RRETURN(MATCH_NOMATCH);
2300 }
2301 eptr++;
2302 ecode++;
2303 break;
2304
2305 case OP_NOT_DIGIT:
2306 if (eptr >= md->end_subject)
2307 {
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 GETCHARINCTEST(c, eptr);
2312 if (
2313 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2314 c < 256 &&
2315 #endif
2316 (md->ctypes[c] & ctype_digit) != 0
2317 )
2318 RRETURN(MATCH_NOMATCH);
2319 ecode++;
2320 break;
2321
2322 case OP_DIGIT:
2323 if (eptr >= md->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 GETCHARINCTEST(c, eptr);
2329 if (
2330 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2331 c > 255 ||
2332 #endif
2333 (md->ctypes[c] & ctype_digit) == 0
2334 )
2335 RRETURN(MATCH_NOMATCH);
2336 ecode++;
2337 break;
2338
2339 case OP_NOT_WHITESPACE:
2340 if (eptr >= md->end_subject)
2341 {
2342 SCHECK_PARTIAL();
2343 RRETURN(MATCH_NOMATCH);
2344 }
2345 GETCHARINCTEST(c, eptr);
2346 if (
2347 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2348 c < 256 &&
2349 #endif
2350 (md->ctypes[c] & ctype_space) != 0
2351 )
2352 RRETURN(MATCH_NOMATCH);
2353 ecode++;
2354 break;
2355
2356 case OP_WHITESPACE:
2357 if (eptr >= md->end_subject)
2358 {
2359 SCHECK_PARTIAL();
2360 RRETURN(MATCH_NOMATCH);
2361 }
2362 GETCHARINCTEST(c, eptr);
2363 if (
2364 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2365 c > 255 ||
2366 #endif
2367 (md->ctypes[c] & ctype_space) == 0
2368 )
2369 RRETURN(MATCH_NOMATCH);
2370 ecode++;
2371 break;
2372
2373 case OP_NOT_WORDCHAR:
2374 if (eptr >= md->end_subject)
2375 {
2376 SCHECK_PARTIAL();
2377 RRETURN(MATCH_NOMATCH);
2378 }
2379 GETCHARINCTEST(c, eptr);
2380 if (
2381 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2382 c < 256 &&
2383 #endif
2384 (md->ctypes[c] & ctype_word) != 0
2385 )
2386 RRETURN(MATCH_NOMATCH);
2387 ecode++;
2388 break;
2389
2390 case OP_WORDCHAR:
2391 if (eptr >= md->end_subject)
2392 {
2393 SCHECK_PARTIAL();
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 GETCHARINCTEST(c, eptr);
2397 if (
2398 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2399 c > 255 ||
2400 #endif
2401 (md->ctypes[c] & ctype_word) == 0
2402 )
2403 RRETURN(MATCH_NOMATCH);
2404 ecode++;
2405 break;
2406
2407 case OP_ANYNL:
2408 if (eptr >= md->end_subject)
2409 {
2410 SCHECK_PARTIAL();
2411 RRETURN(MATCH_NOMATCH);
2412 }
2413 GETCHARINCTEST(c, eptr);
2414 switch(c)
2415 {
2416 default: RRETURN(MATCH_NOMATCH);
2417
2418 case CHAR_CR:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 }
2423 else if (*eptr == CHAR_LF) eptr++;
2424 break;
2425
2426 case CHAR_LF:
2427 break;
2428
2429 case CHAR_VT:
2430 case CHAR_FF:
2431 case CHAR_NEL:
2432 #ifndef EBCDIC
2433 case 0x2028:
2434 case 0x2029:
2435 #endif /* Not EBCDIC */
2436 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2437 break;
2438 }
2439 ecode++;
2440 break;
2441
2442 case OP_NOT_HSPACE:
2443 if (eptr >= md->end_subject)
2444 {
2445 SCHECK_PARTIAL();
2446 RRETURN(MATCH_NOMATCH);
2447 }
2448 GETCHARINCTEST(c, eptr);
2449 switch(c)
2450 {
2451 default: break;
2452 case CHAR_HT:
2453 case CHAR_SPACE:
2454 #ifndef EBCDIC
2455 case 0xa0: /* NBSP */
2456 case 0x1680: /* OGHAM SPACE MARK */
2457 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2458 case 0x2000: /* EN QUAD */
2459 case 0x2001: /* EM QUAD */
2460 case 0x2002: /* EN SPACE */
2461 case 0x2003: /* EM SPACE */
2462 case 0x2004: /* THREE-PER-EM SPACE */
2463 case 0x2005: /* FOUR-PER-EM SPACE */
2464 case 0x2006: /* SIX-PER-EM SPACE */
2465 case 0x2007: /* FIGURE SPACE */
2466 case 0x2008: /* PUNCTUATION SPACE */
2467 case 0x2009: /* THIN SPACE */
2468 case 0x200A: /* HAIR SPACE */
2469 case 0x202f: /* NARROW NO-BREAK SPACE */
2470 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2471 case 0x3000: /* IDEOGRAPHIC SPACE */
2472 #endif /* Not EBCDIC */
2473 RRETURN(MATCH_NOMATCH);
2474 }
2475 ecode++;
2476 break;
2477
2478 case OP_HSPACE:
2479 if (eptr >= md->end_subject)
2480 {
2481 SCHECK_PARTIAL();
2482 RRETURN(MATCH_NOMATCH);
2483 }
2484 GETCHARINCTEST(c, eptr);
2485 switch(c)
2486 {
2487 default: RRETURN(MATCH_NOMATCH);
2488 case CHAR_HT:
2489 case CHAR_SPACE:
2490 #ifndef EBCDIC
2491 case 0xa0: /* NBSP */
2492 case 0x1680: /* OGHAM SPACE MARK */
2493 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2494 case 0x2000: /* EN QUAD */
2495 case 0x2001: /* EM QUAD */
2496 case 0x2002: /* EN SPACE */
2497 case 0x2003: /* EM SPACE */
2498 case 0x2004: /* THREE-PER-EM SPACE */
2499 case 0x2005: /* FOUR-PER-EM SPACE */
2500 case 0x2006: /* SIX-PER-EM SPACE */
2501 case 0x2007: /* FIGURE SPACE */
2502 case 0x2008: /* PUNCTUATION SPACE */
2503 case 0x2009: /* THIN SPACE */
2504 case 0x200A: /* HAIR SPACE */
2505 case 0x202f: /* NARROW NO-BREAK SPACE */
2506 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2507 case 0x3000: /* IDEOGRAPHIC SPACE */
2508 #endif /* Not EBCDIC */
2509 break;
2510 }
2511 ecode++;
2512 break;
2513
2514 case OP_NOT_VSPACE:
2515 if (eptr >= md->end_subject)
2516 {
2517 SCHECK_PARTIAL();
2518 RRETURN(MATCH_NOMATCH);
2519 }
2520 GETCHARINCTEST(c, eptr);
2521 switch(c)
2522 {
2523 default: break;
2524 case CHAR_LF:
2525 case CHAR_VT:
2526 case CHAR_FF:
2527 case CHAR_CR:
2528 case CHAR_NEL:
2529 #ifndef EBCDIC
2530 case 0x2028: /* LINE SEPARATOR */
2531 case 0x2029: /* PARAGRAPH SEPARATOR */
2532 #endif /* Not EBCDIC */
2533 RRETURN(MATCH_NOMATCH);
2534 }
2535 ecode++;
2536 break;
2537
2538 case OP_VSPACE:
2539 if (eptr >= md->end_subject)
2540 {
2541 SCHECK_PARTIAL();
2542 RRETURN(MATCH_NOMATCH);
2543 }
2544 GETCHARINCTEST(c, eptr);
2545 switch(c)
2546 {
2547 default: RRETURN(MATCH_NOMATCH);
2548 case CHAR_LF:
2549 case CHAR_VT:
2550 case CHAR_FF:
2551 case CHAR_CR:
2552 case CHAR_NEL:
2553 #ifndef EBCDIC
2554 case 0x2028: /* LINE SEPARATOR */
2555 case 0x2029: /* PARAGRAPH SEPARATOR */
2556 #endif /* Not EBCDIC */
2557 break;
2558 }
2559 ecode++;
2560 break;
2561
2562 #ifdef SUPPORT_UCP
2563 /* Check the next character by Unicode property. We will get here only
2564 if the support is in the binary; otherwise a compile-time error occurs. */
2565
2566 case OP_PROP:
2567 case OP_NOTPROP:
2568 if (eptr >= md->end_subject)
2569 {
2570 SCHECK_PARTIAL();
2571 RRETURN(MATCH_NOMATCH);
2572 }
2573 GETCHARINCTEST(c, eptr);
2574 {
2575 const ucd_record *prop = GET_UCD(c);
2576
2577 switch(ecode[1])
2578 {
2579 case PT_ANY:
2580 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2581 break;
2582
2583 case PT_LAMP:
2584 if ((prop->chartype == ucp_Lu ||
2585 prop->chartype == ucp_Ll ||
2586 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2587 RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 case PT_GC:
2591 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2592 RRETURN(MATCH_NOMATCH);
2593 break;
2594
2595 case PT_PC:
2596 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2597 RRETURN(MATCH_NOMATCH);
2598 break;
2599
2600 case PT_SC:
2601 if ((ecode[2] != prop->script) == (op == OP_PROP))
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2604
2605 /* These are specials */
2606
2607 case PT_ALNUM:
2608 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2609 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2610 RRETURN(MATCH_NOMATCH);
2611 break;
2612
2613 case PT_SPACE: /* Perl space */
2614 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2615 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2616 == (op == OP_NOTPROP))
2617 RRETURN(MATCH_NOMATCH);
2618 break;
2619
2620 case PT_PXSPACE: /* POSIX space */
2621 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2622 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2623 c == CHAR_FF || c == CHAR_CR)
2624 == (op == OP_NOTPROP))
2625 RRETURN(MATCH_NOMATCH);
2626 break;
2627
2628 case PT_WORD:
2629 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2630 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2631 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2632 RRETURN(MATCH_NOMATCH);
2633 break;
2634
2635 /* This should never occur */
2636
2637 default:
2638 RRETURN(PCRE_ERROR_INTERNAL);
2639 }
2640
2641 ecode += 3;
2642 }
2643 break;
2644
2645 /* Match an extended Unicode sequence. We will get here only if the support
2646 is in the binary; otherwise a compile-time error occurs. */
2647
2648 case OP_EXTUNI:
2649 if (eptr >= md->end_subject)
2650 {
2651 SCHECK_PARTIAL();
2652 RRETURN(MATCH_NOMATCH);
2653 }
2654 else
2655 {
2656 int lgb, rgb;
2657 GETCHARINCTEST(c, eptr);
2658 lgb = UCD_GRAPHBREAK(c);
2659 while (eptr < md->end_subject)
2660 {
2661 int len = 1;
2662 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2663 rgb = UCD_GRAPHBREAK(c);
2664 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2665 lgb = rgb;
2666 eptr += len;
2667 }
2668 }
2669 CHECK_PARTIAL();
2670 ecode++;
2671 break;
2672 #endif
2673
2674
2675 /* Match a back reference, possibly repeatedly. Look past the end of the
2676 item to see if there is repeat information following. The code is similar
2677 to that for character classes, but repeated for efficiency. Then obey
2678 similar code to character type repeats - written out again for speed.
2679 However, if the referenced string is the empty string, always treat
2680 it as matched, any number of times (otherwise there could be infinite
2681 loops). */
2682
2683 case OP_REF:
2684 case OP_REFI:
2685 caseless = op == OP_REFI;
2686 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2687 ecode += 1 + IMM2_SIZE;
2688
2689 /* If the reference is unset, there are two possibilities:
2690
2691 (a) In the default, Perl-compatible state, set the length negative;
2692 this ensures that every attempt at a match fails. We can't just fail
2693 here, because of the possibility of quantifiers with zero minima.
2694
2695 (b) If the JavaScript compatibility flag is set, set the length to zero
2696 so that the back reference matches an empty string.
2697
2698 Otherwise, set the length to the length of what was matched by the
2699 referenced subpattern. */
2700
2701 if (offset >= offset_top || md->offset_vector[offset] < 0)
2702 length = (md->jscript_compat)? 0 : -1;
2703 else
2704 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2705
2706 /* Set up for repetition, or handle the non-repeated case */
2707
2708 switch (*ecode)
2709 {
2710 case OP_CRSTAR:
2711 case OP_CRMINSTAR:
2712 case OP_CRPLUS:
2713 case OP_CRMINPLUS:
2714 case OP_CRQUERY:
2715 case OP_CRMINQUERY:
2716 c = *ecode++ - OP_CRSTAR;
2717 minimize = (c & 1) != 0;
2718 min = rep_min[c]; /* Pick up values from tables; */
2719 max = rep_max[c]; /* zero for max => infinity */
2720 if (max == 0) max = INT_MAX;
2721 break;
2722
2723 case OP_CRRANGE:
2724 case OP_CRMINRANGE:
2725 minimize = (*ecode == OP_CRMINRANGE);
2726 min = GET2(ecode, 1);
2727 max = GET2(ecode, 1 + IMM2_SIZE);
2728 if (max == 0) max = INT_MAX;
2729 ecode += 1 + 2 * IMM2_SIZE;
2730 break;
2731
2732 default: /* No repeat follows */
2733 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2734 {
2735 if (length == -2) eptr = md->end_subject; /* Partial match */
2736 CHECK_PARTIAL();
2737 RRETURN(MATCH_NOMATCH);
2738 }
2739 eptr += length;
2740 continue; /* With the main loop */
2741 }
2742
2743 /* Handle repeated back references. If the length of the reference is
2744 zero, just continue with the main loop. If the length is negative, it
2745 means the reference is unset in non-Java-compatible mode. If the minimum is
2746 zero, we can continue at the same level without recursion. For any other
2747 minimum, carrying on will result in NOMATCH. */
2748
2749 if (length == 0) continue;
2750 if (length < 0 && min == 0) continue;
2751
2752 /* First, ensure the minimum number of matches are present. We get back
2753 the length of the reference string explicitly rather than passing the
2754 address of eptr, so that eptr can be a register variable. */
2755
2756 for (i = 1; i <= min; i++)
2757 {
2758 int slength;
2759 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2760 {
2761 if (slength == -2) eptr = md->end_subject; /* Partial match */
2762 CHECK_PARTIAL();
2763 RRETURN(MATCH_NOMATCH);
2764 }
2765 eptr += slength;
2766 }
2767
2768 /* If min = max, continue at the same level without recursion.
2769 They are not both allowed to be zero. */
2770
2771 if (min == max) continue;
2772
2773 /* If minimizing, keep trying and advancing the pointer */
2774
2775 if (minimize)
2776 {
2777 for (fi = min;; fi++)
2778 {
2779 int slength;
2780 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2781 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2782 if (fi >= max) RRETURN(MATCH_NOMATCH);
2783 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2784 {
2785 if (slength == -2) eptr = md->end_subject; /* Partial match */
2786 CHECK_PARTIAL();
2787 RRETURN(MATCH_NOMATCH);
2788 }
2789 eptr += slength;
2790 }
2791 /* Control never gets here */
2792 }
2793
2794 /* If maximizing, find the longest string and work backwards */
2795
2796 else
2797 {
2798 pp = eptr;
2799 for (i = min; i < max; i++)
2800 {
2801 int slength;
2802 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2803 {
2804 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2805 the soft partial matching case. */
2806
2807 if (slength == -2 && md->partial != 0 &&
2808 md->end_subject > md->start_used_ptr)
2809 {
2810 md->hitend = TRUE;
2811 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2812 }
2813 break;
2814 }
2815 eptr += slength;
2816 }
2817
2818 while (eptr >= pp)
2819 {
2820 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2821 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2822 eptr -= length;
2823 }
2824 RRETURN(MATCH_NOMATCH);
2825 }
2826 /* Control never gets here */
2827
2828 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2829 used when all the characters in the class have values in the range 0-255,
2830 and either the matching is caseful, or the characters are in the range
2831 0-127 when UTF-8 processing is enabled. The only difference between
2832 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2833 encountered.
2834
2835 First, look past the end of the item to see if there is repeat information
2836 following. Then obey similar code to character type repeats - written out
2837 again for speed. */
2838
2839 case OP_NCLASS:
2840 case OP_CLASS:
2841 {
2842 /* The data variable is saved across frames, so the byte map needs to
2843 be stored there. */
2844 #define BYTE_MAP ((pcre_uint8 *)data)
2845 data = ecode + 1; /* Save for matching */
2846 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2847
2848 switch (*ecode)
2849 {
2850 case OP_CRSTAR:
2851 case OP_CRMINSTAR:
2852 case OP_CRPLUS:
2853 case OP_CRMINPLUS:
2854 case OP_CRQUERY:
2855 case OP_CRMINQUERY:
2856 c = *ecode++ - OP_CRSTAR;
2857 minimize = (c & 1) != 0;
2858 min = rep_min[c]; /* Pick up values from tables; */
2859 max = rep_max[c]; /* zero for max => infinity */
2860 if (max == 0) max = INT_MAX;
2861 break;
2862
2863 case OP_CRRANGE:
2864 case OP_CRMINRANGE:
2865 minimize = (*ecode == OP_CRMINRANGE);
2866 min = GET2(ecode, 1);
2867 max = GET2(ecode, 1 + IMM2_SIZE);
2868 if (max == 0) max = INT_MAX;
2869 ecode += 1 + 2 * IMM2_SIZE;
2870 break;
2871
2872 default: /* No repeat follows */
2873 min = max = 1;
2874 break;
2875 }
2876
2877 /* First, ensure the minimum number of matches are present. */
2878
2879 #ifdef SUPPORT_UTF
2880 if (utf)
2881 {
2882 for (i = 1; i <= min; i++)
2883 {
2884 if (eptr >= md->end_subject)
2885 {
2886 SCHECK_PARTIAL();
2887 RRETURN(MATCH_NOMATCH);
2888 }
2889 GETCHARINC(c, eptr);
2890 if (c > 255)
2891 {
2892 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2893 }
2894 else
2895 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2896 }
2897 }
2898 else
2899 #endif
2900 /* Not UTF mode */
2901 {
2902 for (i = 1; i <= min; i++)
2903 {
2904 if (eptr >= md->end_subject)
2905 {
2906 SCHECK_PARTIAL();
2907 RRETURN(MATCH_NOMATCH);
2908 }
2909 c = *eptr++;
2910 #ifndef COMPILE_PCRE8
2911 if (c > 255)
2912 {
2913 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2914 }
2915 else
2916 #endif
2917 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2918 }
2919 }
2920
2921 /* If max == min we can continue with the main loop without the
2922 need to recurse. */
2923
2924 if (min == max) continue;
2925
2926 /* If minimizing, keep testing the rest of the expression and advancing
2927 the pointer while it matches the class. */
2928
2929 if (minimize)
2930 {
2931 #ifdef SUPPORT_UTF
2932 if (utf)
2933 {
2934 for (fi = min;; fi++)
2935 {
2936 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2938 if (fi >= max) RRETURN(MATCH_NOMATCH);
2939 if (eptr >= md->end_subject)
2940 {
2941 SCHECK_PARTIAL();
2942 RRETURN(MATCH_NOMATCH);
2943 }
2944 GETCHARINC(c, eptr);
2945 if (c > 255)
2946 {
2947 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2948 }
2949 else
2950 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2951 }
2952 }
2953 else
2954 #endif
2955 /* Not UTF mode */
2956 {
2957 for (fi = min;; fi++)
2958 {
2959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2961 if (fi >= max) RRETURN(MATCH_NOMATCH);
2962 if (eptr >= md->end_subject)
2963 {
2964 SCHECK_PARTIAL();
2965 RRETURN(MATCH_NOMATCH);
2966 }
2967 c = *eptr++;
2968 #ifndef COMPILE_PCRE8
2969 if (c > 255)
2970 {
2971 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2972 }
2973 else
2974 #endif
2975 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2976 }
2977 }
2978 /* Control never gets here */
2979 }
2980
2981 /* If maximizing, find the longest possible run, then work backwards. */
2982
2983 else
2984 {
2985 pp = eptr;
2986
2987 #ifdef SUPPORT_UTF
2988 if (utf)
2989 {
2990 for (i = min; i < max; i++)
2991 {
2992 int len = 1;
2993 if (eptr >= md->end_subject)
2994 {
2995 SCHECK_PARTIAL();
2996 break;
2997 }
2998 GETCHARLEN(c, eptr, len);
2999 if (c > 255)
3000 {
3001 if (op == OP_CLASS) break;
3002 }
3003 else
3004 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3005 eptr += len;
3006 }
3007 for (;;)
3008 {
3009 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3011 if (eptr-- == pp) break; /* Stop if tried at original pos */
3012 BACKCHAR(eptr);
3013 }
3014 }
3015 else
3016 #endif
3017 /* Not UTF mode */
3018 {
3019 for (i = min; i < max; i++)
3020 {
3021 if (eptr >= md->end_subject)
3022 {
3023 SCHECK_PARTIAL();
3024 break;
3025 }
3026 c = *eptr;
3027 #ifndef COMPILE_PCRE8
3028 if (c > 255)
3029 {
3030 if (op == OP_CLASS) break;
3031 }
3032 else
3033 #endif
3034 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3035 eptr++;
3036 }
3037 while (eptr >= pp)
3038 {
3039 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3041 eptr--;
3042 }
3043 }
3044
3045 RRETURN(MATCH_NOMATCH);
3046 }
3047 #undef BYTE_MAP
3048 }
3049 /* Control never gets here */
3050
3051
3052 /* Match an extended character class. This opcode is encountered only
3053 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3054 mode, because Unicode properties are supported in non-UTF-8 mode. */
3055
3056 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3057 case OP_XCLASS:
3058 {
3059 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3060 ecode += GET(ecode, 1); /* Advance past the item */
3061
3062 switch (*ecode)
3063 {
3064 case OP_CRSTAR:
3065 case OP_CRMINSTAR:
3066 case OP_CRPLUS:
3067 case OP_CRMINPLUS:
3068 case OP_CRQUERY:
3069 case OP_CRMINQUERY:
3070 c = *ecode++ - OP_CRSTAR;
3071 minimize = (c & 1) != 0;
3072 min = rep_min[c]; /* Pick up values from tables; */
3073 max = rep_max[c]; /* zero for max => infinity */
3074 if (max == 0) max = INT_MAX;
3075 break;
3076
3077 case OP_CRRANGE:
3078 case OP_CRMINRANGE:
3079 minimize = (*ecode == OP_CRMINRANGE);
3080 min = GET2(ecode, 1);
3081 max = GET2(ecode, 1 + IMM2_SIZE);
3082 if (max == 0) max = INT_MAX;
3083 ecode += 1 + 2 * IMM2_SIZE;
3084 break;
3085
3086 default: /* No repeat follows */
3087 min = max = 1;
3088 break;
3089 }
3090
3091 /* First, ensure the minimum number of matches are present. */
3092
3093 for (i = 1; i <= min; i++)
3094 {
3095 if (eptr >= md->end_subject)
3096 {
3097 SCHECK_PARTIAL();
3098 RRETURN(MATCH_NOMATCH);
3099 }
3100 GETCHARINCTEST(c, eptr);
3101 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3102 }
3103
3104 /* If max == min we can continue with the main loop without the
3105 need to recurse. */
3106
3107 if (min == max) continue;
3108
3109 /* If minimizing, keep testing the rest of the expression and advancing
3110 the pointer while it matches the class. */
3111
3112 if (minimize)
3113 {
3114 for (fi = min;; fi++)
3115 {
3116 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3118 if (fi >= max) RRETURN(MATCH_NOMATCH);
3119 if (eptr >= md->end_subject)
3120 {
3121 SCHECK_PARTIAL();
3122 RRETURN(MATCH_NOMATCH);
3123 }
3124 GETCHARINCTEST(c, eptr);
3125 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3126 }
3127 /* Control never gets here */
3128 }
3129
3130 /* If maximizing, find the longest possible run, then work backwards. */
3131
3132 else
3133 {
3134 pp = eptr;
3135 for (i = min; i < max; i++)
3136 {
3137 int len = 1;
3138 if (eptr >= md->end_subject)
3139 {
3140 SCHECK_PARTIAL();
3141 break;
3142 }
3143 #ifdef SUPPORT_UTF
3144 GETCHARLENTEST(c, eptr, len);
3145 #else
3146 c = *eptr;
3147 #endif
3148 if (!PRIV(xclass)(c, data, utf)) break;
3149 eptr += len;
3150 }
3151 for(;;)
3152 {
3153 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3154 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3155 if (eptr-- == pp) break; /* Stop if tried at original pos */
3156 #ifdef SUPPORT_UTF
3157 if (utf) BACKCHAR(eptr);
3158 #endif
3159 }
3160 RRETURN(MATCH_NOMATCH);
3161 }
3162
3163 /* Control never gets here */
3164 }
3165 #endif /* End of XCLASS */
3166
3167 /* Match a single character, casefully */
3168
3169 case OP_CHAR:
3170 #ifdef SUPPORT_UTF
3171 if (utf)
3172 {
3173 length = 1;
3174 ecode++;
3175 GETCHARLEN(fc, ecode, length);
3176 if (length > md->end_subject - eptr)
3177 {
3178 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3179 RRETURN(MATCH_NOMATCH);
3180 }
3181 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3182 }
3183 else
3184 #endif
3185 /* Not UTF mode */
3186 {
3187 if (md->end_subject - eptr < 1)
3188 {
3189 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3190 RRETURN(MATCH_NOMATCH);
3191 }
3192 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3193 ecode += 2;
3194 }
3195 break;
3196
3197 /* Match a single character, caselessly. If we are at the end of the
3198 subject, give up immediately. */
3199
3200 case OP_CHARI:
3201 if (eptr >= md->end_subject)
3202 {
3203 SCHECK_PARTIAL();
3204 RRETURN(MATCH_NOMATCH);
3205 }
3206
3207 #ifdef SUPPORT_UTF
3208 if (utf)
3209 {
3210 length = 1;
3211 ecode++;
3212 GETCHARLEN(fc, ecode, length);
3213
3214 /* If the pattern character's value is < 128, we have only one byte, and
3215 we know that its other case must also be one byte long, so we can use the
3216 fast lookup table. We know that there is at least one byte left in the
3217 subject. */
3218
3219 if (fc < 128)
3220 {
3221 if (md->lcc[fc]
3222 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3223 ecode++;
3224 eptr++;
3225 }
3226
3227 /* Otherwise we must pick up the subject character. Note that we cannot
3228 use the value of "length" to check for sufficient bytes left, because the
3229 other case of the character may have more or fewer bytes. */
3230
3231 else
3232 {
3233 unsigned int dc;
3234 GETCHARINC(dc, eptr);
3235 ecode += length;
3236
3237 /* If we have Unicode property support, we can use it to test the other
3238 case of the character, if there is one. */
3239
3240 if (fc != dc)
3241 {
3242 #ifdef SUPPORT_UCP
3243 if (dc != UCD_OTHERCASE(fc))
3244 #endif
3245 RRETURN(MATCH_NOMATCH);
3246 }
3247 }
3248 }
3249 else
3250 #endif /* SUPPORT_UTF */
3251
3252 /* Not UTF mode */
3253 {
3254 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3255 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3256 eptr++;
3257 ecode += 2;
3258 }
3259 break;
3260
3261 /* Match a single character repeatedly. */
3262
3263 case OP_EXACT:
3264 case OP_EXACTI:
3265 min = max = GET2(ecode, 1);
3266 ecode += 1 + IMM2_SIZE;
3267 goto REPEATCHAR;
3268
3269 case OP_POSUPTO:
3270 case OP_POSUPTOI:
3271 possessive = TRUE;
3272 /* Fall through */
3273
3274 case OP_UPTO:
3275 case OP_UPTOI:
3276 case OP_MINUPTO:
3277 case OP_MINUPTOI:
3278 min = 0;
3279 max = GET2(ecode, 1);
3280 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3281 ecode += 1 + IMM2_SIZE;
3282 goto REPEATCHAR;
3283
3284 case OP_POSSTAR:
3285 case OP_POSSTARI:
3286 possessive = TRUE;
3287 min = 0;
3288 max = INT_MAX;
3289 ecode++;
3290 goto REPEATCHAR;
3291
3292 case OP_POSPLUS:
3293 case OP_POSPLUSI:
3294 possessive = TRUE;
3295 min = 1;
3296 max = INT_MAX;
3297 ecode++;
3298 goto REPEATCHAR;
3299
3300 case OP_POSQUERY:
3301 case OP_POSQUERYI:
3302 possessive = TRUE;
3303 min = 0;
3304 max = 1;
3305 ecode++;
3306 goto REPEATCHAR;
3307
3308 case OP_STAR:
3309 case OP_STARI:
3310 case OP_MINSTAR:
3311 case OP_MINSTARI:
3312 case OP_PLUS:
3313 case OP_PLUSI:
3314 case OP_MINPLUS:
3315 case OP_MINPLUSI:
3316 case OP_QUERY:
3317 case OP_QUERYI:
3318 case OP_MINQUERY:
3319 case OP_MINQUERYI:
3320 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3321 minimize = (c & 1) != 0;
3322 min = rep_min[c]; /* Pick up values from tables; */
3323 max = rep_max[c]; /* zero for max => infinity */
3324 if (max == 0) max = INT_MAX;
3325
3326 /* Common code for all repeated single-character matches. */
3327
3328 REPEATCHAR:
3329 #ifdef SUPPORT_UTF
3330 if (utf)
3331 {
3332 length = 1;
3333 charptr = ecode;
3334 GETCHARLEN(fc, ecode, length);
3335 ecode += length;
3336
3337 /* Handle multibyte character matching specially here. There is
3338 support for caseless matching if UCP support is present. */
3339
3340 if (length > 1)
3341 {
3342 #ifdef SUPPORT_UCP
3343 unsigned int othercase;
3344 if (op >= OP_STARI && /* Caseless */
3345 (othercase = UCD_OTHERCASE(fc)) != fc)
3346 oclength = PRIV(ord2utf)(othercase, occhars);
3347 else oclength = 0;
3348 #endif /* SUPPORT_UCP */
3349
3350 for (i = 1; i <= min; i++)
3351 {
3352 if (eptr <= md->end_subject - length &&
3353 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3354 #ifdef SUPPORT_UCP
3355 else if (oclength > 0 &&
3356 eptr <= md->end_subject - oclength &&
3357 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3358 #endif /* SUPPORT_UCP */
3359 else
3360 {
3361 CHECK_PARTIAL();
3362 RRETURN(MATCH_NOMATCH);
3363 }
3364 }
3365
3366 if (min == max) continue;
3367
3368 if (minimize)
3369 {
3370 for (fi = min;; fi++)
3371 {
3372 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3374 if (fi >= max) RRETURN(MATCH_NOMATCH);
3375 if (eptr <= md->end_subject - length &&
3376 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3377 #ifdef SUPPORT_UCP
3378 else if (oclength > 0 &&
3379 eptr <= md->end_subject - oclength &&
3380 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3381 #endif /* SUPPORT_UCP */
3382 else
3383 {
3384 CHECK_PARTIAL();
3385 RRETURN(MATCH_NOMATCH);
3386 }
3387 }
3388 /* Control never gets here */
3389 }
3390
3391 else /* Maximize */
3392 {
3393 pp = eptr;
3394 for (i = min; i < max; i++)
3395 {
3396 if (eptr <= md->end_subject - length &&
3397 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3398 #ifdef SUPPORT_UCP
3399 else if (oclength > 0 &&
3400 eptr <= md->end_subject - oclength &&
3401 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3402 #endif /* SUPPORT_UCP */
3403 else
3404 {
3405 CHECK_PARTIAL();
3406 break;
3407 }
3408 }
3409
3410 if (possessive) continue;
3411
3412 for(;;)
3413 {
3414 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3417 #ifdef SUPPORT_UCP
3418 eptr--;
3419 BACKCHAR(eptr);
3420 #else /* without SUPPORT_UCP */
3421 eptr -= length;
3422 #endif /* SUPPORT_UCP */
3423 }
3424 }
3425 /* Control never gets here */
3426 }
3427
3428 /* If the length of a UTF-8 character is 1, we fall through here, and
3429 obey the code as for non-UTF-8 characters below, though in this case the
3430 value of fc will always be < 128. */
3431 }
3432 else
3433 #endif /* SUPPORT_UTF */
3434 /* When not in UTF-8 mode, load a single-byte character. */
3435 fc = *ecode++;
3436
3437 /* The value of fc at this point is always one character, though we may
3438 or may not be in UTF mode. The code is duplicated for the caseless and
3439 caseful cases, for speed, since matching characters is likely to be quite
3440 common. First, ensure the minimum number of matches are present. If min =
3441 max, continue at the same level without recursing. Otherwise, if
3442 minimizing, keep trying the rest of the expression and advancing one
3443 matching character if failing, up to the maximum. Alternatively, if
3444 maximizing, find the maximum number of characters and work backwards. */
3445
3446 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3447 max, (char *)eptr));
3448
3449 if (op >= OP_STARI) /* Caseless */
3450 {
3451 #ifdef COMPILE_PCRE8
3452 /* fc must be < 128 if UTF is enabled. */
3453 foc = md->fcc[fc];
3454 #else
3455 #ifdef SUPPORT_UTF
3456 #ifdef SUPPORT_UCP
3457 if (utf && fc > 127)
3458 foc = UCD_OTHERCASE(fc);
3459 #else
3460 if (utf && fc > 127)
3461 foc = fc;
3462 #endif /* SUPPORT_UCP */
3463 else
3464 #endif /* SUPPORT_UTF */
3465 foc = TABLE_GET(fc, md->fcc, fc);
3466 #endif /* COMPILE_PCRE8 */
3467
3468 for (i = 1; i <= min; i++)
3469 {
3470 if (eptr >= md->end_subject)
3471 {
3472 SCHECK_PARTIAL();
3473 RRETURN(MATCH_NOMATCH);
3474 }
3475 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3476 eptr++;
3477 }
3478 if (min == max) continue;
3479 if (minimize)
3480 {
3481 for (fi = min;; fi++)
3482 {
3483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3485 if (fi >= max) RRETURN(MATCH_NOMATCH);
3486 if (eptr >= md->end_subject)
3487 {
3488 SCHECK_PARTIAL();
3489 RRETURN(MATCH_NOMATCH);
3490 }
3491 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3492 eptr++;
3493 }
3494 /* Control never gets here */
3495 }
3496 else /* Maximize */
3497 {
3498 pp = eptr;
3499 for (i = min; i < max; i++)
3500 {
3501 if (eptr >= md->end_subject)
3502 {
3503 SCHECK_PARTIAL();
3504 break;
3505 }
3506 if (fc != *eptr && foc != *eptr) break;
3507 eptr++;
3508 }
3509
3510 if (possessive) continue;
3511
3512 while (eptr >= pp)
3513 {
3514 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3515 eptr--;
3516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3517 }
3518 RRETURN(MATCH_NOMATCH);
3519 }
3520 /* Control never gets here */
3521 }
3522
3523 /* Caseful comparisons (includes all multi-byte characters) */
3524
3525 else
3526 {
3527 for (i = 1; i <= min; i++)
3528 {
3529 if (eptr >= md->end_subject)
3530 {
3531 SCHECK_PARTIAL();
3532 RRETURN(MATCH_NOMATCH);
3533 }
3534 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3535 }
3536
3537 if (min == max) continue;
3538
3539 if (minimize)
3540 {
3541 for (fi = min;; fi++)
3542 {
3543 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3544 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3545 if (fi >= max) RRETURN(MATCH_NOMATCH);
3546 if (eptr >= md->end_subject)
3547 {
3548 SCHECK_PARTIAL();
3549 RRETURN(MATCH_NOMATCH);
3550 }
3551 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3552 }
3553 /* Control never gets here */
3554 }
3555 else /* Maximize */
3556 {
3557 pp = eptr;
3558 for (i = min; i < max; i++)
3559 {
3560 if (eptr >= md->end_subject)
3561 {
3562 SCHECK_PARTIAL();
3563 break;
3564 }
3565 if (fc != *eptr) break;
3566 eptr++;
3567 }
3568 if (possessive) continue;
3569
3570 while (eptr >= pp)
3571 {
3572 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3573 eptr--;
3574 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3575 }
3576 RRETURN(MATCH_NOMATCH);
3577 }
3578 }
3579 /* Control never gets here */
3580
3581 /* Match a negated single one-byte character. The character we are
3582 checking can be multibyte. */
3583
3584 case OP_NOT:
3585 case OP_NOTI:
3586 if (eptr >= md->end_subject)
3587 {
3588 SCHECK_PARTIAL();
3589 RRETURN(MATCH_NOMATCH);
3590 }
3591 #ifdef SUPPORT_UTF
3592 if (utf)
3593 {
3594 register unsigned int ch, och;
3595
3596 ecode++;
3597 GETCHARINC(ch, ecode);
3598 GETCHARINC(c, eptr);
3599
3600 if (op == OP_NOT)
3601 {
3602 if (ch == c) RRETURN(MATCH_NOMATCH);
3603 }
3604 else
3605 {
3606 #ifdef SUPPORT_UCP
3607 if (ch > 127)
3608 och = UCD_OTHERCASE(ch);
3609 #else
3610 if (ch > 127)
3611 och = ch;
3612 #endif /* SUPPORT_UCP */
3613 else
3614 och = TABLE_GET(ch, md->fcc, ch);
3615 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3616 }
3617 }
3618 else
3619 #endif
3620 {
3621 register unsigned int ch = ecode[1];
3622 c = *eptr++;
3623 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3624 RRETURN(MATCH_NOMATCH);
3625 ecode += 2;
3626 }
3627 break;
3628
3629 /* Match a negated single one-byte character repeatedly. This is almost a
3630 repeat of the code for a repeated single character, but I haven't found a
3631 nice way of commoning these up that doesn't require a test of the
3632 positive/negative option for each character match. Maybe that wouldn't add
3633 very much to the time taken, but character matching *is* what this is all
3634 about... */
3635
3636 case OP_NOTEXACT:
3637 case OP_NOTEXACTI:
3638 min = max = GET2(ecode, 1);
3639 ecode += 1 + IMM2_SIZE;
3640 goto REPEATNOTCHAR;
3641
3642 case OP_NOTUPTO:
3643 case OP_NOTUPTOI:
3644 case OP_NOTMINUPTO:
3645 case OP_NOTMINUPTOI:
3646 min = 0;
3647 max = GET2(ecode, 1);
3648 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3649 ecode += 1 + IMM2_SIZE;
3650 goto REPEATNOTCHAR;
3651
3652 case OP_NOTPOSSTAR:
3653 case OP_NOTPOSSTARI:
3654 possessive = TRUE;
3655 min = 0;
3656 max = INT_MAX;
3657 ecode++;
3658 goto REPEATNOTCHAR;
3659
3660 case OP_NOTPOSPLUS:
3661 case OP_NOTPOSPLUSI:
3662 possessive = TRUE;
3663 min = 1;
3664 max = INT_MAX;
3665 ecode++;
3666 goto REPEATNOTCHAR;
3667
3668 case OP_NOTPOSQUERY:
3669 case OP_NOTPOSQUERYI:
3670 possessive = TRUE;
3671 min = 0;
3672 max = 1;
3673 ecode++;
3674 goto REPEATNOTCHAR;
3675
3676 case OP_NOTPOSUPTO:
3677 case OP_NOTPOSUPTOI:
3678 possessive = TRUE;
3679 min = 0;
3680 max = GET2(ecode, 1);
3681 ecode += 1 + IMM2_SIZE;
3682 goto REPEATNOTCHAR;
3683
3684 case OP_NOTSTAR:
3685 case OP_NOTSTARI:
3686 case OP_NOTMINSTAR:
3687 case OP_NOTMINSTARI:
3688 case OP_NOTPLUS:
3689 case OP_NOTPLUSI:
3690 case OP_NOTMINPLUS:
3691 case OP_NOTMINPLUSI:
3692 case OP_NOTQUERY:
3693 case OP_NOTQUERYI:
3694 case OP_NOTMINQUERY:
3695 case OP_NOTMINQUERYI:
3696 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3697 minimize = (c & 1) != 0;
3698 min = rep_min[c]; /* Pick up values from tables; */
3699 max = rep_max[c]; /* zero for max => infinity */
3700 if (max == 0) max = INT_MAX;
3701
3702 /* Common code for all repeated single-byte matches. */
3703
3704 REPEATNOTCHAR:
3705 GETCHARINCTEST(fc, ecode);
3706
3707 /* The code is duplicated for the caseless and caseful cases, for speed,
3708 since matching characters is likely to be quite common. First, ensure the
3709 minimum number of matches are present. If min = max, continue at the same
3710 level without recursing. Otherwise, if minimizing, keep trying the rest of
3711 the expression and advancing one matching character if failing, up to the
3712 maximum. Alternatively, if maximizing, find the maximum number of
3713 characters and work backwards. */
3714
3715 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3716 max, (char *)eptr));
3717
3718 if (op >= OP_NOTSTARI) /* Caseless */
3719 {
3720 #ifdef SUPPORT_UTF
3721 #ifdef SUPPORT_UCP
3722 if (utf && fc > 127)
3723 foc = UCD_OTHERCASE(fc);
3724 #else
3725 if (utf && fc > 127)
3726 foc = fc;
3727 #endif /* SUPPORT_UCP */
3728 else
3729 #endif /* SUPPORT_UTF */
3730 foc = TABLE_GET(fc, md->fcc, fc);
3731
3732 #ifdef SUPPORT_UTF
3733 if (utf)
3734 {
3735 register unsigned int d;
3736 for (i = 1; i <= min; i++)
3737 {
3738 if (eptr >= md->end_subject)
3739 {
3740 SCHECK_PARTIAL();
3741 RRETURN(MATCH_NOMATCH);
3742 }
3743 GETCHARINC(d, eptr);
3744 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3745 }
3746 }
3747 else
3748 #endif
3749 /* Not UTF mode */
3750 {
3751 for (i = 1; i <= min; i++)
3752 {
3753 if (eptr >= md->end_subject)
3754 {
3755 SCHECK_PARTIAL();
3756 RRETURN(MATCH_NOMATCH);
3757 }
3758 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3759 eptr++;
3760 }
3761 }
3762
3763 if (min == max) continue;
3764
3765 if (minimize)
3766 {
3767 #ifdef SUPPORT_UTF
3768 if (utf)
3769 {
3770 register unsigned int d;
3771 for (fi = min;; fi++)
3772 {
3773 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3775 if (fi >= max) RRETURN(MATCH_NOMATCH);
3776 if (eptr >= md->end_subject)
3777 {
3778 SCHECK_PARTIAL();
3779 RRETURN(MATCH_NOMATCH);
3780 }
3781 GETCHARINC(d, eptr);
3782 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3783 }
3784 }
3785 else
3786 #endif
3787 /* Not UTF mode */
3788 {
3789 for (fi = min;; fi++)
3790 {
3791 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3793 if (fi >= max) RRETURN(MATCH_NOMATCH);
3794 if (eptr >= md->end_subject)
3795 {
3796 SCHECK_PARTIAL();
3797 RRETURN(MATCH_NOMATCH);
3798 }
3799 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3800 eptr++;
3801 }
3802 }
3803 /* Control never gets here */
3804 }
3805
3806 /* Maximize case */
3807
3808 else
3809 {
3810 pp = eptr;
3811
3812 #ifdef SUPPORT_UTF
3813 if (utf)
3814 {
3815 register unsigned int d;
3816 for (i = min; i < max; i++)
3817 {
3818 int len = 1;
3819 if (eptr >= md->end_subject)
3820 {
3821 SCHECK_PARTIAL();
3822 break;
3823 }
3824 GETCHARLEN(d, eptr, len);
3825 if (fc == d || (unsigned int)foc == d) break;
3826 eptr += len;
3827 }
3828 if (possessive) continue;
3829 for(;;)
3830 {
3831 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3833 if (eptr-- == pp) break; /* Stop if tried at original pos */
3834 BACKCHAR(eptr);
3835 }
3836 }
3837 else
3838 #endif
3839 /* Not UTF mode */
3840 {
3841 for (i = min; i < max; i++)
3842 {
3843 if (eptr >= md->end_subject)
3844 {
3845 SCHECK_PARTIAL();
3846 break;
3847 }
3848 if (fc == *eptr || foc == *eptr) break;
3849 eptr++;
3850 }
3851 if (possessive) continue;
3852 while (eptr >= pp)
3853 {
3854 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3855 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3856 eptr--;
3857 }
3858 }
3859
3860 RRETURN(MATCH_NOMATCH);
3861 }
3862 /* Control never gets here */
3863 }
3864
3865 /* Caseful comparisons */
3866
3867 else
3868 {
3869 #ifdef SUPPORT_UTF
3870 if (utf)
3871 {
3872 register unsigned int d;
3873 for (i = 1; i <= min; i++)
3874 {
3875 if (eptr >= md->end_subject)
3876 {
3877 SCHECK_PARTIAL();
3878 RRETURN(MATCH_NOMATCH);
3879 }
3880 GETCHARINC(d, eptr);
3881 if (fc == d) RRETURN(MATCH_NOMATCH);
3882 }
3883 }
3884 else
3885 #endif
3886 /* Not UTF mode */
3887 {
3888 for (i = 1; i <= min; i++)
3889 {
3890 if (eptr >= md->end_subject)
3891 {
3892 SCHECK_PARTIAL();
3893 RRETURN(MATCH_NOMATCH);
3894 }
3895 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3896 }
3897 }
3898
3899 if (min == max) continue;
3900
3901 if (minimize)
3902 {
3903 #ifdef SUPPORT_UTF
3904 if (utf)
3905 {
3906 register unsigned int d;
3907 for (fi = min;; fi++)
3908 {
3909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3911 if (fi >= max) RRETURN(MATCH_NOMATCH);
3912 if (eptr >= md->end_subject)
3913 {
3914 SCHECK_PARTIAL();
3915 RRETURN(MATCH_NOMATCH);
3916 }
3917 GETCHARINC(d, eptr);
3918 if (fc == d) RRETURN(MATCH_NOMATCH);
3919 }
3920 }
3921 else
3922 #endif
3923 /* Not UTF mode */
3924 {
3925 for (fi = min;; fi++)
3926 {
3927 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3928 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3929 if (fi >= max) RRETURN(MATCH_NOMATCH);
3930 if (eptr >= md->end_subject)
3931 {
3932 SCHECK_PARTIAL();
3933 RRETURN(MATCH_NOMATCH);
3934 }
3935 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3936 }
3937 }
3938 /* Control never gets here */
3939 }
3940
3941 /* Maximize case */
3942
3943 else
3944 {
3945 pp = eptr;
3946
3947 #ifdef SUPPORT_UTF
3948 if (utf)
3949 {
3950 register unsigned int d;
3951 for (i = min; i < max; i++)
3952 {
3953 int len = 1;
3954 if (eptr >= md->end_subject)
3955 {
3956 SCHECK_PARTIAL();
3957 break;
3958 }
3959 GETCHARLEN(d, eptr, len);
3960 if (fc == d) break;
3961 eptr += len;
3962 }
3963 if (possessive) continue;
3964 for(;;)
3965 {
3966 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3968 if (eptr-- == pp) break; /* Stop if tried at original pos */
3969 BACKCHAR(eptr);
3970 }
3971 }
3972 else
3973 #endif
3974 /* Not UTF mode */
3975 {
3976 for (i = min; i < max; i++)
3977 {
3978 if (eptr >= md->end_subject)
3979 {
3980 SCHECK_PARTIAL();
3981 break;
3982 }
3983 if (fc == *eptr) break;
3984 eptr++;
3985 }
3986 if (possessive) continue;
3987 while (eptr >= pp)
3988 {
3989 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3991 eptr--;
3992 }
3993 }
3994
3995 RRETURN(MATCH_NOMATCH);
3996 }
3997 }
3998 /* Control never gets here */
3999
4000 /* Match a single character type repeatedly; several different opcodes
4001 share code. This is very similar to the code for single characters, but we
4002 repeat it in the interests of efficiency. */
4003
4004 case OP_TYPEEXACT:
4005 min = max = GET2(ecode, 1);
4006 minimize = TRUE;
4007 ecode += 1 + IMM2_SIZE;
4008 goto REPEATTYPE;
4009
4010 case OP_TYPEUPTO:
4011 case OP_TYPEMINUPTO:
4012 min = 0;
4013 max = GET2(ecode, 1);
4014 minimize = *ecode == OP_TYPEMINUPTO;
4015 ecode += 1 + IMM2_SIZE;
4016 goto REPEATTYPE;
4017
4018 case OP_TYPEPOSSTAR:
4019 possessive = TRUE;
4020 min = 0;
4021 max = INT_MAX;
4022 ecode++;
4023 goto REPEATTYPE;
4024
4025 case OP_TYPEPOSPLUS:
4026 possessive = TRUE;
4027 min = 1;
4028 max = INT_MAX;
4029 ecode++;
4030 goto REPEATTYPE;
4031
4032 case OP_TYPEPOSQUERY:
4033 possessive = TRUE;
4034 min = 0;
4035 max = 1;
4036 ecode++;
4037 goto REPEATTYPE;
4038
4039 case OP_TYPEPOSUPTO:
4040 possessive = TRUE;
4041 min = 0;
4042 max = GET2(ecode, 1);
4043 ecode += 1 + IMM2_SIZE;
4044 goto REPEATTYPE;
4045
4046 case OP_TYPESTAR:
4047 case OP_TYPEMINSTAR:
4048 case OP_TYPEPLUS:
4049 case OP_TYPEMINPLUS:
4050 case OP_TYPEQUERY:
4051 case OP_TYPEMINQUERY:
4052 c = *ecode++ - OP_TYPESTAR;
4053 minimize = (c & 1) != 0;
4054 min = rep_min[c]; /* Pick up values from tables; */
4055 max = rep_max[c]; /* zero for max => infinity */
4056 if (max == 0) max = INT_MAX;
4057
4058 /* Common code for all repeated single character type matches. Note that
4059 in UTF-8 mode, '.' matches a character of any length, but for the other
4060 character types, the valid characters are all one-byte long. */
4061
4062 REPEATTYPE:
4063 ctype = *ecode++; /* Code for the character type */
4064
4065 #ifdef SUPPORT_UCP
4066 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4067 {
4068 prop_fail_result = ctype == OP_NOTPROP;
4069 prop_type = *ecode++;
4070 prop_value = *ecode++;
4071 }
4072 else prop_type = -1;
4073 #endif
4074
4075 /* First, ensure the minimum number of matches are present. Use inline
4076 code for maximizing the speed, and do the type test once at the start
4077 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4078 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4079 and single-bytes. */
4080
4081 if (min > 0)
4082 {
4083 #ifdef SUPPORT_UCP
4084 if (prop_type >= 0)
4085 {
4086 switch(prop_type)
4087 {
4088 case PT_ANY:
4089 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4090 for (i = 1; i <= min; i++)
4091 {
4092 if (eptr >= md->end_subject)
4093 {
4094 SCHECK_PARTIAL();
4095 RRETURN(MATCH_NOMATCH);
4096 }
4097 GETCHARINCTEST(c, eptr);
4098 }
4099 break;
4100
4101 case PT_LAMP:
4102 for (i = 1; i <= min; i++)
4103 {
4104 int chartype;
4105 if (eptr >= md->end_subject)
4106 {
4107 SCHECK_PARTIAL();
4108 RRETURN(MATCH_NOMATCH);
4109 }
4110 GETCHARINCTEST(c, eptr);
4111 chartype = UCD_CHARTYPE(c);
4112 if ((chartype == ucp_Lu ||
4113 chartype == ucp_Ll ||
4114 chartype == ucp_Lt) == prop_fail_result)
4115 RRETURN(MATCH_NOMATCH);
4116 }
4117 break;
4118
4119 case PT_GC:
4120 for (i = 1; i <= min; i++)
4121 {
4122 if (eptr >= md->end_subject)
4123 {
4124 SCHECK_PARTIAL();
4125 RRETURN(MATCH_NOMATCH);
4126 }
4127 GETCHARINCTEST(c, eptr);
4128 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4129 RRETURN(MATCH_NOMATCH);
4130 }
4131 break;
4132
4133 case PT_PC:
4134 for (i = 1; i <= min; i++)
4135 {
4136 if (eptr >= md->end_subject)
4137 {
4138 SCHECK_PARTIAL();
4139 RRETURN(MATCH_NOMATCH);
4140 }
4141 GETCHARINCTEST(c, eptr);
4142 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4143 RRETURN(MATCH_NOMATCH);
4144 }
4145 break;
4146
4147 case PT_SC:
4148 for (i = 1; i <= min; i++)
4149 {
4150 if (eptr >= md->end_subject)
4151 {
4152 SCHECK_PARTIAL();
4153 RRETURN(MATCH_NOMATCH);
4154 }
4155 GETCHARINCTEST(c, eptr);
4156 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4157 RRETURN(MATCH_NOMATCH);
4158 }
4159 break;
4160
4161 case PT_ALNUM:
4162 for (i = 1; i <= min; i++)
4163 {
4164 int category;
4165 if (eptr >= md->end_subject)
4166 {
4167 SCHECK_PARTIAL();
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 GETCHARINCTEST(c, eptr);
4171 category = UCD_CATEGORY(c);
4172 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4173 RRETURN(MATCH_NOMATCH);
4174 }
4175 break;
4176
4177 case PT_SPACE: /* Perl space */
4178 for (i = 1; i <= min; i++)
4179 {
4180 if (eptr >= md->end_subject)
4181 {
4182 SCHECK_PARTIAL();
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 GETCHARINCTEST(c, eptr);
4186 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4187 c == CHAR_FF || c == CHAR_CR)
4188 == prop_fail_result)
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 break;
4192
4193 case PT_PXSPACE: /* POSIX space */
4194 for (i = 1; i <= min; i++)
4195 {
4196 if (eptr >= md->end_subject)
4197 {
4198 SCHECK_PARTIAL();
4199 RRETURN(MATCH_NOMATCH);
4200 }
4201 GETCHARINCTEST(c, eptr);
4202 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4203 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4204 == prop_fail_result)
4205 RRETURN(MATCH_NOMATCH);
4206 }
4207 break;
4208
4209 case PT_WORD:
4210 for (i = 1; i <= min; i++)
4211 {
4212 int category;
4213 if (eptr >= md->end_subject)
4214 {
4215 SCHECK_PARTIAL();
4216 RRETURN(MATCH_NOMATCH);
4217 }
4218 GETCHARINCTEST(c, eptr);
4219 category = UCD_CATEGORY(c);
4220 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4221 == prop_fail_result)
4222 RRETURN(MATCH_NOMATCH);
4223 }
4224 break;
4225
4226 /* This should not occur */
4227
4228 default:
4229 RRETURN(PCRE_ERROR_INTERNAL);
4230 }
4231 }
4232
4233 /* Match extended Unicode sequences. We will get here only if the
4234 support is in the binary; otherwise a compile-time error occurs. */
4235
4236 else if (ctype == OP_EXTUNI)
4237 {
4238 for (i = 1; i <= min; i++)
4239 {
4240 if (eptr >= md->end_subject)
4241 {
4242 SCHECK_PARTIAL();
4243 RRETURN(MATCH_NOMATCH);
4244 }
4245 else
4246 {
4247 int lgb, rgb;
4248 GETCHARINCTEST(c, eptr);
4249 lgb = UCD_GRAPHBREAK(c);
4250 while (eptr < md->end_subject)
4251 {
4252 int len = 1;
4253 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4254 rgb = UCD_GRAPHBREAK(c);
4255 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4256 lgb = rgb;
4257 eptr += len;
4258 }
4259 }
4260 CHECK_PARTIAL();
4261 }
4262 }
4263
4264 else
4265 #endif /* SUPPORT_UCP */
4266
4267 /* Handle all other cases when the coding is UTF-8 */
4268
4269 #ifdef SUPPORT_UTF
4270 if (utf) switch(ctype)
4271 {
4272 case OP_ANY:
4273 for (i = 1; i <= min; i++)
4274 {
4275 if (eptr >= md->end_subject)
4276 {
4277 SCHECK_PARTIAL();
4278 RRETURN(MATCH_NOMATCH);
4279 }
4280 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4281 if (md->partial != 0 &&
4282 eptr + 1 >= md->end_subject &&
4283 NLBLOCK->nltype == NLTYPE_FIXED &&
4284 NLBLOCK->nllen == 2 &&
4285 *eptr == NLBLOCK->nl[0])
4286 {
4287 md->hitend = TRUE;
4288 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4289 }
4290 eptr++;
4291 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4292 }
4293 break;
4294
4295 case OP_ALLANY:
4296 for (i = 1; i <= min; i++)
4297 {
4298 if (eptr >= md->end_subject)
4299 {
4300 SCHECK_PARTIAL();
4301 RRETURN(MATCH_NOMATCH);
4302 }
4303 eptr++;
4304 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4305 }
4306 break;
4307
4308 case OP_ANYBYTE:
4309 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4310 eptr += min;
4311 break;
4312
4313 case OP_ANYNL:
4314 for (i = 1; i <= min; i++)
4315 {
4316 if (eptr >= md->end_subject)
4317 {
4318 SCHECK_PARTIAL();
4319 RRETURN(MATCH_NOMATCH);
4320 }
4321 GETCHARINC(c, eptr);
4322 switch(c)
4323 {
4324 default: RRETURN(MATCH_NOMATCH);
4325
4326 case CHAR_CR:
4327 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4328 break;
4329
4330 case CHAR_LF:
4331 break;
4332
4333 case CHAR_VT:
4334 case CHAR_FF:
4335 case CHAR_NEL:
4336 #ifndef EBCDIC
4337 case 0x2028:
4338 case 0x2029:
4339 #endif /* Not EBCDIC */
4340 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4341 break;
4342 }
4343 }
4344 break;
4345
4346 case OP_NOT_HSPACE:
4347 for (i = 1; i <= min; i++)
4348 {
4349 if (eptr >= md->end_subject)
4350 {
4351 SCHECK_PARTIAL();
4352 RRETURN(MATCH_NOMATCH);
4353 }
4354 GETCHARINC(c, eptr);
4355 switch(c)
4356 {
4357 default: break;
4358 case CHAR_HT:
4359 case CHAR_SPACE:
4360 #ifndef EBCDIC
4361 case 0xa0: /* NBSP */
4362 case 0x1680: /* OGHAM SPACE MARK */
4363 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4364 case 0x2000: /* EN QUAD */
4365 case 0x2001: /* EM QUAD */
4366 case 0x2002: /* EN SPACE */
4367 case 0x2003: /* EM SPACE */
4368 case 0x2004: /* THREE-PER-EM SPACE */
4369 case 0x2005: /* FOUR-PER-EM SPACE */
4370 case 0x2006: /* SIX-PER-EM SPACE */
4371 case 0x2007: /* FIGURE SPACE */
4372 case 0x2008: /* PUNCTUATION SPACE */
4373 case 0x2009: /* THIN SPACE */
4374 case 0x200A: /* HAIR SPACE */
4375 case 0x202f: /* NARROW NO-BREAK SPACE */
4376 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4377 case 0x3000: /* IDEOGRAPHIC SPACE */
4378 #endif /* Not EBCDIC */
4379 RRETURN(MATCH_NOMATCH);
4380 }
4381 }
4382 break;
4383
4384 case OP_HSPACE:
4385 for (i = 1; i <= min; i++)
4386 {
4387 if (eptr >= md->end_subject)
4388 {
4389 SCHECK_PARTIAL();
4390 RRETURN(MATCH_NOMATCH);
4391 }
4392 GETCHARINC(c, eptr);
4393 switch(c)
4394 {
4395 default: RRETURN(MATCH_NOMATCH);
4396 case CHAR_HT:
4397 case CHAR_SPACE:
4398 #ifndef EBCDIC
4399 case 0xa0: /* NBSP */
4400 case 0x1680: /* OGHAM SPACE MARK */
4401 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4402 case 0x2000: /* EN QUAD */
4403 case 0x2001: /* EM QUAD */
4404 case 0x2002: /* EN SPACE */
4405 case 0x2003: /* EM SPACE */
4406 case 0x2004: /* THREE-PER-EM SPACE */
4407 case 0x2005: /* FOUR-PER-EM SPACE */
4408 case 0x2006: /* SIX-PER-EM SPACE */
4409 case 0x2007: /* FIGURE SPACE */
4410 case 0x2008: /* PUNCTUATION SPACE */
4411 case 0x2009: /* THIN SPACE */
4412 case 0x200A: /* HAIR SPACE */
4413 case 0x202f: /* NARROW NO-BREAK SPACE */
4414 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4415 case 0x3000: /* IDEOGRAPHIC SPACE */
4416 #endif
4417 break;
4418 }
4419 }
4420 break;
4421
4422 case OP_NOT_VSPACE:
4423 for (i = 1; i <= min; i++)
4424 {
4425 if (eptr >= md->end_subject)
4426 {
4427 SCHECK_PARTIAL();
4428 RRETURN(MATCH_NOMATCH);
4429 }
4430 GETCHARINC(c, eptr);
4431 switch(c)
4432 {
4433 default: break;
4434 case CHAR_LF:
4435 case CHAR_VT:
4436 case CHAR_FF:
4437 case CHAR_CR:
4438 case CHAR_NEL:
4439 #ifndef EBCDIC
4440 case 0x2028: /* LINE SEPARATOR */
4441 case 0x2029: /* PARAGRAPH SEPARATOR */
4442 #endif
4443 RRETURN(MATCH_NOMATCH);
4444 }
4445 }
4446 break;
4447
4448 case OP_VSPACE:
4449 for (i = 1; i <= min; i++)
4450 {
4451 if (eptr >= md->end_subject)
4452 {
4453 SCHECK_PARTIAL();
4454 RRETURN(MATCH_NOMATCH);
4455 }
4456 GETCHARINC(c, eptr);
4457 switch(c)
4458 {
4459 default: RRETURN(MATCH_NOMATCH);
4460 case CHAR_LF:
4461 case CHAR_VT:
4462 case CHAR_FF:
4463 case CHAR_CR:
4464 case CHAR_NEL:
4465 #ifndef EBCDIC
4466 case 0x2028: /* LINE SEPARATOR */
4467 case 0x2029: /* PARAGRAPH SEPARATOR */
4468 #endif
4469 break;
4470 }
4471 }
4472 break;
4473
4474 case OP_NOT_DIGIT:
4475 for (i = 1; i <= min; i++)
4476 {
4477 if (eptr >= md->end_subject)
4478 {
4479 SCHECK_PARTIAL();
4480 RRETURN(MATCH_NOMATCH);
4481 }
4482 GETCHARINC(c, eptr);
4483 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4484 RRETURN(MATCH_NOMATCH);
4485 }
4486 break;
4487
4488 case OP_DIGIT:
4489 for (i = 1; i <= min; i++)
4490 {
4491 if (eptr >= md->end_subject)
4492 {
4493 SCHECK_PARTIAL();
4494 RRETURN(MATCH_NOMATCH);
4495 }
4496 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4497 RRETURN(MATCH_NOMATCH);
4498 eptr++;
4499 /* No need to skip more bytes - we know it's a 1-byte character */
4500 }
4501 break;
4502
4503 case OP_NOT_WHITESPACE:
4504 for (i = 1; i <= min; i++)
4505 {
4506 if (eptr >= md->end_subject)
4507 {
4508 SCHECK_PARTIAL();
4509 RRETURN(MATCH_NOMATCH);
4510 }
4511 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4512 RRETURN(MATCH_NOMATCH);
4513 eptr++;
4514 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4515 }
4516 break;
4517
4518 case OP_WHITESPACE:
4519 for (i = 1; i <= min; i++)
4520 {
4521 if (eptr >= md->end_subject)
4522 {
4523 SCHECK_PARTIAL();
4524 RRETURN(MATCH_NOMATCH);
4525 }
4526 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4527 RRETURN(MATCH_NOMATCH);
4528 eptr++;
4529 /* No need to skip more bytes - we know it's a 1-byte character */
4530 }
4531 break;
4532
4533 case OP_NOT_WORDCHAR:
4534 for (i = 1; i <= min; i++)
4535 {
4536 if (eptr >= md->end_subject)
4537 {
4538 SCHECK_PARTIAL();
4539 RRETURN(MATCH_NOMATCH);
4540 }
4541 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4542 RRETURN(MATCH_NOMATCH);
4543 eptr++;
4544 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4545 }
4546 break;
4547
4548 case OP_WORDCHAR:
4549 for (i = 1; i <= min; i++)
4550 {
4551 if (eptr >= md->end_subject)
4552 {
4553 SCHECK_PARTIAL();
4554 RRETURN(MATCH_NOMATCH);
4555 }
4556 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4557 RRETURN(MATCH_NOMATCH);
4558 eptr++;
4559 /* No need to skip more bytes - we know it's a 1-byte character */
4560 }
4561 break;
4562
4563 default:
4564 RRETURN(PCRE_ERROR_INTERNAL);
4565 } /* End switch(ctype) */
4566
4567 else
4568 #endif /* SUPPORT_UTF */
4569
4570 /* Code for the non-UTF-8 case for minimum matching of operators other
4571 than OP_PROP and OP_NOTPROP. */
4572
4573 switch(ctype)
4574 {
4575 case OP_ANY:
4576 for (i = 1; i <= min; i++)
4577 {
4578 if (eptr >= md->end_subject)
4579 {
4580 SCHECK_PARTIAL();
4581 RRETURN(MATCH_NOMATCH);
4582 }
4583 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4584 if (md->partial != 0 &&
4585 eptr + 1 >= md->end_subject &&
4586 NLBLOCK->nltype == NLTYPE_FIXED &&
4587 NLBLOCK->nllen == 2 &&
4588 *eptr == NLBLOCK->nl[0])
4589 {
4590 md->hitend = TRUE;
4591 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4592 }
4593 eptr++;
4594 }
4595 break;
4596
4597 case OP_ALLANY:
4598 if (eptr > md->end_subject - min)
4599 {
4600 SCHECK_PARTIAL();
4601 RRETURN(MATCH_NOMATCH);
4602 }
4603 eptr += min;
4604 break;
4605
4606 case OP_ANYBYTE:
4607 if (eptr > md->end_subject - min)
4608 {
4609 SCHECK_PARTIAL();
4610 RRETURN(MATCH_NOMATCH);
4611 }
4612 eptr += min;
4613 break;
4614
4615 case OP_ANYNL:
4616 for (i = 1; i <= min; i++)
4617 {
4618 if (eptr >= md->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 switch(*eptr++)
4624 {
4625 default: RRETURN(MATCH_NOMATCH);
4626
4627 case CHAR_CR:
4628 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4629 break;
4630
4631 case CHAR_LF:
4632 break;
4633
4634 case CHAR_VT:
4635 case CHAR_FF:
4636 case CHAR_NEL:
4637 #ifdef COMPILE_PCRE16
4638 case 0x2028:
4639 case 0x2029:
4640 #endif
4641 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4642 break;
4643 }
4644 }
4645 break;
4646
4647 case OP_NOT_HSPACE:
4648 for (i = 1; i <= min; i++)
4649 {
4650 if (eptr >= md->end_subject)
4651 {
4652 SCHECK_PARTIAL();
4653 RRETURN(MATCH_NOMATCH);
4654 }
4655 switch(*eptr++)
4656 {
4657 default: break;
4658 case CHAR_HT:
4659 case CHAR_SPACE:
4660 #ifndef EBCDIC
4661 case 0xa0: /* NBSP */
4662 #ifdef COMPILE_PCRE16
4663 case 0x1680: /* OGHAM SPACE MARK */
4664 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4665 case 0x2000: /* EN QUAD */
4666 case 0x2001: /* EM QUAD */
4667 case 0x2002: /* EN SPACE */
4668 case 0x2003: /* EM SPACE */
4669 case 0x2004: /* THREE-PER-EM SPACE */
4670 case 0x2005: /* FOUR-PER-EM SPACE */
4671 case 0x2006: /* SIX-PER-EM SPACE */
4672 case 0x2007: /* FIGURE SPACE */
4673 case 0x2008: /* PUNCTUATION SPACE */
4674 case 0x2009: /* THIN SPACE */
4675 case 0x200A: /* HAIR SPACE */
4676 case 0x202f: /* NARROW NO-BREAK SPACE */
4677 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4678 case 0x3000: /* IDEOGRAPHIC SPACE */
4679 #endif /* COMPILE_PCRE16 */
4680 #endif /* Not EBCDIC */
4681 RRETURN(MATCH_NOMATCH);
4682 }
4683 }
4684 break;
4685
4686 case OP_HSPACE:
4687 for (i = 1; i <= min; i++)
4688 {
4689 if (eptr >= md->end_subject)
4690 {
4691 SCHECK_PARTIAL();
4692 RRETURN(MATCH_NOMATCH);
4693 }
4694 switch(*eptr++)
4695 {
4696 default: RRETURN(MATCH_NOMATCH);
4697 case CHAR_HT:
4698 case CHAR_SPACE:
4699 #ifndef EBCDIC
4700 case 0xa0: /* NBSP */
4701 #ifdef COMPILE_PCRE16
4702 case 0x1680: /* OGHAM SPACE MARK */
4703 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4704 case 0x2000: /* EN QUAD */
4705 case 0x2001: /* EM QUAD */
4706 case 0x2002: /* EN SPACE */
4707 case 0x2003: /* EM SPACE */
4708 case 0x2004: /* THREE-PER-EM SPACE */
4709 case 0x2005: /* FOUR-PER-EM SPACE */
4710 case 0x2006: /* SIX-PER-EM SPACE */
4711 case 0x2007: /* FIGURE SPACE */
4712 case 0x2008: /* PUNCTUATION SPACE */
4713 case 0x2009: /* THIN SPACE */
4714 case 0x200A: /* HAIR SPACE */
4715 case 0x202f: /* NARROW NO-BREAK SPACE */
4716 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4717 case 0x3000: /* IDEOGRAPHIC SPACE */
4718 #endif /* COMPILE_PCRE16 */
4719 #endif /* Not EBCDIC */
4720 break;
4721 }
4722 }
4723 break;
4724
4725 case OP_NOT_VSPACE:
4726 for (i = 1; i <= min; i++)
4727 {
4728 if (eptr >= md->end_subject)
4729 {
4730 SCHECK_PARTIAL();
4731 RRETURN(MATCH_NOMATCH);
4732 }
4733 switch(*eptr++)
4734 {
4735 default: break;
4736 case CHAR_LF:
4737 case CHAR_VT:
4738 case CHAR_FF:
4739 case CHAR_CR:
4740 case CHAR_NEL:
4741 #ifdef COMPILE_PCRE16
4742 case 0x2028: /* LINE SEPARATOR */
4743 case 0x2029: /* PARAGRAPH SEPARATOR */
4744 #endif
4745 RRETURN(MATCH_NOMATCH);
4746 }
4747 }
4748 break;
4749
4750 case OP_VSPACE:
4751 for (i = 1; i <= min; i++)
4752 {
4753 if (eptr >= md->end_subject)
4754 {
4755 SCHECK_PARTIAL();
4756 RRETURN(MATCH_NOMATCH);
4757 }
4758 switch(*eptr++)
4759 {
4760 default: RRETURN(MATCH_NOMATCH);
4761 case CHAR_LF:
4762 case CHAR_VT:
4763 case CHAR_FF:
4764 case CHAR_CR:
4765 case CHAR_NEL:
4766 #ifdef COMPILE_PCRE16
4767 case 0x2028: /* LINE SEPARATOR */
4768 case 0x2029: /* PARAGRAPH SEPARATOR */
4769 #endif
4770 break;
4771 }
4772 }
4773 break;
4774
4775 case OP_NOT_DIGIT:
4776 for (i = 1; i <= min; i++)
4777 {
4778 if (eptr >= md->end_subject)
4779 {
4780 SCHECK_PARTIAL();
4781 RRETURN(MATCH_NOMATCH);
4782 }
4783 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4784 RRETURN(MATCH_NOMATCH);
4785 eptr++;
4786 }
4787 break;
4788
4789 case OP_DIGIT:
4790 for (i = 1; i <= min; i++)
4791 {
4792 if (eptr >= md->end_subject)
4793 {
4794 SCHECK_PARTIAL();
4795 RRETURN(MATCH_NOMATCH);
4796 }
4797 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4798 RRETURN(MATCH_NOMATCH);
4799 eptr++;
4800 }
4801 break;
4802
4803 case OP_NOT_WHITESPACE:
4804 for (i = 1; i <= min; i++)
4805 {
4806 if (eptr >= md->end_subject)
4807 {
4808 SCHECK_PARTIAL();
4809 RRETURN(MATCH_NOMATCH);
4810 }
4811 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4812 RRETURN(MATCH_NOMATCH);
4813 eptr++;
4814 }
4815 break;
4816
4817 case OP_WHITESPACE:
4818 for (i = 1; i <= min; i++)
4819 {
4820 if (eptr >= md->end_subject)
4821 {
4822 SCHECK_PARTIAL();
4823 RRETURN(MATCH_NOMATCH);
4824 }
4825 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4826 RRETURN(MATCH_NOMATCH);
4827 eptr++;
4828 }
4829 break;
4830
4831 case OP_NOT_WORDCHAR:
4832 for (i = 1; i <= min; i++)
4833 {
4834 if (eptr >= md->end_subject)
4835 {
4836 SCHECK_PARTIAL();
4837 RRETURN(MATCH_NOMATCH);
4838 }
4839 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4840 RRETURN(MATCH_NOMATCH);
4841 eptr++;
4842 }
4843 break;
4844
4845 case OP_WORDCHAR:
4846 for (i = 1; i <= min; i++)
4847 {
4848 if (eptr >= md->end_subject)
4849 {
4850 SCHECK_PARTIAL();
4851 RRETURN(MATCH_NOMATCH);
4852 }
4853 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4854 RRETURN(MATCH_NOMATCH);
4855 eptr++;
4856 }
4857 break;
4858
4859 default:
4860 RRETURN(PCRE_ERROR_INTERNAL);
4861 }
4862 }
4863
4864 /* If min = max, continue at the same level without recursing */
4865
4866 if (min == max) continue;
4867
4868 /* If minimizing, we have to test the rest of the pattern before each
4869 subsequent match. Again, separate the UTF-8 case for speed, and also
4870 separate the UCP cases. */
4871
4872 if (minimize)
4873 {
4874 #ifdef SUPPORT_UCP
4875 if (prop_type >= 0)
4876 {
4877 switch(prop_type)
4878 {
4879 case PT_ANY:
4880 for (fi = min;; fi++)
4881 {
4882 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4884 if (fi >= max) RRETURN(MATCH_NOMATCH);
4885 if (eptr >= md->end_subject)
4886 {
4887 SCHECK_PARTIAL();
4888 RRETURN(MATCH_NOMATCH);
4889 }
4890 GETCHARINCTEST(c, eptr);
4891 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4892 }
4893 /* Control never gets here */
4894
4895 case PT_LAMP:
4896 for (fi = min;; fi++)
4897 {
4898 int chartype;
4899 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4901 if (fi >= max) RRETURN(MATCH_NOMATCH);
4902 if (eptr >= md->end_subject)
4903 {
4904 SCHECK_PARTIAL();
4905 RRETURN(MATCH_NOMATCH);
4906 }
4907 GETCHARINCTEST(c, eptr);
4908 chartype = UCD_CHARTYPE(c);
4909 if ((chartype == ucp_Lu ||
4910 chartype == ucp_Ll ||
4911 chartype == ucp_Lt) == prop_fail_result)
4912 RRETURN(MATCH_NOMATCH);
4913 }
4914 /* Control never gets here */
4915
4916 case PT_GC:
4917 for (fi = min;; fi++)
4918 {
4919 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4921 if (fi >= max) RRETURN(MATCH_NOMATCH);
4922 if (eptr >= md->end_subject)
4923 {
4924 SCHECK_PARTIAL();
4925 RRETURN(MATCH_NOMATCH);
4926 }
4927 GETCHARINCTEST(c, eptr);
4928 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4929 RRETURN(MATCH_NOMATCH);
4930 }
4931 /* Control never gets here */
4932
4933 case PT_PC:
4934 for (fi = min;; fi++)
4935 {
4936 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4938 if (fi >= max) RRETURN(MATCH_NOMATCH);
4939 if (eptr >= md->end_subject)
4940 {
4941 SCHECK_PARTIAL();
4942 RRETURN(MATCH_NOMATCH);
4943 }
4944 GETCHARINCTEST(c, eptr);
4945 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4946 RRETURN(MATCH_NOMATCH);
4947 }
4948 /* Control never gets here */
4949
4950 case PT_SC:
4951 for (fi = min;; fi++)
4952 {
4953 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4955 if (fi >= max) RRETURN(MATCH_NOMATCH);
4956 if (eptr >= md->end_subject)
4957 {
4958 SCHECK_PARTIAL();
4959 RRETURN(MATCH_NOMATCH);
4960 }
4961 GETCHARINCTEST(c, eptr);
4962 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4963 RRETURN(MATCH_NOMATCH);
4964 }
4965 /* Control never gets here */
4966
4967 case PT_ALNUM:
4968 for (fi = min;; fi++)
4969 {
4970 int category;
4971 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4973 if (fi >= max) RRETURN(MATCH_NOMATCH);
4974 if (eptr >= md->end_subject)
4975 {
4976 SCHECK_PARTIAL();
4977 RRETURN(MATCH_NOMATCH);
4978 }
4979 GETCHARINCTEST(c, eptr);
4980 category = UCD_CATEGORY(c);
4981 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4982 RRETURN(MATCH_NOMATCH);
4983 }
4984 /* Control never gets here */
4985
4986 case PT_SPACE: /* Perl space */
4987 for (fi = min;; fi++)
4988 {
4989 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4991 if (fi >= max) RRETURN(MATCH_NOMATCH);
4992 if (eptr >= md->end_subject)
4993 {
4994 SCHECK_PARTIAL();
4995 RRETURN(MATCH_NOMATCH);
4996 }
4997 GETCHARINCTEST(c, eptr);
4998 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4999 c == CHAR_FF || c == CHAR_CR)
5000 == prop_fail_result)
5001 RRETURN(MATCH_NOMATCH);
5002 }
5003 /* Control never gets here */
5004
5005 case PT_PXSPACE: /* POSIX space */
5006 for (fi = min;; fi++)
5007 {
5008 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5010 if (fi >= max) RRETURN(MATCH_NOMATCH);
5011 if (eptr >= md->end_subject)
5012 {
5013 SCHECK_PARTIAL();
5014 RRETURN(MATCH_NOMATCH);
5015 }
5016 GETCHARINCTEST(c, eptr);
5017 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5018 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5019 == prop_fail_result)
5020 RRETURN(MATCH_NOMATCH);
5021 }
5022 /* Control never gets here */
5023
5024 case PT_WORD:
5025 for (fi = min;; fi++)
5026 {
5027 int category;
5028 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5030 if (fi >= max) RRETURN(MATCH_NOMATCH);
5031 if (eptr >= md->end_subject)
5032 {
5033 SCHECK_PARTIAL();
5034 RRETURN(MATCH_NOMATCH);
5035 }
5036 GETCHARINCTEST(c, eptr);
5037 category = UCD_CATEGORY(c);
5038 if ((category == ucp_L ||
5039 category == ucp_N ||
5040 c == CHAR_UNDERSCORE)
5041 == prop_fail_result)
5042 RRETURN(MATCH_NOMATCH);
5043 }
5044 /* Control never gets here */
5045
5046 /* This should never occur */
5047
5048 default:
5049 RRETURN(PCRE_ERROR_INTERNAL);
5050 }
5051 }
5052
5053 /* Match extended Unicode sequences. We will get here only if the
5054 support is in the binary; otherwise a compile-time error occurs. */
5055
5056 else if (ctype == OP_EXTUNI)
5057 {
5058 for (fi = min;; fi++)
5059 {
5060 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5062 if (fi >= max) RRETURN(MATCH_NOMATCH);
5063 if (eptr >= md->end_subject)
5064 {
5065 SCHECK_PARTIAL();
5066 RRETURN(MATCH_NOMATCH);
5067 }
5068 else
5069 {
5070 int lgb, rgb;
5071 GETCHARINCTEST(c, eptr);
5072 lgb = UCD_GRAPHBREAK(c);
5073 while (eptr < md->end_subject)
5074 {
5075 int len = 1;
5076 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5077 rgb = UCD_GRAPHBREAK(c);
5078 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5079 lgb = rgb;
5080 eptr += len;
5081 }
5082 }
5083 CHECK_PARTIAL();
5084 }
5085 }
5086 else
5087 #endif /* SUPPORT_UCP */
5088
5089 #ifdef SUPPORT_UTF
5090 if (utf)
5091 {
5092 for (fi = min;; fi++)
5093 {
5094 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5096 if (fi >= max) RRETURN(MATCH_NOMATCH);
5097 if (eptr >= md->end_subject)
5098 {
5099 SCHECK_PARTIAL();
5100 RRETURN(MATCH_NOMATCH);
5101 }
5102 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5103 RRETURN(MATCH_NOMATCH);
5104 GETCHARINC(c, eptr);
5105 switch(ctype)
5106 {
5107 case OP_ANY: /* This is the non-NL case */
5108 if (md->partial != 0 && /* Take care with CRLF partial */
5109 eptr >= md->end_subject &&
5110 NLBLOCK->nltype == NLTYPE_FIXED &&
5111 NLBLOCK->nllen == 2 &&
5112 c == NLBLOCK->nl[0])
5113 {
5114 md->hitend = TRUE;
5115 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5116 }
5117 break;
5118
5119 case OP_ALLANY:
5120 case OP_ANYBYTE:
5121 break;
5122
5123 case OP_ANYNL:
5124 switch(c)
5125 {
5126 default: RRETURN(MATCH_NOMATCH);
5127 case CHAR_CR:
5128 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5129 break;
5130
5131 case CHAR_LF:
5132 break;
5133
5134 case CHAR_VT:
5135 case CHAR_FF:
5136 case CHAR_NEL:
5137 #ifndef EBCDIC
5138 case 0x2028:
5139 case 0x2029:
5140 #endif /* Not EBCDIC */
5141 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5142 break;
5143 }
5144 break;
5145
5146 case OP_NOT_HSPACE:
5147 switch(c)
5148 {
5149 default: break;
5150 case CHAR_HT:
5151 case CHAR_SPACE:
5152 #ifndef EBCDIC
5153 case 0xa0: /* NBSP */
5154 case 0x1680: /* OGHAM SPACE MARK */
5155 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5156 case 0x2000: /* EN QUAD */
5157 case 0x2001: /* EM QUAD */
5158 case 0x2002: /* EN SPACE */
5159 case 0x2003: /* EM SPACE */
5160 case 0x2004: /* THREE-PER-EM SPACE */
5161 case 0x2005: /* FOUR-PER-EM SPACE */
5162 case 0x2006: /* SIX-PER-EM SPACE */
5163 case 0x2007: /* FIGURE SPACE */
5164 case 0x2008: /* PUNCTUATION SPACE */
5165 case 0x2009: /* THIN SPACE */
5166 case 0x200A: /* HAIR SPACE */
5167 case 0x202f: /* NARROW NO-BREAK SPACE */
5168 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5169 case 0x3000: /* IDEOGRAPHIC SPACE */
5170 #endif /* Not EBCDIC */
5171 RRETURN(MATCH_NOMATCH);
5172 }
5173 break;
5174
5175 case OP_HSPACE:
5176 switch(c)
5177 {
5178 default: RRETURN(MATCH_NOMATCH);
5179 case CHAR_HT:
5180 case CHAR_SPACE:
5181 #ifndef EBCDIC
5182 case 0xa0: /* NBSP */
5183 case 0x1680: /* OGHAM SPACE MARK */
5184 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5185 case 0x2000: /* EN QUAD */
5186 case 0x2001: /* EM QUAD */
5187 case 0x2002: /* EN SPACE */
5188 case 0x2003: /* EM SPACE */
5189 case 0x2004: /* THREE-PER-EM SPACE */
5190 case 0x2005: /* FOUR-PER-EM SPACE */
5191 case 0x2006: /* SIX-PER-EM SPACE */
5192 case 0x2007: /* FIGURE SPACE */
5193 case 0x2008: /* PUNCTUATION SPACE */
5194 case 0x2009: /* THIN SPACE */
5195 case 0x200A: /* HAIR SPACE */
5196 case 0x202f: /* NARROW NO-BREAK SPACE */
5197 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5198 case 0x3000: /* IDEOGRAPHIC SPACE */
5199 #endif /* Not EBCDIC */
5200 break;
5201 }
5202 break;
5203
5204 case OP_NOT_VSPACE:
5205 switch(c)
5206 {
5207 default: break;
5208 case CHAR_LF:
5209 case CHAR_VT:
5210 case CHAR_FF:
5211 case CHAR_CR:
5212 case CHAR_NEL:
5213 #ifndef EBCDIC
5214 case 0x2028: /* LINE SEPARATOR */
5215 case 0x2029: /* PARAGRAPH SEPARATOR */
5216 #endif /* Not EBCDIC */
5217 RRETURN(MATCH_NOMATCH);
5218 }
5219 break;
5220
5221 case OP_VSPACE:
5222 switch(c)
5223 {
5224 default: RRETURN(MATCH_NOMATCH);
5225 case CHAR_LF:
5226 case CHAR_VT:
5227 case CHAR_FF:
5228 case CHAR_CR:
5229 case CHAR_NEL:
5230 #ifndef EBCDIC
5231 case 0x2028: /* LINE SEPARATOR */
5232 case 0x2029: /* PARAGRAPH SEPARATOR */
5233 #endif /* Not EBCDIC */
5234 break;
5235 }
5236 break;
5237
5238 case OP_NOT_DIGIT:
5239 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5240 RRETURN(MATCH_NOMATCH);
5241 break;
5242
5243 case OP_DIGIT:
5244 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5245 RRETURN(MATCH_NOMATCH);
5246 break;
5247
5248 case OP_NOT_WHITESPACE:
5249 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5250 RRETURN(MATCH_NOMATCH);
5251 break;
5252
5253 case OP_WHITESPACE:
5254 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5255 RRETURN(MATCH_NOMATCH);
5256 break;
5257
5258 case OP_NOT_WORDCHAR:
5259 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5260 RRETURN(MATCH_NOMATCH);
5261 break;
5262
5263 case OP_WORDCHAR:
5264 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5265 RRETURN(MATCH_NOMATCH);
5266 break;
5267
5268 default:
5269 RRETURN(PCRE_ERROR_INTERNAL);
5270 }
5271 }
5272 }
5273 else
5274 #endif
5275 /* Not UTF mode */
5276 {
5277 for (fi = min;; fi++)
5278 {
5279 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5281 if (fi >= max) RRETURN(MATCH_NOMATCH);
5282 if (eptr >= md->end_subject)
5283 {
5284 SCHECK_PARTIAL();
5285 RRETURN(MATCH_NOMATCH);
5286 }
5287 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5288 RRETURN(MATCH_NOMATCH);
5289 c = *eptr++;
5290 switch(ctype)
5291 {
5292 case OP_ANY: /* This is the non-NL case */
5293 if (md->partial != 0 && /* Take care with CRLF partial */
5294 eptr >= md->end_subject &&
5295 NLBLOCK->nltype == NLTYPE_FIXED &&
5296 NLBLOCK->nllen == 2 &&
5297 c == NLBLOCK->nl[0])
5298 {
5299 md->hitend = TRUE;
5300 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5301 }
5302 break;
5303
5304 case OP_ALLANY:
5305 case OP_ANYBYTE:
5306 break;
5307
5308 case OP_ANYNL:
5309 switch(c)
5310 {
5311 default: RRETURN(MATCH_NOMATCH);
5312 case CHAR_CR:
5313 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5314 break;
5315
5316 case CHAR_LF:
5317 break;
5318
5319 case CHAR_VT:
5320 case CHAR_FF:
5321 case CHAR_NEL:
5322 #ifdef COMPILE_PCRE16
5323 case 0x2028:
5324 case 0x2029:
5325 #endif
5326 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5327 break;
5328 }
5329 break;
5330
5331 case OP_NOT_HSPACE:
5332 switch(c)
5333 {
5334 default: break;
5335 case CHAR_HT:
5336 case CHAR_SPACE:
5337 #ifndef EBCDIC
5338 case 0xa0: /* NBSP */
5339 #ifdef COMPILE_PCRE16
5340 case 0x1680: /* OGHAM SPACE MARK */
5341 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5342 case 0x2000: /* EN QUAD */
5343 case 0x2001: /* EM QUAD */
5344 case 0x2002: /* EN SPACE */
5345 case 0x2003: /* EM SPACE */
5346 case 0x2004: /* THREE-PER-EM SPACE */
5347 case 0x2005: /* FOUR-PER-EM SPACE */
5348 case 0x2006: /* SIX-PER-EM SPACE */
5349 case 0x2007: /* FIGURE SPACE */
5350 case 0x2008: /* PUNCTUATION SPACE */
5351 case 0x2009: /* THIN SPACE */
5352 case 0x200A: /* HAIR SPACE */
5353 case 0x202f: /* NARROW NO-BREAK SPACE */
5354 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5355 case 0x3000: /* IDEOGRAPHIC SPACE */
5356 #endif /* COMPILE_PCRE16 */
5357 #endif /* Not EBCDIC */
5358 RRETURN(MATCH_NOMATCH);
5359 }
5360 break;
5361
5362 case OP_HSPACE:
5363 switch(c)
5364 {
5365 default: RRETURN(MATCH_NOMATCH);
5366 case CHAR_HT:
5367 case CHAR_SPACE:
5368 #ifndef EBCDIC
5369 case 0xa0: /* NBSP */
5370 #ifdef COMPILE_PCRE16
5371 case 0x1680: /* OGHAM SPACE MARK */
5372 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5373 case 0x2000: /* EN QUAD */
5374 case 0x2001: /* EM QUAD */
5375 case 0x2002: /* EN SPACE */
5376 case 0x2003: /* EM SPACE */
5377 case 0x2004: /* THREE-PER-EM SPACE */
5378 case 0x2005: /* FOUR-PER-EM SPACE */
5379 case 0x2006: /* SIX-PER-EM SPACE */
5380 case 0x2007: /* FIGURE SPACE */
5381 case 0x2008: /* PUNCTUATION SPACE */
5382 case 0x2009: /* THIN SPACE */
5383 case 0x200A: /* HAIR SPACE */
5384 case 0x202f: /* NARROW NO-BREAK SPACE */
5385 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5386 case 0x3000: /* IDEOGRAPHIC SPACE */
5387 #endif /* COMPILE_PCRE16 */
5388 #endif /* Not EBCDIC */
5389 break;
5390 }
5391 break;
5392
5393 case OP_NOT_VSPACE:
5394 switch(c)
5395 {
5396 default: break;
5397 case CHAR_LF:
5398 case CHAR_VT:
5399 case CHAR_FF:
5400 case CHAR_CR:
5401 case CHAR_NEL:
5402 #ifdef COMPILE_PCRE16
5403 case 0x2028: /* LINE SEPARATOR */
5404 case 0x2029: /* PARAGRAPH SEPARATOR */
5405 #endif
5406 RRETURN(MATCH_NOMATCH);
5407 }
5408 break;
5409
5410 case OP_VSPACE:
5411 switch(c)
5412 {
5413 default: RRETURN(MATCH_NOMATCH);
5414 case CHAR_LF:
5415 case CHAR_VT:
5416 case CHAR_FF:
5417 case CHAR_CR:
5418 case CHAR_NEL:
5419 #ifdef COMPILE_PCRE16
5420 case 0x2028: /* LINE SEPARATOR */
5421 case 0x2029: /* PARAGRAPH SEPARATOR */
5422 #endif
5423 break;
5424 }
5425 break;
5426
5427 case OP_NOT_DIGIT:
5428 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5429 break;
5430
5431 case OP_DIGIT:
5432 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5433 break;
5434
5435 case OP_NOT_WHITESPACE:
5436 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5437 break;
5438
5439 case OP_WHITESPACE:
5440 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5441 break;
5442
5443 case OP_NOT_WORDCHAR:
5444 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5445 break;
5446
5447 case OP_WORDCHAR:
5448 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5449 break;
5450
5451 default:
5452 RRETURN(PCRE_ERROR_INTERNAL);
5453 }
5454 }
5455 }
5456 /* Control never gets here */
5457 }
5458
5459 /* If maximizing, it is worth using inline code for speed, doing the type
5460 test once at the start (i.e. keep it out of the loop). Again, keep the
5461 UTF-8 and UCP stuff separate. */
5462
5463 else
5464 {
5465 pp = eptr; /* Remember where we started */
5466
5467 #ifdef SUPPORT_UCP
5468 if (prop_type >= 0)
5469 {
5470 switch(prop_type)
5471 {
5472 case PT_ANY:
5473 for (i = min; i < max; i++)
5474 {
5475 int len = 1;
5476 if (eptr >= md->end_subject)
5477 {
5478 SCHECK_PARTIAL();
5479 break;
5480 }
5481 GETCHARLENTEST(c, eptr, len);
5482 if (prop_fail_result) break;
5483 eptr+= len;
5484 }
5485 break;
5486
5487 case PT_LAMP:
5488 for (i = min; i < max; i++)
5489 {
5490 int chartype;
5491 int len = 1;
5492 if (eptr >= md->end_subject)
5493 {
5494 SCHECK_PARTIAL();
5495 break;
5496 }
5497 GETCHARLENTEST(c, eptr, len);
5498 chartype = UCD_CHARTYPE(c);
5499 if ((chartype == ucp_Lu ||
5500 chartype == ucp_Ll ||
5501 chartype == ucp_Lt) == prop_fail_result)
5502 break;
5503 eptr+= len;
5504 }
5505 break;
5506
5507 case PT_GC:
5508 for (i = min; i < max; i++)
5509 {
5510 int len = 1;
5511 if (eptr >= md->end_subject)
5512 {
5513 SCHECK_PARTIAL();
5514 break;
5515 }
5516 GETCHARLENTEST(c, eptr, len);
5517 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5518 eptr+= len;
5519 }
5520 break;
5521
5522 case PT_PC:
5523 for (i = min; i < max; i++)
5524 {
5525 int len = 1;
5526 if (eptr >= md->end_subject)
5527 {
5528 SCHECK_PARTIAL();
5529 break;
5530 }
5531 GETCHARLENTEST(c, eptr, len);
5532 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5533 eptr+= len;
5534 }
5535 break;
5536
5537 case PT_SC:
5538 for (i = min; i < max; i++)
5539 {
5540 int len = 1;
5541 if (eptr >= md->end_subject)
5542 {
5543 SCHECK_PARTIAL();
5544 break;
5545 }
5546 GETCHARLENTEST(c, eptr, len);
5547 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5548 eptr+= len;
5549 }
5550 break;
5551
5552 case PT_ALNUM:
5553 for (i = min; i < max; i++)
5554 {
5555 int category;
5556 int len = 1;
5557 if (eptr >= md->end_subject)
5558 {
5559 SCHECK_PARTIAL();
5560 break;
5561 }
5562 GETCHARLENTEST(c, eptr, len);
5563 category = UCD_CATEGORY(c);
5564 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5565 break;
5566 eptr+= len;
5567 }
5568 break;
5569
5570 case PT_SPACE: /* Perl space */
5571 for (i = min; i < max; i++)
5572 {
5573 int len = 1;
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 GETCHARLENTEST(c, eptr, len);
5580 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5581 c == CHAR_FF || c == CHAR_CR)
5582 == prop_fail_result)
5583 break;
5584 eptr+= len;
5585 }
5586 break;
5587
5588 case PT_PXSPACE: /* POSIX space */
5589 for (i = min; i < max; i++)
5590 {
5591 int len = 1;
5592 if (eptr >= md->end_subject)
5593 {
5594 SCHECK_PARTIAL();
5595 break;
5596 }
5597 GETCHARLENTEST(c, eptr, len);
5598 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5599 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5600 == prop_fail_result)
5601 break;
5602 eptr+= len;
5603 }
5604 break;
5605
5606 case PT_WORD:
5607 for (i = min; i < max; i++)
5608 {
5609 int category;
5610 int len = 1;
5611 if (eptr >= md->end_subject)
5612 {
5613 SCHECK_PARTIAL();
5614 break;
5615 }
5616 GETCHARLENTEST(c, eptr, len);
5617 category = UCD_CATEGORY(c);
5618 if ((category == ucp_L || category == ucp_N ||
5619 c == CHAR_UNDERSCORE) == prop_fail_result)
5620 break;
5621 eptr+= len;
5622 }
5623 break;
5624
5625 default:
5626 RRETURN(PCRE_ERROR_INTERNAL);
5627 }
5628
5629 /* eptr is now past the end of the maximum run */
5630
5631 if (possessive) continue;
5632 for(;;)
5633 {
5634 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5636 if (eptr-- == pp) break; /* Stop if tried at original pos */
5637 if (utf) BACKCHAR(eptr);
5638 }
5639 }
5640
5641 /* Match extended Unicode sequences. We will get here only if the
5642 support is in the binary; otherwise a compile-time error occurs. */
5643
5644 else if (ctype == OP_EXTUNI)
5645 {
5646 for (i = min; i < max; i++)
5647 {
5648 if (eptr >= md->end_subject)
5649 {
5650 SCHECK_PARTIAL();
5651 break;
5652 }
5653 else
5654 {
5655 int lgb, rgb;
5656 GETCHARINCTEST(c, eptr);
5657 lgb = UCD_GRAPHBREAK(c);
5658 while (eptr < md->end_subject)
5659 {
5660 int len = 1;
5661 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5662 rgb = UCD_GRAPHBREAK(c);
5663 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5664 lgb = rgb;
5665 eptr += len;
5666 }
5667 }
5668 CHECK_PARTIAL();
5669 }
5670
5671 /* eptr is now past the end of the maximum run */
5672
5673 if (possessive) continue;
5674
5675 for(;;)
5676 {
5677 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5679 if (eptr-- == pp) break; /* Stop if tried at original pos */
5680 for (;;) /* Move back over one extended */
5681 {
5682 if (!utf) c = *eptr; else
5683 {
5684 BACKCHAR(eptr);
5685 GETCHAR(c, eptr);
5686 }
5687 if (UCD_CATEGORY(c) != ucp_M) break;
5688 eptr--;
5689 }
5690 }
5691 }
5692
5693 else
5694 #endif /* SUPPORT_UCP */
5695
5696 #ifdef SUPPORT_UTF
5697 if (utf)
5698 {
5699 switch(ctype)
5700 {
5701 case OP_ANY:
5702 if (max < INT_MAX)
5703 {
5704 for (i = min; i < max; i++)
5705 {
5706 if (eptr >= md->end_subject)
5707 {
5708 SCHECK_PARTIAL();
5709 break;
5710 }
5711 if (IS_NEWLINE(eptr)) break;
5712 if (md->partial != 0 && /* Take care with CRLF partial */
5713 eptr + 1 >= md->end_subject &&
5714 NLBLOCK->nltype == NLTYPE_FIXED &&
5715 NLBLOCK->nllen == 2 &&
5716 *eptr == NLBLOCK->nl[0])
5717 {
5718 md->hitend = TRUE;
5719 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5720 }
5721 eptr++;
5722 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5723 }
5724 }
5725
5726 /* Handle unlimited UTF-8 repeat */
5727
5728 else
5729 {
5730 for (i = min; i < max; i++)
5731 {
5732 if (eptr >= md->end_subject)
5733 {
5734 SCHECK_PARTIAL();
5735 break;
5736 }
5737 if (IS_NEWLINE(eptr)) break;
5738 if (md->partial != 0 && /* Take care with CRLF partial */
5739 eptr + 1 >= md->end_subject &&
5740 NLBLOCK->nltype == NLTYPE_FIXED &&
5741 NLBLOCK->nllen == 2 &&
5742 *eptr == NLBLOCK->nl[0])
5743 {
5744 md->hitend = TRUE;
5745 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5746 }
5747 eptr++;
5748 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5749 }
5750 }
5751 break;
5752
5753 case OP_ALLANY:
5754 if (max < INT_MAX)
5755 {
5756 for (i = min; i < max; i++)
5757 {
5758 if (eptr >= md->end_subject)
5759 {
5760 SCHECK_PARTIAL();
5761 break;
5762 }
5763 eptr++;
5764 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5765 }
5766 }
5767 else
5768 {
5769 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5770 SCHECK_PARTIAL();
5771 }
5772 break;
5773
5774 /* The byte case is the same as non-UTF8 */
5775
5776 case OP_ANYBYTE:
5777 c = max - min;
5778 if (c > (unsigned int)(md->end_subject - eptr))
5779 {
5780 eptr = md->end_subject;
5781 SCHECK_PARTIAL();
5782 }
5783 else eptr += c;
5784 break;
5785
5786 case OP_ANYNL:
5787 for (i = min; i < max; i++)
5788 {
5789 int len = 1;
5790 if (eptr >= md->end_subject)
5791 {
5792 SCHECK_PARTIAL();
5793 break;
5794 }
5795 GETCHARLEN(c, eptr, len);
5796 if (c == CHAR_CR)
5797 {
5798 if (++eptr >= md->end_subject) break;
5799 if (*eptr == CHAR_LF) eptr++;
5800 }
5801 else
5802 {
5803 if (c != CHAR_LF &&
5804 (md->bsr_anycrlf ||
5805 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5806 #ifndef EBCDIC
5807 && c != 0x2028 && c != 0x2029
5808 #endif /* Not EBCDIC */
5809 )))
5810 break;
5811 eptr += len;
5812 }
5813 }
5814 break;
5815
5816 case OP_NOT_HSPACE:
5817 case OP_HSPACE:
5818 for (i = min; i < max; i++)
5819 {
5820 BOOL gotspace;
5821 int len = 1;
5822 if (eptr >= md->end_subject)
5823 {
5824 SCHECK_PARTIAL();
5825 break;
5826 }
5827 GETCHARLEN(c, eptr, len);
5828 switch(c)
5829 {
5830 default: gotspace = FALSE; break;
5831 case CHAR_HT:
5832 case CHAR_SPACE:
5833 #ifndef EBCDIC
5834 case 0xa0: /* NBSP */
5835 case 0x1680: /* OGHAM SPACE MARK */
5836 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5837 case 0x2000: /* EN QUAD */
5838 case 0x2001: /* EM QUAD */
5839 case 0x2002: /* EN SPACE */
5840 case 0x2003: /* EM SPACE */
5841 case 0x2004: /* THREE-PER-EM SPACE */
5842 case 0x2005: /* FOUR-PER-EM SPACE */
5843 case 0x2006: /* SIX-PER-EM SPACE */
5844 case 0x2007: /* FIGURE SPACE */
5845 case 0x2008: /* PUNCTUATION SPACE */
5846 case 0x2009: /* THIN SPACE */
5847 case 0x200A: /* HAIR SPACE */
5848 case 0x202f: /* NARROW NO-BREAK SPACE */
5849 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5850 case 0x3000: /* IDEOGRAPHIC SPACE */
5851 #endif /* Not EBCDIC */
5852 gotspace = TRUE;
5853 break;
5854 }
5855 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5856 eptr += len;
5857 }
5858 break;
5859
5860 case OP_NOT_VSPACE:
5861 case OP_VSPACE:
5862 for (i = min; i < max; i++)
5863 {
5864 BOOL gotspace;
5865 int len = 1;
5866 if (eptr >= md->end_subject)
5867 {
5868 SCHECK_PARTIAL();
5869 break;
5870 }
5871 GETCHARLEN(c, eptr, len);
5872 switch(c)
5873 {
5874 default: gotspace = FALSE; break;
5875 case CHAR_LF:
5876 case CHAR_VT:
5877 case CHAR_FF:
5878 case CHAR_CR:
5879 case CHAR_NEL:
5880 #ifndef EBCDIC
5881 case 0x2028: /* LINE SEPARATOR */
5882 case 0x2029: /* PARAGRAPH SEPARATOR */
5883 #endif /* Not EBCDIC */
5884 gotspace = TRUE;
5885 break;
5886 }
5887 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5888 eptr += len;
5889 }
5890 break;
5891
5892 case OP_NOT_DIGIT:
5893 for (i = min; i < max; i++)
5894 {
5895 int len = 1;
5896 if (eptr >= md->end_subject)
5897 {
5898 SCHECK_PARTIAL();
5899 break;
5900 }
5901 GETCHARLEN(c, eptr, len);
5902 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5903 eptr+= len;
5904 }
5905 break;
5906
5907 case OP_DIGIT:
5908 for (i = min; i < max; i++)
5909 {
5910 int len = 1;
5911 if (eptr >= md->end_subject)
5912 {
5913 SCHECK_PARTIAL();
5914 break;
5915 }
5916 GETCHARLEN(c, eptr, len);
5917 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5918 eptr+= len;
5919 }
5920 break;
5921
5922 case OP_NOT_WHITESPACE:
5923 for (i = min; i < max; i++)
5924 {
5925 int len = 1;
5926 if (eptr >= md->end_subject)
5927 {
5928 SCHECK_PARTIAL();
5929 break;
5930 }
5931 GETCHARLEN(c, eptr, len);
5932 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5933 eptr+= len;
5934 }
5935 break;
5936
5937 case OP_WHITESPACE:
5938 for (i = min; i < max; i++)
5939 {
5940 int len = 1;
5941 if (eptr >= md->end_subject)
5942 {
5943 SCHECK_PARTIAL();
5944 break;
5945 }
5946 GETCHARLEN(c, eptr, len);
5947 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5948 eptr+= len;
5949 }
5950 break;
5951
5952 case OP_NOT_WORDCHAR:
5953 for (i = min; i < max; i++)
5954 {
5955 int len = 1;
5956 if (eptr >= md->end_subject)
5957 {
5958 SCHECK_PARTIAL();
5959 break;
5960 }
5961 GETCHARLEN(c, eptr, len);
5962 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5963 eptr+= len;
5964 }
5965 break;
5966
5967 case OP_WORDCHAR:
5968 for (i = min; i < max; i++)
5969 {
5970 int len = 1;
5971 if (eptr >= md->end_subject)
5972 {
5973 SCHECK_PARTIAL();
5974 break;
5975 }
5976 GETCHARLEN(c, eptr, len);
5977 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5978 eptr+= len;
5979 }
5980 break;
5981
5982 default:
5983 RRETURN(PCRE_ERROR_INTERNAL);
5984 }
5985
5986 /* eptr is now past the end of the maximum run. If possessive, we are
5987 done (no backing up). Otherwise, match at this position; anything other
5988 than no match is immediately returned. For nomatch, back up one
5989 character, unless we are matching \R and the last thing matched was
5990 \r\n, in which case, back up two bytes. */
5991
5992 if (possessive) continue;
5993 for(;;)
5994 {
5995 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5997 if (eptr-- == pp) break; /* Stop if tried at original pos */
5998 BACKCHAR(eptr);
5999 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_NL &&
6000 eptr[-1] == CHAR_CR) eptr--;
6001 }
6002 }
6003 else
6004 #endif /* SUPPORT_UTF */
6005 /* Not UTF mode */
6006 {
6007 switch(ctype)
6008 {
6009 case OP_ANY:
6010 for (i = min; i < max; i++)
6011 {
6012 if (eptr >= md->end_subject)
6013 {
6014 SCHECK_PARTIAL();
6015 break;
6016 }
6017 if (IS_NEWLINE(eptr)) break;
6018 if (md->partial != 0 && /* Take care with CRLF partial */
6019 eptr + 1 >= md->end_subject &&
6020 NLBLOCK->nltype == NLTYPE_FIXED &&
6021 NLBLOCK->nllen == 2 &&
6022 *eptr == NLBLOCK->nl[0])
6023 {
6024 md->hitend = TRUE;
6025 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
6026 }
6027 eptr++;
6028 }
6029 break;
6030
6031 case OP_ALLANY:
6032 case OP_ANYBYTE:
6033 c = max - min;
6034 if (c > (unsigned int)(md->end_subject - eptr))
6035 {
6036 eptr = md->end_subject;
6037 SCHECK_PARTIAL();
6038 }
6039 else eptr += c;
6040 break;
6041
6042 case OP_ANYNL:
6043 for (i = min; i < max; i++)
6044 {
6045 if (eptr >= md->end_subject)
6046 {
6047 SCHECK_PARTIAL();
6048 break;
6049 }
6050 c = *eptr;
6051 if (c == CHAR_CR)
6052 {
6053 if (++eptr >= md->end_subject) break;
6054 if (*eptr == CHAR_LF) eptr++;
6055 }
6056 else
6057 {
6058 if (c != CHAR_LF && (md->bsr_anycrlf ||
6059 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6060 #ifdef COMPILE_PCRE16
6061 && c != 0x2028 && c != 0x2029
6062 #endif
6063 ))) break;
6064 eptr++;
6065 }
6066 }
6067 break;
6068
6069 case OP_NOT_HSPACE:
6070 for (i = min; i < max; i++)
6071 {
6072 if (eptr >= md->end_subject)
6073 {
6074 SCHECK_PARTIAL();
6075 break;
6076 }
6077 c = *eptr;
6078 if (c == CHAR_HT || c == CHAR_SPACE
6079 #ifndef EBCDIC
6080 || c == 0xa0
6081 #ifdef COMPILE_PCRE16
6082 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6083 || c == 0x202f || c == 0x205f || c == 0x3000
6084 #endif /* COMPILE_PCRE16 */
6085 #endif /* Not EBCDIC */
6086 ) break;
6087 eptr++;
6088 }
6089 break;
6090
6091 case OP_HSPACE:
6092 for (i = min; i < max; i++)
6093 {
6094 if (eptr >= md->end_subject)
6095 {
6096 SCHECK_PARTIAL();
6097 break;
6098 }
6099 c = *eptr;
6100 if (c != CHAR_HT && c != CHAR_SPACE
6101 #ifndef EBCDIC
6102 && c != 0xa0
6103 #ifdef COMPILE_PCRE16
6104 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6105 && c != 0x202f && c != 0x205f && c != 0x3000
6106 #endif /* COMPILE_PCRE16 */
6107 #endif /* Not EBCDIC */
6108 ) break;
6109 eptr++;
6110 }
6111 break;
6112
6113 case OP_NOT_VSPACE:
6114 for (i = min; i < max; i++)
6115 {
6116 if (eptr >= md->end_subject)
6117 {
6118 SCHECK_PARTIAL();
6119 break;
6120 }
6121 c = *eptr;
6122 if (c == CHAR_LF || c == CHAR_VT || c == CHAR_FF ||
6123 c == CHAR_CR || c == CHAR_NEL
6124 #ifdef COMPILE_PCRE16
6125 || c == 0x2028 || c == 0x2029
6126 #endif
6127 ) break;
6128 eptr++;
6129 }
6130 break;
6131
6132 case OP_VSPACE:
6133 for (i = min; i < max; i++)
6134 {
6135 if (eptr >= md->end_subject)
6136 {
6137 SCHECK_PARTIAL();
6138 break;
6139 }
6140 c = *eptr;
6141 if (c != CHAR_LF && c != CHAR_VT && c != CHAR_FF &&
6142 c != CHAR_CR && c != CHAR_NEL
6143 #ifdef COMPILE_PCRE16
6144 && c != 0x2028 && c != 0x2029
6145 #endif
6146 ) break;
6147 eptr++;
6148 }
6149 break;
6150
6151 case OP_NOT_DIGIT:
6152 for (i = min; i < max; i++)
6153 {
6154 if (eptr >= md->end_subject)
6155 {
6156 SCHECK_PARTIAL();
6157 break;
6158 }
6159 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6160 eptr++;
6161 }
6162 break;
6163
6164 case OP_DIGIT:
6165 for (i = min; i < max; i++)
6166 {
6167 if (eptr >= md->end_subject)
6168 {
6169 SCHECK_PARTIAL();
6170 break;
6171 }
6172 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6173 eptr++;
6174 }
6175 break;
6176
6177 case OP_NOT_WHITESPACE:
6178 for (i = min; i < max; i++)
6179 {
6180 if (eptr >= md->end_subject)
6181 {
6182 SCHECK_PARTIAL();
6183 break;
6184 }
6185 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6186 eptr++;
6187 }
6188 break;
6189
6190 case OP_WHITESPACE:
6191 for (i = min; i < max; i++)
6192 {
6193 if (eptr >= md->end_subject)
6194 {
6195 SCHECK_PARTIAL();
6196 break;
6197 }
6198 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6199 eptr++;
6200 }
6201 break;
6202
6203 case OP_NOT_WORDCHAR:
6204 for (i = min; i < max; i++)
6205 {
6206 if (eptr >= md->end_subject)
6207 {
6208 SCHECK_PARTIAL();
6209 break;
6210 }
6211 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6212 eptr++;
6213 }
6214 break;
6215
6216 case OP_WORDCHAR:
6217 for (i = min; i < max; i++)
6218 {
6219 if (eptr >= md->end_subject)
6220 {
6221 SCHECK_PARTIAL();
6222 break;
6223 }
6224 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6225 eptr++;
6226 }
6227 break;
6228
6229 default:
6230 RRETURN(PCRE_ERROR_INTERNAL);
6231 }
6232
6233 /* eptr is now past the end of the maximum run. If possessive, we are
6234 done (no backing up). Otherwise, match at this position; anything other
6235 than no match is immediately returned. For nomatch, back up one
6236 character (byte), unless we are matching \R and the last thing matched
6237 was \r\n, in which case, back up two bytes. */
6238
6239 if (possessive) continue;
6240 while (eptr >= pp)
6241 {
6242 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6244 eptr--;
6245 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6246 eptr[-1] == CHAR_CR) eptr--;
6247 }
6248 }
6249
6250 /* Get here if we can't make it match with any permitted repetitions */
6251
6252 RRETURN(MATCH_NOMATCH);
6253 }
6254 /* Control never gets here */
6255
6256 /* There's been some horrible disaster. Arrival here can only mean there is
6257 something seriously wrong in the code above or the OP_xxx definitions. */
6258
6259 default:
6260 DPRINTF(("Unknown opcode %d\n", *ecode));
6261 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6262 }
6263
6264 /* Do not stick any code in here without much thought; it is assumed
6265 that "continue" in the code above comes out to here to repeat the main
6266 loop. */
6267
6268 } /* End of main loop */
6269 /* Control never reaches here */
6270
6271
6272 /* When compiling to use the heap rather than the stack for recursive calls to
6273 match(), the RRETURN() macro jumps here. The number that is saved in
6274 frame->Xwhere indicates which label we actually want to return to. */
6275
6276 #ifdef NO_RECURSE
6277 #define LBL(val) case val: goto L_RM##val;
6278 HEAP_RETURN:
6279 switch (frame->Xwhere)
6280 {
6281 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6282 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6283 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6284 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6285 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6286 LBL(65) LBL(66)
6287 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6288 LBL(21)
6289 #endif
6290 #ifdef SUPPORT_UTF
6291 LBL(16) LBL(18) LBL(20)
6292 LBL(22) LBL(23) LBL(28) LBL(30)
6293 LBL(32) LBL(34) LBL(42) LBL(46)
6294 #ifdef SUPPORT_UCP
6295 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6296 LBL(59) LBL(60) LBL(61) LBL(62)
6297 #endif /* SUPPORT_UCP */
6298 #endif /* SUPPORT_UTF */
6299 default:
6300 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6301
6302 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6303
6304 return PCRE_ERROR_INTERNAL;
6305 }
6306 #undef LBL
6307 #endif /* NO_RECURSE */
6308 }
6309
6310
6311 /***************************************************************************
6312 ****************************************************************************
6313 RECURSION IN THE match() FUNCTION
6314
6315 Undefine all the macros that were defined above to handle this. */
6316
6317 #ifdef NO_RECURSE
6318 #undef eptr
6319 #undef ecode
6320 #undef mstart
6321 #undef offset_top
6322 #undef eptrb
6323 #undef flags
6324
6325 #undef callpat
6326 #undef charptr
6327 #undef data
6328 #undef next
6329 #undef pp
6330 #undef prev
6331 #undef saved_eptr
6332
6333 #undef new_recursive
6334
6335 #undef cur_is_word
6336 #undef condition
6337 #undef prev_is_word
6338
6339 #undef ctype
6340 #undef length
6341 #undef max
6342 #undef min
6343 #undef number
6344 #undef offset
6345 #undef op
6346 #undef save_capture_last
6347 #undef save_offset1
6348 #undef save_offset2
6349 #undef save_offset3
6350 #undef stacksave
6351
6352 #undef newptrb
6353
6354 #endif
6355
6356 /* These two are defined as macros in both cases */
6357
6358 #undef fc
6359 #undef fi
6360
6361 /***************************************************************************
6362 ***************************************************************************/
6363
6364
6365 #ifdef NO_RECURSE
6366 /*************************************************
6367 * Release allocated heap frames *
6368 *************************************************/
6369
6370 /* This function releases all the allocated frames. The base frame is on the
6371 machine stack, and so must not be freed.
6372
6373 Argument: the address of the base frame
6374 Returns: nothing
6375 */
6376
6377 static void
6378 release_match_heapframes (heapframe *frame_base)
6379 {
6380 heapframe *nextframe = frame_base->Xnextframe;
6381 while (nextframe != NULL)
6382 {
6383 heapframe *oldframe = nextframe;
6384 nextframe = nextframe->Xnextframe;
6385 (PUBL(stack_free))(oldframe);
6386 }
6387 }
6388 #endif
6389
6390
6391 /*************************************************
6392 * Execute a Regular Expression *
6393 *************************************************/
6394
6395 /* This function applies a compiled re to a subject string and picks out
6396 portions of the string if it matches. Two elements in the vector are set for
6397 each substring: the offsets to the start and end of the substring.
6398
6399 Arguments:
6400 argument_re points to the compiled expression
6401 extra_data points to extra data or is NULL
6402 subject points to the subject string
6403 length length of subject string (may contain binary zeros)
6404 start_offset where to start in the subject string
6405 options option bits
6406 offsets points to a vector of ints to be filled in with offsets
6407 offsetcount the number of elements in the vector
6408
6409 Returns: > 0 => success; value is the number of elements filled in
6410 = 0 => success, but offsets is not big enough
6411 -1 => failed to match
6412 < -1 => some kind of unexpected problem
6413 */
6414
6415 #ifdef COMPILE_PCRE8
6416 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6417 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6418 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6419 int offsetcount)
6420 #else
6421 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6422 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6423 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6424 int offsetcount)
6425 #endif
6426 {
6427 int rc, ocount, arg_offset_max;
6428 int newline;
6429 BOOL using_temporary_offsets = FALSE;
6430 BOOL anchored;
6431 BOOL startline;
6432 BOOL firstline;
6433 BOOL utf;
6434 BOOL has_first_char = FALSE;
6435 BOOL has_req_char = FALSE;
6436 pcre_uchar first_char = 0;
6437 pcre_uchar first_char2 = 0;
6438 pcre_uchar req_char = 0;
6439 pcre_uchar req_char2 = 0;
6440 match_data match_block;
6441 match_data *md = &match_block;
6442 const pcre_uint8 *tables;
6443 const pcre_uint8 *start_bits = NULL;
6444 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6445 PCRE_PUCHAR end_subject;
6446 PCRE_PUCHAR start_partial = NULL;
6447 PCRE_PUCHAR req_char_ptr = start_match - 1;
6448
6449 const pcre_study_data *study;
6450 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6451
6452 #ifdef NO_RECURSE
6453 heapframe frame_zero;
6454 frame_zero.Xprevframe = NULL; /* Marks the top level */
6455 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6456 md->match_frames_base = &frame_zero;
6457 #endif
6458
6459 /* Check for the special magic call that measures the size of the stack used
6460 per recursive call of match(). Without the funny casting for sizeof, a Windows
6461 compiler gave this error: "unary minus operator applied to unsigned type,
6462 result still unsigned". Hopefully the cast fixes that. */
6463
6464 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6465 start_offset == -999)
6466 #ifdef NO_RECURSE
6467 return -((int)sizeof(heapframe));
6468 #else
6469 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6470 #endif
6471
6472 /* Plausibility checks */
6473
6474 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6475 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6476 return PCRE_ERROR_NULL;
6477 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6478 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6479
6480 /* Check that the first field in the block is the magic number. If it is not,
6481 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6482 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6483 means that the pattern is likely compiled with different endianness. */
6484
6485 if (re->magic_number != MAGIC_NUMBER)
6486 return re->magic_number == REVERSED_MAGIC_NUMBER?
6487 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6488 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6489
6490 /* These two settings are used in the code for checking a UTF-8 string that
6491 follows immediately afterwards. Other values in the md block are used only
6492 during "normal" pcre_exec() processing, not when the JIT support is in use,
6493 so they are set up later. */
6494
6495 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6496 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6497 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6498 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6499
6500 /* Check a UTF-8 string if required. Pass back the character offset and error
6501 code for an invalid string if a results vector is available. */
6502
6503 #ifdef SUPPORT_UTF
6504 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6505 {
6506 int erroroffset;
6507 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6508 if (errorcode != 0)
6509 {
6510 if (offsetcount >= 2)
6511 {
6512 offsets[0] = erroroffset;
6513 offsets[1] = errorcode;
6514 }
6515 #ifdef COMPILE_PCRE16
6516 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6517 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6518 #else
6519 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6520 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6521 #endif
6522 }
6523
6524 /* Check that a start_offset points to the start of a UTF character. */
6525 if (start_offset > 0 && start_offset < length &&
6526 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6527 return PCRE_ERROR_BADUTF8_OFFSET;
6528 }
6529 #endif
6530
6531 /* If the pattern was successfully studied with JIT support, run the JIT
6532 executable instead of the rest of this function. Most options must be set at
6533 compile time for the JIT code to be usable. Fallback to the normal code path if
6534 an unsupported flag is set. */
6535
6536 #ifdef SUPPORT_JIT
6537 if (extra_data != NULL
6538 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6539 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6540 && extra_data->executable_jit != NULL
6541 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6542 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6543 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6544 {
6545 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6546 start_offset, options, offsets, offsetcount);
6547
6548 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6549 mode is not compiled. In this case we simply fallback to interpreter. */
6550
6551 if (rc != PCRE_ERROR_NULL) return rc;
6552 }
6553 #endif
6554
6555 /* Carry on with non-JIT matching. This information is for finding all the
6556 numbers associated with a given name, for condition testing. */
6557
6558 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6559 md->name_count = re->name_count;
6560 md->name_entry_size = re->name_entry_size;
6561
6562 /* Fish out the optional data from the extra_data structure, first setting
6563 the default values. */
6564
6565 study = NULL;
6566 md->match_limit = MATCH_LIMIT;
6567 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6568 md->callout_data = NULL;
6569
6570 /* The table pointer is always in native byte order. */
6571
6572 tables = re->tables;
6573
6574 if (extra_data != NULL)
6575 {
6576 register unsigned int flags = extra_data->flags;
6577 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6578 study = (const pcre_study_data *)extra_data->study_data;
6579 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6580 md->match_limit = extra_data->match_limit;
6581 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6582 md->match_limit_recursion = extra_data->match_limit_recursion;
6583 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6584 md->callout_data = extra_data->callout_data;
6585 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6586 }
6587
6588 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6589 is a feature that makes it possible to save compiled regex and re-use them
6590 in other programs later. */
6591
6592 if (tables == NULL) tables = PRIV(default_tables);
6593
6594 /* Set up other data */
6595
6596 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6597 startline = (re->flags & PCRE_STARTLINE) != 0;
6598 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6599
6600 /* The code starts after the real_pcre block and the capture name table. */
6601
6602 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6603 re->name_count * re->name_entry_size;
6604
6605 md->start_subject = (PCRE_PUCHAR)subject;
6606 md->start_offset = start_offset;
6607 md->end_subject = md->start_subject + length;
6608 end_subject = md->end_subject;
6609
6610 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6611 md->use_ucp = (re->options & PCRE_UCP) != 0;
6612 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6613 md->ignore_skip_arg = FALSE;
6614
6615 /* Some options are unpacked into BOOL variables in the hope that testing
6616 them will be faster than individual option bits. */
6617
6618 md->notbol = (options & PCRE_NOTBOL) != 0;
6619 md->noteol = (options & PCRE_NOTEOL) != 0;
6620 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6621 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6622
6623 md->hitend = FALSE;
6624 md->mark = md->nomatch_mark = NULL; /* In case never set */
6625
6626 md->recursive = NULL; /* No recursion at top level */
6627 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6628
6629 md->lcc = tables + lcc_offset;
6630 md->fcc = tables + fcc_offset;
6631 md->ctypes = tables + ctypes_offset;
6632
6633 /* Handle different \R options. */
6634
6635 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6636 {
6637 case 0:
6638 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6639 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6640 else
6641 #ifdef BSR_ANYCRLF
6642 md->bsr_anycrlf = TRUE;
6643 #else
6644 md->bsr_anycrlf = FALSE;
6645 #endif
6646 break;
6647
6648 case PCRE_BSR_ANYCRLF:
6649 md->bsr_anycrlf = TRUE;
6650 break;
6651
6652 case PCRE_BSR_UNICODE:
6653 md->bsr_anycrlf = FALSE;
6654 break;
6655
6656 default: return PCRE_ERROR_BADNEWLINE;
6657 }
6658
6659 /* Handle different types of newline. The three bits give eight cases. If
6660 nothing is set at run time, whatever was used at compile time applies. */
6661
6662 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6663 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6664 {
6665 case 0: newline = NEWLINE; break; /* Compile-time default */
6666 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6667 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6668 case PCRE_NEWLINE_CR+
6669 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6670 case PCRE_NEWLINE_ANY: newline = -1; break;
6671 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6672 default: return PCRE_ERROR_BADNEWLINE;
6673 }
6674
6675 if (newline == -2)
6676 {
6677 md->nltype = NLTYPE_ANYCRLF;
6678 }
6679 else if (newline < 0)
6680 {
6681 md->nltype = NLTYPE_ANY;
6682 }
6683 else
6684 {
6685 md->nltype = NLTYPE_FIXED;
6686 if (newline > 255)
6687 {
6688 md->nllen = 2;
6689 md->nl[0] = (newline >> 8) & 255;
6690 md->nl[1] = newline & 255;
6691 }
6692 else
6693 {
6694 md->nllen = 1;
6695 md->nl[0] = newline;
6696 }
6697 }
6698
6699 /* Partial matching was originally supported only for a restricted set of
6700 regexes; from release 8.00 there are no restrictions, but the bits are still
6701 defined (though never set). So there's no harm in leaving this code. */
6702
6703 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6704 return PCRE_ERROR_BADPARTIAL;
6705
6706 /* If the expression has got more back references than the offsets supplied can
6707 hold, we get a temporary chunk of working store to use during the matching.
6708 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6709 of 3. */
6710
6711 ocount = offsetcount - (offsetcount % 3);
6712 arg_offset_max = (2*ocount)/3;
6713
6714 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6715 {
6716 ocount = re->top_backref * 3 + 3;
6717 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6718 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6719 using_temporary_offsets = TRUE;
6720 DPRINTF(("Got memory to hold back references\n"));
6721 }
6722 else md->offset_vector = offsets;
6723
6724 md->offset_end = ocount;
6725 md->offset_max = (2*ocount)/3;
6726 md->offset_overflow = FALSE;
6727 md->capture_last = -1;
6728
6729 /* Reset the working variable associated with each extraction. These should
6730 never be used unless previously set, but they get saved and restored, and so we
6731 initialize them to avoid reading uninitialized locations. Also, unset the
6732 offsets for the matched string. This is really just for tidiness with callouts,
6733 in case they inspect these fields. */
6734
6735 if (md->offset_vector != NULL)
6736 {
6737 register int *iptr = md->offset_vector + ocount;
6738 register int *iend = iptr - re->top_bracket;
6739 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6740 while (--iptr >= iend) *iptr = -1;
6741 md->offset_vector[0] = md->offset_vector[1] = -1;
6742 }
6743
6744 /* Set up the first character to match, if available. The first_char value is
6745 never set for an anchored regular expression, but the anchoring may be forced
6746 at run time, so we have to test for anchoring. The first char may be unset for
6747 an unanchored pattern, of course. If there's no first char and the pattern was
6748 studied, there may be a bitmap of possible first characters. */
6749
6750 if (!anchored)
6751 {
6752