/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1015 - (show annotations)
Sun Aug 26 16:07:14 2012 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 219248 byte(s)
Error occurred while calculating annotation data.
Improve extended grapheme clusters using a bit table.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95
96
97 #ifdef PCRE_DEBUG
98 /*************************************************
99 * Debugging function to print chars *
100 *************************************************/
101
102 /* Print a sequence of chars in printable format, stopping at the end of the
103 subject if the requested.
104
105 Arguments:
106 p points to characters
107 length number to print
108 is_subject TRUE if printing from within md->start_subject
109 md pointer to matching data block, if is_subject is TRUE
110
111 Returns: nothing
112 */
113
114 static void
115 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
116 {
117 unsigned int c;
118 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
119 while (length-- > 0)
120 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121 }
122 #endif
123
124
125
126 /*************************************************
127 * Match a back-reference *
128 *************************************************/
129
130 /* Normally, if a back reference hasn't been set, the length that is passed is
131 negative, so the match always fails. However, in JavaScript compatibility mode,
132 the length passed is zero. Note that in caseless UTF-8 mode, the number of
133 subject bytes matched may be different to the number of reference bytes.
134
135 Arguments:
136 offset index into the offset vector
137 eptr pointer into the subject
138 length length of reference to be matched (number of bytes)
139 md points to match data block
140 caseless TRUE if caseless
141
142 Returns: >= 0 the number of subject bytes matched
143 -1 no match
144 -2 partial match; always given if at end subject
145 */
146
147 static int
148 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
149 BOOL caseless)
150 {
151 PCRE_PUCHAR eptr_start = eptr;
152 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if reference not set (and not JavaScript compatible - in that
168 case the length is passed as zero). */
169
170 if (length < 0) return -1;
171
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
175
176 if (caseless)
177 {
178 #ifdef SUPPORT_UTF
179 #ifdef SUPPORT_UCP
180 if (md->utf)
181 {
182 /* Match characters up to the end of the reference. NOTE: the number of
183 bytes matched may differ, because there are some characters whose upper and
184 lower case versions code as different numbers of bytes. For example, U+023A
185 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
186 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
187 the latter. It is important, therefore, to check the length along the
188 reference, not along the subject (earlier code did this wrong). */
189
190 PCRE_PUCHAR endptr = p + length;
191 while (p < endptr)
192 {
193 int c, d;
194 if (eptr >= md->end_subject) return -2; /* Partial match */
195 GETCHARINC(c, eptr);
196 GETCHARINC(d, p);
197 if (c != d && c != UCD_OTHERCASE(d)) return -1;
198 }
199 }
200 else
201 #endif
202 #endif
203
204 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
205 is no UCP support. */
206 {
207 while (length-- > 0)
208 {
209 if (eptr >= md->end_subject) return -2; /* Partial match */
210 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
211 p++;
212 eptr++;
213 }
214 }
215 }
216
217 /* In the caseful case, we can just compare the bytes, whether or not we
218 are in UTF-8 mode. */
219
220 else
221 {
222 while (length-- > 0)
223 {
224 if (eptr >= md->end_subject) return -2; /* Partial match */
225 if (*p++ != *eptr++) return -1;
226 }
227 }
228
229 return (int)(eptr - eptr_start);
230 }
231
232
233
234 /***************************************************************************
235 ****************************************************************************
236 RECURSION IN THE match() FUNCTION
237
238 The match() function is highly recursive, though not every recursive call
239 increases the recursive depth. Nevertheless, some regular expressions can cause
240 it to recurse to a great depth. I was writing for Unix, so I just let it call
241 itself recursively. This uses the stack for saving everything that has to be
242 saved for a recursive call. On Unix, the stack can be large, and this works
243 fine.
244
245 It turns out that on some non-Unix-like systems there are problems with
246 programs that use a lot of stack. (This despite the fact that every last chip
247 has oodles of memory these days, and techniques for extending the stack have
248 been known for decades.) So....
249
250 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
251 calls by keeping local variables that need to be preserved in blocks of memory
252 obtained from malloc() instead instead of on the stack. Macros are used to
253 achieve this so that the actual code doesn't look very different to what it
254 always used to.
255
256 The original heap-recursive code used longjmp(). However, it seems that this
257 can be very slow on some operating systems. Following a suggestion from Stan
258 Switzer, the use of longjmp() has been abolished, at the cost of having to
259 provide a unique number for each call to RMATCH. There is no way of generating
260 a sequence of numbers at compile time in C. I have given them names, to make
261 them stand out more clearly.
262
263 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
264 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
265 tests. Furthermore, not using longjmp() means that local dynamic variables
266 don't have indeterminate values; this has meant that the frame size can be
267 reduced because the result can be "passed back" by straight setting of the
268 variable instead of being passed in the frame.
269 ****************************************************************************
270 ***************************************************************************/
271
272 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
273 below must be updated in sync. */
274
275 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
276 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
277 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
278 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
279 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
280 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
281 RM61, RM62, RM63, RM64, RM65, RM66 };
282
283 /* These versions of the macros use the stack, as normal. There are debugging
284 versions and production versions. Note that the "rw" argument of RMATCH isn't
285 actually used in this definition. */
286
287 #ifndef NO_RECURSE
288 #define REGISTER register
289
290 #ifdef PCRE_DEBUG
291 #define RMATCH(ra,rb,rc,rd,re,rw) \
292 { \
293 printf("match() called in line %d\n", __LINE__); \
294 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
295 printf("to line %d\n", __LINE__); \
296 }
297 #define RRETURN(ra) \
298 { \
299 printf("match() returned %d from line %d ", ra, __LINE__); \
300 return ra; \
301 }
302 #else
303 #define RMATCH(ra,rb,rc,rd,re,rw) \
304 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
305 #define RRETURN(ra) return ra
306 #endif
307
308 #else
309
310
311 /* These versions of the macros manage a private stack on the heap. Note that
312 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
313 argument of match(), which never changes. */
314
315 #define REGISTER
316
317 #define RMATCH(ra,rb,rc,rd,re,rw)\
318 {\
319 heapframe *newframe = frame->Xnextframe;\
320 if (newframe == NULL)\
321 {\
322 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
323 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
324 newframe->Xnextframe = NULL;\
325 frame->Xnextframe = newframe;\
326 }\
327 frame->Xwhere = rw;\
328 newframe->Xeptr = ra;\
329 newframe->Xecode = rb;\
330 newframe->Xmstart = mstart;\
331 newframe->Xoffset_top = rc;\
332 newframe->Xeptrb = re;\
333 newframe->Xrdepth = frame->Xrdepth + 1;\
334 newframe->Xprevframe = frame;\
335 frame = newframe;\
336 DPRINTF(("restarting from line %d\n", __LINE__));\
337 goto HEAP_RECURSE;\
338 L_##rw:\
339 DPRINTF(("jumped back to line %d\n", __LINE__));\
340 }
341
342 #define RRETURN(ra)\
343 {\
344 heapframe *oldframe = frame;\
345 frame = oldframe->Xprevframe;\
346 if (frame != NULL)\
347 {\
348 rrc = ra;\
349 goto HEAP_RETURN;\
350 }\
351 return ra;\
352 }
353
354
355 /* Structure for remembering the local variables in a private frame */
356
357 typedef struct heapframe {
358 struct heapframe *Xprevframe;
359 struct heapframe *Xnextframe;
360
361 /* Function arguments that may change */
362
363 PCRE_PUCHAR Xeptr;
364 const pcre_uchar *Xecode;
365 PCRE_PUCHAR Xmstart;
366 int Xoffset_top;
367 eptrblock *Xeptrb;
368 unsigned int Xrdepth;
369
370 /* Function local variables */
371
372 PCRE_PUCHAR Xcallpat;
373 #ifdef SUPPORT_UTF
374 PCRE_PUCHAR Xcharptr;
375 #endif
376 PCRE_PUCHAR Xdata;
377 PCRE_PUCHAR Xnext;
378 PCRE_PUCHAR Xpp;
379 PCRE_PUCHAR Xprev;
380 PCRE_PUCHAR Xsaved_eptr;
381
382 recursion_info Xnew_recursive;
383
384 BOOL Xcur_is_word;
385 BOOL Xcondition;
386 BOOL Xprev_is_word;
387
388 #ifdef SUPPORT_UCP
389 int Xprop_type;
390 int Xprop_value;
391 int Xprop_fail_result;
392 int Xoclength;
393 pcre_uchar Xocchars[6];
394 #endif
395
396 int Xcodelink;
397 int Xctype;
398 unsigned int Xfc;
399 int Xfi;
400 int Xlength;
401 int Xmax;
402 int Xmin;
403 int Xnumber;
404 int Xoffset;
405 int Xop;
406 int Xsave_capture_last;
407 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
408 int Xstacksave[REC_STACK_SAVE_MAX];
409
410 eptrblock Xnewptrb;
411
412 /* Where to jump back to */
413
414 int Xwhere;
415
416 } heapframe;
417
418 #endif
419
420
421 /***************************************************************************
422 ***************************************************************************/
423
424
425
426 /*************************************************
427 * Match from current position *
428 *************************************************/
429
430 /* This function is called recursively in many circumstances. Whenever it
431 returns a negative (error) response, the outer incarnation must also return the
432 same response. */
433
434 /* These macros pack up tests that are used for partial matching, and which
435 appear several times in the code. We set the "hit end" flag if the pointer is
436 at the end of the subject and also past the start of the subject (i.e.
437 something has been matched). For hard partial matching, we then return
438 immediately. The second one is used when we already know we are past the end of
439 the subject. */
440
441 #define CHECK_PARTIAL()\
442 if (md->partial != 0 && eptr >= md->end_subject && \
443 eptr > md->start_used_ptr) \
444 { \
445 md->hitend = TRUE; \
446 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
447 }
448
449 #define SCHECK_PARTIAL()\
450 if (md->partial != 0 && eptr > md->start_used_ptr) \
451 { \
452 md->hitend = TRUE; \
453 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
454 }
455
456
457 /* Performance note: It might be tempting to extract commonly used fields from
458 the md structure (e.g. utf, end_subject) into individual variables to improve
459 performance. Tests using gcc on a SPARC disproved this; in the first case, it
460 made performance worse.
461
462 Arguments:
463 eptr pointer to current character in subject
464 ecode pointer to current position in compiled code
465 mstart pointer to the current match start position (can be modified
466 by encountering \K)
467 offset_top current top pointer
468 md pointer to "static" info for the match
469 eptrb pointer to chain of blocks containing eptr at start of
470 brackets - for testing for empty matches
471 rdepth the recursion depth
472
473 Returns: MATCH_MATCH if matched ) these values are >= 0
474 MATCH_NOMATCH if failed to match )
475 a negative MATCH_xxx value for PRUNE, SKIP, etc
476 a negative PCRE_ERROR_xxx value if aborted by an error condition
477 (e.g. stopped by repeated call or recursion limit)
478 */
479
480 static int
481 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
482 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
483 unsigned int rdepth)
484 {
485 /* These variables do not need to be preserved over recursion in this function,
486 so they can be ordinary variables in all cases. Mark some of them with
487 "register" because they are used a lot in loops. */
488
489 register int rrc; /* Returns from recursive calls */
490 register int i; /* Used for loops not involving calls to RMATCH() */
491 register unsigned int c; /* Character values not kept over RMATCH() calls */
492 register BOOL utf; /* Local copy of UTF flag for speed */
493
494 BOOL minimize, possessive; /* Quantifier options */
495 BOOL caseless;
496 int condcode;
497
498 /* When recursion is not being used, all "local" variables that have to be
499 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
500 frame on the stack here; subsequent instantiations are obtained from the heap
501 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
502 the top-level on the stack rather than malloc-ing them all gives a performance
503 boost in many cases where there is not much "recursion". */
504
505 #ifdef NO_RECURSE
506 heapframe *frame = (heapframe *)md->match_frames_base;
507
508 /* Copy in the original argument variables */
509
510 frame->Xeptr = eptr;
511 frame->Xecode = ecode;
512 frame->Xmstart = mstart;
513 frame->Xoffset_top = offset_top;
514 frame->Xeptrb = eptrb;
515 frame->Xrdepth = rdepth;
516
517 /* This is where control jumps back to to effect "recursion" */
518
519 HEAP_RECURSE:
520
521 /* Macros make the argument variables come from the current frame */
522
523 #define eptr frame->Xeptr
524 #define ecode frame->Xecode
525 #define mstart frame->Xmstart
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define oclength frame->Xoclength
554 #define occhars frame->Xocchars
555 #endif
556
557 #define ctype frame->Xctype
558 #define fc frame->Xfc
559 #define fi frame->Xfi
560 #define length frame->Xlength
561 #define max frame->Xmax
562 #define min frame->Xmin
563 #define number frame->Xnumber
564 #define offset frame->Xoffset
565 #define op frame->Xop
566 #define save_capture_last frame->Xsave_capture_last
567 #define save_offset1 frame->Xsave_offset1
568 #define save_offset2 frame->Xsave_offset2
569 #define save_offset3 frame->Xsave_offset3
570 #define stacksave frame->Xstacksave
571
572 #define newptrb frame->Xnewptrb
573
574 /* When recursion is being used, local variables are allocated on the stack and
575 get preserved during recursion in the normal way. In this environment, fi and
576 i, and fc and c, can be the same variables. */
577
578 #else /* NO_RECURSE not defined */
579 #define fi i
580 #define fc c
581
582 /* Many of the following variables are used only in small blocks of the code.
583 My normal style of coding would have declared them within each of those blocks.
584 However, in order to accommodate the version of this code that uses an external
585 "stack" implemented on the heap, it is easier to declare them all here, so the
586 declarations can be cut out in a block. The only declarations within blocks
587 below are for variables that do not have to be preserved over a recursive call
588 to RMATCH(). */
589
590 #ifdef SUPPORT_UTF
591 const pcre_uchar *charptr;
592 #endif
593 const pcre_uchar *callpat;
594 const pcre_uchar *data;
595 const pcre_uchar *next;
596 PCRE_PUCHAR pp;
597 const pcre_uchar *prev;
598 PCRE_PUCHAR saved_eptr;
599
600 recursion_info new_recursive;
601
602 BOOL cur_is_word;
603 BOOL condition;
604 BOOL prev_is_word;
605
606 #ifdef SUPPORT_UCP
607 int prop_type;
608 int prop_value;
609 int prop_fail_result;
610 int oclength;
611 pcre_uchar occhars[6];
612 #endif
613
614 int codelink;
615 int ctype;
616 int length;
617 int max;
618 int min;
619 int number;
620 int offset;
621 int op;
622 int save_capture_last;
623 int save_offset1, save_offset2, save_offset3;
624 int stacksave[REC_STACK_SAVE_MAX];
625
626 eptrblock newptrb;
627
628 /* There is a special fudge for calling match() in a way that causes it to
629 measure the size of its basic stack frame when the stack is being used for
630 recursion. The second argument (ecode) being NULL triggers this behaviour. It
631 cannot normally ever be NULL. The return is the negated value of the frame
632 size. */
633
634 if (ecode == NULL)
635 {
636 if (rdepth == 0)
637 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
638 else
639 {
640 int len = (char *)&rdepth - (char *)eptr;
641 return (len > 0)? -len : len;
642 }
643 }
644 #endif /* NO_RECURSE */
645
646 /* To save space on the stack and in the heap frame, I have doubled up on some
647 of the local variables that are used only in localised parts of the code, but
648 still need to be preserved over recursive calls of match(). These macros define
649 the alternative names that are used. */
650
651 #define allow_zero cur_is_word
652 #define cbegroup condition
653 #define code_offset codelink
654 #define condassert condition
655 #define matched_once prev_is_word
656 #define foc number
657 #define save_mark data
658
659 /* These statements are here to stop the compiler complaining about unitialized
660 variables. */
661
662 #ifdef SUPPORT_UCP
663 prop_value = 0;
664 prop_fail_result = 0;
665 #endif
666
667
668 /* This label is used for tail recursion, which is used in a few cases even
669 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
670 used. Thanks to Ian Taylor for noticing this possibility and sending the
671 original patch. */
672
673 TAIL_RECURSE:
674
675 /* OK, now we can get on with the real code of the function. Recursive calls
676 are specified by the macro RMATCH and RRETURN is used to return. When
677 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
678 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
679 defined). However, RMATCH isn't like a function call because it's quite a
680 complicated macro. It has to be used in one particular way. This shouldn't,
681 however, impact performance when true recursion is being used. */
682
683 #ifdef SUPPORT_UTF
684 utf = md->utf; /* Local copy of the flag */
685 #else
686 utf = FALSE;
687 #endif
688
689 /* First check that we haven't called match() too many times, or that we
690 haven't exceeded the recursive call limit. */
691
692 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
693 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
694
695 /* At the start of a group with an unlimited repeat that may match an empty
696 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
697 done this way to save having to use another function argument, which would take
698 up space on the stack. See also MATCH_CONDASSERT below.
699
700 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
701 such remembered pointers, to be checked when we hit the closing ket, in order
702 to break infinite loops that match no characters. When match() is called in
703 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
704 NOT be used with tail recursion, because the memory block that is used is on
705 the stack, so a new one may be required for each match(). */
706
707 if (md->match_function_type == MATCH_CBEGROUP)
708 {
709 newptrb.epb_saved_eptr = eptr;
710 newptrb.epb_prev = eptrb;
711 eptrb = &newptrb;
712 md->match_function_type = 0;
713 }
714
715 /* Now start processing the opcodes. */
716
717 for (;;)
718 {
719 minimize = possessive = FALSE;
720 op = *ecode;
721
722 switch(op)
723 {
724 case OP_MARK:
725 md->nomatch_mark = ecode + 2;
726 md->mark = NULL; /* In case previously set by assertion */
727 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
728 eptrb, RM55);
729 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
730 md->mark == NULL) md->mark = ecode + 2;
731
732 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
733 argument, and we must check whether that argument matches this MARK's
734 argument. It is passed back in md->start_match_ptr (an overloading of that
735 variable). If it does match, we reset that variable to the current subject
736 position and return MATCH_SKIP. Otherwise, pass back the return code
737 unaltered. */
738
739 else if (rrc == MATCH_SKIP_ARG &&
740 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
741 {
742 md->start_match_ptr = eptr;
743 RRETURN(MATCH_SKIP);
744 }
745 RRETURN(rrc);
746
747 case OP_FAIL:
748 RRETURN(MATCH_NOMATCH);
749
750 /* COMMIT overrides PRUNE, SKIP, and THEN */
751
752 case OP_COMMIT:
753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 eptrb, RM52);
755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
756 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
757 rrc != MATCH_THEN)
758 RRETURN(rrc);
759 RRETURN(MATCH_COMMIT);
760
761 /* PRUNE overrides THEN */
762
763 case OP_PRUNE:
764 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
765 eptrb, RM51);
766 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
767 RRETURN(MATCH_PRUNE);
768
769 case OP_PRUNE_ARG:
770 md->nomatch_mark = ecode + 2;
771 md->mark = NULL; /* In case previously set by assertion */
772 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
773 eptrb, RM56);
774 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
775 md->mark == NULL) md->mark = ecode + 2;
776 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
777 RRETURN(MATCH_PRUNE);
778
779 /* SKIP overrides PRUNE and THEN */
780
781 case OP_SKIP:
782 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
783 eptrb, RM53);
784 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
785 RRETURN(rrc);
786 md->start_match_ptr = eptr; /* Pass back current position */
787 RRETURN(MATCH_SKIP);
788
789 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
790 nomatch_mark. There is a flag that disables this opcode when re-matching a
791 pattern that ended with a SKIP for which there was not a matching MARK. */
792
793 case OP_SKIP_ARG:
794 if (md->ignore_skip_arg)
795 {
796 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
797 break;
798 }
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM57);
801 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
802 RRETURN(rrc);
803
804 /* Pass back the current skip name by overloading md->start_match_ptr and
805 returning the special MATCH_SKIP_ARG return code. This will either be
806 caught by a matching MARK, or get to the top, where it causes a rematch
807 with the md->ignore_skip_arg flag set. */
808
809 md->start_match_ptr = ecode + 2;
810 RRETURN(MATCH_SKIP_ARG);
811
812 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
813 the branch in which it occurs can be determined. Overload the start of
814 match pointer to do this. */
815
816 case OP_THEN:
817 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
818 eptrb, RM54);
819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
820 md->start_match_ptr = ecode;
821 RRETURN(MATCH_THEN);
822
823 case OP_THEN_ARG:
824 md->nomatch_mark = ecode + 2;
825 md->mark = NULL; /* In case previously set by assertion */
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
827 md, eptrb, RM58);
828 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
829 md->mark == NULL) md->mark = ecode + 2;
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 md->start_match_ptr = ecode;
832 RRETURN(MATCH_THEN);
833
834 /* Handle an atomic group that does not contain any capturing parentheses.
835 This can be handled like an assertion. Prior to 8.13, all atomic groups
836 were handled this way. In 8.13, the code was changed as below for ONCE, so
837 that backups pass through the group and thereby reset captured values.
838 However, this uses a lot more stack, so in 8.20, atomic groups that do not
839 contain any captures generate OP_ONCE_NC, which can be handled in the old,
840 less stack intensive way.
841
842 Check the alternative branches in turn - the matching won't pass the KET
843 for this kind of subpattern. If any one branch matches, we carry on as at
844 the end of a normal bracket, leaving the subject pointer, but resetting
845 the start-of-match value in case it was changed by \K. */
846
847 case OP_ONCE_NC:
848 prev = ecode;
849 saved_eptr = eptr;
850 save_mark = md->mark;
851 do
852 {
853 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
854 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
855 {
856 mstart = md->start_match_ptr;
857 break;
858 }
859 if (rrc == MATCH_THEN)
860 {
861 next = ecode + GET(ecode,1);
862 if (md->start_match_ptr < next &&
863 (*ecode == OP_ALT || *next == OP_ALT))
864 rrc = MATCH_NOMATCH;
865 }
866
867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
868 ecode += GET(ecode,1);
869 md->mark = save_mark;
870 }
871 while (*ecode == OP_ALT);
872
873 /* If hit the end of the group (which could be repeated), fail */
874
875 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
876
877 /* Continue as from after the group, updating the offsets high water
878 mark, since extracts may have been taken. */
879
880 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
881
882 offset_top = md->end_offset_top;
883 eptr = md->end_match_ptr;
884
885 /* For a non-repeating ket, just continue at this level. This also
886 happens for a repeating ket if no characters were matched in the group.
887 This is the forcible breaking of infinite loops as implemented in Perl
888 5.005. */
889
890 if (*ecode == OP_KET || eptr == saved_eptr)
891 {
892 ecode += 1+LINK_SIZE;
893 break;
894 }
895
896 /* The repeating kets try the rest of the pattern or restart from the
897 preceding bracket, in the appropriate order. The second "call" of match()
898 uses tail recursion, to avoid using another stack frame. */
899
900 if (*ecode == OP_KETRMIN)
901 {
902 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
904 ecode = prev;
905 goto TAIL_RECURSE;
906 }
907 else /* OP_KETRMAX */
908 {
909 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
911 ecode += 1 + LINK_SIZE;
912 goto TAIL_RECURSE;
913 }
914 /* Control never gets here */
915
916 /* Handle a capturing bracket, other than those that are possessive with an
917 unlimited repeat. If there is space in the offset vector, save the current
918 subject position in the working slot at the top of the vector. We mustn't
919 change the current values of the data slot, because they may be set from a
920 previous iteration of this group, and be referred to by a reference inside
921 the group. A failure to match might occur after the group has succeeded,
922 if something later on doesn't match. For this reason, we need to restore
923 the working value and also the values of the final offsets, in case they
924 were set by a previous iteration of the same bracket.
925
926 If there isn't enough space in the offset vector, treat this as if it were
927 a non-capturing bracket. Don't worry about setting the flag for the error
928 case here; that is handled in the code for KET. */
929
930 case OP_CBRA:
931 case OP_SCBRA:
932 number = GET2(ecode, 1+LINK_SIZE);
933 offset = number << 1;
934
935 #ifdef PCRE_DEBUG
936 printf("start bracket %d\n", number);
937 printf("subject=");
938 pchars(eptr, 16, TRUE, md);
939 printf("\n");
940 #endif
941
942 if (offset < md->offset_max)
943 {
944 save_offset1 = md->offset_vector[offset];
945 save_offset2 = md->offset_vector[offset+1];
946 save_offset3 = md->offset_vector[md->offset_end - number];
947 save_capture_last = md->capture_last;
948 save_mark = md->mark;
949
950 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
951 md->offset_vector[md->offset_end - number] =
952 (int)(eptr - md->start_subject);
953
954 for (;;)
955 {
956 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
957 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
958 eptrb, RM1);
959 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
960
961 /* If we backed up to a THEN, check whether it is within the current
962 branch by comparing the address of the THEN that is passed back with
963 the end of the branch. If it is within the current branch, and the
964 branch is one of two or more alternatives (it either starts or ends
965 with OP_ALT), we have reached the limit of THEN's action, so convert
966 the return code to NOMATCH, which will cause normal backtracking to
967 happen from now on. Otherwise, THEN is passed back to an outer
968 alternative. This implements Perl's treatment of parenthesized groups,
969 where a group not containing | does not affect the current alternative,
970 that is, (X) is NOT the same as (X|(*F)). */
971
972 if (rrc == MATCH_THEN)
973 {
974 next = ecode + GET(ecode,1);
975 if (md->start_match_ptr < next &&
976 (*ecode == OP_ALT || *next == OP_ALT))
977 rrc = MATCH_NOMATCH;
978 }
979
980 /* Anything other than NOMATCH is passed back. */
981
982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
983 md->capture_last = save_capture_last;
984 ecode += GET(ecode, 1);
985 md->mark = save_mark;
986 if (*ecode != OP_ALT) break;
987 }
988
989 DPRINTF(("bracket %d failed\n", number));
990 md->offset_vector[offset] = save_offset1;
991 md->offset_vector[offset+1] = save_offset2;
992 md->offset_vector[md->offset_end - number] = save_offset3;
993
994 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
995
996 RRETURN(rrc);
997 }
998
999 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1000 as a non-capturing bracket. */
1001
1002 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1003 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1004
1005 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1006
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1009
1010 /* Non-capturing or atomic group, except for possessive with unlimited
1011 repeat and ONCE group with no captures. Loop for all the alternatives.
1012
1013 When we get to the final alternative within the brackets, we used to return
1014 the result of a recursive call to match() whatever happened so it was
1015 possible to reduce stack usage by turning this into a tail recursion,
1016 except in the case of a possibly empty group. However, now that there is
1017 the possiblity of (*THEN) occurring in the final alternative, this
1018 optimization is no longer always possible.
1019
1020 We can optimize if we know there are no (*THEN)s in the pattern; at present
1021 this is the best that can be done.
1022
1023 MATCH_ONCE is returned when the end of an atomic group is successfully
1024 reached, but subsequent matching fails. It passes back up the tree (causing
1025 captured values to be reset) until the original atomic group level is
1026 reached. This is tested by comparing md->once_target with the start of the
1027 group. At this point, the return is converted into MATCH_NOMATCH so that
1028 previous backup points can be taken. */
1029
1030 case OP_ONCE:
1031 case OP_BRA:
1032 case OP_SBRA:
1033 DPRINTF(("start non-capturing bracket\n"));
1034
1035 for (;;)
1036 {
1037 if (op >= OP_SBRA || op == OP_ONCE)
1038 md->match_function_type = MATCH_CBEGROUP;
1039
1040 /* If this is not a possibly empty group, and there are no (*THEN)s in
1041 the pattern, and this is the final alternative, optimize as described
1042 above. */
1043
1044 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1045 {
1046 ecode += PRIV(OP_lengths)[*ecode];
1047 goto TAIL_RECURSE;
1048 }
1049
1050 /* In all other cases, we have to make another call to match(). */
1051
1052 save_mark = md->mark;
1053 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1054 RM2);
1055
1056 /* See comment in the code for capturing groups above about handling
1057 THEN. */
1058
1059 if (rrc == MATCH_THEN)
1060 {
1061 next = ecode + GET(ecode,1);
1062 if (md->start_match_ptr < next &&
1063 (*ecode == OP_ALT || *next == OP_ALT))
1064 rrc = MATCH_NOMATCH;
1065 }
1066
1067 if (rrc != MATCH_NOMATCH)
1068 {
1069 if (rrc == MATCH_ONCE)
1070 {
1071 const pcre_uchar *scode = ecode;
1072 if (*scode != OP_ONCE) /* If not at start, find it */
1073 {
1074 while (*scode == OP_ALT) scode += GET(scode, 1);
1075 scode -= GET(scode, 1);
1076 }
1077 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1078 }
1079 RRETURN(rrc);
1080 }
1081 ecode += GET(ecode, 1);
1082 md->mark = save_mark;
1083 if (*ecode != OP_ALT) break;
1084 }
1085
1086 RRETURN(MATCH_NOMATCH);
1087
1088 /* Handle possessive capturing brackets with an unlimited repeat. We come
1089 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1090 handled similarly to the normal case above. However, the matching is
1091 different. The end of these brackets will always be OP_KETRPOS, which
1092 returns MATCH_KETRPOS without going further in the pattern. By this means
1093 we can handle the group by iteration rather than recursion, thereby
1094 reducing the amount of stack needed. */
1095
1096 case OP_CBRAPOS:
1097 case OP_SCBRAPOS:
1098 allow_zero = FALSE;
1099
1100 POSSESSIVE_CAPTURE:
1101 number = GET2(ecode, 1+LINK_SIZE);
1102 offset = number << 1;
1103
1104 #ifdef PCRE_DEBUG
1105 printf("start possessive bracket %d\n", number);
1106 printf("subject=");
1107 pchars(eptr, 16, TRUE, md);
1108 printf("\n");
1109 #endif
1110
1111 if (offset < md->offset_max)
1112 {
1113 matched_once = FALSE;
1114 code_offset = (int)(ecode - md->start_code);
1115
1116 save_offset1 = md->offset_vector[offset];
1117 save_offset2 = md->offset_vector[offset+1];
1118 save_offset3 = md->offset_vector[md->offset_end - number];
1119 save_capture_last = md->capture_last;
1120
1121 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1122
1123 /* Each time round the loop, save the current subject position for use
1124 when the group matches. For MATCH_MATCH, the group has matched, so we
1125 restart it with a new subject starting position, remembering that we had
1126 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1127 usual. If we haven't matched any alternatives in any iteration, check to
1128 see if a previous iteration matched. If so, the group has matched;
1129 continue from afterwards. Otherwise it has failed; restore the previous
1130 capture values before returning NOMATCH. */
1131
1132 for (;;)
1133 {
1134 md->offset_vector[md->offset_end - number] =
1135 (int)(eptr - md->start_subject);
1136 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1137 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1138 eptrb, RM63);
1139 if (rrc == MATCH_KETRPOS)
1140 {
1141 offset_top = md->end_offset_top;
1142 eptr = md->end_match_ptr;
1143 ecode = md->start_code + code_offset;
1144 save_capture_last = md->capture_last;
1145 matched_once = TRUE;
1146 continue;
1147 }
1148
1149 /* See comment in the code for capturing groups above about handling
1150 THEN. */
1151
1152 if (rrc == MATCH_THEN)
1153 {
1154 next = ecode + GET(ecode,1);
1155 if (md->start_match_ptr < next &&
1156 (*ecode == OP_ALT || *next == OP_ALT))
1157 rrc = MATCH_NOMATCH;
1158 }
1159
1160 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1161 md->capture_last = save_capture_last;
1162 ecode += GET(ecode, 1);
1163 if (*ecode != OP_ALT) break;
1164 }
1165
1166 if (!matched_once)
1167 {
1168 md->offset_vector[offset] = save_offset1;
1169 md->offset_vector[offset+1] = save_offset2;
1170 md->offset_vector[md->offset_end - number] = save_offset3;
1171 }
1172
1173 if (allow_zero || matched_once)
1174 {
1175 ecode += 1 + LINK_SIZE;
1176 break;
1177 }
1178
1179 RRETURN(MATCH_NOMATCH);
1180 }
1181
1182 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1183 as a non-capturing bracket. */
1184
1185 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187
1188 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1189
1190 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1191 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1192
1193 /* Non-capturing possessive bracket with unlimited repeat. We come here
1194 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1195 without the capturing complication. It is written out separately for speed
1196 and cleanliness. */
1197
1198 case OP_BRAPOS:
1199 case OP_SBRAPOS:
1200 allow_zero = FALSE;
1201
1202 POSSESSIVE_NON_CAPTURE:
1203 matched_once = FALSE;
1204 code_offset = (int)(ecode - md->start_code);
1205
1206 for (;;)
1207 {
1208 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1209 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1210 eptrb, RM48);
1211 if (rrc == MATCH_KETRPOS)
1212 {
1213 offset_top = md->end_offset_top;
1214 eptr = md->end_match_ptr;
1215 ecode = md->start_code + code_offset;
1216 matched_once = TRUE;
1217 continue;
1218 }
1219
1220 /* See comment in the code for capturing groups above about handling
1221 THEN. */
1222
1223 if (rrc == MATCH_THEN)
1224 {
1225 next = ecode + GET(ecode,1);
1226 if (md->start_match_ptr < next &&
1227 (*ecode == OP_ALT || *next == OP_ALT))
1228 rrc = MATCH_NOMATCH;
1229 }
1230
1231 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1232 ecode += GET(ecode, 1);
1233 if (*ecode != OP_ALT) break;
1234 }
1235
1236 if (matched_once || allow_zero)
1237 {
1238 ecode += 1 + LINK_SIZE;
1239 break;
1240 }
1241 RRETURN(MATCH_NOMATCH);
1242
1243 /* Control never reaches here. */
1244
1245 /* Conditional group: compilation checked that there are no more than
1246 two branches. If the condition is false, skipping the first branch takes us
1247 past the end if there is only one branch, but that's OK because that is
1248 exactly what going to the ket would do. */
1249
1250 case OP_COND:
1251 case OP_SCOND:
1252 codelink = GET(ecode, 1);
1253
1254 /* Because of the way auto-callout works during compile, a callout item is
1255 inserted between OP_COND and an assertion condition. */
1256
1257 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1258 {
1259 if (PUBL(callout) != NULL)
1260 {
1261 PUBL(callout_block) cb;
1262 cb.version = 2; /* Version 1 of the callout block */
1263 cb.callout_number = ecode[LINK_SIZE+2];
1264 cb.offset_vector = md->offset_vector;
1265 #ifdef COMPILE_PCRE8
1266 cb.subject = (PCRE_SPTR)md->start_subject;
1267 #else
1268 cb.subject = (PCRE_SPTR16)md->start_subject;
1269 #endif
1270 cb.subject_length = (int)(md->end_subject - md->start_subject);
1271 cb.start_match = (int)(mstart - md->start_subject);
1272 cb.current_position = (int)(eptr - md->start_subject);
1273 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1274 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1275 cb.capture_top = offset_top/2;
1276 cb.capture_last = md->capture_last;
1277 cb.callout_data = md->callout_data;
1278 cb.mark = md->nomatch_mark;
1279 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1280 if (rrc < 0) RRETURN(rrc);
1281 }
1282 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1283 }
1284
1285 condcode = ecode[LINK_SIZE+1];
1286
1287 /* Now see what the actual condition is */
1288
1289 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1290 {
1291 if (md->recursive == NULL) /* Not recursing => FALSE */
1292 {
1293 condition = FALSE;
1294 ecode += GET(ecode, 1);
1295 }
1296 else
1297 {
1298 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1299 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1300
1301 /* If the test is for recursion into a specific subpattern, and it is
1302 false, but the test was set up by name, scan the table to see if the
1303 name refers to any other numbers, and test them. The condition is true
1304 if any one is set. */
1305
1306 if (!condition && condcode == OP_NRREF)
1307 {
1308 pcre_uchar *slotA = md->name_table;
1309 for (i = 0; i < md->name_count; i++)
1310 {
1311 if (GET2(slotA, 0) == recno) break;
1312 slotA += md->name_entry_size;
1313 }
1314
1315 /* Found a name for the number - there can be only one; duplicate
1316 names for different numbers are allowed, but not vice versa. First
1317 scan down for duplicates. */
1318
1319 if (i < md->name_count)
1320 {
1321 pcre_uchar *slotB = slotA;
1322 while (slotB > md->name_table)
1323 {
1324 slotB -= md->name_entry_size;
1325 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1326 {
1327 condition = GET2(slotB, 0) == md->recursive->group_num;
1328 if (condition) break;
1329 }
1330 else break;
1331 }
1332
1333 /* Scan up for duplicates */
1334
1335 if (!condition)
1336 {
1337 slotB = slotA;
1338 for (i++; i < md->name_count; i++)
1339 {
1340 slotB += md->name_entry_size;
1341 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1342 {
1343 condition = GET2(slotB, 0) == md->recursive->group_num;
1344 if (condition) break;
1345 }
1346 else break;
1347 }
1348 }
1349 }
1350 }
1351
1352 /* Chose branch according to the condition */
1353
1354 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1355 }
1356 }
1357
1358 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1359 {
1360 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1361 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1362
1363 /* If the numbered capture is unset, but the reference was by name,
1364 scan the table to see if the name refers to any other numbers, and test
1365 them. The condition is true if any one is set. This is tediously similar
1366 to the code above, but not close enough to try to amalgamate. */
1367
1368 if (!condition && condcode == OP_NCREF)
1369 {
1370 int refno = offset >> 1;
1371 pcre_uchar *slotA = md->name_table;
1372
1373 for (i = 0; i < md->name_count; i++)
1374 {
1375 if (GET2(slotA, 0) == refno) break;
1376 slotA += md->name_entry_size;
1377 }
1378
1379 /* Found a name for the number - there can be only one; duplicate names
1380 for different numbers are allowed, but not vice versa. First scan down
1381 for duplicates. */
1382
1383 if (i < md->name_count)
1384 {
1385 pcre_uchar *slotB = slotA;
1386 while (slotB > md->name_table)
1387 {
1388 slotB -= md->name_entry_size;
1389 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1390 {
1391 offset = GET2(slotB, 0) << 1;
1392 condition = offset < offset_top &&
1393 md->offset_vector[offset] >= 0;
1394 if (condition) break;
1395 }
1396 else break;
1397 }
1398
1399 /* Scan up for duplicates */
1400
1401 if (!condition)
1402 {
1403 slotB = slotA;
1404 for (i++; i < md->name_count; i++)
1405 {
1406 slotB += md->name_entry_size;
1407 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1408 {
1409 offset = GET2(slotB, 0) << 1;
1410 condition = offset < offset_top &&
1411 md->offset_vector[offset] >= 0;
1412 if (condition) break;
1413 }
1414 else break;
1415 }
1416 }
1417 }
1418 }
1419
1420 /* Chose branch according to the condition */
1421
1422 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1423 }
1424
1425 else if (condcode == OP_DEF) /* DEFINE - always false */
1426 {
1427 condition = FALSE;
1428 ecode += GET(ecode, 1);
1429 }
1430
1431 /* The condition is an assertion. Call match() to evaluate it - setting
1432 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1433 an assertion. */
1434
1435 else
1436 {
1437 md->match_function_type = MATCH_CONDASSERT;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1439 if (rrc == MATCH_MATCH)
1440 {
1441 if (md->end_offset_top > offset_top)
1442 offset_top = md->end_offset_top; /* Captures may have happened */
1443 condition = TRUE;
1444 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1445 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1446 }
1447
1448 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1449 assertion; it is therefore treated as NOMATCH. */
1450
1451 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1452 {
1453 RRETURN(rrc); /* Need braces because of following else */
1454 }
1455 else
1456 {
1457 condition = FALSE;
1458 ecode += codelink;
1459 }
1460 }
1461
1462 /* We are now at the branch that is to be obeyed. As there is only one, can
1463 use tail recursion to avoid using another stack frame, except when there is
1464 unlimited repeat of a possibly empty group. In the latter case, a recursive
1465 call to match() is always required, unless the second alternative doesn't
1466 exist, in which case we can just plough on. Note that, for compatibility
1467 with Perl, the | in a conditional group is NOT treated as creating two
1468 alternatives. If a THEN is encountered in the branch, it propagates out to
1469 the enclosing alternative (unless nested in a deeper set of alternatives,
1470 of course). */
1471
1472 if (condition || *ecode == OP_ALT)
1473 {
1474 if (op != OP_SCOND)
1475 {
1476 ecode += 1 + LINK_SIZE;
1477 goto TAIL_RECURSE;
1478 }
1479
1480 md->match_function_type = MATCH_CBEGROUP;
1481 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1482 RRETURN(rrc);
1483 }
1484
1485 /* Condition false & no alternative; continue after the group. */
1486
1487 else
1488 {
1489 ecode += 1 + LINK_SIZE;
1490 }
1491 break;
1492
1493
1494 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1495 to close any currently open capturing brackets. */
1496
1497 case OP_CLOSE:
1498 number = GET2(ecode, 1);
1499 offset = number << 1;
1500
1501 #ifdef PCRE_DEBUG
1502 printf("end bracket %d at *ACCEPT", number);
1503 printf("\n");
1504 #endif
1505
1506 md->capture_last = number;
1507 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1508 {
1509 md->offset_vector[offset] =
1510 md->offset_vector[md->offset_end - number];
1511 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1512 if (offset_top <= offset) offset_top = offset + 2;
1513 }
1514 ecode += 1 + IMM2_SIZE;
1515 break;
1516
1517
1518 /* End of the pattern, either real or forced. */
1519
1520 case OP_END:
1521 case OP_ACCEPT:
1522 case OP_ASSERT_ACCEPT:
1523
1524 /* If we have matched an empty string, fail if not in an assertion and not
1525 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1526 is set and we have matched at the start of the subject. In both cases,
1527 backtracking will then try other alternatives, if any. */
1528
1529 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1530 md->recursive == NULL &&
1531 (md->notempty ||
1532 (md->notempty_atstart &&
1533 mstart == md->start_subject + md->start_offset)))
1534 RRETURN(MATCH_NOMATCH);
1535
1536 /* Otherwise, we have a match. */
1537
1538 md->end_match_ptr = eptr; /* Record where we ended */
1539 md->end_offset_top = offset_top; /* and how many extracts were taken */
1540 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1541
1542 /* For some reason, the macros don't work properly if an expression is
1543 given as the argument to RRETURN when the heap is in use. */
1544
1545 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1546 RRETURN(rrc);
1547
1548 /* Assertion brackets. Check the alternative branches in turn - the
1549 matching won't pass the KET for an assertion. If any one branch matches,
1550 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1551 start of each branch to move the current point backwards, so the code at
1552 this level is identical to the lookahead case. When the assertion is part
1553 of a condition, we want to return immediately afterwards. The caller of
1554 this incarnation of the match() function will have set MATCH_CONDASSERT in
1555 md->match_function type, and one of these opcodes will be the first opcode
1556 that is processed. We use a local variable that is preserved over calls to
1557 match() to remember this case. */
1558
1559 case OP_ASSERT:
1560 case OP_ASSERTBACK:
1561 save_mark = md->mark;
1562 if (md->match_function_type == MATCH_CONDASSERT)
1563 {
1564 condassert = TRUE;
1565 md->match_function_type = 0;
1566 }
1567 else condassert = FALSE;
1568
1569 do
1570 {
1571 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1572 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1573 {
1574 mstart = md->start_match_ptr; /* In case \K reset it */
1575 break;
1576 }
1577 md->mark = save_mark;
1578
1579 /* A COMMIT failure must fail the entire assertion, without trying any
1580 subsequent branches. */
1581
1582 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1583
1584 /* PCRE does not allow THEN to escape beyond an assertion; it
1585 is treated as NOMATCH. */
1586
1587 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1588 ecode += GET(ecode, 1);
1589 }
1590 while (*ecode == OP_ALT);
1591
1592 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1593
1594 /* If checking an assertion for a condition, return MATCH_MATCH. */
1595
1596 if (condassert) RRETURN(MATCH_MATCH);
1597
1598 /* Continue from after the assertion, updating the offsets high water
1599 mark, since extracts may have been taken during the assertion. */
1600
1601 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1602 ecode += 1 + LINK_SIZE;
1603 offset_top = md->end_offset_top;
1604 continue;
1605
1606 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1607 PRUNE, or COMMIT means we must assume failure without checking subsequent
1608 branches. */
1609
1610 case OP_ASSERT_NOT:
1611 case OP_ASSERTBACK_NOT:
1612 save_mark = md->mark;
1613 if (md->match_function_type == MATCH_CONDASSERT)
1614 {
1615 condassert = TRUE;
1616 md->match_function_type = 0;
1617 }
1618 else condassert = FALSE;
1619
1620 do
1621 {
1622 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1623 md->mark = save_mark;
1624 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1625 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1626 {
1627 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1628 break;
1629 }
1630
1631 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1632 as NOMATCH. */
1633
1634 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1635 ecode += GET(ecode,1);
1636 }
1637 while (*ecode == OP_ALT);
1638
1639 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1640
1641 ecode += 1 + LINK_SIZE;
1642 continue;
1643
1644 /* Move the subject pointer back. This occurs only at the start of
1645 each branch of a lookbehind assertion. If we are too close to the start to
1646 move back, this match function fails. When working with UTF-8 we move
1647 back a number of characters, not bytes. */
1648
1649 case OP_REVERSE:
1650 #ifdef SUPPORT_UTF
1651 if (utf)
1652 {
1653 i = GET(ecode, 1);
1654 while (i-- > 0)
1655 {
1656 eptr--;
1657 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1658 BACKCHAR(eptr);
1659 }
1660 }
1661 else
1662 #endif
1663
1664 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1665
1666 {
1667 eptr -= GET(ecode, 1);
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 }
1670
1671 /* Save the earliest consulted character, then skip to next op code */
1672
1673 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1674 ecode += 1 + LINK_SIZE;
1675 break;
1676
1677 /* The callout item calls an external function, if one is provided, passing
1678 details of the match so far. This is mainly for debugging, though the
1679 function is able to force a failure. */
1680
1681 case OP_CALLOUT:
1682 if (PUBL(callout) != NULL)
1683 {
1684 PUBL(callout_block) cb;
1685 cb.version = 2; /* Version 1 of the callout block */
1686 cb.callout_number = ecode[1];
1687 cb.offset_vector = md->offset_vector;
1688 #ifdef COMPILE_PCRE8
1689 cb.subject = (PCRE_SPTR)md->start_subject;
1690 #else
1691 cb.subject = (PCRE_SPTR16)md->start_subject;
1692 #endif
1693 cb.subject_length = (int)(md->end_subject - md->start_subject);
1694 cb.start_match = (int)(mstart - md->start_subject);
1695 cb.current_position = (int)(eptr - md->start_subject);
1696 cb.pattern_position = GET(ecode, 2);
1697 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1698 cb.capture_top = offset_top/2;
1699 cb.capture_last = md->capture_last;
1700 cb.callout_data = md->callout_data;
1701 cb.mark = md->nomatch_mark;
1702 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1703 if (rrc < 0) RRETURN(rrc);
1704 }
1705 ecode += 2 + 2*LINK_SIZE;
1706 break;
1707
1708 /* Recursion either matches the current regex, or some subexpression. The
1709 offset data is the offset to the starting bracket from the start of the
1710 whole pattern. (This is so that it works from duplicated subpatterns.)
1711
1712 The state of the capturing groups is preserved over recursion, and
1713 re-instated afterwards. We don't know how many are started and not yet
1714 finished (offset_top records the completed total) so we just have to save
1715 all the potential data. There may be up to 65535 such values, which is too
1716 large to put on the stack, but using malloc for small numbers seems
1717 expensive. As a compromise, the stack is used when there are no more than
1718 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1719
1720 There are also other values that have to be saved. We use a chained
1721 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1722 for the original version of this logic. It has, however, been hacked around
1723 a lot, so he is not to blame for the current way it works. */
1724
1725 case OP_RECURSE:
1726 {
1727 recursion_info *ri;
1728 int recno;
1729
1730 callpat = md->start_code + GET(ecode, 1);
1731 recno = (callpat == md->start_code)? 0 :
1732 GET2(callpat, 1 + LINK_SIZE);
1733
1734 /* Check for repeating a recursion without advancing the subject pointer.
1735 This should catch convoluted mutual recursions. (Some simple cases are
1736 caught at compile time.) */
1737
1738 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1739 if (recno == ri->group_num && eptr == ri->subject_position)
1740 RRETURN(PCRE_ERROR_RECURSELOOP);
1741
1742 /* Add to "recursing stack" */
1743
1744 new_recursive.group_num = recno;
1745 new_recursive.subject_position = eptr;
1746 new_recursive.prevrec = md->recursive;
1747 md->recursive = &new_recursive;
1748
1749 /* Where to continue from afterwards */
1750
1751 ecode += 1 + LINK_SIZE;
1752
1753 /* Now save the offset data */
1754
1755 new_recursive.saved_max = md->offset_end;
1756 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1757 new_recursive.offset_save = stacksave;
1758 else
1759 {
1760 new_recursive.offset_save =
1761 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1762 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1763 }
1764 memcpy(new_recursive.offset_save, md->offset_vector,
1765 new_recursive.saved_max * sizeof(int));
1766
1767 /* OK, now we can do the recursion. After processing each alternative,
1768 restore the offset data. If there were nested recursions, md->recursive
1769 might be changed, so reset it before looping. */
1770
1771 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1772 cbegroup = (*callpat >= OP_SBRA);
1773 do
1774 {
1775 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1776 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1777 md, eptrb, RM6);
1778 memcpy(md->offset_vector, new_recursive.offset_save,
1779 new_recursive.saved_max * sizeof(int));
1780 md->recursive = new_recursive.prevrec;
1781 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1782 {
1783 DPRINTF(("Recursion matched\n"));
1784 if (new_recursive.offset_save != stacksave)
1785 (PUBL(free))(new_recursive.offset_save);
1786
1787 /* Set where we got to in the subject, and reset the start in case
1788 it was changed by \K. This *is* propagated back out of a recursion,
1789 for Perl compatibility. */
1790
1791 eptr = md->end_match_ptr;
1792 mstart = md->start_match_ptr;
1793 goto RECURSION_MATCHED; /* Exit loop; end processing */
1794 }
1795
1796 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1797 is treated as NOMATCH. */
1798
1799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1800 rrc != MATCH_COMMIT)
1801 {
1802 DPRINTF(("Recursion gave error %d\n", rrc));
1803 if (new_recursive.offset_save != stacksave)
1804 (PUBL(free))(new_recursive.offset_save);
1805 RRETURN(rrc);
1806 }
1807
1808 md->recursive = &new_recursive;
1809 callpat += GET(callpat, 1);
1810 }
1811 while (*callpat == OP_ALT);
1812
1813 DPRINTF(("Recursion didn't match\n"));
1814 md->recursive = new_recursive.prevrec;
1815 if (new_recursive.offset_save != stacksave)
1816 (PUBL(free))(new_recursive.offset_save);
1817 RRETURN(MATCH_NOMATCH);
1818 }
1819
1820 RECURSION_MATCHED:
1821 break;
1822
1823 /* An alternation is the end of a branch; scan along to find the end of the
1824 bracketed group and go to there. */
1825
1826 case OP_ALT:
1827 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1828 break;
1829
1830 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1831 indicating that it may occur zero times. It may repeat infinitely, or not
1832 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1833 with fixed upper repeat limits are compiled as a number of copies, with the
1834 optional ones preceded by BRAZERO or BRAMINZERO. */
1835
1836 case OP_BRAZERO:
1837 next = ecode + 1;
1838 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840 do next += GET(next, 1); while (*next == OP_ALT);
1841 ecode = next + 1 + LINK_SIZE;
1842 break;
1843
1844 case OP_BRAMINZERO:
1845 next = ecode + 1;
1846 do next += GET(next, 1); while (*next == OP_ALT);
1847 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1849 ecode++;
1850 break;
1851
1852 case OP_SKIPZERO:
1853 next = ecode+1;
1854 do next += GET(next,1); while (*next == OP_ALT);
1855 ecode = next + 1 + LINK_SIZE;
1856 break;
1857
1858 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1859 here; just jump to the group, with allow_zero set TRUE. */
1860
1861 case OP_BRAPOSZERO:
1862 op = *(++ecode);
1863 allow_zero = TRUE;
1864 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1865 goto POSSESSIVE_NON_CAPTURE;
1866
1867 /* End of a group, repeated or non-repeating. */
1868
1869 case OP_KET:
1870 case OP_KETRMIN:
1871 case OP_KETRMAX:
1872 case OP_KETRPOS:
1873 prev = ecode - GET(ecode, 1);
1874
1875 /* If this was a group that remembered the subject start, in order to break
1876 infinite repeats of empty string matches, retrieve the subject start from
1877 the chain. Otherwise, set it NULL. */
1878
1879 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1880 {
1881 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1882 eptrb = eptrb->epb_prev; /* Backup to previous group */
1883 }
1884 else saved_eptr = NULL;
1885
1886 /* If we are at the end of an assertion group or a non-capturing atomic
1887 group, stop matching and return MATCH_MATCH, but record the current high
1888 water mark for use by positive assertions. We also need to record the match
1889 start in case it was changed by \K. */
1890
1891 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1892 *prev == OP_ONCE_NC)
1893 {
1894 md->end_match_ptr = eptr; /* For ONCE_NC */
1895 md->end_offset_top = offset_top;
1896 md->start_match_ptr = mstart;
1897 RRETURN(MATCH_MATCH); /* Sets md->mark */
1898 }
1899
1900 /* For capturing groups we have to check the group number back at the start
1901 and if necessary complete handling an extraction by setting the offsets and
1902 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1903 into group 0, so it won't be picked up here. Instead, we catch it when the
1904 OP_END is reached. Other recursion is handled here. We just have to record
1905 the current subject position and start match pointer and give a MATCH
1906 return. */
1907
1908 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1909 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1910 {
1911 number = GET2(prev, 1+LINK_SIZE);
1912 offset = number << 1;
1913
1914 #ifdef PCRE_DEBUG
1915 printf("end bracket %d", number);
1916 printf("\n");
1917 #endif
1918
1919 /* Handle a recursively called group. */
1920
1921 if (md->recursive != NULL && md->recursive->group_num == number)
1922 {
1923 md->end_match_ptr = eptr;
1924 md->start_match_ptr = mstart;
1925 RRETURN(MATCH_MATCH);
1926 }
1927
1928 /* Deal with capturing */
1929
1930 md->capture_last = number;
1931 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1932 {
1933 /* If offset is greater than offset_top, it means that we are
1934 "skipping" a capturing group, and that group's offsets must be marked
1935 unset. In earlier versions of PCRE, all the offsets were unset at the
1936 start of matching, but this doesn't work because atomic groups and
1937 assertions can cause a value to be set that should later be unset.
1938 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1939 part of the atomic group, but this is not on the final matching path,
1940 so must be unset when 2 is set. (If there is no group 2, there is no
1941 problem, because offset_top will then be 2, indicating no capture.) */
1942
1943 if (offset > offset_top)
1944 {
1945 register int *iptr = md->offset_vector + offset_top;
1946 register int *iend = md->offset_vector + offset;
1947 while (iptr < iend) *iptr++ = -1;
1948 }
1949
1950 /* Now make the extraction */
1951
1952 md->offset_vector[offset] =
1953 md->offset_vector[md->offset_end - number];
1954 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1955 if (offset_top <= offset) offset_top = offset + 2;
1956 }
1957 }
1958
1959 /* For an ordinary non-repeating ket, just continue at this level. This
1960 also happens for a repeating ket if no characters were matched in the
1961 group. This is the forcible breaking of infinite loops as implemented in
1962 Perl 5.005. For a non-repeating atomic group that includes captures,
1963 establish a backup point by processing the rest of the pattern at a lower
1964 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1965 original OP_ONCE level, thereby bypassing intermediate backup points, but
1966 resetting any captures that happened along the way. */
1967
1968 if (*ecode == OP_KET || eptr == saved_eptr)
1969 {
1970 if (*prev == OP_ONCE)
1971 {
1972 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1975 RRETURN(MATCH_ONCE);
1976 }
1977 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1978 break;
1979 }
1980
1981 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1982 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1983 at a time from the outer level, thus saving stack. */
1984
1985 if (*ecode == OP_KETRPOS)
1986 {
1987 md->end_match_ptr = eptr;
1988 md->end_offset_top = offset_top;
1989 RRETURN(MATCH_KETRPOS);
1990 }
1991
1992 /* The normal repeating kets try the rest of the pattern or restart from
1993 the preceding bracket, in the appropriate order. In the second case, we can
1994 use tail recursion to avoid using another stack frame, unless we have an
1995 an atomic group or an unlimited repeat of a group that can match an empty
1996 string. */
1997
1998 if (*ecode == OP_KETRMIN)
1999 {
2000 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2002 if (*prev == OP_ONCE)
2003 {
2004 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2006 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2007 RRETURN(MATCH_ONCE);
2008 }
2009 if (*prev >= OP_SBRA) /* Could match an empty string */
2010 {
2011 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2012 RRETURN(rrc);
2013 }
2014 ecode = prev;
2015 goto TAIL_RECURSE;
2016 }
2017 else /* OP_KETRMAX */
2018 {
2019 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2020 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (*prev == OP_ONCE)
2023 {
2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev;
2027 RRETURN(MATCH_ONCE);
2028 }
2029 ecode += 1 + LINK_SIZE;
2030 goto TAIL_RECURSE;
2031 }
2032 /* Control never gets here */
2033
2034 /* Not multiline mode: start of subject assertion, unless notbol. */
2035
2036 case OP_CIRC:
2037 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2038
2039 /* Start of subject assertion */
2040
2041 case OP_SOD:
2042 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2043 ecode++;
2044 break;
2045
2046 /* Multiline mode: start of subject unless notbol, or after any newline. */
2047
2048 case OP_CIRCM:
2049 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2050 if (eptr != md->start_subject &&
2051 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2052 RRETURN(MATCH_NOMATCH);
2053 ecode++;
2054 break;
2055
2056 /* Start of match assertion */
2057
2058 case OP_SOM:
2059 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2060 ecode++;
2061 break;
2062
2063 /* Reset the start of match point */
2064
2065 case OP_SET_SOM:
2066 mstart = eptr;
2067 ecode++;
2068 break;
2069
2070 /* Multiline mode: assert before any newline, or before end of subject
2071 unless noteol is set. */
2072
2073 case OP_DOLLM:
2074 if (eptr < md->end_subject)
2075 {
2076 if (!IS_NEWLINE(eptr))
2077 {
2078 if (md->partial != 0 &&
2079 eptr + 1 >= md->end_subject &&
2080 NLBLOCK->nltype == NLTYPE_FIXED &&
2081 NLBLOCK->nllen == 2 &&
2082 *eptr == NLBLOCK->nl[0])
2083 {
2084 md->hitend = TRUE;
2085 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2086 }
2087 RRETURN(MATCH_NOMATCH);
2088 }
2089 }
2090 else
2091 {
2092 if (md->noteol) RRETURN(MATCH_NOMATCH);
2093 SCHECK_PARTIAL();
2094 }
2095 ecode++;
2096 break;
2097
2098 /* Not multiline mode: assert before a terminating newline or before end of
2099 subject unless noteol is set. */
2100
2101 case OP_DOLL:
2102 if (md->noteol) RRETURN(MATCH_NOMATCH);
2103 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2104
2105 /* ... else fall through for endonly */
2106
2107 /* End of subject assertion (\z) */
2108
2109 case OP_EOD:
2110 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2111 SCHECK_PARTIAL();
2112 ecode++;
2113 break;
2114
2115 /* End of subject or ending \n assertion (\Z) */
2116
2117 case OP_EODN:
2118 ASSERT_NL_OR_EOS:
2119 if (eptr < md->end_subject &&
2120 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2121 {
2122 if (md->partial != 0 &&
2123 eptr + 1 >= md->end_subject &&
2124 NLBLOCK->nltype == NLTYPE_FIXED &&
2125 NLBLOCK->nllen == 2 &&
2126 *eptr == NLBLOCK->nl[0])
2127 {
2128 md->hitend = TRUE;
2129 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2130 }
2131 RRETURN(MATCH_NOMATCH);
2132 }
2133
2134 /* Either at end of string or \n before end. */
2135
2136 SCHECK_PARTIAL();
2137 ecode++;
2138 break;
2139
2140 /* Word boundary assertions */
2141
2142 case OP_NOT_WORD_BOUNDARY:
2143 case OP_WORD_BOUNDARY:
2144 {
2145
2146 /* Find out if the previous and current characters are "word" characters.
2147 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2148 be "non-word" characters. Remember the earliest consulted character for
2149 partial matching. */
2150
2151 #ifdef SUPPORT_UTF
2152 if (utf)
2153 {
2154 /* Get status of previous character */
2155
2156 if (eptr == md->start_subject) prev_is_word = FALSE; else
2157 {
2158 PCRE_PUCHAR lastptr = eptr - 1;
2159 BACKCHAR(lastptr);
2160 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2161 GETCHAR(c, lastptr);
2162 #ifdef SUPPORT_UCP
2163 if (md->use_ucp)
2164 {
2165 if (c == '_') prev_is_word = TRUE; else
2166 {
2167 int cat = UCD_CATEGORY(c);
2168 prev_is_word = (cat == ucp_L || cat == ucp_N);
2169 }
2170 }
2171 else
2172 #endif
2173 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2174 }
2175
2176 /* Get status of next character */
2177
2178 if (eptr >= md->end_subject)
2179 {
2180 SCHECK_PARTIAL();
2181 cur_is_word = FALSE;
2182 }
2183 else
2184 {
2185 GETCHAR(c, eptr);
2186 #ifdef SUPPORT_UCP
2187 if (md->use_ucp)
2188 {
2189 if (c == '_') cur_is_word = TRUE; else
2190 {
2191 int cat = UCD_CATEGORY(c);
2192 cur_is_word = (cat == ucp_L || cat == ucp_N);
2193 }
2194 }
2195 else
2196 #endif
2197 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2198 }
2199 }
2200 else
2201 #endif
2202
2203 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2204 consistency with the behaviour of \w we do use it in this case. */
2205
2206 {
2207 /* Get status of previous character */
2208
2209 if (eptr == md->start_subject) prev_is_word = FALSE; else
2210 {
2211 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2212 #ifdef SUPPORT_UCP
2213 if (md->use_ucp)
2214 {
2215 c = eptr[-1];
2216 if (c == '_') prev_is_word = TRUE; else
2217 {
2218 int cat = UCD_CATEGORY(c);
2219 prev_is_word = (cat == ucp_L || cat == ucp_N);
2220 }
2221 }
2222 else
2223 #endif
2224 prev_is_word = MAX_255(eptr[-1])
2225 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2226 }
2227
2228 /* Get status of next character */
2229
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 cur_is_word = FALSE;
2234 }
2235 else
2236 #ifdef SUPPORT_UCP
2237 if (md->use_ucp)
2238 {
2239 c = *eptr;
2240 if (c == '_') cur_is_word = TRUE; else
2241 {
2242 int cat = UCD_CATEGORY(c);
2243 cur_is_word = (cat == ucp_L || cat == ucp_N);
2244 }
2245 }
2246 else
2247 #endif
2248 cur_is_word = MAX_255(*eptr)
2249 && ((md->ctypes[*eptr] & ctype_word) != 0);
2250 }
2251
2252 /* Now see if the situation is what we want */
2253
2254 if ((*ecode++ == OP_WORD_BOUNDARY)?
2255 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2256 RRETURN(MATCH_NOMATCH);
2257 }
2258 break;
2259
2260 /* Match any single character type except newline; have to take care with
2261 CRLF newlines and partial matching. */
2262
2263 case OP_ANY:
2264 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2265 if (md->partial != 0 &&
2266 eptr + 1 >= md->end_subject &&
2267 NLBLOCK->nltype == NLTYPE_FIXED &&
2268 NLBLOCK->nllen == 2 &&
2269 *eptr == NLBLOCK->nl[0])
2270 {
2271 md->hitend = TRUE;
2272 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2273 }
2274
2275 /* Fall through */
2276
2277 /* Match any single character whatsoever. */
2278
2279 case OP_ALLANY:
2280 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2281 { /* not be updated before SCHECK_PARTIAL. */
2282 SCHECK_PARTIAL();
2283 RRETURN(MATCH_NOMATCH);
2284 }
2285 eptr++;
2286 #ifdef SUPPORT_UTF
2287 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2288 #endif
2289 ecode++;
2290 break;
2291
2292 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2293 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2294
2295 case OP_ANYBYTE:
2296 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2297 { /* not be updated before SCHECK_PARTIAL. */
2298 SCHECK_PARTIAL();
2299 RRETURN(MATCH_NOMATCH);
2300 }
2301 eptr++;
2302 ecode++;
2303 break;
2304
2305 case OP_NOT_DIGIT:
2306 if (eptr >= md->end_subject)
2307 {
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 GETCHARINCTEST(c, eptr);
2312 if (
2313 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2314 c < 256 &&
2315 #endif
2316 (md->ctypes[c] & ctype_digit) != 0
2317 )
2318 RRETURN(MATCH_NOMATCH);
2319 ecode++;
2320 break;
2321
2322 case OP_DIGIT:
2323 if (eptr >= md->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 GETCHARINCTEST(c, eptr);
2329 if (
2330 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2331 c > 255 ||
2332 #endif
2333 (md->ctypes[c] & ctype_digit) == 0
2334 )
2335 RRETURN(MATCH_NOMATCH);
2336 ecode++;
2337 break;
2338
2339 case OP_NOT_WHITESPACE:
2340 if (eptr >= md->end_subject)
2341 {
2342 SCHECK_PARTIAL();
2343 RRETURN(MATCH_NOMATCH);
2344 }
2345 GETCHARINCTEST(c, eptr);
2346 if (
2347 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2348 c < 256 &&
2349 #endif
2350 (md->ctypes[c] & ctype_space) != 0
2351 )
2352 RRETURN(MATCH_NOMATCH);
2353 ecode++;
2354 break;
2355
2356 case OP_WHITESPACE:
2357 if (eptr >= md->end_subject)
2358 {
2359 SCHECK_PARTIAL();
2360 RRETURN(MATCH_NOMATCH);
2361 }
2362 GETCHARINCTEST(c, eptr);
2363 if (
2364 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2365 c > 255 ||
2366 #endif
2367 (md->ctypes[c] & ctype_space) == 0
2368 )
2369 RRETURN(MATCH_NOMATCH);
2370 ecode++;
2371 break;
2372
2373 case OP_NOT_WORDCHAR:
2374 if (eptr >= md->end_subject)
2375 {
2376 SCHECK_PARTIAL();
2377 RRETURN(MATCH_NOMATCH);
2378 }
2379 GETCHARINCTEST(c, eptr);
2380 if (
2381 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2382 c < 256 &&
2383 #endif
2384 (md->ctypes[c] & ctype_word) != 0
2385 )
2386 RRETURN(MATCH_NOMATCH);
2387 ecode++;
2388 break;
2389
2390 case OP_WORDCHAR:
2391 if (eptr >= md->end_subject)
2392 {
2393 SCHECK_PARTIAL();
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 GETCHARINCTEST(c, eptr);
2397 if (
2398 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2399 c > 255 ||
2400 #endif
2401 (md->ctypes[c] & ctype_word) == 0
2402 )
2403 RRETURN(MATCH_NOMATCH);
2404 ecode++;
2405 break;
2406
2407 case OP_ANYNL:
2408 if (eptr >= md->end_subject)
2409 {
2410 SCHECK_PARTIAL();
2411 RRETURN(MATCH_NOMATCH);
2412 }
2413 GETCHARINCTEST(c, eptr);
2414 switch(c)
2415 {
2416 default: RRETURN(MATCH_NOMATCH);
2417
2418 case 0x000d:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 }
2423 else if (*eptr == 0x0a) eptr++;
2424 break;
2425
2426 case 0x000a:
2427 break;
2428
2429 case 0x000b:
2430 case 0x000c:
2431 case 0x0085:
2432 case 0x2028:
2433 case 0x2029:
2434 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2435 break;
2436 }
2437 ecode++;
2438 break;
2439
2440 case OP_NOT_HSPACE:
2441 if (eptr >= md->end_subject)
2442 {
2443 SCHECK_PARTIAL();
2444 RRETURN(MATCH_NOMATCH);
2445 }
2446 GETCHARINCTEST(c, eptr);
2447 switch(c)
2448 {
2449 default: break;
2450 case 0x09: /* HT */
2451 case 0x20: /* SPACE */
2452 case 0xa0: /* NBSP */
2453 case 0x1680: /* OGHAM SPACE MARK */
2454 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2455 case 0x2000: /* EN QUAD */
2456 case 0x2001: /* EM QUAD */
2457 case 0x2002: /* EN SPACE */
2458 case 0x2003: /* EM SPACE */
2459 case 0x2004: /* THREE-PER-EM SPACE */
2460 case 0x2005: /* FOUR-PER-EM SPACE */
2461 case 0x2006: /* SIX-PER-EM SPACE */
2462 case 0x2007: /* FIGURE SPACE */
2463 case 0x2008: /* PUNCTUATION SPACE */
2464 case 0x2009: /* THIN SPACE */
2465 case 0x200A: /* HAIR SPACE */
2466 case 0x202f: /* NARROW NO-BREAK SPACE */
2467 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2468 case 0x3000: /* IDEOGRAPHIC SPACE */
2469 RRETURN(MATCH_NOMATCH);
2470 }
2471 ecode++;
2472 break;
2473
2474 case OP_HSPACE:
2475 if (eptr >= md->end_subject)
2476 {
2477 SCHECK_PARTIAL();
2478 RRETURN(MATCH_NOMATCH);
2479 }
2480 GETCHARINCTEST(c, eptr);
2481 switch(c)
2482 {
2483 default: RRETURN(MATCH_NOMATCH);
2484 case 0x09: /* HT */
2485 case 0x20: /* SPACE */
2486 case 0xa0: /* NBSP */
2487 case 0x1680: /* OGHAM SPACE MARK */
2488 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2489 case 0x2000: /* EN QUAD */
2490 case 0x2001: /* EM QUAD */
2491 case 0x2002: /* EN SPACE */
2492 case 0x2003: /* EM SPACE */
2493 case 0x2004: /* THREE-PER-EM SPACE */
2494 case 0x2005: /* FOUR-PER-EM SPACE */
2495 case 0x2006: /* SIX-PER-EM SPACE */
2496 case 0x2007: /* FIGURE SPACE */
2497 case 0x2008: /* PUNCTUATION SPACE */
2498 case 0x2009: /* THIN SPACE */
2499 case 0x200A: /* HAIR SPACE */
2500 case 0x202f: /* NARROW NO-BREAK SPACE */
2501 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2502 case 0x3000: /* IDEOGRAPHIC SPACE */
2503 break;
2504 }
2505 ecode++;
2506 break;
2507
2508 case OP_NOT_VSPACE:
2509 if (eptr >= md->end_subject)
2510 {
2511 SCHECK_PARTIAL();
2512 RRETURN(MATCH_NOMATCH);
2513 }
2514 GETCHARINCTEST(c, eptr);
2515 switch(c)
2516 {
2517 default: break;
2518 case 0x0a: /* LF */
2519 case 0x0b: /* VT */
2520 case 0x0c: /* FF */
2521 case 0x0d: /* CR */
2522 case 0x85: /* NEL */
2523 case 0x2028: /* LINE SEPARATOR */
2524 case 0x2029: /* PARAGRAPH SEPARATOR */
2525 RRETURN(MATCH_NOMATCH);
2526 }
2527 ecode++;
2528 break;
2529
2530 case OP_VSPACE:
2531 if (eptr >= md->end_subject)
2532 {
2533 SCHECK_PARTIAL();
2534 RRETURN(MATCH_NOMATCH);
2535 }
2536 GETCHARINCTEST(c, eptr);
2537 switch(c)
2538 {
2539 default: RRETURN(MATCH_NOMATCH);
2540 case 0x0a: /* LF */
2541 case 0x0b: /* VT */
2542 case 0x0c: /* FF */
2543 case 0x0d: /* CR */
2544 case 0x85: /* NEL */
2545 case 0x2028: /* LINE SEPARATOR */
2546 case 0x2029: /* PARAGRAPH SEPARATOR */
2547 break;
2548 }
2549 ecode++;
2550 break;
2551
2552 #ifdef SUPPORT_UCP
2553 /* Check the next character by Unicode property. We will get here only
2554 if the support is in the binary; otherwise a compile-time error occurs. */
2555
2556 case OP_PROP:
2557 case OP_NOTPROP:
2558 if (eptr >= md->end_subject)
2559 {
2560 SCHECK_PARTIAL();
2561 RRETURN(MATCH_NOMATCH);
2562 }
2563 GETCHARINCTEST(c, eptr);
2564 {
2565 const ucd_record *prop = GET_UCD(c);
2566
2567 switch(ecode[1])
2568 {
2569 case PT_ANY:
2570 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2571 break;
2572
2573 case PT_LAMP:
2574 if ((prop->chartype == ucp_Lu ||
2575 prop->chartype == ucp_Ll ||
2576 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2577 RRETURN(MATCH_NOMATCH);
2578 break;
2579
2580 case PT_GC:
2581 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2582 RRETURN(MATCH_NOMATCH);
2583 break;
2584
2585 case PT_PC:
2586 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2587 RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 case PT_SC:
2591 if ((ecode[2] != prop->script) == (op == OP_PROP))
2592 RRETURN(MATCH_NOMATCH);
2593 break;
2594
2595 /* These are specials */
2596
2597 case PT_ALNUM:
2598 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2599 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2600 RRETURN(MATCH_NOMATCH);
2601 break;
2602
2603 case PT_SPACE: /* Perl space */
2604 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2605 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2606 == (op == OP_NOTPROP))
2607 RRETURN(MATCH_NOMATCH);
2608 break;
2609
2610 case PT_PXSPACE: /* POSIX space */
2611 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2612 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2613 c == CHAR_FF || c == CHAR_CR)
2614 == (op == OP_NOTPROP))
2615 RRETURN(MATCH_NOMATCH);
2616 break;
2617
2618 case PT_WORD:
2619 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2620 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2621 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2622 RRETURN(MATCH_NOMATCH);
2623 break;
2624
2625 /* This should never occur */
2626
2627 default:
2628 RRETURN(PCRE_ERROR_INTERNAL);
2629 }
2630
2631 ecode += 3;
2632 }
2633 break;
2634
2635 /* Match an extended Unicode sequence. We will get here only if the support
2636 is in the binary; otherwise a compile-time error occurs. */
2637
2638 case OP_EXTUNI:
2639 if (eptr >= md->end_subject)
2640 {
2641 SCHECK_PARTIAL();
2642 RRETURN(MATCH_NOMATCH);
2643 }
2644 else
2645 {
2646 int lgb, rgb;
2647 GETCHARINCTEST(c, eptr);
2648 lgb = UCD_GRAPHBREAK(c);
2649 while (eptr < md->end_subject)
2650 {
2651 int len = 1;
2652 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2653 rgb = UCD_GRAPHBREAK(c);
2654 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2655 lgb = rgb;
2656 eptr += len;
2657 }
2658 }
2659 CHECK_PARTIAL();
2660 ecode++;
2661 break;
2662 #endif
2663
2664
2665 /* Match a back reference, possibly repeatedly. Look past the end of the
2666 item to see if there is repeat information following. The code is similar
2667 to that for character classes, but repeated for efficiency. Then obey
2668 similar code to character type repeats - written out again for speed.
2669 However, if the referenced string is the empty string, always treat
2670 it as matched, any number of times (otherwise there could be infinite
2671 loops). */
2672
2673 case OP_REF:
2674 case OP_REFI:
2675 caseless = op == OP_REFI;
2676 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2677 ecode += 1 + IMM2_SIZE;
2678
2679 /* If the reference is unset, there are two possibilities:
2680
2681 (a) In the default, Perl-compatible state, set the length negative;
2682 this ensures that every attempt at a match fails. We can't just fail
2683 here, because of the possibility of quantifiers with zero minima.
2684
2685 (b) If the JavaScript compatibility flag is set, set the length to zero
2686 so that the back reference matches an empty string.
2687
2688 Otherwise, set the length to the length of what was matched by the
2689 referenced subpattern. */
2690
2691 if (offset >= offset_top || md->offset_vector[offset] < 0)
2692 length = (md->jscript_compat)? 0 : -1;
2693 else
2694 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2695
2696 /* Set up for repetition, or handle the non-repeated case */
2697
2698 switch (*ecode)
2699 {
2700 case OP_CRSTAR:
2701 case OP_CRMINSTAR:
2702 case OP_CRPLUS:
2703 case OP_CRMINPLUS:
2704 case OP_CRQUERY:
2705 case OP_CRMINQUERY:
2706 c = *ecode++ - OP_CRSTAR;
2707 minimize = (c & 1) != 0;
2708 min = rep_min[c]; /* Pick up values from tables; */
2709 max = rep_max[c]; /* zero for max => infinity */
2710 if (max == 0) max = INT_MAX;
2711 break;
2712
2713 case OP_CRRANGE:
2714 case OP_CRMINRANGE:
2715 minimize = (*ecode == OP_CRMINRANGE);
2716 min = GET2(ecode, 1);
2717 max = GET2(ecode, 1 + IMM2_SIZE);
2718 if (max == 0) max = INT_MAX;
2719 ecode += 1 + 2 * IMM2_SIZE;
2720 break;
2721
2722 default: /* No repeat follows */
2723 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2724 {
2725 if (length == -2) eptr = md->end_subject; /* Partial match */
2726 CHECK_PARTIAL();
2727 RRETURN(MATCH_NOMATCH);
2728 }
2729 eptr += length;
2730 continue; /* With the main loop */
2731 }
2732
2733 /* Handle repeated back references. If the length of the reference is
2734 zero, just continue with the main loop. If the length is negative, it
2735 means the reference is unset in non-Java-compatible mode. If the minimum is
2736 zero, we can continue at the same level without recursion. For any other
2737 minimum, carrying on will result in NOMATCH. */
2738
2739 if (length == 0) continue;
2740 if (length < 0 && min == 0) continue;
2741
2742 /* First, ensure the minimum number of matches are present. We get back
2743 the length of the reference string explicitly rather than passing the
2744 address of eptr, so that eptr can be a register variable. */
2745
2746 for (i = 1; i <= min; i++)
2747 {
2748 int slength;
2749 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2750 {
2751 if (slength == -2) eptr = md->end_subject; /* Partial match */
2752 CHECK_PARTIAL();
2753 RRETURN(MATCH_NOMATCH);
2754 }
2755 eptr += slength;
2756 }
2757
2758 /* If min = max, continue at the same level without recursion.
2759 They are not both allowed to be zero. */
2760
2761 if (min == max) continue;
2762
2763 /* If minimizing, keep trying and advancing the pointer */
2764
2765 if (minimize)
2766 {
2767 for (fi = min;; fi++)
2768 {
2769 int slength;
2770 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2771 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2772 if (fi >= max) RRETURN(MATCH_NOMATCH);
2773 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2774 {
2775 if (slength == -2) eptr = md->end_subject; /* Partial match */
2776 CHECK_PARTIAL();
2777 RRETURN(MATCH_NOMATCH);
2778 }
2779 eptr += slength;
2780 }
2781 /* Control never gets here */
2782 }
2783
2784 /* If maximizing, find the longest string and work backwards */
2785
2786 else
2787 {
2788 pp = eptr;
2789 for (i = min; i < max; i++)
2790 {
2791 int slength;
2792 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2793 {
2794 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2795 the soft partial matching case. */
2796
2797 if (slength == -2 && md->partial != 0 &&
2798 md->end_subject > md->start_used_ptr)
2799 {
2800 md->hitend = TRUE;
2801 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2802 }
2803 break;
2804 }
2805 eptr += slength;
2806 }
2807
2808 while (eptr >= pp)
2809 {
2810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2812 eptr -= length;
2813 }
2814 RRETURN(MATCH_NOMATCH);
2815 }
2816 /* Control never gets here */
2817
2818 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2819 used when all the characters in the class have values in the range 0-255,
2820 and either the matching is caseful, or the characters are in the range
2821 0-127 when UTF-8 processing is enabled. The only difference between
2822 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2823 encountered.
2824
2825 First, look past the end of the item to see if there is repeat information
2826 following. Then obey similar code to character type repeats - written out
2827 again for speed. */
2828
2829 case OP_NCLASS:
2830 case OP_CLASS:
2831 {
2832 /* The data variable is saved across frames, so the byte map needs to
2833 be stored there. */
2834 #define BYTE_MAP ((pcre_uint8 *)data)
2835 data = ecode + 1; /* Save for matching */
2836 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2837
2838 switch (*ecode)
2839 {
2840 case OP_CRSTAR:
2841 case OP_CRMINSTAR:
2842 case OP_CRPLUS:
2843 case OP_CRMINPLUS:
2844 case OP_CRQUERY:
2845 case OP_CRMINQUERY:
2846 c = *ecode++ - OP_CRSTAR;
2847 minimize = (c & 1) != 0;
2848 min = rep_min[c]; /* Pick up values from tables; */
2849 max = rep_max[c]; /* zero for max => infinity */
2850 if (max == 0) max = INT_MAX;
2851 break;
2852
2853 case OP_CRRANGE:
2854 case OP_CRMINRANGE:
2855 minimize = (*ecode == OP_CRMINRANGE);
2856 min = GET2(ecode, 1);
2857 max = GET2(ecode, 1 + IMM2_SIZE);
2858 if (max == 0) max = INT_MAX;
2859 ecode += 1 + 2 * IMM2_SIZE;
2860 break;
2861
2862 default: /* No repeat follows */
2863 min = max = 1;
2864 break;
2865 }
2866
2867 /* First, ensure the minimum number of matches are present. */
2868
2869 #ifdef SUPPORT_UTF
2870 if (utf)
2871 {
2872 for (i = 1; i <= min; i++)
2873 {
2874 if (eptr >= md->end_subject)
2875 {
2876 SCHECK_PARTIAL();
2877 RRETURN(MATCH_NOMATCH);
2878 }
2879 GETCHARINC(c, eptr);
2880 if (c > 255)
2881 {
2882 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2883 }
2884 else
2885 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2886 }
2887 }
2888 else
2889 #endif
2890 /* Not UTF mode */
2891 {
2892 for (i = 1; i <= min; i++)
2893 {
2894 if (eptr >= md->end_subject)
2895 {
2896 SCHECK_PARTIAL();
2897 RRETURN(MATCH_NOMATCH);
2898 }
2899 c = *eptr++;
2900 #ifndef COMPILE_PCRE8
2901 if (c > 255)
2902 {
2903 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2904 }
2905 else
2906 #endif
2907 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2908 }
2909 }
2910
2911 /* If max == min we can continue with the main loop without the
2912 need to recurse. */
2913
2914 if (min == max) continue;
2915
2916 /* If minimizing, keep testing the rest of the expression and advancing
2917 the pointer while it matches the class. */
2918
2919 if (minimize)
2920 {
2921 #ifdef SUPPORT_UTF
2922 if (utf)
2923 {
2924 for (fi = min;; fi++)
2925 {
2926 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2928 if (fi >= max) RRETURN(MATCH_NOMATCH);
2929 if (eptr >= md->end_subject)
2930 {
2931 SCHECK_PARTIAL();
2932 RRETURN(MATCH_NOMATCH);
2933 }
2934 GETCHARINC(c, eptr);
2935 if (c > 255)
2936 {
2937 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2938 }
2939 else
2940 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2941 }
2942 }
2943 else
2944 #endif
2945 /* Not UTF mode */
2946 {
2947 for (fi = min;; fi++)
2948 {
2949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2951 if (fi >= max) RRETURN(MATCH_NOMATCH);
2952 if (eptr >= md->end_subject)
2953 {
2954 SCHECK_PARTIAL();
2955 RRETURN(MATCH_NOMATCH);
2956 }
2957 c = *eptr++;
2958 #ifndef COMPILE_PCRE8
2959 if (c > 255)
2960 {
2961 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2962 }
2963 else
2964 #endif
2965 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2966 }
2967 }
2968 /* Control never gets here */
2969 }
2970
2971 /* If maximizing, find the longest possible run, then work backwards. */
2972
2973 else
2974 {
2975 pp = eptr;
2976
2977 #ifdef SUPPORT_UTF
2978 if (utf)
2979 {
2980 for (i = min; i < max; i++)
2981 {
2982 int len = 1;
2983 if (eptr >= md->end_subject)
2984 {
2985 SCHECK_PARTIAL();
2986 break;
2987 }
2988 GETCHARLEN(c, eptr, len);
2989 if (c > 255)
2990 {
2991 if (op == OP_CLASS) break;
2992 }
2993 else
2994 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2995 eptr += len;
2996 }
2997 for (;;)
2998 {
2999 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001 if (eptr-- == pp) break; /* Stop if tried at original pos */
3002 BACKCHAR(eptr);
3003 }
3004 }
3005 else
3006 #endif
3007 /* Not UTF mode */
3008 {
3009 for (i = min; i < max; i++)
3010 {
3011 if (eptr >= md->end_subject)
3012 {
3013 SCHECK_PARTIAL();
3014 break;
3015 }
3016 c = *eptr;
3017 #ifndef COMPILE_PCRE8
3018 if (c > 255)
3019 {
3020 if (op == OP_CLASS) break;
3021 }
3022 else
3023 #endif
3024 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3025 eptr++;
3026 }
3027 while (eptr >= pp)
3028 {
3029 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3030 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3031 eptr--;
3032 }
3033 }
3034
3035 RRETURN(MATCH_NOMATCH);
3036 }
3037 #undef BYTE_MAP
3038 }
3039 /* Control never gets here */
3040
3041
3042 /* Match an extended character class. This opcode is encountered only
3043 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3044 mode, because Unicode properties are supported in non-UTF-8 mode. */
3045
3046 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3047 case OP_XCLASS:
3048 {
3049 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3050 ecode += GET(ecode, 1); /* Advance past the item */
3051
3052 switch (*ecode)
3053 {
3054 case OP_CRSTAR:
3055 case OP_CRMINSTAR:
3056 case OP_CRPLUS:
3057 case OP_CRMINPLUS:
3058 case OP_CRQUERY:
3059 case OP_CRMINQUERY:
3060 c = *ecode++ - OP_CRSTAR;
3061 minimize = (c & 1) != 0;
3062 min = rep_min[c]; /* Pick up values from tables; */
3063 max = rep_max[c]; /* zero for max => infinity */
3064 if (max == 0) max = INT_MAX;
3065 break;
3066
3067 case OP_CRRANGE:
3068 case OP_CRMINRANGE:
3069 minimize = (*ecode == OP_CRMINRANGE);
3070 min = GET2(ecode, 1);
3071 max = GET2(ecode, 1 + IMM2_SIZE);
3072 if (max == 0) max = INT_MAX;
3073 ecode += 1 + 2 * IMM2_SIZE;
3074 break;
3075
3076 default: /* No repeat follows */
3077 min = max = 1;
3078 break;
3079 }
3080
3081 /* First, ensure the minimum number of matches are present. */
3082
3083 for (i = 1; i <= min; i++)
3084 {
3085 if (eptr >= md->end_subject)
3086 {
3087 SCHECK_PARTIAL();
3088 RRETURN(MATCH_NOMATCH);
3089 }
3090 GETCHARINCTEST(c, eptr);
3091 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3092 }
3093
3094 /* If max == min we can continue with the main loop without the
3095 need to recurse. */
3096
3097 if (min == max) continue;
3098
3099 /* If minimizing, keep testing the rest of the expression and advancing
3100 the pointer while it matches the class. */
3101
3102 if (minimize)
3103 {
3104 for (fi = min;; fi++)
3105 {
3106 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3107 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3108 if (fi >= max) RRETURN(MATCH_NOMATCH);
3109 if (eptr >= md->end_subject)
3110 {
3111 SCHECK_PARTIAL();
3112 RRETURN(MATCH_NOMATCH);
3113 }
3114 GETCHARINCTEST(c, eptr);
3115 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3116 }
3117 /* Control never gets here */
3118 }
3119
3120 /* If maximizing, find the longest possible run, then work backwards. */
3121
3122 else
3123 {
3124 pp = eptr;
3125 for (i = min; i < max; i++)
3126 {
3127 int len = 1;
3128 if (eptr >= md->end_subject)
3129 {
3130 SCHECK_PARTIAL();
3131 break;
3132 }
3133 #ifdef SUPPORT_UTF
3134 GETCHARLENTEST(c, eptr, len);
3135 #else
3136 c = *eptr;
3137 #endif
3138 if (!PRIV(xclass)(c, data, utf)) break;
3139 eptr += len;
3140 }
3141 for(;;)
3142 {
3143 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3144 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3145 if (eptr-- == pp) break; /* Stop if tried at original pos */
3146 #ifdef SUPPORT_UTF
3147 if (utf) BACKCHAR(eptr);
3148 #endif
3149 }
3150 RRETURN(MATCH_NOMATCH);
3151 }
3152
3153 /* Control never gets here */
3154 }
3155 #endif /* End of XCLASS */
3156
3157 /* Match a single character, casefully */
3158
3159 case OP_CHAR:
3160 #ifdef SUPPORT_UTF
3161 if (utf)
3162 {
3163 length = 1;
3164 ecode++;
3165 GETCHARLEN(fc, ecode, length);
3166 if (length > md->end_subject - eptr)
3167 {
3168 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3169 RRETURN(MATCH_NOMATCH);
3170 }
3171 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3172 }
3173 else
3174 #endif
3175 /* Not UTF mode */
3176 {
3177 if (md->end_subject - eptr < 1)
3178 {
3179 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3180 RRETURN(MATCH_NOMATCH);
3181 }
3182 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3183 ecode += 2;
3184 }
3185 break;
3186
3187 /* Match a single character, caselessly. If we are at the end of the
3188 subject, give up immediately. */
3189
3190 case OP_CHARI:
3191 if (eptr >= md->end_subject)
3192 {
3193 SCHECK_PARTIAL();
3194 RRETURN(MATCH_NOMATCH);
3195 }
3196
3197 #ifdef SUPPORT_UTF
3198 if (utf)
3199 {
3200 length = 1;
3201 ecode++;
3202 GETCHARLEN(fc, ecode, length);
3203
3204 /* If the pattern character's value is < 128, we have only one byte, and
3205 we know that its other case must also be one byte long, so we can use the
3206 fast lookup table. We know that there is at least one byte left in the
3207 subject. */
3208
3209 if (fc < 128)
3210 {
3211 if (md->lcc[fc]
3212 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3213 ecode++;
3214 eptr++;
3215 }
3216
3217 /* Otherwise we must pick up the subject character. Note that we cannot
3218 use the value of "length" to check for sufficient bytes left, because the
3219 other case of the character may have more or fewer bytes. */
3220
3221 else
3222 {
3223 unsigned int dc;
3224 GETCHARINC(dc, eptr);
3225 ecode += length;
3226
3227 /* If we have Unicode property support, we can use it to test the other
3228 case of the character, if there is one. */
3229
3230 if (fc != dc)
3231 {
3232 #ifdef SUPPORT_UCP
3233 if (dc != UCD_OTHERCASE(fc))
3234 #endif
3235 RRETURN(MATCH_NOMATCH);
3236 }
3237 }
3238 }
3239 else
3240 #endif /* SUPPORT_UTF */
3241
3242 /* Not UTF mode */
3243 {
3244 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3245 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3246 eptr++;
3247 ecode += 2;
3248 }
3249 break;
3250
3251 /* Match a single character repeatedly. */
3252
3253 case OP_EXACT:
3254 case OP_EXACTI:
3255 min = max = GET2(ecode, 1);
3256 ecode += 1 + IMM2_SIZE;
3257 goto REPEATCHAR;
3258
3259 case OP_POSUPTO:
3260 case OP_POSUPTOI:
3261 possessive = TRUE;
3262 /* Fall through */
3263
3264 case OP_UPTO:
3265 case OP_UPTOI:
3266 case OP_MINUPTO:
3267 case OP_MINUPTOI:
3268 min = 0;
3269 max = GET2(ecode, 1);
3270 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3271 ecode += 1 + IMM2_SIZE;
3272 goto REPEATCHAR;
3273
3274 case OP_POSSTAR:
3275 case OP_POSSTARI:
3276 possessive = TRUE;
3277 min = 0;
3278 max = INT_MAX;
3279 ecode++;
3280 goto REPEATCHAR;
3281
3282 case OP_POSPLUS:
3283 case OP_POSPLUSI:
3284 possessive = TRUE;
3285 min = 1;
3286 max = INT_MAX;
3287 ecode++;
3288 goto REPEATCHAR;
3289
3290 case OP_POSQUERY:
3291 case OP_POSQUERYI:
3292 possessive = TRUE;
3293 min = 0;
3294 max = 1;
3295 ecode++;
3296 goto REPEATCHAR;
3297
3298 case OP_STAR:
3299 case OP_STARI:
3300 case OP_MINSTAR:
3301 case OP_MINSTARI:
3302 case OP_PLUS:
3303 case OP_PLUSI:
3304 case OP_MINPLUS:
3305 case OP_MINPLUSI:
3306 case OP_QUERY:
3307 case OP_QUERYI:
3308 case OP_MINQUERY:
3309 case OP_MINQUERYI:
3310 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3311 minimize = (c & 1) != 0;
3312 min = rep_min[c]; /* Pick up values from tables; */
3313 max = rep_max[c]; /* zero for max => infinity */
3314 if (max == 0) max = INT_MAX;
3315
3316 /* Common code for all repeated single-character matches. */
3317
3318 REPEATCHAR:
3319 #ifdef SUPPORT_UTF
3320 if (utf)
3321 {
3322 length = 1;
3323 charptr = ecode;
3324 GETCHARLEN(fc, ecode, length);
3325 ecode += length;
3326
3327 /* Handle multibyte character matching specially here. There is
3328 support for caseless matching if UCP support is present. */
3329
3330 if (length > 1)
3331 {
3332 #ifdef SUPPORT_UCP
3333 unsigned int othercase;
3334 if (op >= OP_STARI && /* Caseless */
3335 (othercase = UCD_OTHERCASE(fc)) != fc)
3336 oclength = PRIV(ord2utf)(othercase, occhars);
3337 else oclength = 0;
3338 #endif /* SUPPORT_UCP */
3339
3340 for (i = 1; i <= min; i++)
3341 {
3342 if (eptr <= md->end_subject - length &&
3343 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3344 #ifdef SUPPORT_UCP
3345 else if (oclength > 0 &&
3346 eptr <= md->end_subject - oclength &&
3347 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3348 #endif /* SUPPORT_UCP */
3349 else
3350 {
3351 CHECK_PARTIAL();
3352 RRETURN(MATCH_NOMATCH);
3353 }
3354 }
3355
3356 if (min == max) continue;
3357
3358 if (minimize)
3359 {
3360 for (fi = min;; fi++)
3361 {
3362 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3363 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3364 if (fi >= max) RRETURN(MATCH_NOMATCH);
3365 if (eptr <= md->end_subject - length &&
3366 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3367 #ifdef SUPPORT_UCP
3368 else if (oclength > 0 &&
3369 eptr <= md->end_subject - oclength &&
3370 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3371 #endif /* SUPPORT_UCP */
3372 else
3373 {
3374 CHECK_PARTIAL();
3375 RRETURN(MATCH_NOMATCH);
3376 }
3377 }
3378 /* Control never gets here */
3379 }
3380
3381 else /* Maximize */
3382 {
3383 pp = eptr;
3384 for (i = min; i < max; i++)
3385 {
3386 if (eptr <= md->end_subject - length &&
3387 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3388 #ifdef SUPPORT_UCP
3389 else if (oclength > 0 &&
3390 eptr <= md->end_subject - oclength &&
3391 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3392 #endif /* SUPPORT_UCP */
3393 else
3394 {
3395 CHECK_PARTIAL();
3396 break;
3397 }
3398 }
3399
3400 if (possessive) continue;
3401
3402 for(;;)
3403 {
3404 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3405 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3406 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3407 #ifdef SUPPORT_UCP
3408 eptr--;
3409 BACKCHAR(eptr);
3410 #else /* without SUPPORT_UCP */
3411 eptr -= length;
3412 #endif /* SUPPORT_UCP */
3413 }
3414 }
3415 /* Control never gets here */
3416 }
3417
3418 /* If the length of a UTF-8 character is 1, we fall through here, and
3419 obey the code as for non-UTF-8 characters below, though in this case the
3420 value of fc will always be < 128. */
3421 }
3422 else
3423 #endif /* SUPPORT_UTF */
3424 /* When not in UTF-8 mode, load a single-byte character. */
3425 fc = *ecode++;
3426
3427 /* The value of fc at this point is always one character, though we may
3428 or may not be in UTF mode. The code is duplicated for the caseless and
3429 caseful cases, for speed, since matching characters is likely to be quite
3430 common. First, ensure the minimum number of matches are present. If min =
3431 max, continue at the same level without recursing. Otherwise, if
3432 minimizing, keep trying the rest of the expression and advancing one
3433 matching character if failing, up to the maximum. Alternatively, if
3434 maximizing, find the maximum number of characters and work backwards. */
3435
3436 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3437 max, (char *)eptr));
3438
3439 if (op >= OP_STARI) /* Caseless */
3440 {
3441 #ifdef COMPILE_PCRE8
3442 /* fc must be < 128 if UTF is enabled. */
3443 foc = md->fcc[fc];
3444 #else
3445 #ifdef SUPPORT_UTF
3446 #ifdef SUPPORT_UCP
3447 if (utf && fc > 127)
3448 foc = UCD_OTHERCASE(fc);
3449 #else
3450 if (utf && fc > 127)
3451 foc = fc;
3452 #endif /* SUPPORT_UCP */
3453 else
3454 #endif /* SUPPORT_UTF */
3455 foc = TABLE_GET(fc, md->fcc, fc);
3456 #endif /* COMPILE_PCRE8 */
3457
3458 for (i = 1; i <= min; i++)
3459 {
3460 if (eptr >= md->end_subject)
3461 {
3462 SCHECK_PARTIAL();
3463 RRETURN(MATCH_NOMATCH);
3464 }
3465 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3466 eptr++;
3467 }
3468 if (min == max) continue;
3469 if (minimize)
3470 {
3471 for (fi = min;; fi++)
3472 {
3473 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3474 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3475 if (fi >= max) RRETURN(MATCH_NOMATCH);
3476 if (eptr >= md->end_subject)
3477 {
3478 SCHECK_PARTIAL();
3479 RRETURN(MATCH_NOMATCH);
3480 }
3481 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3482 eptr++;
3483 }
3484 /* Control never gets here */
3485 }
3486 else /* Maximize */
3487 {
3488 pp = eptr;
3489 for (i = min; i < max; i++)
3490 {
3491 if (eptr >= md->end_subject)
3492 {
3493 SCHECK_PARTIAL();
3494 break;
3495 }
3496 if (fc != *eptr && foc != *eptr) break;
3497 eptr++;
3498 }
3499
3500 if (possessive) continue;
3501
3502 while (eptr >= pp)
3503 {
3504 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3505 eptr--;
3506 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3507 }
3508 RRETURN(MATCH_NOMATCH);
3509 }
3510 /* Control never gets here */
3511 }
3512
3513 /* Caseful comparisons (includes all multi-byte characters) */
3514
3515 else
3516 {
3517 for (i = 1; i <= min; i++)
3518 {
3519 if (eptr >= md->end_subject)
3520 {
3521 SCHECK_PARTIAL();
3522 RRETURN(MATCH_NOMATCH);
3523 }
3524 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3525 }
3526
3527 if (min == max) continue;
3528
3529 if (minimize)
3530 {
3531 for (fi = min;; fi++)
3532 {
3533 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3534 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3535 if (fi >= max) RRETURN(MATCH_NOMATCH);
3536 if (eptr >= md->end_subject)
3537 {
3538 SCHECK_PARTIAL();
3539 RRETURN(MATCH_NOMATCH);
3540 }
3541 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3542 }
3543 /* Control never gets here */
3544 }
3545 else /* Maximize */
3546 {
3547 pp = eptr;
3548 for (i = min; i < max; i++)
3549 {
3550 if (eptr >= md->end_subject)
3551 {
3552 SCHECK_PARTIAL();
3553 break;
3554 }
3555 if (fc != *eptr) break;
3556 eptr++;
3557 }
3558 if (possessive) continue;
3559
3560 while (eptr >= pp)
3561 {
3562 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3563 eptr--;
3564 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3565 }
3566 RRETURN(MATCH_NOMATCH);
3567 }
3568 }
3569 /* Control never gets here */
3570
3571 /* Match a negated single one-byte character. The character we are
3572 checking can be multibyte. */
3573
3574 case OP_NOT:
3575 case OP_NOTI:
3576 if (eptr >= md->end_subject)
3577 {
3578 SCHECK_PARTIAL();
3579 RRETURN(MATCH_NOMATCH);
3580 }
3581 #ifdef SUPPORT_UTF
3582 if (utf)
3583 {
3584 register unsigned int ch, och;
3585
3586 ecode++;
3587 GETCHARINC(ch, ecode);
3588 GETCHARINC(c, eptr);
3589
3590 if (op == OP_NOT)
3591 {
3592 if (ch == c) RRETURN(MATCH_NOMATCH);
3593 }
3594 else
3595 {
3596 #ifdef SUPPORT_UCP
3597 if (ch > 127)
3598 och = UCD_OTHERCASE(ch);
3599 #else
3600 if (ch > 127)
3601 och = ch;
3602 #endif /* SUPPORT_UCP */
3603 else
3604 och = TABLE_GET(ch, md->fcc, ch);
3605 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3606 }
3607 }
3608 else
3609 #endif
3610 {
3611 register unsigned int ch = ecode[1];
3612 c = *eptr++;
3613 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3614 RRETURN(MATCH_NOMATCH);
3615 ecode += 2;
3616 }
3617 break;
3618
3619 /* Match a negated single one-byte character repeatedly. This is almost a
3620 repeat of the code for a repeated single character, but I haven't found a
3621 nice way of commoning these up that doesn't require a test of the
3622 positive/negative option for each character match. Maybe that wouldn't add
3623 very much to the time taken, but character matching *is* what this is all
3624 about... */
3625
3626 case OP_NOTEXACT:
3627 case OP_NOTEXACTI:
3628 min = max = GET2(ecode, 1);
3629 ecode += 1 + IMM2_SIZE;
3630 goto REPEATNOTCHAR;
3631
3632 case OP_NOTUPTO:
3633 case OP_NOTUPTOI:
3634 case OP_NOTMINUPTO:
3635 case OP_NOTMINUPTOI:
3636 min = 0;
3637 max = GET2(ecode, 1);
3638 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3639 ecode += 1 + IMM2_SIZE;
3640 goto REPEATNOTCHAR;
3641
3642 case OP_NOTPOSSTAR:
3643 case OP_NOTPOSSTARI:
3644 possessive = TRUE;
3645 min = 0;
3646 max = INT_MAX;
3647 ecode++;
3648 goto REPEATNOTCHAR;
3649
3650 case OP_NOTPOSPLUS:
3651 case OP_NOTPOSPLUSI:
3652 possessive = TRUE;
3653 min = 1;
3654 max = INT_MAX;
3655 ecode++;
3656 goto REPEATNOTCHAR;
3657
3658 case OP_NOTPOSQUERY:
3659 case OP_NOTPOSQUERYI:
3660 possessive = TRUE;
3661 min = 0;
3662 max = 1;
3663 ecode++;
3664 goto REPEATNOTCHAR;
3665
3666 case OP_NOTPOSUPTO:
3667 case OP_NOTPOSUPTOI:
3668 possessive = TRUE;
3669 min = 0;
3670 max = GET2(ecode, 1);
3671 ecode += 1 + IMM2_SIZE;
3672 goto REPEATNOTCHAR;
3673
3674 case OP_NOTSTAR:
3675 case OP_NOTSTARI:
3676 case OP_NOTMINSTAR:
3677 case OP_NOTMINSTARI:
3678 case OP_NOTPLUS:
3679 case OP_NOTPLUSI:
3680 case OP_NOTMINPLUS:
3681 case OP_NOTMINPLUSI:
3682 case OP_NOTQUERY:
3683 case OP_NOTQUERYI:
3684 case OP_NOTMINQUERY:
3685 case OP_NOTMINQUERYI:
3686 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3687 minimize = (c & 1) != 0;
3688 min = rep_min[c]; /* Pick up values from tables; */
3689 max = rep_max[c]; /* zero for max => infinity */
3690 if (max == 0) max = INT_MAX;
3691
3692 /* Common code for all repeated single-byte matches. */
3693
3694 REPEATNOTCHAR:
3695 GETCHARINCTEST(fc, ecode);
3696
3697 /* The code is duplicated for the caseless and caseful cases, for speed,
3698 since matching characters is likely to be quite common. First, ensure the
3699 minimum number of matches are present. If min = max, continue at the same
3700 level without recursing. Otherwise, if minimizing, keep trying the rest of
3701 the expression and advancing one matching character if failing, up to the
3702 maximum. Alternatively, if maximizing, find the maximum number of
3703 characters and work backwards. */
3704
3705 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3706 max, (char *)eptr));
3707
3708 if (op >= OP_NOTSTARI) /* Caseless */
3709 {
3710 #ifdef SUPPORT_UTF
3711 #ifdef SUPPORT_UCP
3712 if (utf && fc > 127)
3713 foc = UCD_OTHERCASE(fc);
3714 #else
3715 if (utf && fc > 127)
3716 foc = fc;
3717 #endif /* SUPPORT_UCP */
3718 else
3719 #endif /* SUPPORT_UTF */
3720 foc = TABLE_GET(fc, md->fcc, fc);
3721
3722 #ifdef SUPPORT_UTF
3723 if (utf)
3724 {
3725 register unsigned int d;
3726 for (i = 1; i <= min; i++)
3727 {
3728 if (eptr >= md->end_subject)
3729 {
3730 SCHECK_PARTIAL();
3731 RRETURN(MATCH_NOMATCH);
3732 }
3733 GETCHARINC(d, eptr);
3734 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3735 }
3736 }
3737 else
3738 #endif
3739 /* Not UTF mode */
3740 {
3741 for (i = 1; i <= min; i++)
3742 {
3743 if (eptr >= md->end_subject)
3744 {
3745 SCHECK_PARTIAL();
3746 RRETURN(MATCH_NOMATCH);
3747 }
3748 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3749 eptr++;
3750 }
3751 }
3752
3753 if (min == max) continue;
3754
3755 if (minimize)
3756 {
3757 #ifdef SUPPORT_UTF
3758 if (utf)
3759 {
3760 register unsigned int d;
3761 for (fi = min;; fi++)
3762 {
3763 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3764 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3765 if (fi >= max) RRETURN(MATCH_NOMATCH);
3766 if (eptr >= md->end_subject)
3767 {
3768 SCHECK_PARTIAL();
3769 RRETURN(MATCH_NOMATCH);
3770 }
3771 GETCHARINC(d, eptr);
3772 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3773 }
3774 }
3775 else
3776 #endif
3777 /* Not UTF mode */
3778 {
3779 for (fi = min;; fi++)
3780 {
3781 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3783 if (fi >= max) RRETURN(MATCH_NOMATCH);
3784 if (eptr >= md->end_subject)
3785 {
3786 SCHECK_PARTIAL();
3787 RRETURN(MATCH_NOMATCH);
3788 }
3789 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3790 eptr++;
3791 }
3792 }
3793 /* Control never gets here */
3794 }
3795
3796 /* Maximize case */
3797
3798 else
3799 {
3800 pp = eptr;
3801
3802 #ifdef SUPPORT_UTF
3803 if (utf)
3804 {
3805 register unsigned int d;
3806 for (i = min; i < max; i++)
3807 {
3808 int len = 1;
3809 if (eptr >= md->end_subject)
3810 {
3811 SCHECK_PARTIAL();
3812 break;
3813 }
3814 GETCHARLEN(d, eptr, len);
3815 if (fc == d || (unsigned int)foc == d) break;
3816 eptr += len;
3817 }
3818 if (possessive) continue;
3819 for(;;)
3820 {
3821 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3823 if (eptr-- == pp) break; /* Stop if tried at original pos */
3824 BACKCHAR(eptr);
3825 }
3826 }
3827 else
3828 #endif
3829 /* Not UTF mode */
3830 {
3831 for (i = min; i < max; i++)
3832 {
3833 if (eptr >= md->end_subject)
3834 {
3835 SCHECK_PARTIAL();
3836 break;
3837 }
3838 if (fc == *eptr || foc == *eptr) break;
3839 eptr++;
3840 }
3841 if (possessive) continue;
3842 while (eptr >= pp)
3843 {
3844 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3846 eptr--;
3847 }
3848 }
3849
3850 RRETURN(MATCH_NOMATCH);
3851 }
3852 /* Control never gets here */
3853 }
3854
3855 /* Caseful comparisons */
3856
3857 else
3858 {
3859 #ifdef SUPPORT_UTF
3860 if (utf)
3861 {
3862 register unsigned int d;
3863 for (i = 1; i <= min; i++)
3864 {
3865 if (eptr >= md->end_subject)
3866 {
3867 SCHECK_PARTIAL();
3868 RRETURN(MATCH_NOMATCH);
3869 }
3870 GETCHARINC(d, eptr);
3871 if (fc == d) RRETURN(MATCH_NOMATCH);
3872 }
3873 }
3874 else
3875 #endif
3876 /* Not UTF mode */
3877 {
3878 for (i = 1; i <= min; i++)
3879 {
3880 if (eptr >= md->end_subject)
3881 {
3882 SCHECK_PARTIAL();
3883 RRETURN(MATCH_NOMATCH);
3884 }
3885 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3886 }
3887 }
3888
3889 if (min == max) continue;
3890
3891 if (minimize)
3892 {
3893 #ifdef SUPPORT_UTF
3894 if (utf)
3895 {
3896 register unsigned int d;
3897 for (fi = min;; fi++)
3898 {
3899 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3901 if (fi >= max) RRETURN(MATCH_NOMATCH);
3902 if (eptr >= md->end_subject)
3903 {
3904 SCHECK_PARTIAL();
3905 RRETURN(MATCH_NOMATCH);
3906 }
3907 GETCHARINC(d, eptr);
3908 if (fc == d) RRETURN(MATCH_NOMATCH);
3909 }
3910 }
3911 else
3912 #endif
3913 /* Not UTF mode */
3914 {
3915 for (fi = min;; fi++)
3916 {
3917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3919 if (fi >= max) RRETURN(MATCH_NOMATCH);
3920 if (eptr >= md->end_subject)
3921 {
3922 SCHECK_PARTIAL();
3923 RRETURN(MATCH_NOMATCH);
3924 }
3925 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3926 }
3927 }
3928 /* Control never gets here */
3929 }
3930
3931 /* Maximize case */
3932
3933 else
3934 {
3935 pp = eptr;
3936
3937 #ifdef SUPPORT_UTF
3938 if (utf)
3939 {
3940 register unsigned int d;
3941 for (i = min; i < max; i++)
3942 {
3943 int len = 1;
3944 if (eptr >= md->end_subject)
3945 {
3946 SCHECK_PARTIAL();
3947 break;
3948 }
3949 GETCHARLEN(d, eptr, len);
3950 if (fc == d) break;
3951 eptr += len;
3952 }
3953 if (possessive) continue;
3954 for(;;)
3955 {
3956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3958 if (eptr-- == pp) break; /* Stop if tried at original pos */
3959 BACKCHAR(eptr);
3960 }
3961 }
3962 else
3963 #endif
3964 /* Not UTF mode */
3965 {
3966 for (i = min; i < max; i++)
3967 {
3968 if (eptr >= md->end_subject)
3969 {
3970 SCHECK_PARTIAL();
3971 break;
3972 }
3973 if (fc == *eptr) break;
3974 eptr++;
3975 }
3976 if (possessive) continue;
3977 while (eptr >= pp)
3978 {
3979 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3981 eptr--;
3982 }
3983 }
3984
3985 RRETURN(MATCH_NOMATCH);
3986 }
3987 }
3988 /* Control never gets here */
3989
3990 /* Match a single character type repeatedly; several different opcodes
3991 share code. This is very similar to the code for single characters, but we
3992 repeat it in the interests of efficiency. */
3993
3994 case OP_TYPEEXACT:
3995 min = max = GET2(ecode, 1);
3996 minimize = TRUE;
3997 ecode += 1 + IMM2_SIZE;
3998 goto REPEATTYPE;
3999
4000 case OP_TYPEUPTO:
4001 case OP_TYPEMINUPTO:
4002 min = 0;
4003 max = GET2(ecode, 1);
4004 minimize = *ecode == OP_TYPEMINUPTO;
4005 ecode += 1 + IMM2_SIZE;
4006 goto REPEATTYPE;
4007
4008 case OP_TYPEPOSSTAR:
4009 possessive = TRUE;
4010 min = 0;
4011 max = INT_MAX;
4012 ecode++;
4013 goto REPEATTYPE;
4014
4015 case OP_TYPEPOSPLUS:
4016 possessive = TRUE;
4017 min = 1;
4018 max = INT_MAX;
4019 ecode++;
4020 goto REPEATTYPE;
4021
4022 case OP_TYPEPOSQUERY:
4023 possessive = TRUE;
4024 min = 0;
4025 max = 1;
4026 ecode++;
4027 goto REPEATTYPE;
4028
4029 case OP_TYPEPOSUPTO:
4030 possessive = TRUE;
4031 min = 0;
4032 max = GET2(ecode, 1);
4033 ecode += 1 + IMM2_SIZE;
4034 goto REPEATTYPE;
4035
4036 case OP_TYPESTAR:
4037 case OP_TYPEMINSTAR:
4038 case OP_TYPEPLUS:
4039 case OP_TYPEMINPLUS:
4040 case OP_TYPEQUERY:
4041 case OP_TYPEMINQUERY:
4042 c = *ecode++ - OP_TYPESTAR;
4043 minimize = (c & 1) != 0;
4044 min = rep_min[c]; /* Pick up values from tables; */
4045 max = rep_max[c]; /* zero for max => infinity */
4046 if (max == 0) max = INT_MAX;
4047
4048 /* Common code for all repeated single character type matches. Note that
4049 in UTF-8 mode, '.' matches a character of any length, but for the other
4050 character types, the valid characters are all one-byte long. */
4051
4052 REPEATTYPE:
4053 ctype = *ecode++; /* Code for the character type */
4054
4055 #ifdef SUPPORT_UCP
4056 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4057 {
4058 prop_fail_result = ctype == OP_NOTPROP;
4059 prop_type = *ecode++;
4060 prop_value = *ecode++;
4061 }
4062 else prop_type = -1;
4063 #endif
4064
4065 /* First, ensure the minimum number of matches are present. Use inline
4066 code for maximizing the speed, and do the type test once at the start
4067 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4068 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4069 and single-bytes. */
4070
4071 if (min > 0)
4072 {
4073 #ifdef SUPPORT_UCP
4074 if (prop_type >= 0)
4075 {
4076 switch(prop_type)
4077 {
4078 case PT_ANY:
4079 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4080 for (i = 1; i <= min; i++)
4081 {
4082 if (eptr >= md->end_subject)
4083 {
4084 SCHECK_PARTIAL();
4085 RRETURN(MATCH_NOMATCH);
4086 }
4087 GETCHARINCTEST(c, eptr);
4088 }
4089 break;
4090
4091 case PT_LAMP:
4092 for (i = 1; i <= min; i++)
4093 {
4094 int chartype;
4095 if (eptr >= md->end_subject)
4096 {
4097 SCHECK_PARTIAL();
4098 RRETURN(MATCH_NOMATCH);
4099 }
4100 GETCHARINCTEST(c, eptr);
4101 chartype = UCD_CHARTYPE(c);
4102 if ((chartype == ucp_Lu ||
4103 chartype == ucp_Ll ||
4104 chartype == ucp_Lt) == prop_fail_result)
4105 RRETURN(MATCH_NOMATCH);
4106 }
4107 break;
4108
4109 case PT_GC:
4110 for (i = 1; i <= min; i++)
4111 {
4112 if (eptr >= md->end_subject)
4113 {
4114 SCHECK_PARTIAL();
4115 RRETURN(MATCH_NOMATCH);
4116 }
4117 GETCHARINCTEST(c, eptr);
4118 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4119 RRETURN(MATCH_NOMATCH);
4120 }
4121 break;
4122
4123 case PT_PC:
4124 for (i = 1; i <= min; i++)
4125 {
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 RRETURN(MATCH_NOMATCH);
4130 }
4131 GETCHARINCTEST(c, eptr);
4132 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4133 RRETURN(MATCH_NOMATCH);
4134 }
4135 break;
4136
4137 case PT_SC:
4138 for (i = 1; i <= min; i++)
4139 {
4140 if (eptr >= md->end_subject)
4141 {
4142 SCHECK_PARTIAL();
4143 RRETURN(MATCH_NOMATCH);
4144 }
4145 GETCHARINCTEST(c, eptr);
4146 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4147 RRETURN(MATCH_NOMATCH);
4148 }
4149 break;
4150
4151 case PT_ALNUM:
4152 for (i = 1; i <= min; i++)
4153 {
4154 int category;
4155 if (eptr >= md->end_subject)
4156 {
4157 SCHECK_PARTIAL();
4158 RRETURN(MATCH_NOMATCH);
4159 }
4160 GETCHARINCTEST(c, eptr);
4161 category = UCD_CATEGORY(c);
4162 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4163 RRETURN(MATCH_NOMATCH);
4164 }
4165 break;
4166
4167 case PT_SPACE: /* Perl space */
4168 for (i = 1; i <= min; i++)
4169 {
4170 if (eptr >= md->end_subject)
4171 {
4172 SCHECK_PARTIAL();
4173 RRETURN(MATCH_NOMATCH);
4174 }
4175 GETCHARINCTEST(c, eptr);
4176 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4177 c == CHAR_FF || c == CHAR_CR)
4178 == prop_fail_result)
4179 RRETURN(MATCH_NOMATCH);
4180 }
4181 break;
4182
4183 case PT_PXSPACE: /* POSIX space */
4184 for (i = 1; i <= min; i++)
4185 {
4186 if (eptr >= md->end_subject)
4187 {
4188 SCHECK_PARTIAL();
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 GETCHARINCTEST(c, eptr);
4192 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4193 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4194 == prop_fail_result)
4195 RRETURN(MATCH_NOMATCH);
4196 }
4197 break;
4198
4199 case PT_WORD:
4200 for (i = 1; i <= min; i++)
4201 {
4202 int category;
4203 if (eptr >= md->end_subject)
4204 {
4205 SCHECK_PARTIAL();
4206 RRETURN(MATCH_NOMATCH);
4207 }
4208 GETCHARINCTEST(c, eptr);
4209 category = UCD_CATEGORY(c);
4210 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4211 == prop_fail_result)
4212 RRETURN(MATCH_NOMATCH);
4213 }
4214 break;
4215
4216 /* This should not occur */
4217
4218 default:
4219 RRETURN(PCRE_ERROR_INTERNAL);
4220 }
4221 }
4222
4223 /* Match extended Unicode sequences. We will get here only if the
4224 support is in the binary; otherwise a compile-time error occurs. */
4225
4226 else if (ctype == OP_EXTUNI)
4227 {
4228 for (i = 1; i <= min; i++)
4229 {
4230 if (eptr >= md->end_subject)
4231 {
4232 SCHECK_PARTIAL();
4233 RRETURN(MATCH_NOMATCH);
4234 }
4235 else
4236 {
4237 int lgb, rgb;
4238 GETCHARINCTEST(c, eptr);
4239 lgb = UCD_GRAPHBREAK(c);
4240 while (eptr < md->end_subject)
4241 {
4242 int len = 1;
4243 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4244 rgb = UCD_GRAPHBREAK(c);
4245 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4246 lgb = rgb;
4247 eptr += len;
4248 }
4249 }
4250 CHECK_PARTIAL();
4251 }
4252 }
4253
4254 else
4255 #endif /* SUPPORT_UCP */
4256
4257 /* Handle all other cases when the coding is UTF-8 */
4258
4259 #ifdef SUPPORT_UTF
4260 if (utf) switch(ctype)
4261 {
4262 case OP_ANY:
4263 for (i = 1; i <= min; i++)
4264 {
4265 if (eptr >= md->end_subject)
4266 {
4267 SCHECK_PARTIAL();
4268 RRETURN(MATCH_NOMATCH);
4269 }
4270 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4271 if (md->partial != 0 &&
4272 eptr + 1 >= md->end_subject &&
4273 NLBLOCK->nltype == NLTYPE_FIXED &&
4274 NLBLOCK->nllen == 2 &&
4275 *eptr == NLBLOCK->nl[0])
4276 {
4277 md->hitend = TRUE;
4278 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4279 }
4280 eptr++;
4281 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4282 }
4283 break;
4284
4285 case OP_ALLANY:
4286 for (i = 1; i <= min; i++)
4287 {
4288 if (eptr >= md->end_subject)
4289 {
4290 SCHECK_PARTIAL();
4291 RRETURN(MATCH_NOMATCH);
4292 }
4293 eptr++;
4294 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4295 }
4296 break;
4297
4298 case OP_ANYBYTE:
4299 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4300 eptr += min;
4301 break;
4302
4303 case OP_ANYNL:
4304 for (i = 1; i <= min; i++)
4305 {
4306 if (eptr >= md->end_subject)
4307 {
4308 SCHECK_PARTIAL();
4309 RRETURN(MATCH_NOMATCH);
4310 }
4311 GETCHARINC(c, eptr);
4312 switch(c)
4313 {
4314 default: RRETURN(MATCH_NOMATCH);
4315
4316 case 0x000d:
4317 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4318 break;
4319
4320 case 0x000a:
4321 break;
4322
4323 case 0x000b:
4324 case 0x000c:
4325 case 0x0085:
4326 case 0x2028:
4327 case 0x2029:
4328 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4329 break;
4330 }
4331 }
4332 break;
4333
4334 case OP_NOT_HSPACE:
4335 for (i = 1; i <= min; i++)
4336 {
4337 if (eptr >= md->end_subject)
4338 {
4339 SCHECK_PARTIAL();
4340 RRETURN(MATCH_NOMATCH);
4341 }
4342 GETCHARINC(c, eptr);
4343 switch(c)
4344 {
4345 default: break;
4346 case 0x09: /* HT */
4347 case 0x20: /* SPACE */
4348 case 0xa0: /* NBSP */
4349 case 0x1680: /* OGHAM SPACE MARK */
4350 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4351 case 0x2000: /* EN QUAD */
4352 case 0x2001: /* EM QUAD */
4353 case 0x2002: /* EN SPACE */
4354 case 0x2003: /* EM SPACE */
4355 case 0x2004: /* THREE-PER-EM SPACE */
4356 case 0x2005: /* FOUR-PER-EM SPACE */
4357 case 0x2006: /* SIX-PER-EM SPACE */
4358 case 0x2007: /* FIGURE SPACE */
4359 case 0x2008: /* PUNCTUATION SPACE */
4360 case 0x2009: /* THIN SPACE */
4361 case 0x200A: /* HAIR SPACE */
4362 case 0x202f: /* NARROW NO-BREAK SPACE */
4363 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4364 case 0x3000: /* IDEOGRAPHIC SPACE */
4365 RRETURN(MATCH_NOMATCH);
4366 }
4367 }
4368 break;
4369
4370 case OP_HSPACE:
4371 for (i = 1; i <= min; i++)
4372 {
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 RRETURN(MATCH_NOMATCH);
4377 }
4378 GETCHARINC(c, eptr);
4379 switch(c)
4380 {
4381 default: RRETURN(MATCH_NOMATCH);
4382 case 0x09: /* HT */
4383 case 0x20: /* SPACE */
4384 case 0xa0: /* NBSP */
4385 case 0x1680: /* OGHAM SPACE MARK */
4386 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4387 case 0x2000: /* EN QUAD */
4388 case 0x2001: /* EM QUAD */
4389 case 0x2002: /* EN SPACE */
4390 case 0x2003: /* EM SPACE */
4391 case 0x2004: /* THREE-PER-EM SPACE */
4392 case 0x2005: /* FOUR-PER-EM SPACE */
4393 case 0x2006: /* SIX-PER-EM SPACE */
4394 case 0x2007: /* FIGURE SPACE */
4395 case 0x2008: /* PUNCTUATION SPACE */
4396 case 0x2009: /* THIN SPACE */
4397 case 0x200A: /* HAIR SPACE */
4398 case 0x202f: /* NARROW NO-BREAK SPACE */
4399 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4400 case 0x3000: /* IDEOGRAPHIC SPACE */
4401 break;
4402 }
4403 }
4404 break;
4405
4406 case OP_NOT_VSPACE:
4407 for (i = 1; i <= min; i++)
4408 {
4409 if (eptr >= md->end_subject)
4410 {
4411 SCHECK_PARTIAL();
4412 RRETURN(MATCH_NOMATCH);
4413 }
4414 GETCHARINC(c, eptr);
4415 switch(c)
4416 {
4417 default: break;
4418 case 0x0a: /* LF */
4419 case 0x0b: /* VT */
4420 case 0x0c: /* FF */
4421 case 0x0d: /* CR */
4422 case 0x85: /* NEL */
4423 case 0x2028: /* LINE SEPARATOR */
4424 case 0x2029: /* PARAGRAPH SEPARATOR */
4425 RRETURN(MATCH_NOMATCH);
4426 }
4427 }
4428 break;
4429
4430 case OP_VSPACE:
4431 for (i = 1; i <= min; i++)
4432 {
4433 if (eptr >= md->end_subject)
4434 {
4435 SCHECK_PARTIAL();
4436 RRETURN(MATCH_NOMATCH);
4437 }
4438 GETCHARINC(c, eptr);
4439 switch(c)
4440 {
4441 default: RRETURN(MATCH_NOMATCH);
4442 case 0x0a: /* LF */
4443 case 0x0b: /* VT */
4444 case 0x0c: /* FF */
4445 case 0x0d: /* CR */
4446 case 0x85: /* NEL */
4447 case 0x2028: /* LINE SEPARATOR */
4448 case 0x2029: /* PARAGRAPH SEPARATOR */
4449 break;
4450 }
4451 }
4452 break;
4453
4454 case OP_NOT_DIGIT:
4455 for (i = 1; i <= min; i++)
4456 {
4457 if (eptr >= md->end_subject)
4458 {
4459 SCHECK_PARTIAL();
4460 RRETURN(MATCH_NOMATCH);
4461 }
4462 GETCHARINC(c, eptr);
4463 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4464 RRETURN(MATCH_NOMATCH);
4465 }
4466 break;
4467
4468 case OP_DIGIT:
4469 for (i = 1; i <= min; i++)
4470 {
4471 if (eptr >= md->end_subject)
4472 {
4473 SCHECK_PARTIAL();
4474 RRETURN(MATCH_NOMATCH);
4475 }
4476 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4477 RRETURN(MATCH_NOMATCH);
4478 eptr++;
4479 /* No need to skip more bytes - we know it's a 1-byte character */
4480 }
4481 break;
4482
4483 case OP_NOT_WHITESPACE:
4484 for (i = 1; i <= min; i++)
4485 {
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 RRETURN(MATCH_NOMATCH);
4490 }
4491 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4492 RRETURN(MATCH_NOMATCH);
4493 eptr++;
4494 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4495 }
4496 break;
4497
4498 case OP_WHITESPACE:
4499 for (i = 1; i <= min; i++)
4500 {
4501 if (eptr >= md->end_subject)
4502 {
4503 SCHECK_PARTIAL();
4504 RRETURN(MATCH_NOMATCH);
4505 }
4506 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4507 RRETURN(MATCH_NOMATCH);
4508 eptr++;
4509 /* No need to skip more bytes - we know it's a 1-byte character */
4510 }
4511 break;
4512
4513 case OP_NOT_WORDCHAR:
4514 for (i = 1; i <= min; i++)
4515 {
4516 if (eptr >= md->end_subject)
4517 {
4518 SCHECK_PARTIAL();
4519 RRETURN(MATCH_NOMATCH);
4520 }
4521 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4522 RRETURN(MATCH_NOMATCH);
4523 eptr++;
4524 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4525 }
4526 break;
4527
4528 case OP_WORDCHAR:
4529 for (i = 1; i <= min; i++)
4530 {
4531 if (eptr >= md->end_subject)
4532 {
4533 SCHECK_PARTIAL();
4534 RRETURN(MATCH_NOMATCH);
4535 }
4536 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4537 RRETURN(MATCH_NOMATCH);
4538 eptr++;
4539 /* No need to skip more bytes - we know it's a 1-byte character */
4540 }
4541 break;
4542
4543 default:
4544 RRETURN(PCRE_ERROR_INTERNAL);
4545 } /* End switch(ctype) */
4546
4547 else
4548 #endif /* SUPPORT_UTF */
4549
4550 /* Code for the non-UTF-8 case for minimum matching of operators other
4551 than OP_PROP and OP_NOTPROP. */
4552
4553 switch(ctype)
4554 {
4555 case OP_ANY:
4556 for (i = 1; i <= min; i++)
4557 {
4558 if (eptr >= md->end_subject)
4559 {
4560 SCHECK_PARTIAL();
4561 RRETURN(MATCH_NOMATCH);
4562 }
4563 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4564 if (md->partial != 0 &&
4565 eptr + 1 >= md->end_subject &&
4566 NLBLOCK->nltype == NLTYPE_FIXED &&
4567 NLBLOCK->nllen == 2 &&
4568 *eptr == NLBLOCK->nl[0])
4569 {
4570 md->hitend = TRUE;
4571 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4572 }
4573 eptr++;
4574 }
4575 break;
4576
4577 case OP_ALLANY:
4578 if (eptr > md->end_subject - min)
4579 {
4580 SCHECK_PARTIAL();
4581 RRETURN(MATCH_NOMATCH);
4582 }
4583 eptr += min;
4584 break;
4585
4586 case OP_ANYBYTE:
4587 if (eptr > md->end_subject - min)
4588 {
4589 SCHECK_PARTIAL();
4590 RRETURN(MATCH_NOMATCH);
4591 }
4592 eptr += min;
4593 break;
4594
4595 case OP_ANYNL:
4596 for (i = 1; i <= min; i++)
4597 {
4598 if (eptr >= md->end_subject)
4599 {
4600 SCHECK_PARTIAL();
4601 RRETURN(MATCH_NOMATCH);
4602 }
4603 switch(*eptr++)
4604 {
4605 default: RRETURN(MATCH_NOMATCH);
4606
4607 case 0x000d:
4608 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4609 break;
4610
4611 case 0x000a:
4612 break;
4613
4614 case 0x000b:
4615 case 0x000c:
4616 case 0x0085:
4617 #ifdef COMPILE_PCRE16
4618 case 0x2028:
4619 case 0x2029:
4620 #endif
4621 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4622 break;
4623 }
4624 }
4625 break;
4626
4627 case OP_NOT_HSPACE:
4628 for (i = 1; i <= min; i++)
4629 {
4630 if (eptr >= md->end_subject)
4631 {
4632 SCHECK_PARTIAL();
4633 RRETURN(MATCH_NOMATCH);
4634 }
4635 switch(*eptr++)
4636 {
4637 default: break;
4638 case 0x09: /* HT */
4639 case 0x20: /* SPACE */
4640 case 0xa0: /* NBSP */
4641 #ifdef COMPILE_PCRE16
4642 case 0x1680: /* OGHAM SPACE MARK */
4643 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4644 case 0x2000: /* EN QUAD */
4645 case 0x2001: /* EM QUAD */
4646 case 0x2002: /* EN SPACE */
4647 case 0x2003: /* EM SPACE */
4648 case 0x2004: /* THREE-PER-EM SPACE */
4649 case 0x2005: /* FOUR-PER-EM SPACE */
4650 case 0x2006: /* SIX-PER-EM SPACE */
4651 case 0x2007: /* FIGURE SPACE */
4652 case 0x2008: /* PUNCTUATION SPACE */
4653 case 0x2009: /* THIN SPACE */
4654 case 0x200A: /* HAIR SPACE */
4655 case 0x202f: /* NARROW NO-BREAK SPACE */
4656 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4657 case 0x3000: /* IDEOGRAPHIC SPACE */
4658 #endif
4659 RRETURN(MATCH_NOMATCH);
4660 }
4661 }
4662 break;
4663
4664 case OP_HSPACE:
4665 for (i = 1; i <= min; i++)
4666 {
4667 if (eptr >= md->end_subject)
4668 {
4669 SCHECK_PARTIAL();
4670 RRETURN(MATCH_NOMATCH);
4671 }
4672 switch(*eptr++)
4673 {
4674 default: RRETURN(MATCH_NOMATCH);
4675 case 0x09: /* HT */
4676 case 0x20: /* SPACE */
4677 case 0xa0: /* NBSP */
4678 #ifdef COMPILE_PCRE16
4679 case 0x1680: /* OGHAM SPACE MARK */
4680 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4681 case 0x2000: /* EN QUAD */
4682 case 0x2001: /* EM QUAD */
4683 case 0x2002: /* EN SPACE */
4684 case 0x2003: /* EM SPACE */
4685 case 0x2004: /* THREE-PER-EM SPACE */
4686 case 0x2005: /* FOUR-PER-EM SPACE */
4687 case 0x2006: /* SIX-PER-EM SPACE */
4688 case 0x2007: /* FIGURE SPACE */
4689 case 0x2008: /* PUNCTUATION SPACE */
4690 case 0x2009: /* THIN SPACE */
4691 case 0x200A: /* HAIR SPACE */
4692 case 0x202f: /* NARROW NO-BREAK SPACE */
4693 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4694 case 0x3000: /* IDEOGRAPHIC SPACE */
4695 #endif
4696 break;
4697 }
4698 }
4699 break;
4700
4701 case OP_NOT_VSPACE:
4702 for (i = 1; i <= min; i++)
4703 {
4704 if (eptr >= md->end_subject)
4705 {
4706 SCHECK_PARTIAL();
4707 RRETURN(MATCH_NOMATCH);
4708 }
4709 switch(*eptr++)
4710 {
4711 default: break;
4712 case 0x0a: /* LF */
4713 case 0x0b: /* VT */
4714 case 0x0c: /* FF */
4715 case 0x0d: /* CR */
4716 case 0x85: /* NEL */
4717 #ifdef COMPILE_PCRE16
4718 case 0x2028: /* LINE SEPARATOR */
4719 case 0x2029: /* PARAGRAPH SEPARATOR */
4720 #endif
4721 RRETURN(MATCH_NOMATCH);
4722 }
4723 }
4724 break;
4725
4726 case OP_VSPACE:
4727 for (i = 1; i <= min; i++)
4728 {
4729 if (eptr >= md->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4733 }
4734 switch(*eptr++)
4735 {
4736 default: RRETURN(MATCH_NOMATCH);
4737 case 0x0a: /* LF */
4738 case 0x0b: /* VT */
4739 case 0x0c: /* FF */
4740 case 0x0d: /* CR */
4741 case 0x85: /* NEL */
4742 #ifdef COMPILE_PCRE16
4743 case 0x2028: /* LINE SEPARATOR */
4744 case 0x2029: /* PARAGRAPH SEPARATOR */
4745 #endif
4746 break;
4747 }
4748 }
4749 break;
4750
4751 case OP_NOT_DIGIT:
4752 for (i = 1; i <= min; i++)
4753 {
4754 if (eptr >= md->end_subject)
4755 {
4756 SCHECK_PARTIAL();
4757 RRETURN(MATCH_NOMATCH);
4758 }
4759 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4760 RRETURN(MATCH_NOMATCH);
4761 eptr++;
4762 }
4763 break;
4764
4765 case OP_DIGIT:
4766 for (i = 1; i <= min; i++)
4767 {
4768 if (eptr >= md->end_subject)
4769 {
4770 SCHECK_PARTIAL();
4771 RRETURN(MATCH_NOMATCH);
4772 }
4773 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4774 RRETURN(MATCH_NOMATCH);
4775 eptr++;
4776 }
4777 break;
4778
4779 case OP_NOT_WHITESPACE:
4780 for (i = 1; i <= min; i++)
4781 {
4782 if (eptr >= md->end_subject)
4783 {
4784 SCHECK_PARTIAL();
4785 RRETURN(MATCH_NOMATCH);
4786 }
4787 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4788 RRETURN(MATCH_NOMATCH);
4789 eptr++;
4790 }
4791 break;
4792
4793 case OP_WHITESPACE:
4794 for (i = 1; i <= min; i++)
4795 {
4796 if (eptr >= md->end_subject)
4797 {
4798 SCHECK_PARTIAL();
4799 RRETURN(MATCH_NOMATCH);
4800 }
4801 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4802 RRETURN(MATCH_NOMATCH);
4803 eptr++;
4804 }
4805 break;
4806
4807 case OP_NOT_WORDCHAR:
4808 for (i = 1; i <= min; i++)
4809 {
4810 if (eptr >= md->end_subject)
4811 {
4812 SCHECK_PARTIAL();
4813 RRETURN(MATCH_NOMATCH);
4814 }
4815 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4816 RRETURN(MATCH_NOMATCH);
4817 eptr++;
4818 }
4819 break;
4820
4821 case OP_WORDCHAR:
4822 for (i = 1; i <= min; i++)
4823 {
4824 if (eptr >= md->end_subject)
4825 {
4826 SCHECK_PARTIAL();
4827 RRETURN(MATCH_NOMATCH);
4828 }
4829 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4830 RRETURN(MATCH_NOMATCH);
4831 eptr++;
4832 }
4833 break;
4834
4835 default:
4836 RRETURN(PCRE_ERROR_INTERNAL);
4837 }
4838 }
4839
4840 /* If min = max, continue at the same level without recursing */
4841
4842 if (min == max) continue;
4843
4844 /* If minimizing, we have to test the rest of the pattern before each
4845 subsequent match. Again, separate the UTF-8 case for speed, and also
4846 separate the UCP cases. */
4847
4848 if (minimize)
4849 {
4850 #ifdef SUPPORT_UCP
4851 if (prop_type >= 0)
4852 {
4853 switch(prop_type)
4854 {
4855 case PT_ANY:
4856 for (fi = min;; fi++)
4857 {
4858 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4860 if (fi >= max) RRETURN(MATCH_NOMATCH);
4861 if (eptr >= md->end_subject)
4862 {
4863 SCHECK_PARTIAL();
4864 RRETURN(MATCH_NOMATCH);
4865 }
4866 GETCHARINCTEST(c, eptr);
4867 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4868 }
4869 /* Control never gets here */
4870
4871 case PT_LAMP:
4872 for (fi = min;; fi++)
4873 {
4874 int chartype;
4875 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4877 if (fi >= max) RRETURN(MATCH_NOMATCH);
4878 if (eptr >= md->end_subject)
4879 {
4880 SCHECK_PARTIAL();
4881 RRETURN(MATCH_NOMATCH);
4882 }
4883 GETCHARINCTEST(c, eptr);
4884 chartype = UCD_CHARTYPE(c);
4885 if ((chartype == ucp_Lu ||
4886 chartype == ucp_Ll ||
4887 chartype == ucp_Lt) == prop_fail_result)
4888 RRETURN(MATCH_NOMATCH);
4889 }
4890 /* Control never gets here */
4891
4892 case PT_GC:
4893 for (fi = min;; fi++)
4894 {
4895 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4896 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4897 if (fi >= max) RRETURN(MATCH_NOMATCH);
4898 if (eptr >= md->end_subject)
4899 {
4900 SCHECK_PARTIAL();
4901 RRETURN(MATCH_NOMATCH);
4902 }
4903 GETCHARINCTEST(c, eptr);
4904 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4905 RRETURN(MATCH_NOMATCH);
4906 }
4907 /* Control never gets here */
4908
4909 case PT_PC:
4910 for (fi = min;; fi++)
4911 {
4912 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4914 if (fi >= max) RRETURN(MATCH_NOMATCH);
4915 if (eptr >= md->end_subject)
4916 {
4917 SCHECK_PARTIAL();
4918 RRETURN(MATCH_NOMATCH);
4919 }
4920 GETCHARINCTEST(c, eptr);
4921 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4922 RRETURN(MATCH_NOMATCH);
4923 }
4924 /* Control never gets here */
4925
4926 case PT_SC:
4927 for (fi = min;; fi++)
4928 {
4929 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4930 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4931 if (fi >= max) RRETURN(MATCH_NOMATCH);
4932 if (eptr >= md->end_subject)
4933 {
4934 SCHECK_PARTIAL();
4935 RRETURN(MATCH_NOMATCH);
4936 }
4937 GETCHARINCTEST(c, eptr);
4938 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4939 RRETURN(MATCH_NOMATCH);
4940 }
4941 /* Control never gets here */
4942
4943 case PT_ALNUM:
4944 for (fi = min;; fi++)
4945 {
4946 int category;
4947 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4949 if (fi >= max) RRETURN(MATCH_NOMATCH);
4950 if (eptr >= md->end_subject)
4951 {
4952 SCHECK_PARTIAL();
4953 RRETURN(MATCH_NOMATCH);
4954 }
4955 GETCHARINCTEST(c, eptr);
4956 category = UCD_CATEGORY(c);
4957 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4958 RRETURN(MATCH_NOMATCH);
4959 }
4960 /* Control never gets here */
4961
4962 case PT_SPACE: /* Perl space */
4963 for (fi = min;; fi++)
4964 {
4965 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4967 if (fi >= max) RRETURN(MATCH_NOMATCH);
4968 if (eptr >= md->end_subject)
4969 {
4970 SCHECK_PARTIAL();
4971 RRETURN(MATCH_NOMATCH);
4972 }
4973 GETCHARINCTEST(c, eptr);
4974 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4975 c == CHAR_FF || c == CHAR_CR)
4976 == prop_fail_result)
4977 RRETURN(MATCH_NOMATCH);
4978 }
4979 /* Control never gets here */
4980
4981 case PT_PXSPACE: /* POSIX space */
4982 for (fi = min;; fi++)
4983 {
4984 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4985 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4986 if (fi >= max) RRETURN(MATCH_NOMATCH);
4987 if (eptr >= md->end_subject)
4988 {
4989 SCHECK_PARTIAL();
4990 RRETURN(MATCH_NOMATCH);
4991 }
4992 GETCHARINCTEST(c, eptr);
4993 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4994 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4995 == prop_fail_result)
4996 RRETURN(MATCH_NOMATCH);
4997 }
4998 /* Control never gets here */
4999
5000 case PT_WORD:
5001 for (fi = min;; fi++)
5002 {
5003 int category;
5004 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5006 if (fi >= max) RRETURN(MATCH_NOMATCH);
5007 if (eptr >= md->end_subject)
5008 {
5009 SCHECK_PARTIAL();
5010 RRETURN(MATCH_NOMATCH);
5011 }
5012 GETCHARINCTEST(c, eptr);
5013 category = UCD_CATEGORY(c);
5014 if ((category == ucp_L ||
5015 category == ucp_N ||
5016 c == CHAR_UNDERSCORE)
5017 == prop_fail_result)
5018 RRETURN(MATCH_NOMATCH);
5019 }
5020 /* Control never gets here */
5021
5022 /* This should never occur */
5023
5024 default:
5025 RRETURN(PCRE_ERROR_INTERNAL);
5026 }
5027 }
5028
5029 /* Match extended Unicode sequences. We will get here only if the
5030 support is in the binary; otherwise a compile-time error occurs. */
5031
5032 else if (ctype == OP_EXTUNI)
5033 {
5034 for (fi = min;; fi++)
5035 {
5036 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5037 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5038 if (fi >= max) RRETURN(MATCH_NOMATCH);
5039 if (eptr >= md->end_subject)
5040 {
5041 SCHECK_PARTIAL();
5042 RRETURN(MATCH_NOMATCH);
5043 }
5044 else
5045 {
5046 int lgb, rgb;
5047 GETCHARINCTEST(c, eptr);
5048 lgb = UCD_GRAPHBREAK(c);
5049 while (eptr < md->end_subject)
5050 {
5051 int len = 1;
5052 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5053 rgb = UCD_GRAPHBREAK(c);
5054 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5055 lgb = rgb;
5056 eptr += len;
5057 }
5058 }
5059 CHECK_PARTIAL();
5060 }
5061 }
5062 else
5063 #endif /* SUPPORT_UCP */
5064
5065 #ifdef SUPPORT_UTF
5066 if (utf)
5067 {
5068 for (fi = min;; fi++)
5069 {
5070 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5072 if (fi >= max) RRETURN(MATCH_NOMATCH);
5073 if (eptr >= md->end_subject)
5074 {
5075 SCHECK_PARTIAL();
5076 RRETURN(MATCH_NOMATCH);
5077 }
5078 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5079 RRETURN(MATCH_NOMATCH);
5080 GETCHARINC(c, eptr);
5081 switch(ctype)
5082 {
5083 case OP_ANY: /* This is the non-NL case */
5084 if (md->partial != 0 && /* Take care with CRLF partial */
5085 eptr >= md->end_subject &&
5086 NLBLOCK->nltype == NLTYPE_FIXED &&
5087 NLBLOCK->nllen == 2 &&
5088 c == NLBLOCK->nl[0])
5089 {
5090 md->hitend = TRUE;
5091 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5092 }
5093 break;
5094
5095 case OP_ALLANY:
5096 case OP_ANYBYTE:
5097 break;
5098
5099 case OP_ANYNL:
5100 switch(c)
5101 {
5102 default: RRETURN(MATCH_NOMATCH);
5103 case 0x000d:
5104 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5105 break;
5106 case 0x000a:
5107 break;
5108
5109 case 0x000b:
5110 case 0x000c:
5111 case 0x0085:
5112 case 0x2028:
5113 case 0x2029:
5114 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5115 break;
5116 }
5117 break;
5118
5119 case OP_NOT_HSPACE:
5120 switch(c)
5121 {
5122 default: break;
5123 case 0x09: /* HT */
5124 case 0x20: /* SPACE */
5125 case 0xa0: /* NBSP */
5126 case 0x1680: /* OGHAM SPACE MARK */
5127 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5128 case 0x2000: /* EN QUAD */
5129 case 0x2001: /* EM QUAD */
5130 case 0x2002: /* EN SPACE */
5131 case 0x2003: /* EM SPACE */
5132 case 0x2004: /* THREE-PER-EM SPACE */
5133 case 0x2005: /* FOUR-PER-EM SPACE */
5134 case 0x2006: /* SIX-PER-EM SPACE */
5135 case 0x2007: /* FIGURE SPACE */
5136 case 0x2008: /* PUNCTUATION SPACE */
5137 case 0x2009: /* THIN SPACE */
5138 case 0x200A: /* HAIR SPACE */
5139 case 0x202f: /* NARROW NO-BREAK SPACE */
5140 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5141 case 0x3000: /* IDEOGRAPHIC SPACE */
5142 RRETURN(MATCH_NOMATCH);
5143 }
5144 break;
5145
5146 case OP_HSPACE:
5147 switch(c)
5148 {
5149 default: RRETURN(MATCH_NOMATCH);
5150 case 0x09: /* HT */
5151 case 0x20: /* SPACE */
5152 case 0xa0: /* NBSP */
5153 case 0x1680: /* OGHAM SPACE MARK */
5154 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5155 case 0x2000: /* EN QUAD */
5156 case 0x2001: /* EM QUAD */
5157 case 0x2002: /* EN SPACE */
5158 case 0x2003: /* EM SPACE */
5159 case 0x2004: /* THREE-PER-EM SPACE */
5160 case 0x2005: /* FOUR-PER-EM SPACE */
5161 case 0x2006: /* SIX-PER-EM SPACE */
5162 case 0x2007: /* FIGURE SPACE */
5163 case 0x2008: /* PUNCTUATION SPACE */
5164 case 0x2009: /* THIN SPACE */
5165 case 0x200A: /* HAIR SPACE */
5166 case 0x202f: /* NARROW NO-BREAK SPACE */
5167 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5168 case 0x3000: /* IDEOGRAPHIC SPACE */
5169 break;
5170 }
5171 break;
5172
5173 case OP_NOT_VSPACE:
5174 switch(c)
5175 {
5176 default: break;
5177 case 0x0a: /* LF */
5178 case 0x0b: /* VT */
5179 case 0x0c: /* FF */
5180 case 0x0d: /* CR */
5181 case 0x85: /* NEL */
5182 case 0x2028: /* LINE SEPARATOR */
5183 case 0x2029: /* PARAGRAPH SEPARATOR */
5184 RRETURN(MATCH_NOMATCH);
5185 }
5186 break;
5187
5188 case OP_VSPACE:
5189 switch(c)
5190 {
5191 default: RRETURN(MATCH_NOMATCH);
5192 case 0x0a: /* LF */
5193 case 0x0b: /* VT */
5194 case 0x0c: /* FF */
5195 case 0x0d: /* CR */
5196 case 0x85: /* NEL */
5197 case 0x2028: /* LINE SEPARATOR */
5198 case 0x2029: /* PARAGRAPH SEPARATOR */
5199 break;
5200 }
5201 break;
5202
5203 case OP_NOT_DIGIT:
5204 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5205 RRETURN(MATCH_NOMATCH);
5206 break;
5207
5208 case OP_DIGIT:
5209 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5210 RRETURN(MATCH_NOMATCH);
5211 break;
5212
5213 case OP_NOT_WHITESPACE:
5214 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5215 RRETURN(MATCH_NOMATCH);
5216 break;
5217
5218 case OP_WHITESPACE:
5219 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5220 RRETURN(MATCH_NOMATCH);
5221 break;
5222
5223 case OP_NOT_WORDCHAR:
5224 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5225 RRETURN(MATCH_NOMATCH);
5226 break;
5227
5228 case OP_WORDCHAR:
5229 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5230 RRETURN(MATCH_NOMATCH);
5231 break;
5232
5233 default:
5234 RRETURN(PCRE_ERROR_INTERNAL);
5235 }
5236 }
5237 }
5238 else
5239 #endif
5240 /* Not UTF mode */
5241 {
5242 for (fi = min;; fi++)
5243 {
5244 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5245 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5246 if (fi >= max) RRETURN(MATCH_NOMATCH);
5247 if (eptr >= md->end_subject)
5248 {
5249 SCHECK_PARTIAL();
5250 RRETURN(MATCH_NOMATCH);
5251 }
5252 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5253 RRETURN(MATCH_NOMATCH);
5254 c = *eptr++;
5255 switch(ctype)
5256 {
5257 case OP_ANY: /* This is the non-NL case */
5258 if (md->partial != 0 && /* Take care with CRLF partial */
5259 eptr >= md->end_subject &&
5260 NLBLOCK->nltype == NLTYPE_FIXED &&
5261 NLBLOCK->nllen == 2 &&
5262 c == NLBLOCK->nl[0])
5263 {
5264 md->hitend = TRUE;
5265 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5266 }
5267 break;
5268
5269 case OP_ALLANY:
5270 case OP_ANYBYTE:
5271 break;
5272
5273 case OP_ANYNL:
5274 switch(c)
5275 {
5276 default: RRETURN(MATCH_NOMATCH);
5277 case 0x000d:
5278 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5279 break;
5280
5281 case 0x000a:
5282 break;
5283
5284 case 0x000b:
5285 case 0x000c:
5286 case 0x0085:
5287 #ifdef COMPILE_PCRE16
5288 case 0x2028:
5289 case 0x2029:
5290 #endif
5291 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5292 break;
5293 }
5294 break;
5295
5296 case OP_NOT_HSPACE:
5297 switch(c)
5298 {
5299 default: break;
5300 case 0x09: /* HT */
5301 case 0x20: /* SPACE */
5302 case 0xa0: /* NBSP */
5303 #ifdef COMPILE_PCRE16
5304 case 0x1680: /* OGHAM SPACE MARK */
5305 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5306 case 0x2000: /* EN QUAD */
5307 case 0x2001: /* EM QUAD */
5308 case 0x2002: /* EN SPACE */
5309 case 0x2003: /* EM SPACE */
5310 case 0x2004: /* THREE-PER-EM SPACE */
5311 case 0x2005: /* FOUR-PER-EM SPACE */
5312 case 0x2006: /* SIX-PER-EM SPACE */
5313 case 0x2007: /* FIGURE SPACE */
5314 case 0x2008: /* PUNCTUATION SPACE */
5315 case 0x2009: /* THIN SPACE */
5316 case 0x200A: /* HAIR SPACE */
5317 case 0x202f: /* NARROW NO-BREAK SPACE */
5318 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5319 case 0x3000: /* IDEOGRAPHIC SPACE */
5320 #endif
5321 RRETURN(MATCH_NOMATCH);
5322 }
5323 break;
5324
5325 case OP_HSPACE:
5326 switch(c)
5327 {
5328 default: RRETURN(MATCH_NOMATCH);
5329 case 0x09: /* HT */
5330 case 0x20: /* SPACE */
5331 case 0xa0: /* NBSP */
5332 #ifdef COMPILE_PCRE16
5333 case 0x1680: /* OGHAM SPACE MARK */
5334 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5335 case 0x2000: /* EN QUAD */
5336 case 0x2001: /* EM QUAD */
5337 case 0x2002: /* EN SPACE */
5338 case 0x2003: /* EM SPACE */
5339 case 0x2004: /* THREE-PER-EM SPACE */
5340 case 0x2005: /* FOUR-PER-EM SPACE */
5341 case 0x2006: /* SIX-PER-EM SPACE */
5342 case 0x2007: /* FIGURE SPACE */
5343 case 0x2008: /* PUNCTUATION SPACE */
5344 case 0x2009: /* THIN SPACE */
5345 case 0x200A: /* HAIR SPACE */
5346 case 0x202f: /* NARROW NO-BREAK SPACE */
5347 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5348 case 0x3000: /* IDEOGRAPHIC SPACE */
5349 #endif
5350 break;
5351 }
5352 break;
5353
5354 case OP_NOT_VSPACE:
5355 switch(c)
5356 {
5357 default: break;
5358 case 0x0a: /* LF */
5359 case 0x0b: /* VT */
5360 case 0x0c: /* FF */
5361 case 0x0d: /* CR */
5362 case 0x85: /* NEL */
5363 #ifdef COMPILE_PCRE16
5364 case 0x2028: /* LINE SEPARATOR */
5365 case 0x2029: /* PARAGRAPH SEPARATOR */
5366 #endif
5367 RRETURN(MATCH_NOMATCH);
5368 }
5369 break;
5370
5371 case OP_VSPACE:
5372 switch(c)
5373 {
5374 default: RRETURN(MATCH_NOMATCH);
5375 case 0x0a: /* LF */
5376 case 0x0b: /* VT */
5377 case 0x0c: /* FF */
5378 case 0x0d: /* CR */
5379 case 0x85: /* NEL */
5380 #ifdef COMPILE_PCRE16
5381 case 0x2028: /* LINE SEPARATOR */
5382 case 0x2029: /* PARAGRAPH SEPARATOR */
5383 #endif
5384 break;
5385 }
5386 break;
5387
5388 case OP_NOT_DIGIT:
5389 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5390 break;
5391
5392 case OP_DIGIT:
5393 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5394 break;
5395
5396 case OP_NOT_WHITESPACE:
5397 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5398 break;
5399
5400 case OP_WHITESPACE:
5401 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5402 break;
5403
5404 case OP_NOT_WORDCHAR:
5405 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5406 break;
5407
5408 case OP_WORDCHAR:
5409 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5410 break;
5411
5412 default:
5413 RRETURN(PCRE_ERROR_INTERNAL);
5414 }
5415 }
5416 }
5417 /* Control never gets here */
5418 }
5419
5420 /* If maximizing, it is worth using inline code for speed, doing the type
5421 test once at the start (i.e. keep it out of the loop). Again, keep the
5422 UTF-8 and UCP stuff separate. */
5423
5424 else
5425 {
5426 pp = eptr; /* Remember where we started */
5427
5428 #ifdef SUPPORT_UCP
5429 if (prop_type >= 0)
5430 {
5431 switch(prop_type)
5432 {
5433 case PT_ANY:
5434 for (i = min; i < max; i++)
5435 {
5436 int len = 1;
5437 if (eptr >= md->end_subject)
5438 {
5439 SCHECK_PARTIAL();
5440 break;
5441 }
5442 GETCHARLENTEST(c, eptr, len);
5443 if (prop_fail_result) break;
5444 eptr+= len;
5445 }
5446 break;
5447
5448 case PT_LAMP:
5449 for (i = min; i < max; i++)
5450 {
5451 int chartype;
5452 int len = 1;
5453 if (eptr >= md->end_subject)
5454 {
5455 SCHECK_PARTIAL();
5456 break;
5457 }
5458 GETCHARLENTEST(c, eptr, len);
5459 chartype = UCD_CHARTYPE(c);
5460 if ((chartype == ucp_Lu ||
5461 chartype == ucp_Ll ||
5462 chartype == ucp_Lt) == prop_fail_result)
5463 break;
5464 eptr+= len;
5465 }
5466 break;
5467
5468 case PT_GC:
5469 for (i = min; i < max; i++)
5470 {
5471 int len = 1;
5472 if (eptr >= md->end_subject)
5473 {
5474 SCHECK_PARTIAL();
5475 break;
5476 }
5477 GETCHARLENTEST(c, eptr, len);
5478 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5479 eptr+= len;
5480 }
5481 break;
5482
5483 case PT_PC:
5484 for (i = min; i < max; i++)
5485 {
5486 int len = 1;
5487 if (eptr >= md->end_subject)
5488 {
5489 SCHECK_PARTIAL();
5490 break;
5491 }
5492 GETCHARLENTEST(c, eptr, len);
5493 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5494 eptr+= len;
5495 }
5496 break;
5497
5498 case PT_SC:
5499 for (i = min; i < max; i++)
5500 {
5501 int len = 1;
5502 if (eptr >= md->end_subject)
5503 {
5504 SCHECK_PARTIAL();
5505 break;
5506 }
5507 GETCHARLENTEST(c, eptr, len);
5508 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5509 eptr+= len;
5510 }
5511 break;
5512
5513 case PT_ALNUM:
5514 for (i = min; i < max; i++)
5515 {
5516 int category;
5517 int len = 1;
5518 if (eptr >= md->end_subject)
5519 {
5520 SCHECK_PARTIAL();
5521 break;
5522 }
5523 GETCHARLENTEST(c, eptr, len);
5524 category = UCD_CATEGORY(c);
5525 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5526 break;
5527 eptr+= len;
5528 }
5529 break;
5530
5531 case PT_SPACE: /* Perl space */
5532 for (i = min; i < max; i++)
5533 {
5534 int len = 1;
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 GETCHARLENTEST(c, eptr, len);
5541 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5542 c == CHAR_FF || c == CHAR_CR)
5543 == prop_fail_result)
5544 break;
5545 eptr+= len;
5546 }
5547 break;
5548
5549 case PT_PXSPACE: /* POSIX space */
5550 for (i = min; i < max; i++)
5551 {
5552 int len = 1;
5553 if (eptr >= md->end_subject)
5554 {
5555 SCHECK_PARTIAL();
5556 break;
5557 }
5558 GETCHARLENTEST(c, eptr, len);
5559 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5560 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5561 == prop_fail_result)
5562 break;
5563 eptr+= len;
5564 }
5565 break;
5566
5567 case PT_WORD:
5568 for (i = min; i < max; i++)
5569 {
5570 int category;
5571 int len = 1;
5572 if (eptr >= md->end_subject)
5573 {
5574 SCHECK_PARTIAL();
5575 break;
5576 }
5577 GETCHARLENTEST(c, eptr, len);
5578 category = UCD_CATEGORY(c);
5579 if ((category == ucp_L || category == ucp_N ||
5580 c == CHAR_UNDERSCORE) == prop_fail_result)
5581 break;
5582 eptr+= len;
5583 }
5584 break;
5585
5586 default:
5587 RRETURN(PCRE_ERROR_INTERNAL);
5588 }
5589
5590 /* eptr is now past the end of the maximum run */
5591
5592 if (possessive) continue;
5593 for(;;)
5594 {
5595 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5596 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5597 if (eptr-- == pp) break; /* Stop if tried at original pos */
5598 if (utf) BACKCHAR(eptr);
5599 }
5600 }
5601
5602 /* Match extended Unicode sequences. We will get here only if the
5603 support is in the binary; otherwise a compile-time error occurs. */
5604
5605 else if (ctype == OP_EXTUNI)
5606 {
5607 for (i = min; i < max; i++)
5608 {
5609 if (eptr >= md->end_subject)
5610 {
5611 SCHECK_PARTIAL();
5612 break;
5613 }
5614 else
5615 {
5616 int lgb, rgb;
5617 GETCHARINCTEST(c, eptr);
5618 lgb = UCD_GRAPHBREAK(c);
5619 while (eptr < md->end_subject)
5620 {
5621 int len = 1;
5622 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5623 rgb = UCD_GRAPHBREAK(c);
5624 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5625 lgb = rgb;
5626 eptr += len;
5627 }
5628 }
5629 CHECK_PARTIAL();
5630 }
5631
5632 /* eptr is now past the end of the maximum run */
5633
5634 if (possessive) continue;
5635
5636 for(;;)
5637 {
5638 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5640 if (eptr-- == pp) break; /* Stop if tried at original pos */
5641 for (;;) /* Move back over one extended */
5642 {
5643 if (!utf) c = *eptr; else
5644 {
5645 BACKCHAR(eptr);
5646 GETCHAR(c, eptr);
5647 }
5648 if (UCD_CATEGORY(c) != ucp_M) break;
5649 eptr--;
5650 }
5651 }
5652 }
5653
5654 else
5655 #endif /* SUPPORT_UCP */
5656
5657 #ifdef SUPPORT_UTF
5658 if (utf)
5659 {
5660 switch(ctype)
5661 {
5662 case OP_ANY:
5663 if (max < INT_MAX)
5664 {
5665 for (i = min; i < max; i++)
5666 {
5667 if (eptr >= md->end_subject)
5668 {
5669 SCHECK_PARTIAL();
5670 break;
5671 }
5672 if (IS_NEWLINE(eptr)) break;
5673 if (md->partial != 0 && /* Take care with CRLF partial */
5674 eptr + 1 >= md->end_subject &&
5675 NLBLOCK->nltype == NLTYPE_FIXED &&
5676 NLBLOCK->nllen == 2 &&
5677 *eptr == NLBLOCK->nl[0])
5678 {
5679 md->hitend = TRUE;
5680 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5681 }
5682 eptr++;
5683 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5684 }
5685 }
5686
5687 /* Handle unlimited UTF-8 repeat */
5688
5689 else
5690 {
5691 for (i = min; i < max; i++)
5692 {
5693 if (eptr >= md->end_subject)
5694 {
5695 SCHECK_PARTIAL();
5696 break;
5697 }
5698 if (IS_NEWLINE(eptr)) break;
5699 if (md->partial != 0 && /* Take care with CRLF partial */
5700 eptr + 1 >= md->end_subject &&
5701 NLBLOCK->nltype == NLTYPE_FIXED &&
5702 NLBLOCK->nllen == 2 &&
5703 *eptr == NLBLOCK->nl[0])
5704 {
5705 md->hitend = TRUE;
5706 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5707 }
5708 eptr++;
5709 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5710 }
5711 }
5712 break;
5713
5714 case OP_ALLANY:
5715 if (max < INT_MAX)
5716 {
5717 for (i = min; i < max; i++)
5718 {
5719 if (eptr >= md->end_subject)
5720 {
5721 SCHECK_PARTIAL();
5722 break;
5723 }
5724 eptr++;
5725 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5726 }
5727 }
5728 else
5729 {
5730 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5731 SCHECK_PARTIAL();
5732 }
5733 break;
5734
5735 /* The byte case is the same as non-UTF8 */
5736
5737 case OP_ANYBYTE:
5738 c = max - min;
5739 if (c > (unsigned int)(md->end_subject - eptr))
5740 {
5741 eptr = md->end_subject;
5742 SCHECK_PARTIAL();
5743 }
5744 else eptr += c;
5745 break;
5746
5747 case OP_ANYNL:
5748 for (i = min; i < max; i++)
5749 {
5750 int len = 1;
5751 if (eptr >= md->end_subject)
5752 {
5753 SCHECK_PARTIAL();
5754 break;
5755 }
5756 GETCHARLEN(c, eptr, len);
5757 if (c == 0x000d)
5758 {
5759 if (++eptr >= md->end_subject) break;
5760 if (*eptr == 0x000a) eptr++;
5761 }
5762 else
5763 {
5764 if (c != 0x000a &&
5765 (md->bsr_anycrlf ||
5766 (c != 0x000b && c != 0x000c &&
5767 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5768 break;
5769 eptr += len;
5770 }
5771 }
5772 break;
5773
5774 case OP_NOT_HSPACE:
5775 case OP_HSPACE:
5776 for (i = min; i < max; i++)
5777 {
5778 BOOL gotspace;
5779 int len = 1;
5780 if (eptr >= md->end_subject)
5781 {
5782 SCHECK_PARTIAL();
5783 break;
5784 }
5785 GETCHARLEN(c, eptr, len);
5786 switch(c)
5787 {
5788 default: gotspace = FALSE; break;
5789 case 0x09: /* HT */
5790 case 0x20: /* SPACE */
5791 case 0xa0: /* NBSP */
5792 case 0x1680: /* OGHAM SPACE MARK */
5793 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5794 case 0x2000: /* EN QUAD */
5795 case 0x2001: /* EM QUAD */
5796 case 0x2002: /* EN SPACE */
5797 case 0x2003: /* EM SPACE */
5798 case 0x2004: /* THREE-PER-EM SPACE */
5799 case 0x2005: /* FOUR-PER-EM SPACE */
5800 case 0x2006: /* SIX-PER-EM SPACE */
5801 case 0x2007: /* FIGURE SPACE */
5802 case 0x2008: /* PUNCTUATION SPACE */
5803 case 0x2009: /* THIN SPACE */
5804 case 0x200A: /* HAIR SPACE */
5805 case 0x202f: /* NARROW NO-BREAK SPACE */
5806 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5807 case 0x3000: /* IDEOGRAPHIC SPACE */
5808 gotspace = TRUE;
5809 break;
5810 }
5811 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5812 eptr += len;
5813 }
5814 break;
5815
5816 case OP_NOT_VSPACE:
5817 case OP_VSPACE:
5818 for (i = min; i < max; i++)
5819 {
5820 BOOL gotspace;
5821 int len = 1;
5822 if (eptr >= md->end_subject)
5823 {
5824 SCHECK_PARTIAL();
5825 break;
5826 }
5827 GETCHARLEN(c, eptr, len);
5828 switch(c)
5829 {
5830 default: gotspace = FALSE; break;
5831 case 0x0a: /* LF */
5832 case 0x0b: /* VT */
5833 case 0x0c: /* FF */
5834 case 0x0d: /* CR */
5835 case 0x85: /* NEL */
5836 case 0x2028: /* LINE SEPARATOR */
5837 case 0x2029: /* PARAGRAPH SEPARATOR */
5838 gotspace = TRUE;
5839 break;
5840 }
5841 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5842 eptr += len;
5843 }
5844 break;
5845
5846 case OP_NOT_DIGIT:
5847 for (i = min; i < max; i++)
5848 {
5849 int len = 1;
5850 if (eptr >= md->end_subject)
5851 {
5852 SCHECK_PARTIAL();
5853 break;
5854 }
5855 GETCHARLEN(c, eptr, len);
5856 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5857 eptr+= len;
5858 }
5859 break;
5860
5861 case OP_DIGIT:
5862 for (i = min; i < max; i++)
5863 {
5864 int len = 1;
5865 if (eptr >= md->end_subject)
5866 {
5867 SCHECK_PARTIAL();
5868 break;
5869 }
5870 GETCHARLEN(c, eptr, len);
5871 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5872 eptr+= len;
5873 }
5874 break;
5875
5876 case OP_NOT_WHITESPACE:
5877 for (i = min; i < max; i++)
5878 {
5879 int len = 1;
5880 if (eptr >= md->end_subject)
5881 {
5882 SCHECK_PARTIAL();
5883 break;
5884 }
5885 GETCHARLEN(c, eptr, len);
5886 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5887 eptr+= len;
5888 }
5889 break;
5890
5891 case OP_WHITESPACE:
5892 for (i = min; i < max; i++)
5893 {
5894 int len = 1;
5895 if (eptr >= md->end_subject)
5896 {
5897 SCHECK_PARTIAL();
5898 break;
5899 }
5900 GETCHARLEN(c, eptr, len);
5901 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5902 eptr+= len;
5903 }
5904 break;
5905
5906 case OP_NOT_WORDCHAR:
5907 for (i = min; i < max; i++)
5908 {
5909 int len = 1;
5910 if (eptr >= md->end_subject)
5911 {
5912 SCHECK_PARTIAL();
5913 break;
5914 }
5915 GETCHARLEN(c, eptr, len);
5916 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5917 eptr+= len;
5918 }
5919 break;
5920
5921 case OP_WORDCHAR:
5922 for (i = min; i < max; i++)
5923 {
5924 int len = 1;
5925 if (eptr >= md->end_subject)
5926 {
5927 SCHECK_PARTIAL();
5928 break;
5929 }
5930 GETCHARLEN(c, eptr, len);
5931 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5932 eptr+= len;
5933 }
5934 break;
5935
5936 default:
5937 RRETURN(PCRE_ERROR_INTERNAL);
5938 }
5939
5940 /* eptr is now past the end of the maximum run. If possessive, we are
5941 done (no backing up). Otherwise, match at this position; anything other
5942 than no match is immediately returned. For nomatch, back up one
5943 character, unless we are matching \R and the last thing matched was
5944 \r\n, in which case, back up two bytes. */
5945
5946 if (possessive) continue;
5947 for(;;)
5948 {
5949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5951 if (eptr-- == pp) break; /* Stop if tried at original pos */
5952 BACKCHAR(eptr);
5953 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5954 eptr[-1] == '\r') eptr--;
5955 }
5956 }
5957 else
5958 #endif /* SUPPORT_UTF */
5959 /* Not UTF mode */
5960 {
5961 switch(ctype)
5962 {
5963 case OP_ANY:
5964 for (i = min; i < max; i++)
5965 {
5966 if (eptr >= md->end_subject)
5967 {
5968 SCHECK_PARTIAL();
5969 break;
5970 }
5971 if (IS_NEWLINE(eptr)) break;
5972 if (md->partial != 0 && /* Take care with CRLF partial */
5973 eptr + 1 >= md->end_subject &&
5974 NLBLOCK->nltype == NLTYPE_FIXED &&
5975 NLBLOCK->nllen == 2 &&
5976 *eptr == NLBLOCK->nl[0])
5977 {
5978 md->hitend = TRUE;
5979 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5980 }
5981 eptr++;
5982 }
5983 break;
5984
5985 case OP_ALLANY:
5986 case OP_ANYBYTE:
5987 c = max - min;
5988 if (c > (unsigned int)(md->end_subject - eptr))
5989 {
5990 eptr = md->end_subject;
5991 SCHECK_PARTIAL();
5992 }
5993 else eptr += c;
5994 break;
5995
5996 case OP_ANYNL:
5997 for (i = min; i < max; i++)
5998 {
5999 if (eptr >= md->end_subject)
6000 {
6001 SCHECK_PARTIAL();
6002 break;
6003 }
6004 c = *eptr;
6005 if (c == 0x000d)
6006 {
6007 if (++eptr >= md->end_subject) break;
6008 if (*eptr == 0x000a) eptr++;
6009 }
6010 else
6011 {
6012 if (c != 0x000a && (md->bsr_anycrlf ||
6013 (c != 0x000b && c != 0x000c && c != 0x0085
6014 #ifdef COMPILE_PCRE16
6015 && c != 0x2028 && c != 0x2029
6016 #endif
6017 ))) break;
6018 eptr++;
6019 }
6020 }
6021 break;
6022
6023 case OP_NOT_HSPACE:
6024 for (i = min; i < max; i++)
6025 {
6026 if (eptr >= md->end_subject)
6027 {
6028 SCHECK_PARTIAL();
6029 break;
6030 }
6031 c = *eptr;
6032 if (c == 0x09 || c == 0x20 || c == 0xa0
6033 #ifdef COMPILE_PCRE16
6034 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6035 || c == 0x202f || c == 0x205f || c == 0x3000
6036 #endif
6037 ) break;
6038 eptr++;
6039 }
6040 break;
6041
6042 case OP_HSPACE:
6043 for (i = min; i < max; i++)
6044 {
6045 if (eptr >= md->end_subject)
6046 {
6047 SCHECK_PARTIAL();
6048 break;
6049 }
6050 c = *eptr;
6051 if (c != 0x09 && c != 0x20 && c != 0xa0
6052 #ifdef COMPILE_PCRE16
6053 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6054 && c != 0x202f && c != 0x205f && c != 0x3000
6055 #endif
6056 ) break;
6057 eptr++;
6058 }
6059 break;
6060
6061 case OP_NOT_VSPACE:
6062 for (i = min; i < max; i++)
6063 {
6064 if (eptr >= md->end_subject)
6065 {
6066 SCHECK_PARTIAL();
6067 break;
6068 }
6069 c = *eptr;
6070 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6071 #ifdef COMPILE_PCRE16
6072 || c == 0x2028 || c == 0x2029
6073 #endif
6074 ) break;
6075 eptr++;
6076 }
6077 break;
6078
6079 case OP_VSPACE:
6080 for (i = min; i < max; i++)
6081 {
6082 if (eptr >= md->end_subject)
6083 {
6084 SCHECK_PARTIAL();
6085 break;
6086 }
6087 c = *eptr;
6088 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6089 #ifdef COMPILE_PCRE16
6090 && c != 0x2028 && c != 0x2029
6091 #endif
6092 ) break;
6093 eptr++;
6094 }
6095 break;
6096
6097 case OP_NOT_DIGIT:
6098 for (i = min; i < max; i++)
6099 {
6100 if (eptr >= md->end_subject)
6101 {
6102 SCHECK_PARTIAL();
6103 break;
6104 }
6105 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6106 eptr++;
6107 }
6108 break;
6109
6110 case OP_DIGIT:
6111 for (i = min; i < max; i++)
6112 {
6113 if (eptr >= md->end_subject)
6114 {
6115 SCHECK_PARTIAL();
6116 break;
6117 }
6118 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6119 eptr++;
6120 }
6121 break;
6122
6123 case OP_NOT_WHITESPACE:
6124 for (i = min; i < max; i++)
6125 {
6126 if (eptr >= md->end_subject)
6127 {
6128 SCHECK_PARTIAL();
6129 break;
6130 }
6131 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6132 eptr++;
6133 }
6134 break;
6135
6136 case OP_WHITESPACE:
6137 for (i = min; i < max; i++)
6138 {
6139 if (eptr >= md->end_subject)
6140 {
6141 SCHECK_PARTIAL();
6142 break;
6143 }
6144 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6145 eptr++;
6146 }
6147 break;
6148
6149 case OP_NOT_WORDCHAR:
6150 for (i = min; i < max; i++)
6151 {
6152 if (eptr >= md->end_subject)
6153 {
6154 SCHECK_PARTIAL();
6155 break;
6156 }
6157 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6158 eptr++;
6159 }
6160 break;
6161
6162 case OP_WORDCHAR:
6163 for (i = min; i < max; i++)
6164 {
6165 if (eptr >= md->end_subject)
6166 {
6167 SCHECK_PARTIAL();
6168 break;
6169 }
6170 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6171 eptr++;
6172 }
6173 break;
6174
6175 default:
6176 RRETURN(PCRE_ERROR_INTERNAL);
6177 }
6178
6179 /* eptr is now past the end of the maximum run. If possessive, we are
6180 done (no backing up). Otherwise, match at this position; anything other
6181 than no match is immediately returned. For nomatch, back up one
6182 character (byte), unless we are matching \R and the last thing matched
6183 was \r\n, in which case, back up two bytes. */
6184
6185 if (possessive) continue;
6186 while (eptr >= pp)
6187 {
6188 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6190 eptr--;
6191 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6192 eptr[-1] == '\r') eptr--;
6193 }
6194 }
6195
6196 /* Get here if we can't make it match with any permitted repetitions */
6197
6198 RRETURN(MATCH_NOMATCH);
6199 }
6200 /* Control never gets here */
6201
6202 /* There's been some horrible disaster. Arrival here can only mean there is
6203 something seriously wrong in the code above or the OP_xxx definitions. */
6204
6205 default:
6206 DPRINTF(("Unknown opcode %d\n", *ecode));
6207 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6208 }
6209
6210 /* Do not stick any code in here without much thought; it is assumed
6211 that "continue" in the code above comes out to here to repeat the main
6212 loop. */
6213
6214 } /* End of main loop */
6215 /* Control never reaches here */
6216
6217
6218 /* When compiling to use the heap rather than the stack for recursive calls to
6219 match(), the RRETURN() macro jumps here. The number that is saved in
6220 frame->Xwhere indicates which label we actually want to return to. */
6221
6222 #ifdef NO_RECURSE
6223 #define LBL(val) case val: goto L_RM##val;
6224 HEAP_RETURN:
6225 switch (frame->Xwhere)
6226 {
6227 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6228 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6229 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6230 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6231 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6232 LBL(65) LBL(66)
6233 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6234 LBL(21)
6235 #endif
6236 #ifdef SUPPORT_UTF
6237 LBL(16) LBL(18) LBL(20)
6238 LBL(22) LBL(23) LBL(28) LBL(30)
6239 LBL(32) LBL(34) LBL(42) LBL(46)
6240 #ifdef SUPPORT_UCP
6241 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6242 LBL(59) LBL(60) LBL(61) LBL(62)
6243 #endif /* SUPPORT_UCP */
6244 #endif /* SUPPORT_UTF */
6245 default:
6246 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6247
6248 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6249
6250 return PCRE_ERROR_INTERNAL;
6251 }
6252 #undef LBL
6253 #endif /* NO_RECURSE */
6254 }
6255
6256
6257 /***************************************************************************
6258 ****************************************************************************
6259 RECURSION IN THE match() FUNCTION
6260
6261 Undefine all the macros that were defined above to handle this. */
6262
6263 #ifdef NO_RECURSE
6264 #undef eptr
6265 #undef ecode
6266 #undef mstart
6267 #undef offset_top
6268 #undef eptrb
6269 #undef flags
6270
6271 #undef callpat
6272 #undef charptr
6273 #undef data
6274 #undef next
6275 #undef pp
6276 #undef prev
6277 #undef saved_eptr
6278
6279 #undef new_recursive
6280
6281 #undef cur_is_word
6282 #undef condition
6283 #undef prev_is_word
6284
6285 #undef ctype
6286 #undef length
6287 #undef max
6288 #undef min
6289 #undef number
6290 #undef offset
6291 #undef op
6292 #undef save_capture_last
6293 #undef save_offset1
6294 #undef save_offset2
6295 #undef save_offset3
6296 #undef stacksave
6297
6298 #undef newptrb
6299
6300 #endif
6301
6302 /* These two are defined as macros in both cases */
6303
6304 #undef fc
6305 #undef fi
6306
6307 /***************************************************************************
6308 ***************************************************************************/
6309
6310
6311 #ifdef NO_RECURSE
6312 /*************************************************
6313 * Release allocated heap frames *
6314 *************************************************/
6315
6316 /* This function releases all the allocated frames. The base frame is on the
6317 machine stack, and so must not be freed.
6318
6319 Argument: the address of the base frame
6320 Returns: nothing
6321 */
6322
6323 static void
6324 release_match_heapframes (heapframe *frame_base)
6325 {
6326 heapframe *nextframe = frame_base->Xnextframe;
6327 while (nextframe != NULL)
6328 {
6329 heapframe *oldframe = nextframe;
6330 nextframe = nextframe->Xnextframe;
6331 (PUBL(stack_free))(oldframe);
6332 }
6333 }
6334 #endif
6335
6336
6337 /*************************************************
6338 * Execute a Regular Expression *
6339 *************************************************/
6340
6341 /* This function applies a compiled re to a subject string and picks out
6342 portions of the string if it matches. Two elements in the vector are set for
6343 each substring: the offsets to the start and end of the substring.
6344
6345 Arguments:
6346 argument_re points to the compiled expression
6347 extra_data points to extra data or is NULL
6348 subject points to the subject string
6349 length length of subject string (may contain binary zeros)
6350 start_offset where to start in the subject string
6351 options option bits
6352 offsets points to a vector of ints to be filled in with offsets
6353 offsetcount the number of elements in the vector
6354
6355 Returns: > 0 => success; value is the number of elements filled in
6356 = 0 => success, but offsets is not big enough
6357 -1 => failed to match
6358 < -1 => some kind of unexpected problem
6359 */
6360
6361 #ifdef COMPILE_PCRE8
6362 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6363 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6364 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6365 int offsetcount)
6366 #else
6367 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6368 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6369 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6370 int offsetcount)
6371 #endif
6372 {
6373 int rc, ocount, arg_offset_max;
6374 int newline;
6375 BOOL using_temporary_offsets = FALSE;
6376 BOOL anchored;
6377 BOOL startline;
6378 BOOL firstline;
6379 BOOL utf;
6380 BOOL has_first_char = FALSE;
6381 BOOL has_req_char = FALSE;
6382 pcre_uchar first_char = 0;
6383 pcre_uchar first_char2 = 0;
6384 pcre_uchar req_char = 0;
6385 pcre_uchar req_char2 = 0;
6386 match_data match_block;
6387 match_data *md = &match_block;
6388 const pcre_uint8 *tables;
6389 const pcre_uint8 *start_bits = NULL;
6390 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6391 PCRE_PUCHAR end_subject;
6392 PCRE_PUCHAR start_partial = NULL;
6393 PCRE_PUCHAR req_char_ptr = start_match - 1;
6394
6395 const pcre_study_data *study;
6396 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6397
6398 #ifdef NO_RECURSE
6399 heapframe frame_zero;
6400 frame_zero.Xprevframe = NULL; /* Marks the top level */
6401 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6402 md->match_frames_base = &frame_zero;
6403 #endif
6404
6405 /* Check for the special magic call that measures the size of the stack used
6406 per recursive call of match(). Without the funny casting for sizeof, a Windows
6407 compiler gave this error: "unary minus operator applied to unsigned type,
6408 result still unsigned". Hopefully the cast fixes that. */
6409
6410 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6411 start_offset == -999)
6412 #ifdef NO_RECURSE
6413 return -((int)sizeof(heapframe));
6414 #else
6415 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6416 #endif
6417
6418 /* Plausibility checks */
6419
6420 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6421 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6422 return PCRE_ERROR_NULL;
6423 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6424 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6425
6426 /* Check that the first field in the block is the magic number. If it is not,
6427 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6428 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6429 means that the pattern is likely compiled with different endianness. */
6430
6431 if (re->magic_number != MAGIC_NUMBER)
6432 return re->magic_number == REVERSED_MAGIC_NUMBER?
6433 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6434 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6435
6436 /* These two settings are used in the code for checking a UTF-8 string that
6437 follows immediately afterwards. Other values in the md block are used only
6438 during "normal" pcre_exec() processing, not when the JIT support is in use,
6439 so they are set up later. */
6440
6441 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6442 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6443 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6444 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6445
6446 /* Check a UTF-8 string if required. Pass back the character offset and error
6447 code for an invalid string if a results vector is available. */
6448
6449 #ifdef SUPPORT_UTF
6450 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6451 {
6452 int erroroffset;
6453 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6454 if (errorcode != 0)
6455 {
6456 if (offsetcount >= 2)
6457 {
6458 offsets[0] = erroroffset;
6459 offsets[1] = errorcode;
6460 }
6461 #ifdef COMPILE_PCRE16
6462 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6463 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6464 #else
6465 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6466 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6467 #endif
6468 }
6469
6470 /* Check that a start_offset points to the start of a UTF character. */
6471 if (start_offset > 0 && start_offset < length &&
6472 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6473 return PCRE_ERROR_BADUTF8_OFFSET;
6474 }
6475 #endif
6476
6477 /* If the pattern was successfully studied with JIT support, run the JIT
6478 executable instead of the rest of this function. Most options must be set at
6479 compile time for the JIT code to be usable. Fallback to the normal code path if
6480 an unsupported flag is set. */
6481
6482 #ifdef SUPPORT_JIT
6483 if (extra_data != NULL
6484 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6485 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6486 && extra_data->executable_jit != NULL
6487 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6488 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6489 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6490 {
6491 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6492 start_offset, options, offsets, offsetcount);
6493
6494 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6495 mode is not compiled. In this case we simply fallback to interpreter. */
6496
6497 if (rc != PCRE_ERROR_NULL) return rc;
6498 }
6499 #endif
6500
6501 /* Carry on with non-JIT matching. This information is for finding all the
6502 numbers associated with a given name, for condition testing. */
6503
6504 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6505 md->name_count = re->name_count;
6506 md->name_entry_size = re->name_entry_size;
6507
6508 /* Fish out the optional data from the extra_data structure, first setting
6509 the default values. */
6510
6511 study = NULL;
6512 md->match_limit = MATCH_LIMIT;
6513 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6514 md->callout_data = NULL;
6515
6516 /* The table pointer is always in native byte order. */
6517
6518 tables = re->tables;
6519
6520 if (extra_data != NULL)
6521 {
6522 register unsigned int flags = extra_data->flags;
6523 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6524 study = (const pcre_study_data *)extra_data->study_data;
6525 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6526 md->match_limit = extra_data->match_limit;
6527 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6528 md->match_limit_recursion = extra_data->match_limit_recursion;
6529 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6530 md->callout_data = extra_data->callout_data;
6531 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6532 }
6533
6534 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6535 is a feature that makes it possible to save compiled regex and re-use them
6536 in other programs later. */
6537
6538 if (tables == NULL) tables = PRIV(default_tables);
6539
6540 /* Set up other data */
6541
6542 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6543 startline = (re->flags & PCRE_STARTLINE) != 0;
6544 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6545
6546 /* The code starts after the real_pcre block and the capture name table. */
6547
6548 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6549 re->name_count * re->name_entry_size;
6550
6551 md->start_subject = (PCRE_PUCHAR)subject;
6552 md->start_offset = start_offset;
6553 md->end_subject = md->start_subject + length;
6554 end_subject = md->end_subject;
6555
6556 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6557 md->use_ucp = (re->options & PCRE_UCP) != 0;
6558 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6559 md->ignore_skip_arg = FALSE;
6560
6561 /* Some options are unpacked into BOOL variables in the hope that testing
6562 them will be faster than individual option bits. */
6563
6564 md->notbol = (options & PCRE_NOTBOL) != 0;
6565 md->noteol = (options & PCRE_NOTEOL) != 0;
6566 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6567 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6568
6569 md->hitend = FALSE;
6570 md->mark = md->nomatch_mark = NULL; /* In case never set */
6571
6572 md->recursive = NULL; /* No recursion at top level */
6573 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6574
6575 md->lcc = tables + lcc_offset;
6576 md->fcc = tables + fcc_offset;
6577 md->ctypes = tables + ctypes_offset;
6578
6579 /* Handle different \R options. */
6580
6581 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6582 {
6583 case 0:
6584 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6585 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6586 else
6587 #ifdef BSR_ANYCRLF
6588 md->bsr_anycrlf = TRUE;
6589 #else
6590 md->bsr_anycrlf = FALSE;
6591 #endif
6592 break;
6593
6594 case PCRE_BSR_ANYCRLF:
6595 md->bsr_anycrlf = TRUE;
6596 break;
6597
6598 case PCRE_BSR_UNICODE:
6599 md->bsr_anycrlf = FALSE;
6600 break;
6601
6602 default: return PCRE_ERROR_BADNEWLINE;
6603 }
6604
6605 /* Handle different types of newline. The three bits give eight cases. If
6606 nothing is set at run time, whatever was used at compile time applies. */
6607
6608 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6609 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6610 {
6611 case 0: newline = NEWLINE; break; /* Compile-time default */
6612 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6613 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6614 case PCRE_NEWLINE_CR+
6615 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6616 case PCRE_NEWLINE_ANY: newline = -1; break;
6617 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6618 default: return PCRE_ERROR_BADNEWLINE;
6619 }
6620
6621 if (newline == -2)
6622 {
6623 md->nltype = NLTYPE_ANYCRLF;
6624 }
6625 else if (newline < 0)
6626 {
6627 md->nltype = NLTYPE_ANY;
6628 }
6629 else
6630 {
6631 md->nltype = NLTYPE_FIXED;
6632 if (newline > 255)
6633 {
6634 md->nllen = 2;
6635 md->nl[0] = (newline >> 8) & 255;
6636 md->nl[1] = newline & 255;
6637 }
6638 else
6639 {
6640 md->nllen = 1;
6641 md->nl[0] = newline;
6642 }
6643 }
6644
6645 /* Partial matching was originally supported only for a restricted set of
6646 regexes; from release 8.00 there are no restrictions, but the bits are still
6647 defined (though never set). So there's no harm in leaving this code. */
6648
6649 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6650 return PCRE_ERROR_BADPARTIAL;
6651
6652 /* If the expression has got more back references than the offsets supplied can
6653 hold, we get a temporary chunk of working store to use during the matching.
6654 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6655 of 3. */
6656
6657 ocount = offsetcount - (offsetcount % 3);
6658 arg_offset_max = (2*ocount)/3;
6659
6660 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6661 {
6662 ocount = re->top_backref * 3 + 3;
6663 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6664 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6665 using_temporary_offsets = TRUE;
6666 DPRINTF(("Got memory to hold back references\n"));
6667 }
6668 else md->offset_vector = offsets;
6669
6670 md->offset_end = ocount;
6671 md->offset_max = (2*ocount)/3;
6672 md->offset_overflow = FALSE;
6673 md->capture_last = -1;
6674
6675 /* Reset the working variable associated with each extraction. These should
6676 never be used unless previously set, but they get saved and restored, and so we
6677 initialize them to avoid reading uninitialized locations. Also, unset the
6678 offsets for the matched string. This is really just for tidiness with callouts,
6679 in case they inspect these fields. */
6680
6681 if (md->offset_vector != NULL)
6682 {
6683 register int *iptr = md->offset_vector + ocount;
6684 register int *iend = iptr - re->top_bracket;
6685 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6686 while (--iptr >= iend) *iptr = -1;
6687 md->offset_vector[0] = md->offset_vector[1] = -1;
6688 }
6689
6690 /* Set up the first character to match, if available. The first_char value is
6691 never set for an anchored regular expression, but the anchoring may be forced
6692 at run time, so we have to test for anchoring. The first char may be unset for
6693 an unanchored pattern, of course. If there's no first char and the pattern was
6694 studied, there may be a bitmap of possible first characters. */
6695
6696 if (!anchored)
6697 {
6698 if ((re->flags & PCRE_FIRSTSET) != 0)
6699 {
6700 has_first_char = TRUE;
6701 first_char = first_char2 = (pcre_uchar)(re->first_char);
6702 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6703 {
6704 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6705 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6706 if (utf && first_char > 127)
6707 first_char2 = UCD_OTHERCASE(first_char);
6708 #endif
6709 }
6710 }
6711 else
6712 if (!startline && study != NULL &&
6713 (study->flags & PCRE_STUDY_MAPPED) != 0)
6714 start_bits = study->start_bits;
6715 }
6716
6717 /* For anchored or unanchored matches, there may be a "last known required
6718 character" set. */
6719
6720 if ((re->flags & PCRE_REQCHSET) != 0)
6721 {
6722 has_req_char = TRUE;
6723 req_char = req_char2 = (pcre_uchar)(re->req_char);
6724 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6725 {
6726 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6727 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6728 if (utf && req_char > 127)
6729 req_char2 = UCD_OTHERCASE(req_char);
6730 #endif
6731 }
6732 }
6733
6734
6735 /* ==========================================================================*/
6736
6737 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6738 the loop runs just once. */
6739
6740 for(;;)
6741 {
6742 PCRE_PUCHAR save_end_subject = end_subject;
6743 PCRE_PUCHAR new_start_match;
6744
6745 /* If firstline is TRUE, the start of the match is constrained to the first