/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 916 - (show annotations)
Wed Feb 15 09:50:53 2012 UTC (7 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 214824 byte(s)
Fix several partial matching bugs for backrefs, \R, \X, and CRLF line endings. 
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: >= 0 the number of subject bytes matched
144 -1 no match
145 -2 partial match; always given if at end subject
146 */
147
148 static int
149 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
150 BOOL caseless)
151 {
152 PCRE_PUCHAR eptr_start = eptr;
153 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
154
155 #ifdef PCRE_DEBUG
156 if (eptr >= md->end_subject)
157 printf("matching subject <null>");
158 else
159 {
160 printf("matching subject ");
161 pchars(eptr, length, TRUE, md);
162 }
163 printf(" against backref ");
164 pchars(p, length, FALSE, md);
165 printf("\n");
166 #endif
167
168 /* Always fail if reference not set (and not JavaScript compatible - in that
169 case the length is passed as zero). */
170
171 if (length < 0) return -1;
172
173 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
174 properly if Unicode properties are supported. Otherwise, we can check only
175 ASCII characters. */
176
177 if (caseless)
178 {
179 #ifdef SUPPORT_UTF
180 #ifdef SUPPORT_UCP
181 if (md->utf)
182 {
183 /* Match characters up to the end of the reference. NOTE: the number of
184 bytes matched may differ, because there are some characters whose upper and
185 lower case versions code as different numbers of bytes. For example, U+023A
186 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
187 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
188 the latter. It is important, therefore, to check the length along the
189 reference, not along the subject (earlier code did this wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 int c, d;
195 if (eptr >= md->end_subject) return -2; /* Partial match */
196 GETCHARINC(c, eptr);
197 GETCHARINC(d, p);
198 if (c != d && c != UCD_OTHERCASE(d)) return -1;
199 }
200 }
201 else
202 #endif
203 #endif
204
205 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
206 is no UCP support. */
207 {
208 while (length-- > 0)
209 {
210 if (eptr >= md->end_subject) return -2; /* Partial match */
211 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
212 p++;
213 eptr++;
214 }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 while (length-- > 0)
224 {
225 if (eptr >= md->end_subject) return -2; /* Partial match */
226 if (*p++ != *eptr++) return -1;
227 }
228 }
229
230 return (int)(eptr - eptr_start);
231 }
232
233
234
235 /***************************************************************************
236 ****************************************************************************
237 RECURSION IN THE match() FUNCTION
238
239 The match() function is highly recursive, though not every recursive call
240 increases the recursive depth. Nevertheless, some regular expressions can cause
241 it to recurse to a great depth. I was writing for Unix, so I just let it call
242 itself recursively. This uses the stack for saving everything that has to be
243 saved for a recursive call. On Unix, the stack can be large, and this works
244 fine.
245
246 It turns out that on some non-Unix-like systems there are problems with
247 programs that use a lot of stack. (This despite the fact that every last chip
248 has oodles of memory these days, and techniques for extending the stack have
249 been known for decades.) So....
250
251 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
252 calls by keeping local variables that need to be preserved in blocks of memory
253 obtained from malloc() instead instead of on the stack. Macros are used to
254 achieve this so that the actual code doesn't look very different to what it
255 always used to.
256
257 The original heap-recursive code used longjmp(). However, it seems that this
258 can be very slow on some operating systems. Following a suggestion from Stan
259 Switzer, the use of longjmp() has been abolished, at the cost of having to
260 provide a unique number for each call to RMATCH. There is no way of generating
261 a sequence of numbers at compile time in C. I have given them names, to make
262 them stand out more clearly.
263
264 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
265 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
266 tests. Furthermore, not using longjmp() means that local dynamic variables
267 don't have indeterminate values; this has meant that the frame size can be
268 reduced because the result can be "passed back" by straight setting of the
269 variable instead of being passed in the frame.
270 ****************************************************************************
271 ***************************************************************************/
272
273 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
274 below must be updated in sync. */
275
276 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
277 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
278 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
279 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
280 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
281 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
282 RM61, RM62, RM63, RM64, RM65, RM66 };
283
284 /* These versions of the macros use the stack, as normal. There are debugging
285 versions and production versions. Note that the "rw" argument of RMATCH isn't
286 actually used in this definition. */
287
288 #ifndef NO_RECURSE
289 #define REGISTER register
290
291 #ifdef PCRE_DEBUG
292 #define RMATCH(ra,rb,rc,rd,re,rw) \
293 { \
294 printf("match() called in line %d\n", __LINE__); \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
296 printf("to line %d\n", __LINE__); \
297 }
298 #define RRETURN(ra) \
299 { \
300 printf("match() returned %d from line %d ", ra, __LINE__); \
301 return ra; \
302 }
303 #else
304 #define RMATCH(ra,rb,rc,rd,re,rw) \
305 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
306 #define RRETURN(ra) return ra
307 #endif
308
309 #else
310
311
312 /* These versions of the macros manage a private stack on the heap. Note that
313 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
314 argument of match(), which never changes. */
315
316 #define REGISTER
317
318 #define RMATCH(ra,rb,rc,rd,re,rw)\
319 {\
320 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
321 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
322 frame->Xwhere = rw; \
323 newframe->Xeptr = ra;\
324 newframe->Xecode = rb;\
325 newframe->Xmstart = mstart;\
326 newframe->Xoffset_top = rc;\
327 newframe->Xeptrb = re;\
328 newframe->Xrdepth = frame->Xrdepth + 1;\
329 newframe->Xprevframe = frame;\
330 frame = newframe;\
331 DPRINTF(("restarting from line %d\n", __LINE__));\
332 goto HEAP_RECURSE;\
333 L_##rw:\
334 DPRINTF(("jumped back to line %d\n", __LINE__));\
335 }
336
337 #define RRETURN(ra)\
338 {\
339 heapframe *oldframe = frame;\
340 frame = oldframe->Xprevframe;\
341 if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
342 if (frame != NULL)\
343 {\
344 rrc = ra;\
345 goto HEAP_RETURN;\
346 }\
347 return ra;\
348 }
349
350
351 /* Structure for remembering the local variables in a private frame */
352
353 typedef struct heapframe {
354 struct heapframe *Xprevframe;
355
356 /* Function arguments that may change */
357
358 PCRE_PUCHAR Xeptr;
359 const pcre_uchar *Xecode;
360 PCRE_PUCHAR Xmstart;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uchar Xocchars[6];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appear several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 offset_top current top pointer
463 md pointer to "static" info for the match
464 eptrb pointer to chain of blocks containing eptr at start of
465 brackets - for testing for empty matches
466 rdepth the recursion depth
467
468 Returns: MATCH_MATCH if matched ) these values are >= 0
469 MATCH_NOMATCH if failed to match )
470 a negative MATCH_xxx value for PRUNE, SKIP, etc
471 a negative PCRE_ERROR_xxx value if aborted by an error condition
472 (e.g. stopped by repeated call or recursion limit)
473 */
474
475 static int
476 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
477 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
478 unsigned int rdepth)
479 {
480 /* These variables do not need to be preserved over recursion in this function,
481 so they can be ordinary variables in all cases. Mark some of them with
482 "register" because they are used a lot in loops. */
483
484 register int rrc; /* Returns from recursive calls */
485 register int i; /* Used for loops not involving calls to RMATCH() */
486 register unsigned int c; /* Character values not kept over RMATCH() calls */
487 register BOOL utf; /* Local copy of UTF flag for speed */
488
489 BOOL minimize, possessive; /* Quantifier options */
490 BOOL caseless;
491 int condcode;
492
493 /* When recursion is not being used, all "local" variables that have to be
494 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
495 frame on the stack here; subsequent instantiations are obtained from the heap
496 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
497 the top-level on the stack rather than malloc-ing them all gives a performance
498 boost in many cases where there is not much "recursion". */
499
500 #ifdef NO_RECURSE
501 heapframe frame_zero;
502 heapframe *frame = &frame_zero;
503 frame->Xprevframe = NULL; /* Marks the top level */
504
505 /* Copy in the original argument variables */
506
507 frame->Xeptr = eptr;
508 frame->Xecode = ecode;
509 frame->Xmstart = mstart;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define offset_top frame->Xoffset_top
524 #define eptrb frame->Xeptrb
525 #define rdepth frame->Xrdepth
526
527 /* Ditto for the local variables */
528
529 #ifdef SUPPORT_UTF
530 #define charptr frame->Xcharptr
531 #endif
532 #define callpat frame->Xcallpat
533 #define codelink frame->Xcodelink
534 #define data frame->Xdata
535 #define next frame->Xnext
536 #define pp frame->Xpp
537 #define prev frame->Xprev
538 #define saved_eptr frame->Xsaved_eptr
539
540 #define new_recursive frame->Xnew_recursive
541
542 #define cur_is_word frame->Xcur_is_word
543 #define condition frame->Xcondition
544 #define prev_is_word frame->Xprev_is_word
545
546 #ifdef SUPPORT_UCP
547 #define prop_type frame->Xprop_type
548 #define prop_value frame->Xprop_value
549 #define prop_fail_result frame->Xprop_fail_result
550 #define oclength frame->Xoclength
551 #define occhars frame->Xocchars
552 #endif
553
554 #define ctype frame->Xctype
555 #define fc frame->Xfc
556 #define fi frame->Xfi
557 #define length frame->Xlength
558 #define max frame->Xmax
559 #define min frame->Xmin
560 #define number frame->Xnumber
561 #define offset frame->Xoffset
562 #define op frame->Xop
563 #define save_capture_last frame->Xsave_capture_last
564 #define save_offset1 frame->Xsave_offset1
565 #define save_offset2 frame->Xsave_offset2
566 #define save_offset3 frame->Xsave_offset3
567 #define stacksave frame->Xstacksave
568
569 #define newptrb frame->Xnewptrb
570
571 /* When recursion is being used, local variables are allocated on the stack and
572 get preserved during recursion in the normal way. In this environment, fi and
573 i, and fc and c, can be the same variables. */
574
575 #else /* NO_RECURSE not defined */
576 #define fi i
577 #define fc c
578
579 /* Many of the following variables are used only in small blocks of the code.
580 My normal style of coding would have declared them within each of those blocks.
581 However, in order to accommodate the version of this code that uses an external
582 "stack" implemented on the heap, it is easier to declare them all here, so the
583 declarations can be cut out in a block. The only declarations within blocks
584 below are for variables that do not have to be preserved over a recursive call
585 to RMATCH(). */
586
587 #ifdef SUPPORT_UTF
588 const pcre_uchar *charptr;
589 #endif
590 const pcre_uchar *callpat;
591 const pcre_uchar *data;
592 const pcre_uchar *next;
593 PCRE_PUCHAR pp;
594 const pcre_uchar *prev;
595 PCRE_PUCHAR saved_eptr;
596
597 recursion_info new_recursive;
598
599 BOOL cur_is_word;
600 BOOL condition;
601 BOOL prev_is_word;
602
603 #ifdef SUPPORT_UCP
604 int prop_type;
605 int prop_value;
606 int prop_fail_result;
607 int oclength;
608 pcre_uchar occhars[6];
609 #endif
610
611 int codelink;
612 int ctype;
613 int length;
614 int max;
615 int min;
616 int number;
617 int offset;
618 int op;
619 int save_capture_last;
620 int save_offset1, save_offset2, save_offset3;
621 int stacksave[REC_STACK_SAVE_MAX];
622
623 eptrblock newptrb;
624
625 /* There is a special fudge for calling match() in a way that causes it to
626 measure the size of its basic stack frame when the stack is being used for
627 recursion. The second argument (ecode) being NULL triggers this behaviour. It
628 cannot normally ever be NULL. The return is the negated value of the frame
629 size. */
630
631 if (ecode == NULL)
632 {
633 if (rdepth == 0)
634 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
635 else
636 {
637 int len = (char *)&rdepth - (char *)eptr;
638 return (len > 0)? -len : len;
639 }
640 }
641 #endif /* NO_RECURSE */
642
643 /* To save space on the stack and in the heap frame, I have doubled up on some
644 of the local variables that are used only in localised parts of the code, but
645 still need to be preserved over recursive calls of match(). These macros define
646 the alternative names that are used. */
647
648 #define allow_zero cur_is_word
649 #define cbegroup condition
650 #define code_offset codelink
651 #define condassert condition
652 #define matched_once prev_is_word
653 #define foc number
654 #define save_mark data
655
656 /* These statements are here to stop the compiler complaining about unitialized
657 variables. */
658
659 #ifdef SUPPORT_UCP
660 prop_value = 0;
661 prop_fail_result = 0;
662 #endif
663
664
665 /* This label is used for tail recursion, which is used in a few cases even
666 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
667 used. Thanks to Ian Taylor for noticing this possibility and sending the
668 original patch. */
669
670 TAIL_RECURSE:
671
672 /* OK, now we can get on with the real code of the function. Recursive calls
673 are specified by the macro RMATCH and RRETURN is used to return. When
674 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
675 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
676 defined). However, RMATCH isn't like a function call because it's quite a
677 complicated macro. It has to be used in one particular way. This shouldn't,
678 however, impact performance when true recursion is being used. */
679
680 #ifdef SUPPORT_UTF
681 utf = md->utf; /* Local copy of the flag */
682 #else
683 utf = FALSE;
684 #endif
685
686 /* First check that we haven't called match() too many times, or that we
687 haven't exceeded the recursive call limit. */
688
689 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
690 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
691
692 /* At the start of a group with an unlimited repeat that may match an empty
693 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
694 done this way to save having to use another function argument, which would take
695 up space on the stack. See also MATCH_CONDASSERT below.
696
697 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
698 such remembered pointers, to be checked when we hit the closing ket, in order
699 to break infinite loops that match no characters. When match() is called in
700 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
701 NOT be used with tail recursion, because the memory block that is used is on
702 the stack, so a new one may be required for each match(). */
703
704 if (md->match_function_type == MATCH_CBEGROUP)
705 {
706 newptrb.epb_saved_eptr = eptr;
707 newptrb.epb_prev = eptrb;
708 eptrb = &newptrb;
709 md->match_function_type = 0;
710 }
711
712 /* Now start processing the opcodes. */
713
714 for (;;)
715 {
716 minimize = possessive = FALSE;
717 op = *ecode;
718
719 switch(op)
720 {
721 case OP_MARK:
722 md->nomatch_mark = ecode + 2;
723 md->mark = NULL; /* In case previously set by assertion */
724 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
725 eptrb, RM55);
726 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
727 md->mark == NULL) md->mark = ecode + 2;
728
729 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
730 argument, and we must check whether that argument matches this MARK's
731 argument. It is passed back in md->start_match_ptr (an overloading of that
732 variable). If it does match, we reset that variable to the current subject
733 position and return MATCH_SKIP. Otherwise, pass back the return code
734 unaltered. */
735
736 else if (rrc == MATCH_SKIP_ARG &&
737 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
738 {
739 md->start_match_ptr = eptr;
740 RRETURN(MATCH_SKIP);
741 }
742 RRETURN(rrc);
743
744 case OP_FAIL:
745 RRETURN(MATCH_NOMATCH);
746
747 /* COMMIT overrides PRUNE, SKIP, and THEN */
748
749 case OP_COMMIT:
750 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
751 eptrb, RM52);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
753 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
754 rrc != MATCH_THEN)
755 RRETURN(rrc);
756 RRETURN(MATCH_COMMIT);
757
758 /* PRUNE overrides THEN */
759
760 case OP_PRUNE:
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
762 eptrb, RM51);
763 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
764 RRETURN(MATCH_PRUNE);
765
766 case OP_PRUNE_ARG:
767 md->nomatch_mark = ecode + 2;
768 md->mark = NULL; /* In case previously set by assertion */
769 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
770 eptrb, RM56);
771 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
772 md->mark == NULL) md->mark = ecode + 2;
773 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
774 RRETURN(MATCH_PRUNE);
775
776 /* SKIP overrides PRUNE and THEN */
777
778 case OP_SKIP:
779 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
780 eptrb, RM53);
781 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
782 RRETURN(rrc);
783 md->start_match_ptr = eptr; /* Pass back current position */
784 RRETURN(MATCH_SKIP);
785
786 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
787 nomatch_mark. There is a flag that disables this opcode when re-matching a
788 pattern that ended with a SKIP for which there was not a matching MARK. */
789
790 case OP_SKIP_ARG:
791 if (md->ignore_skip_arg)
792 {
793 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
794 break;
795 }
796 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
797 eptrb, RM57);
798 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
799 RRETURN(rrc);
800
801 /* Pass back the current skip name by overloading md->start_match_ptr and
802 returning the special MATCH_SKIP_ARG return code. This will either be
803 caught by a matching MARK, or get to the top, where it causes a rematch
804 with the md->ignore_skip_arg flag set. */
805
806 md->start_match_ptr = ecode + 2;
807 RRETURN(MATCH_SKIP_ARG);
808
809 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
810 the branch in which it occurs can be determined. Overload the start of
811 match pointer to do this. */
812
813 case OP_THEN:
814 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
815 eptrb, RM54);
816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
817 md->start_match_ptr = ecode;
818 RRETURN(MATCH_THEN);
819
820 case OP_THEN_ARG:
821 md->nomatch_mark = ecode + 2;
822 md->mark = NULL; /* In case previously set by assertion */
823 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
824 md, eptrb, RM58);
825 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
826 md->mark == NULL) md->mark = ecode + 2;
827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
828 md->start_match_ptr = ecode;
829 RRETURN(MATCH_THEN);
830
831 /* Handle an atomic group that does not contain any capturing parentheses.
832 This can be handled like an assertion. Prior to 8.13, all atomic groups
833 were handled this way. In 8.13, the code was changed as below for ONCE, so
834 that backups pass through the group and thereby reset captured values.
835 However, this uses a lot more stack, so in 8.20, atomic groups that do not
836 contain any captures generate OP_ONCE_NC, which can be handled in the old,
837 less stack intensive way.
838
839 Check the alternative branches in turn - the matching won't pass the KET
840 for this kind of subpattern. If any one branch matches, we carry on as at
841 the end of a normal bracket, leaving the subject pointer, but resetting
842 the start-of-match value in case it was changed by \K. */
843
844 case OP_ONCE_NC:
845 prev = ecode;
846 saved_eptr = eptr;
847 save_mark = md->mark;
848 do
849 {
850 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
851 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
852 {
853 mstart = md->start_match_ptr;
854 break;
855 }
856 if (rrc == MATCH_THEN)
857 {
858 next = ecode + GET(ecode,1);
859 if (md->start_match_ptr < next &&
860 (*ecode == OP_ALT || *next == OP_ALT))
861 rrc = MATCH_NOMATCH;
862 }
863
864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865 ecode += GET(ecode,1);
866 md->mark = save_mark;
867 }
868 while (*ecode == OP_ALT);
869
870 /* If hit the end of the group (which could be repeated), fail */
871
872 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
873
874 /* Continue as from after the group, updating the offsets high water
875 mark, since extracts may have been taken. */
876
877 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
878
879 offset_top = md->end_offset_top;
880 eptr = md->end_match_ptr;
881
882 /* For a non-repeating ket, just continue at this level. This also
883 happens for a repeating ket if no characters were matched in the group.
884 This is the forcible breaking of infinite loops as implemented in Perl
885 5.005. */
886
887 if (*ecode == OP_KET || eptr == saved_eptr)
888 {
889 ecode += 1+LINK_SIZE;
890 break;
891 }
892
893 /* The repeating kets try the rest of the pattern or restart from the
894 preceding bracket, in the appropriate order. The second "call" of match()
895 uses tail recursion, to avoid using another stack frame. */
896
897 if (*ecode == OP_KETRMIN)
898 {
899 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
901 ecode = prev;
902 goto TAIL_RECURSE;
903 }
904 else /* OP_KETRMAX */
905 {
906 md->match_function_type = MATCH_CBEGROUP;
907 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
909 ecode += 1 + LINK_SIZE;
910 goto TAIL_RECURSE;
911 }
912 /* Control never gets here */
913
914 /* Handle a capturing bracket, other than those that are possessive with an
915 unlimited repeat. If there is space in the offset vector, save the current
916 subject position in the working slot at the top of the vector. We mustn't
917 change the current values of the data slot, because they may be set from a
918 previous iteration of this group, and be referred to by a reference inside
919 the group. A failure to match might occur after the group has succeeded,
920 if something later on doesn't match. For this reason, we need to restore
921 the working value and also the values of the final offsets, in case they
922 were set by a previous iteration of the same bracket.
923
924 If there isn't enough space in the offset vector, treat this as if it were
925 a non-capturing bracket. Don't worry about setting the flag for the error
926 case here; that is handled in the code for KET. */
927
928 case OP_CBRA:
929 case OP_SCBRA:
930 number = GET2(ecode, 1+LINK_SIZE);
931 offset = number << 1;
932
933 #ifdef PCRE_DEBUG
934 printf("start bracket %d\n", number);
935 printf("subject=");
936 pchars(eptr, 16, TRUE, md);
937 printf("\n");
938 #endif
939
940 if (offset < md->offset_max)
941 {
942 save_offset1 = md->offset_vector[offset];
943 save_offset2 = md->offset_vector[offset+1];
944 save_offset3 = md->offset_vector[md->offset_end - number];
945 save_capture_last = md->capture_last;
946 save_mark = md->mark;
947
948 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
949 md->offset_vector[md->offset_end - number] =
950 (int)(eptr - md->start_subject);
951
952 for (;;)
953 {
954 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
955 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
956 eptrb, RM1);
957 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
958
959 /* If we backed up to a THEN, check whether it is within the current
960 branch by comparing the address of the THEN that is passed back with
961 the end of the branch. If it is within the current branch, and the
962 branch is one of two or more alternatives (it either starts or ends
963 with OP_ALT), we have reached the limit of THEN's action, so convert
964 the return code to NOMATCH, which will cause normal backtracking to
965 happen from now on. Otherwise, THEN is passed back to an outer
966 alternative. This implements Perl's treatment of parenthesized groups,
967 where a group not containing | does not affect the current alternative,
968 that is, (X) is NOT the same as (X|(*F)). */
969
970 if (rrc == MATCH_THEN)
971 {
972 next = ecode + GET(ecode,1);
973 if (md->start_match_ptr < next &&
974 (*ecode == OP_ALT || *next == OP_ALT))
975 rrc = MATCH_NOMATCH;
976 }
977
978 /* Anything other than NOMATCH is passed back. */
979
980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 md->mark = save_mark;
984 if (*ecode != OP_ALT) break;
985 }
986
987 DPRINTF(("bracket %d failed\n", number));
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991
992 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
993
994 RRETURN(rrc);
995 }
996
997 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
998 as a non-capturing bracket. */
999
1000 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1001 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1002
1003 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1004
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007
1008 /* Non-capturing or atomic group, except for possessive with unlimited
1009 repeat and ONCE group with no captures. Loop for all the alternatives.
1010
1011 When we get to the final alternative within the brackets, we used to return
1012 the result of a recursive call to match() whatever happened so it was
1013 possible to reduce stack usage by turning this into a tail recursion,
1014 except in the case of a possibly empty group. However, now that there is
1015 the possiblity of (*THEN) occurring in the final alternative, this
1016 optimization is no longer always possible.
1017
1018 We can optimize if we know there are no (*THEN)s in the pattern; at present
1019 this is the best that can be done.
1020
1021 MATCH_ONCE is returned when the end of an atomic group is successfully
1022 reached, but subsequent matching fails. It passes back up the tree (causing
1023 captured values to be reset) until the original atomic group level is
1024 reached. This is tested by comparing md->once_target with the start of the
1025 group. At this point, the return is converted into MATCH_NOMATCH so that
1026 previous backup points can be taken. */
1027
1028 case OP_ONCE:
1029 case OP_BRA:
1030 case OP_SBRA:
1031 DPRINTF(("start non-capturing bracket\n"));
1032
1033 for (;;)
1034 {
1035 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1036
1037 /* If this is not a possibly empty group, and there are no (*THEN)s in
1038 the pattern, and this is the final alternative, optimize as described
1039 above. */
1040
1041 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1042 {
1043 ecode += PRIV(OP_lengths)[*ecode];
1044 goto TAIL_RECURSE;
1045 }
1046
1047 /* In all other cases, we have to make another call to match(). */
1048
1049 save_mark = md->mark;
1050 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1051 RM2);
1052
1053 /* See comment in the code for capturing groups above about handling
1054 THEN. */
1055
1056 if (rrc == MATCH_THEN)
1057 {
1058 next = ecode + GET(ecode,1);
1059 if (md->start_match_ptr < next &&
1060 (*ecode == OP_ALT || *next == OP_ALT))
1061 rrc = MATCH_NOMATCH;
1062 }
1063
1064 if (rrc != MATCH_NOMATCH)
1065 {
1066 if (rrc == MATCH_ONCE)
1067 {
1068 const pcre_uchar *scode = ecode;
1069 if (*scode != OP_ONCE) /* If not at start, find it */
1070 {
1071 while (*scode == OP_ALT) scode += GET(scode, 1);
1072 scode -= GET(scode, 1);
1073 }
1074 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1075 }
1076 RRETURN(rrc);
1077 }
1078 ecode += GET(ecode, 1);
1079 md->mark = save_mark;
1080 if (*ecode != OP_ALT) break;
1081 }
1082
1083 RRETURN(MATCH_NOMATCH);
1084
1085 /* Handle possessive capturing brackets with an unlimited repeat. We come
1086 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1087 handled similarly to the normal case above. However, the matching is
1088 different. The end of these brackets will always be OP_KETRPOS, which
1089 returns MATCH_KETRPOS without going further in the pattern. By this means
1090 we can handle the group by iteration rather than recursion, thereby
1091 reducing the amount of stack needed. */
1092
1093 case OP_CBRAPOS:
1094 case OP_SCBRAPOS:
1095 allow_zero = FALSE;
1096
1097 POSSESSIVE_CAPTURE:
1098 number = GET2(ecode, 1+LINK_SIZE);
1099 offset = number << 1;
1100
1101 #ifdef PCRE_DEBUG
1102 printf("start possessive bracket %d\n", number);
1103 printf("subject=");
1104 pchars(eptr, 16, TRUE, md);
1105 printf("\n");
1106 #endif
1107
1108 if (offset < md->offset_max)
1109 {
1110 matched_once = FALSE;
1111 code_offset = (int)(ecode - md->start_code);
1112
1113 save_offset1 = md->offset_vector[offset];
1114 save_offset2 = md->offset_vector[offset+1];
1115 save_offset3 = md->offset_vector[md->offset_end - number];
1116 save_capture_last = md->capture_last;
1117
1118 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1119
1120 /* Each time round the loop, save the current subject position for use
1121 when the group matches. For MATCH_MATCH, the group has matched, so we
1122 restart it with a new subject starting position, remembering that we had
1123 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1124 usual. If we haven't matched any alternatives in any iteration, check to
1125 see if a previous iteration matched. If so, the group has matched;
1126 continue from afterwards. Otherwise it has failed; restore the previous
1127 capture values before returning NOMATCH. */
1128
1129 for (;;)
1130 {
1131 md->offset_vector[md->offset_end - number] =
1132 (int)(eptr - md->start_subject);
1133 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1134 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1135 eptrb, RM63);
1136 if (rrc == MATCH_KETRPOS)
1137 {
1138 offset_top = md->end_offset_top;
1139 eptr = md->end_match_ptr;
1140 ecode = md->start_code + code_offset;
1141 save_capture_last = md->capture_last;
1142 matched_once = TRUE;
1143 continue;
1144 }
1145
1146 /* See comment in the code for capturing groups above about handling
1147 THEN. */
1148
1149 if (rrc == MATCH_THEN)
1150 {
1151 next = ecode + GET(ecode,1);
1152 if (md->start_match_ptr < next &&
1153 (*ecode == OP_ALT || *next == OP_ALT))
1154 rrc = MATCH_NOMATCH;
1155 }
1156
1157 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1158 md->capture_last = save_capture_last;
1159 ecode += GET(ecode, 1);
1160 if (*ecode != OP_ALT) break;
1161 }
1162
1163 if (!matched_once)
1164 {
1165 md->offset_vector[offset] = save_offset1;
1166 md->offset_vector[offset+1] = save_offset2;
1167 md->offset_vector[md->offset_end - number] = save_offset3;
1168 }
1169
1170 if (allow_zero || matched_once)
1171 {
1172 ecode += 1 + LINK_SIZE;
1173 break;
1174 }
1175
1176 RRETURN(MATCH_NOMATCH);
1177 }
1178
1179 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1180 as a non-capturing bracket. */
1181
1182 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1183 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1184
1185 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1186
1187 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1189
1190 /* Non-capturing possessive bracket with unlimited repeat. We come here
1191 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1192 without the capturing complication. It is written out separately for speed
1193 and cleanliness. */
1194
1195 case OP_BRAPOS:
1196 case OP_SBRAPOS:
1197 allow_zero = FALSE;
1198
1199 POSSESSIVE_NON_CAPTURE:
1200 matched_once = FALSE;
1201 code_offset = (int)(ecode - md->start_code);
1202
1203 for (;;)
1204 {
1205 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1206 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1207 eptrb, RM48);
1208 if (rrc == MATCH_KETRPOS)
1209 {
1210 offset_top = md->end_offset_top;
1211 eptr = md->end_match_ptr;
1212 ecode = md->start_code + code_offset;
1213 matched_once = TRUE;
1214 continue;
1215 }
1216
1217 /* See comment in the code for capturing groups above about handling
1218 THEN. */
1219
1220 if (rrc == MATCH_THEN)
1221 {
1222 next = ecode + GET(ecode,1);
1223 if (md->start_match_ptr < next &&
1224 (*ecode == OP_ALT || *next == OP_ALT))
1225 rrc = MATCH_NOMATCH;
1226 }
1227
1228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1229 ecode += GET(ecode, 1);
1230 if (*ecode != OP_ALT) break;
1231 }
1232
1233 if (matched_once || allow_zero)
1234 {
1235 ecode += 1 + LINK_SIZE;
1236 break;
1237 }
1238 RRETURN(MATCH_NOMATCH);
1239
1240 /* Control never reaches here. */
1241
1242 /* Conditional group: compilation checked that there are no more than
1243 two branches. If the condition is false, skipping the first branch takes us
1244 past the end if there is only one branch, but that's OK because that is
1245 exactly what going to the ket would do. */
1246
1247 case OP_COND:
1248 case OP_SCOND:
1249 codelink = GET(ecode, 1);
1250
1251 /* Because of the way auto-callout works during compile, a callout item is
1252 inserted between OP_COND and an assertion condition. */
1253
1254 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1255 {
1256 if (PUBL(callout) != NULL)
1257 {
1258 PUBL(callout_block) cb;
1259 cb.version = 2; /* Version 1 of the callout block */
1260 cb.callout_number = ecode[LINK_SIZE+2];
1261 cb.offset_vector = md->offset_vector;
1262 #ifdef COMPILE_PCRE8
1263 cb.subject = (PCRE_SPTR)md->start_subject;
1264 #else
1265 cb.subject = (PCRE_SPTR16)md->start_subject;
1266 #endif
1267 cb.subject_length = (int)(md->end_subject - md->start_subject);
1268 cb.start_match = (int)(mstart - md->start_subject);
1269 cb.current_position = (int)(eptr - md->start_subject);
1270 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1271 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1272 cb.capture_top = offset_top/2;
1273 cb.capture_last = md->capture_last;
1274 cb.callout_data = md->callout_data;
1275 cb.mark = md->nomatch_mark;
1276 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1277 if (rrc < 0) RRETURN(rrc);
1278 }
1279 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1280 }
1281
1282 condcode = ecode[LINK_SIZE+1];
1283
1284 /* Now see what the actual condition is */
1285
1286 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1287 {
1288 if (md->recursive == NULL) /* Not recursing => FALSE */
1289 {
1290 condition = FALSE;
1291 ecode += GET(ecode, 1);
1292 }
1293 else
1294 {
1295 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1296 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1297
1298 /* If the test is for recursion into a specific subpattern, and it is
1299 false, but the test was set up by name, scan the table to see if the
1300 name refers to any other numbers, and test them. The condition is true
1301 if any one is set. */
1302
1303 if (!condition && condcode == OP_NRREF)
1304 {
1305 pcre_uchar *slotA = md->name_table;
1306 for (i = 0; i < md->name_count; i++)
1307 {
1308 if (GET2(slotA, 0) == recno) break;
1309 slotA += md->name_entry_size;
1310 }
1311
1312 /* Found a name for the number - there can be only one; duplicate
1313 names for different numbers are allowed, but not vice versa. First
1314 scan down for duplicates. */
1315
1316 if (i < md->name_count)
1317 {
1318 pcre_uchar *slotB = slotA;
1319 while (slotB > md->name_table)
1320 {
1321 slotB -= md->name_entry_size;
1322 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1323 {
1324 condition = GET2(slotB, 0) == md->recursive->group_num;
1325 if (condition) break;
1326 }
1327 else break;
1328 }
1329
1330 /* Scan up for duplicates */
1331
1332 if (!condition)
1333 {
1334 slotB = slotA;
1335 for (i++; i < md->name_count; i++)
1336 {
1337 slotB += md->name_entry_size;
1338 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1339 {
1340 condition = GET2(slotB, 0) == md->recursive->group_num;
1341 if (condition) break;
1342 }
1343 else break;
1344 }
1345 }
1346 }
1347 }
1348
1349 /* Chose branch according to the condition */
1350
1351 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1352 }
1353 }
1354
1355 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1356 {
1357 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1358 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1359
1360 /* If the numbered capture is unset, but the reference was by name,
1361 scan the table to see if the name refers to any other numbers, and test
1362 them. The condition is true if any one is set. This is tediously similar
1363 to the code above, but not close enough to try to amalgamate. */
1364
1365 if (!condition && condcode == OP_NCREF)
1366 {
1367 int refno = offset >> 1;
1368 pcre_uchar *slotA = md->name_table;
1369
1370 for (i = 0; i < md->name_count; i++)
1371 {
1372 if (GET2(slotA, 0) == refno) break;
1373 slotA += md->name_entry_size;
1374 }
1375
1376 /* Found a name for the number - there can be only one; duplicate names
1377 for different numbers are allowed, but not vice versa. First scan down
1378 for duplicates. */
1379
1380 if (i < md->name_count)
1381 {
1382 pcre_uchar *slotB = slotA;
1383 while (slotB > md->name_table)
1384 {
1385 slotB -= md->name_entry_size;
1386 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1387 {
1388 offset = GET2(slotB, 0) << 1;
1389 condition = offset < offset_top &&
1390 md->offset_vector[offset] >= 0;
1391 if (condition) break;
1392 }
1393 else break;
1394 }
1395
1396 /* Scan up for duplicates */
1397
1398 if (!condition)
1399 {
1400 slotB = slotA;
1401 for (i++; i < md->name_count; i++)
1402 {
1403 slotB += md->name_entry_size;
1404 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1405 {
1406 offset = GET2(slotB, 0) << 1;
1407 condition = offset < offset_top &&
1408 md->offset_vector[offset] >= 0;
1409 if (condition) break;
1410 }
1411 else break;
1412 }
1413 }
1414 }
1415 }
1416
1417 /* Chose branch according to the condition */
1418
1419 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1420 }
1421
1422 else if (condcode == OP_DEF) /* DEFINE - always false */
1423 {
1424 condition = FALSE;
1425 ecode += GET(ecode, 1);
1426 }
1427
1428 /* The condition is an assertion. Call match() to evaluate it - setting
1429 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1430 an assertion. */
1431
1432 else
1433 {
1434 md->match_function_type = MATCH_CONDASSERT;
1435 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1436 if (rrc == MATCH_MATCH)
1437 {
1438 if (md->end_offset_top > offset_top)
1439 offset_top = md->end_offset_top; /* Captures may have happened */
1440 condition = TRUE;
1441 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1442 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1443 }
1444
1445 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1446 assertion; it is therefore treated as NOMATCH. */
1447
1448 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1449 {
1450 RRETURN(rrc); /* Need braces because of following else */
1451 }
1452 else
1453 {
1454 condition = FALSE;
1455 ecode += codelink;
1456 }
1457 }
1458
1459 /* We are now at the branch that is to be obeyed. As there is only one, can
1460 use tail recursion to avoid using another stack frame, except when there is
1461 unlimited repeat of a possibly empty group. In the latter case, a recursive
1462 call to match() is always required, unless the second alternative doesn't
1463 exist, in which case we can just plough on. Note that, for compatibility
1464 with Perl, the | in a conditional group is NOT treated as creating two
1465 alternatives. If a THEN is encountered in the branch, it propagates out to
1466 the enclosing alternative (unless nested in a deeper set of alternatives,
1467 of course). */
1468
1469 if (condition || *ecode == OP_ALT)
1470 {
1471 if (op != OP_SCOND)
1472 {
1473 ecode += 1 + LINK_SIZE;
1474 goto TAIL_RECURSE;
1475 }
1476
1477 md->match_function_type = MATCH_CBEGROUP;
1478 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1479 RRETURN(rrc);
1480 }
1481
1482 /* Condition false & no alternative; continue after the group. */
1483
1484 else
1485 {
1486 ecode += 1 + LINK_SIZE;
1487 }
1488 break;
1489
1490
1491 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1492 to close any currently open capturing brackets. */
1493
1494 case OP_CLOSE:
1495 number = GET2(ecode, 1);
1496 offset = number << 1;
1497
1498 #ifdef PCRE_DEBUG
1499 printf("end bracket %d at *ACCEPT", number);
1500 printf("\n");
1501 #endif
1502
1503 md->capture_last = number;
1504 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1505 {
1506 md->offset_vector[offset] =
1507 md->offset_vector[md->offset_end - number];
1508 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1509 if (offset_top <= offset) offset_top = offset + 2;
1510 }
1511 ecode += 1 + IMM2_SIZE;
1512 break;
1513
1514
1515 /* End of the pattern, either real or forced. */
1516
1517 case OP_END:
1518 case OP_ACCEPT:
1519 case OP_ASSERT_ACCEPT:
1520
1521 /* If we have matched an empty string, fail if not in an assertion and not
1522 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1523 is set and we have matched at the start of the subject. In both cases,
1524 backtracking will then try other alternatives, if any. */
1525
1526 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1527 md->recursive == NULL &&
1528 (md->notempty ||
1529 (md->notempty_atstart &&
1530 mstart == md->start_subject + md->start_offset)))
1531 RRETURN(MATCH_NOMATCH);
1532
1533 /* Otherwise, we have a match. */
1534
1535 md->end_match_ptr = eptr; /* Record where we ended */
1536 md->end_offset_top = offset_top; /* and how many extracts were taken */
1537 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1538
1539 /* For some reason, the macros don't work properly if an expression is
1540 given as the argument to RRETURN when the heap is in use. */
1541
1542 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1543 RRETURN(rrc);
1544
1545 /* Assertion brackets. Check the alternative branches in turn - the
1546 matching won't pass the KET for an assertion. If any one branch matches,
1547 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1548 start of each branch to move the current point backwards, so the code at
1549 this level is identical to the lookahead case. When the assertion is part
1550 of a condition, we want to return immediately afterwards. The caller of
1551 this incarnation of the match() function will have set MATCH_CONDASSERT in
1552 md->match_function type, and one of these opcodes will be the first opcode
1553 that is processed. We use a local variable that is preserved over calls to
1554 match() to remember this case. */
1555
1556 case OP_ASSERT:
1557 case OP_ASSERTBACK:
1558 save_mark = md->mark;
1559 if (md->match_function_type == MATCH_CONDASSERT)
1560 {
1561 condassert = TRUE;
1562 md->match_function_type = 0;
1563 }
1564 else condassert = FALSE;
1565
1566 do
1567 {
1568 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1569 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1570 {
1571 mstart = md->start_match_ptr; /* In case \K reset it */
1572 break;
1573 }
1574
1575 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1576 as NOMATCH. */
1577
1578 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1579 ecode += GET(ecode, 1);
1580 md->mark = save_mark;
1581 }
1582 while (*ecode == OP_ALT);
1583
1584 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1585
1586 /* If checking an assertion for a condition, return MATCH_MATCH. */
1587
1588 if (condassert) RRETURN(MATCH_MATCH);
1589
1590 /* Continue from after the assertion, updating the offsets high water
1591 mark, since extracts may have been taken during the assertion. */
1592
1593 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1594 ecode += 1 + LINK_SIZE;
1595 offset_top = md->end_offset_top;
1596 continue;
1597
1598 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1599 PRUNE, or COMMIT means we must assume failure without checking subsequent
1600 branches. */
1601
1602 case OP_ASSERT_NOT:
1603 case OP_ASSERTBACK_NOT:
1604 save_mark = md->mark;
1605 if (md->match_function_type == MATCH_CONDASSERT)
1606 {
1607 condassert = TRUE;
1608 md->match_function_type = 0;
1609 }
1610 else condassert = FALSE;
1611
1612 do
1613 {
1614 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1615 md->mark = save_mark;
1616 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1617 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1618 {
1619 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1620 break;
1621 }
1622
1623 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1624 as NOMATCH. */
1625
1626 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1627 ecode += GET(ecode,1);
1628 }
1629 while (*ecode == OP_ALT);
1630
1631 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1632
1633 ecode += 1 + LINK_SIZE;
1634 continue;
1635
1636 /* Move the subject pointer back. This occurs only at the start of
1637 each branch of a lookbehind assertion. If we are too close to the start to
1638 move back, this match function fails. When working with UTF-8 we move
1639 back a number of characters, not bytes. */
1640
1641 case OP_REVERSE:
1642 #ifdef SUPPORT_UTF
1643 if (utf)
1644 {
1645 i = GET(ecode, 1);
1646 while (i-- > 0)
1647 {
1648 eptr--;
1649 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1650 BACKCHAR(eptr);
1651 }
1652 }
1653 else
1654 #endif
1655
1656 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1657
1658 {
1659 eptr -= GET(ecode, 1);
1660 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1661 }
1662
1663 /* Save the earliest consulted character, then skip to next op code */
1664
1665 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1666 ecode += 1 + LINK_SIZE;
1667 break;
1668
1669 /* The callout item calls an external function, if one is provided, passing
1670 details of the match so far. This is mainly for debugging, though the
1671 function is able to force a failure. */
1672
1673 case OP_CALLOUT:
1674 if (PUBL(callout) != NULL)
1675 {
1676 PUBL(callout_block) cb;
1677 cb.version = 2; /* Version 1 of the callout block */
1678 cb.callout_number = ecode[1];
1679 cb.offset_vector = md->offset_vector;
1680 #ifdef COMPILE_PCRE8
1681 cb.subject = (PCRE_SPTR)md->start_subject;
1682 #else
1683 cb.subject = (PCRE_SPTR16)md->start_subject;
1684 #endif
1685 cb.subject_length = (int)(md->end_subject - md->start_subject);
1686 cb.start_match = (int)(mstart - md->start_subject);
1687 cb.current_position = (int)(eptr - md->start_subject);
1688 cb.pattern_position = GET(ecode, 2);
1689 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1690 cb.capture_top = offset_top/2;
1691 cb.capture_last = md->capture_last;
1692 cb.callout_data = md->callout_data;
1693 cb.mark = md->nomatch_mark;
1694 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1695 if (rrc < 0) RRETURN(rrc);
1696 }
1697 ecode += 2 + 2*LINK_SIZE;
1698 break;
1699
1700 /* Recursion either matches the current regex, or some subexpression. The
1701 offset data is the offset to the starting bracket from the start of the
1702 whole pattern. (This is so that it works from duplicated subpatterns.)
1703
1704 The state of the capturing groups is preserved over recursion, and
1705 re-instated afterwards. We don't know how many are started and not yet
1706 finished (offset_top records the completed total) so we just have to save
1707 all the potential data. There may be up to 65535 such values, which is too
1708 large to put on the stack, but using malloc for small numbers seems
1709 expensive. As a compromise, the stack is used when there are no more than
1710 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1711
1712 There are also other values that have to be saved. We use a chained
1713 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1714 for the original version of this logic. It has, however, been hacked around
1715 a lot, so he is not to blame for the current way it works. */
1716
1717 case OP_RECURSE:
1718 {
1719 recursion_info *ri;
1720 int recno;
1721
1722 callpat = md->start_code + GET(ecode, 1);
1723 recno = (callpat == md->start_code)? 0 :
1724 GET2(callpat, 1 + LINK_SIZE);
1725
1726 /* Check for repeating a recursion without advancing the subject pointer.
1727 This should catch convoluted mutual recursions. (Some simple cases are
1728 caught at compile time.) */
1729
1730 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1731 if (recno == ri->group_num && eptr == ri->subject_position)
1732 RRETURN(PCRE_ERROR_RECURSELOOP);
1733
1734 /* Add to "recursing stack" */
1735
1736 new_recursive.group_num = recno;
1737 new_recursive.subject_position = eptr;
1738 new_recursive.prevrec = md->recursive;
1739 md->recursive = &new_recursive;
1740
1741 /* Where to continue from afterwards */
1742
1743 ecode += 1 + LINK_SIZE;
1744
1745 /* Now save the offset data */
1746
1747 new_recursive.saved_max = md->offset_end;
1748 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1749 new_recursive.offset_save = stacksave;
1750 else
1751 {
1752 new_recursive.offset_save =
1753 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1754 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1755 }
1756 memcpy(new_recursive.offset_save, md->offset_vector,
1757 new_recursive.saved_max * sizeof(int));
1758
1759 /* OK, now we can do the recursion. After processing each alternative,
1760 restore the offset data. If there were nested recursions, md->recursive
1761 might be changed, so reset it before looping. */
1762
1763 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1764 cbegroup = (*callpat >= OP_SBRA);
1765 do
1766 {
1767 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1768 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1769 md, eptrb, RM6);
1770 memcpy(md->offset_vector, new_recursive.offset_save,
1771 new_recursive.saved_max * sizeof(int));
1772 md->recursive = new_recursive.prevrec;
1773 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1774 {
1775 DPRINTF(("Recursion matched\n"));
1776 if (new_recursive.offset_save != stacksave)
1777 (PUBL(free))(new_recursive.offset_save);
1778
1779 /* Set where we got to in the subject, and reset the start in case
1780 it was changed by \K. This *is* propagated back out of a recursion,
1781 for Perl compatibility. */
1782
1783 eptr = md->end_match_ptr;
1784 mstart = md->start_match_ptr;
1785 goto RECURSION_MATCHED; /* Exit loop; end processing */
1786 }
1787
1788 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1789 as NOMATCH. */
1790
1791 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1792 {
1793 DPRINTF(("Recursion gave error %d\n", rrc));
1794 if (new_recursive.offset_save != stacksave)
1795 (PUBL(free))(new_recursive.offset_save);
1796 RRETURN(rrc);
1797 }
1798
1799 md->recursive = &new_recursive;
1800 callpat += GET(callpat, 1);
1801 }
1802 while (*callpat == OP_ALT);
1803
1804 DPRINTF(("Recursion didn't match\n"));
1805 md->recursive = new_recursive.prevrec;
1806 if (new_recursive.offset_save != stacksave)
1807 (PUBL(free))(new_recursive.offset_save);
1808 RRETURN(MATCH_NOMATCH);
1809 }
1810
1811 RECURSION_MATCHED:
1812 break;
1813
1814 /* An alternation is the end of a branch; scan along to find the end of the
1815 bracketed group and go to there. */
1816
1817 case OP_ALT:
1818 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1819 break;
1820
1821 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1822 indicating that it may occur zero times. It may repeat infinitely, or not
1823 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1824 with fixed upper repeat limits are compiled as a number of copies, with the
1825 optional ones preceded by BRAZERO or BRAMINZERO. */
1826
1827 case OP_BRAZERO:
1828 next = ecode + 1;
1829 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1831 do next += GET(next, 1); while (*next == OP_ALT);
1832 ecode = next + 1 + LINK_SIZE;
1833 break;
1834
1835 case OP_BRAMINZERO:
1836 next = ecode + 1;
1837 do next += GET(next, 1); while (*next == OP_ALT);
1838 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840 ecode++;
1841 break;
1842
1843 case OP_SKIPZERO:
1844 next = ecode+1;
1845 do next += GET(next,1); while (*next == OP_ALT);
1846 ecode = next + 1 + LINK_SIZE;
1847 break;
1848
1849 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1850 here; just jump to the group, with allow_zero set TRUE. */
1851
1852 case OP_BRAPOSZERO:
1853 op = *(++ecode);
1854 allow_zero = TRUE;
1855 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1856 goto POSSESSIVE_NON_CAPTURE;
1857
1858 /* End of a group, repeated or non-repeating. */
1859
1860 case OP_KET:
1861 case OP_KETRMIN:
1862 case OP_KETRMAX:
1863 case OP_KETRPOS:
1864 prev = ecode - GET(ecode, 1);
1865
1866 /* If this was a group that remembered the subject start, in order to break
1867 infinite repeats of empty string matches, retrieve the subject start from
1868 the chain. Otherwise, set it NULL. */
1869
1870 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1871 {
1872 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1873 eptrb = eptrb->epb_prev; /* Backup to previous group */
1874 }
1875 else saved_eptr = NULL;
1876
1877 /* If we are at the end of an assertion group or a non-capturing atomic
1878 group, stop matching and return MATCH_MATCH, but record the current high
1879 water mark for use by positive assertions. We also need to record the match
1880 start in case it was changed by \K. */
1881
1882 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1883 *prev == OP_ONCE_NC)
1884 {
1885 md->end_match_ptr = eptr; /* For ONCE_NC */
1886 md->end_offset_top = offset_top;
1887 md->start_match_ptr = mstart;
1888 RRETURN(MATCH_MATCH); /* Sets md->mark */
1889 }
1890
1891 /* For capturing groups we have to check the group number back at the start
1892 and if necessary complete handling an extraction by setting the offsets and
1893 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1894 into group 0, so it won't be picked up here. Instead, we catch it when the
1895 OP_END is reached. Other recursion is handled here. We just have to record
1896 the current subject position and start match pointer and give a MATCH
1897 return. */
1898
1899 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1900 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1901 {
1902 number = GET2(prev, 1+LINK_SIZE);
1903 offset = number << 1;
1904
1905 #ifdef PCRE_DEBUG
1906 printf("end bracket %d", number);
1907 printf("\n");
1908 #endif
1909
1910 /* Handle a recursively called group. */
1911
1912 if (md->recursive != NULL && md->recursive->group_num == number)
1913 {
1914 md->end_match_ptr = eptr;
1915 md->start_match_ptr = mstart;
1916 RRETURN(MATCH_MATCH);
1917 }
1918
1919 /* Deal with capturing */
1920
1921 md->capture_last = number;
1922 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1923 {
1924 /* If offset is greater than offset_top, it means that we are
1925 "skipping" a capturing group, and that group's offsets must be marked
1926 unset. In earlier versions of PCRE, all the offsets were unset at the
1927 start of matching, but this doesn't work because atomic groups and
1928 assertions can cause a value to be set that should later be unset.
1929 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1930 part of the atomic group, but this is not on the final matching path,
1931 so must be unset when 2 is set. (If there is no group 2, there is no
1932 problem, because offset_top will then be 2, indicating no capture.) */
1933
1934 if (offset > offset_top)
1935 {
1936 register int *iptr = md->offset_vector + offset_top;
1937 register int *iend = md->offset_vector + offset;
1938 while (iptr < iend) *iptr++ = -1;
1939 }
1940
1941 /* Now make the extraction */
1942
1943 md->offset_vector[offset] =
1944 md->offset_vector[md->offset_end - number];
1945 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1946 if (offset_top <= offset) offset_top = offset + 2;
1947 }
1948 }
1949
1950 /* For an ordinary non-repeating ket, just continue at this level. This
1951 also happens for a repeating ket if no characters were matched in the
1952 group. This is the forcible breaking of infinite loops as implemented in
1953 Perl 5.005. For a non-repeating atomic group that includes captures,
1954 establish a backup point by processing the rest of the pattern at a lower
1955 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1956 original OP_ONCE level, thereby bypassing intermediate backup points, but
1957 resetting any captures that happened along the way. */
1958
1959 if (*ecode == OP_KET || eptr == saved_eptr)
1960 {
1961 if (*prev == OP_ONCE)
1962 {
1963 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1965 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1966 RRETURN(MATCH_ONCE);
1967 }
1968 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1969 break;
1970 }
1971
1972 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1973 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1974 at a time from the outer level, thus saving stack. */
1975
1976 if (*ecode == OP_KETRPOS)
1977 {
1978 md->end_match_ptr = eptr;
1979 md->end_offset_top = offset_top;
1980 RRETURN(MATCH_KETRPOS);
1981 }
1982
1983 /* The normal repeating kets try the rest of the pattern or restart from
1984 the preceding bracket, in the appropriate order. In the second case, we can
1985 use tail recursion to avoid using another stack frame, unless we have an
1986 an atomic group or an unlimited repeat of a group that can match an empty
1987 string. */
1988
1989 if (*ecode == OP_KETRMIN)
1990 {
1991 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1992 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1993 if (*prev == OP_ONCE)
1994 {
1995 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1998 RRETURN(MATCH_ONCE);
1999 }
2000 if (*prev >= OP_SBRA) /* Could match an empty string */
2001 {
2002 md->match_function_type = MATCH_CBEGROUP;
2003 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2004 RRETURN(rrc);
2005 }
2006 ecode = prev;
2007 goto TAIL_RECURSE;
2008 }
2009 else /* OP_KETRMAX */
2010 {
2011 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2012 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2013 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2015 if (*prev == OP_ONCE)
2016 {
2017 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 md->once_target = prev;
2020 RRETURN(MATCH_ONCE);
2021 }
2022 ecode += 1 + LINK_SIZE;
2023 goto TAIL_RECURSE;
2024 }
2025 /* Control never gets here */
2026
2027 /* Not multiline mode: start of subject assertion, unless notbol. */
2028
2029 case OP_CIRC:
2030 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2031
2032 /* Start of subject assertion */
2033
2034 case OP_SOD:
2035 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2036 ecode++;
2037 break;
2038
2039 /* Multiline mode: start of subject unless notbol, or after any newline. */
2040
2041 case OP_CIRCM:
2042 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2043 if (eptr != md->start_subject &&
2044 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2045 RRETURN(MATCH_NOMATCH);
2046 ecode++;
2047 break;
2048
2049 /* Start of match assertion */
2050
2051 case OP_SOM:
2052 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2053 ecode++;
2054 break;
2055
2056 /* Reset the start of match point */
2057
2058 case OP_SET_SOM:
2059 mstart = eptr;
2060 ecode++;
2061 break;
2062
2063 /* Multiline mode: assert before any newline, or before end of subject
2064 unless noteol is set. */
2065
2066 case OP_DOLLM:
2067 if (eptr < md->end_subject)
2068 {
2069 if (!IS_NEWLINE(eptr))
2070 {
2071 if (eptr + 1 >= md->end_subject &&
2072 md->partial != 0 &&
2073 NLBLOCK->nltype == NLTYPE_FIXED &&
2074 NLBLOCK->nllen == 2 &&
2075 *eptr == NLBLOCK->nl[0])
2076 {
2077 md->hitend = TRUE;
2078 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2079 }
2080 RRETURN(MATCH_NOMATCH);
2081 }
2082 }
2083 else
2084 {
2085 if (md->noteol) RRETURN(MATCH_NOMATCH);
2086 SCHECK_PARTIAL();
2087 }
2088 ecode++;
2089 break;
2090
2091 /* Not multiline mode: assert before a terminating newline or before end of
2092 subject unless noteol is set. */
2093
2094 case OP_DOLL:
2095 if (md->noteol) RRETURN(MATCH_NOMATCH);
2096 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2097
2098 /* ... else fall through for endonly */
2099
2100 /* End of subject assertion (\z) */
2101
2102 case OP_EOD:
2103 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2104 SCHECK_PARTIAL();
2105 ecode++;
2106 break;
2107
2108 /* End of subject or ending \n assertion (\Z) */
2109
2110 case OP_EODN:
2111 ASSERT_NL_OR_EOS:
2112 if (eptr < md->end_subject &&
2113 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2114 {
2115 if (eptr + 1 >= md->end_subject &&
2116 md->partial != 0 &&
2117 NLBLOCK->nltype == NLTYPE_FIXED &&
2118 NLBLOCK->nllen == 2 &&
2119 *eptr == NLBLOCK->nl[0])
2120 {
2121 md->hitend = TRUE;
2122 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2123 }
2124 RRETURN(MATCH_NOMATCH);
2125 }
2126
2127 /* Either at end of string or \n before end. */
2128
2129 SCHECK_PARTIAL();
2130 ecode++;
2131 break;
2132
2133 /* Word boundary assertions */
2134
2135 case OP_NOT_WORD_BOUNDARY:
2136 case OP_WORD_BOUNDARY:
2137 {
2138
2139 /* Find out if the previous and current characters are "word" characters.
2140 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2141 be "non-word" characters. Remember the earliest consulted character for
2142 partial matching. */
2143
2144 #ifdef SUPPORT_UTF
2145 if (utf)
2146 {
2147 /* Get status of previous character */
2148
2149 if (eptr == md->start_subject) prev_is_word = FALSE; else
2150 {
2151 PCRE_PUCHAR lastptr = eptr - 1;
2152 BACKCHAR(lastptr);
2153 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2154 GETCHAR(c, lastptr);
2155 #ifdef SUPPORT_UCP
2156 if (md->use_ucp)
2157 {
2158 if (c == '_') prev_is_word = TRUE; else
2159 {
2160 int cat = UCD_CATEGORY(c);
2161 prev_is_word = (cat == ucp_L || cat == ucp_N);
2162 }
2163 }
2164 else
2165 #endif
2166 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2167 }
2168
2169 /* Get status of next character */
2170
2171 if (eptr >= md->end_subject)
2172 {
2173 SCHECK_PARTIAL();
2174 cur_is_word = FALSE;
2175 }
2176 else
2177 {
2178 GETCHAR(c, eptr);
2179 #ifdef SUPPORT_UCP
2180 if (md->use_ucp)
2181 {
2182 if (c == '_') cur_is_word = TRUE; else
2183 {
2184 int cat = UCD_CATEGORY(c);
2185 cur_is_word = (cat == ucp_L || cat == ucp_N);
2186 }
2187 }
2188 else
2189 #endif
2190 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2191 }
2192 }
2193 else
2194 #endif
2195
2196 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2197 consistency with the behaviour of \w we do use it in this case. */
2198
2199 {
2200 /* Get status of previous character */
2201
2202 if (eptr == md->start_subject) prev_is_word = FALSE; else
2203 {
2204 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2205 #ifdef SUPPORT_UCP
2206 if (md->use_ucp)
2207 {
2208 c = eptr[-1];
2209 if (c == '_') prev_is_word = TRUE; else
2210 {
2211 int cat = UCD_CATEGORY(c);
2212 prev_is_word = (cat == ucp_L || cat == ucp_N);
2213 }
2214 }
2215 else
2216 #endif
2217 prev_is_word = MAX_255(eptr[-1])
2218 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2219 }
2220
2221 /* Get status of next character */
2222
2223 if (eptr >= md->end_subject)
2224 {
2225 SCHECK_PARTIAL();
2226 cur_is_word = FALSE;
2227 }
2228 else
2229 #ifdef SUPPORT_UCP
2230 if (md->use_ucp)
2231 {
2232 c = *eptr;
2233 if (c == '_') cur_is_word = TRUE; else
2234 {
2235 int cat = UCD_CATEGORY(c);
2236 cur_is_word = (cat == ucp_L || cat == ucp_N);
2237 }
2238 }
2239 else
2240 #endif
2241 cur_is_word = MAX_255(*eptr)
2242 && ((md->ctypes[*eptr] & ctype_word) != 0);
2243 }
2244
2245 /* Now see if the situation is what we want */
2246
2247 if ((*ecode++ == OP_WORD_BOUNDARY)?
2248 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2249 RRETURN(MATCH_NOMATCH);
2250 }
2251 break;
2252
2253 /* Match a single character type; inline for speed */
2254
2255 case OP_ANY:
2256 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2257 /* Fall through */
2258
2259 case OP_ALLANY:
2260 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2261 { /* not be updated before SCHECK_PARTIAL. */
2262 SCHECK_PARTIAL();
2263 RRETURN(MATCH_NOMATCH);
2264 }
2265 eptr++;
2266 #ifdef SUPPORT_UTF
2267 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2268 #endif
2269 ecode++;
2270 break;
2271
2272 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2273 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2274
2275 case OP_ANYBYTE:
2276 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2277 { /* not be updated before SCHECK_PARTIAL. */
2278 SCHECK_PARTIAL();
2279 RRETURN(MATCH_NOMATCH);
2280 }
2281 eptr++;
2282 ecode++;
2283 break;
2284
2285 case OP_NOT_DIGIT:
2286 if (eptr >= md->end_subject)
2287 {
2288 SCHECK_PARTIAL();
2289 RRETURN(MATCH_NOMATCH);
2290 }
2291 GETCHARINCTEST(c, eptr);
2292 if (
2293 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2294 c < 256 &&
2295 #endif
2296 (md->ctypes[c] & ctype_digit) != 0
2297 )
2298 RRETURN(MATCH_NOMATCH);
2299 ecode++;
2300 break;
2301
2302 case OP_DIGIT:
2303 if (eptr >= md->end_subject)
2304 {
2305 SCHECK_PARTIAL();
2306 RRETURN(MATCH_NOMATCH);
2307 }
2308 GETCHARINCTEST(c, eptr);
2309 if (
2310 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2311 c > 255 ||
2312 #endif
2313 (md->ctypes[c] & ctype_digit) == 0
2314 )
2315 RRETURN(MATCH_NOMATCH);
2316 ecode++;
2317 break;
2318
2319 case OP_NOT_WHITESPACE:
2320 if (eptr >= md->end_subject)
2321 {
2322 SCHECK_PARTIAL();
2323 RRETURN(MATCH_NOMATCH);
2324 }
2325 GETCHARINCTEST(c, eptr);
2326 if (
2327 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2328 c < 256 &&
2329 #endif
2330 (md->ctypes[c] & ctype_space) != 0
2331 )
2332 RRETURN(MATCH_NOMATCH);
2333 ecode++;
2334 break;
2335
2336 case OP_WHITESPACE:
2337 if (eptr >= md->end_subject)
2338 {
2339 SCHECK_PARTIAL();
2340 RRETURN(MATCH_NOMATCH);
2341 }
2342 GETCHARINCTEST(c, eptr);
2343 if (
2344 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2345 c > 255 ||
2346 #endif
2347 (md->ctypes[c] & ctype_space) == 0
2348 )
2349 RRETURN(MATCH_NOMATCH);
2350 ecode++;
2351 break;
2352
2353 case OP_NOT_WORDCHAR:
2354 if (eptr >= md->end_subject)
2355 {
2356 SCHECK_PARTIAL();
2357 RRETURN(MATCH_NOMATCH);
2358 }
2359 GETCHARINCTEST(c, eptr);
2360 if (
2361 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2362 c < 256 &&
2363 #endif
2364 (md->ctypes[c] & ctype_word) != 0
2365 )
2366 RRETURN(MATCH_NOMATCH);
2367 ecode++;
2368 break;
2369
2370 case OP_WORDCHAR:
2371 if (eptr >= md->end_subject)
2372 {
2373 SCHECK_PARTIAL();
2374 RRETURN(MATCH_NOMATCH);
2375 }
2376 GETCHARINCTEST(c, eptr);
2377 if (
2378 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2379 c > 255 ||
2380 #endif
2381 (md->ctypes[c] & ctype_word) == 0
2382 )
2383 RRETURN(MATCH_NOMATCH);
2384 ecode++;
2385 break;
2386
2387 case OP_ANYNL:
2388 if (eptr >= md->end_subject)
2389 {
2390 SCHECK_PARTIAL();
2391 RRETURN(MATCH_NOMATCH);
2392 }
2393 GETCHARINCTEST(c, eptr);
2394 switch(c)
2395 {
2396 default: RRETURN(MATCH_NOMATCH);
2397
2398 case 0x000d:
2399 if (eptr >= md->end_subject)
2400 {
2401 SCHECK_PARTIAL();
2402 }
2403 else if (*eptr == 0x0a) eptr++;
2404 break;
2405
2406 case 0x000a:
2407 break;
2408
2409 case 0x000b:
2410 case 0x000c:
2411 case 0x0085:
2412 case 0x2028:
2413 case 0x2029:
2414 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2415 break;
2416 }
2417 ecode++;
2418 break;
2419
2420 case OP_NOT_HSPACE:
2421 if (eptr >= md->end_subject)
2422 {
2423 SCHECK_PARTIAL();
2424 RRETURN(MATCH_NOMATCH);
2425 }
2426 GETCHARINCTEST(c, eptr);
2427 switch(c)
2428 {
2429 default: break;
2430 case 0x09: /* HT */
2431 case 0x20: /* SPACE */
2432 case 0xa0: /* NBSP */
2433 case 0x1680: /* OGHAM SPACE MARK */
2434 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2435 case 0x2000: /* EN QUAD */
2436 case 0x2001: /* EM QUAD */
2437 case 0x2002: /* EN SPACE */
2438 case 0x2003: /* EM SPACE */
2439 case 0x2004: /* THREE-PER-EM SPACE */
2440 case 0x2005: /* FOUR-PER-EM SPACE */
2441 case 0x2006: /* SIX-PER-EM SPACE */
2442 case 0x2007: /* FIGURE SPACE */
2443 case 0x2008: /* PUNCTUATION SPACE */
2444 case 0x2009: /* THIN SPACE */
2445 case 0x200A: /* HAIR SPACE */
2446 case 0x202f: /* NARROW NO-BREAK SPACE */
2447 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2448 case 0x3000: /* IDEOGRAPHIC SPACE */
2449 RRETURN(MATCH_NOMATCH);
2450 }
2451 ecode++;
2452 break;
2453
2454 case OP_HSPACE:
2455 if (eptr >= md->end_subject)
2456 {
2457 SCHECK_PARTIAL();
2458 RRETURN(MATCH_NOMATCH);
2459 }
2460 GETCHARINCTEST(c, eptr);
2461 switch(c)
2462 {
2463 default: RRETURN(MATCH_NOMATCH);
2464 case 0x09: /* HT */
2465 case 0x20: /* SPACE */
2466 case 0xa0: /* NBSP */
2467 case 0x1680: /* OGHAM SPACE MARK */
2468 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2469 case 0x2000: /* EN QUAD */
2470 case 0x2001: /* EM QUAD */
2471 case 0x2002: /* EN SPACE */
2472 case 0x2003: /* EM SPACE */
2473 case 0x2004: /* THREE-PER-EM SPACE */
2474 case 0x2005: /* FOUR-PER-EM SPACE */
2475 case 0x2006: /* SIX-PER-EM SPACE */
2476 case 0x2007: /* FIGURE SPACE */
2477 case 0x2008: /* PUNCTUATION SPACE */
2478 case 0x2009: /* THIN SPACE */
2479 case 0x200A: /* HAIR SPACE */
2480 case 0x202f: /* NARROW NO-BREAK SPACE */
2481 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2482 case 0x3000: /* IDEOGRAPHIC SPACE */
2483 break;
2484 }
2485 ecode++;
2486 break;
2487
2488 case OP_NOT_VSPACE:
2489 if (eptr >= md->end_subject)
2490 {
2491 SCHECK_PARTIAL();
2492 RRETURN(MATCH_NOMATCH);
2493 }
2494 GETCHARINCTEST(c, eptr);
2495 switch(c)
2496 {
2497 default: break;
2498 case 0x0a: /* LF */
2499 case 0x0b: /* VT */
2500 case 0x0c: /* FF */
2501 case 0x0d: /* CR */
2502 case 0x85: /* NEL */
2503 case 0x2028: /* LINE SEPARATOR */
2504 case 0x2029: /* PARAGRAPH SEPARATOR */
2505 RRETURN(MATCH_NOMATCH);
2506 }
2507 ecode++;
2508 break;
2509
2510 case OP_VSPACE:
2511 if (eptr >= md->end_subject)
2512 {
2513 SCHECK_PARTIAL();
2514 RRETURN(MATCH_NOMATCH);
2515 }
2516 GETCHARINCTEST(c, eptr);
2517 switch(c)
2518 {
2519 default: RRETURN(MATCH_NOMATCH);
2520 case 0x0a: /* LF */
2521 case 0x0b: /* VT */
2522 case 0x0c: /* FF */
2523 case 0x0d: /* CR */
2524 case 0x85: /* NEL */
2525 case 0x2028: /* LINE SEPARATOR */
2526 case 0x2029: /* PARAGRAPH SEPARATOR */
2527 break;
2528 }
2529 ecode++;
2530 break;
2531
2532 #ifdef SUPPORT_UCP
2533 /* Check the next character by Unicode property. We will get here only
2534 if the support is in the binary; otherwise a compile-time error occurs. */
2535
2536 case OP_PROP:
2537 case OP_NOTPROP:
2538 if (eptr >= md->end_subject)
2539 {
2540 SCHECK_PARTIAL();
2541 RRETURN(MATCH_NOMATCH);
2542 }
2543 GETCHARINCTEST(c, eptr);
2544 {
2545 const ucd_record *prop = GET_UCD(c);
2546
2547 switch(ecode[1])
2548 {
2549 case PT_ANY:
2550 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2551 break;
2552
2553 case PT_LAMP:
2554 if ((prop->chartype == ucp_Lu ||
2555 prop->chartype == ucp_Ll ||
2556 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2557 RRETURN(MATCH_NOMATCH);
2558 break;
2559
2560 case PT_GC:
2561 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2562 RRETURN(MATCH_NOMATCH);
2563 break;
2564
2565 case PT_PC:
2566 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2567 RRETURN(MATCH_NOMATCH);
2568 break;
2569
2570 case PT_SC:
2571 if ((ecode[2] != prop->script) == (op == OP_PROP))
2572 RRETURN(MATCH_NOMATCH);
2573 break;
2574
2575 /* These are specials */
2576
2577 case PT_ALNUM:
2578 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2579 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2580 RRETURN(MATCH_NOMATCH);
2581 break;
2582
2583 case PT_SPACE: /* Perl space */
2584 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2585 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2586 == (op == OP_NOTPROP))
2587 RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 case PT_PXSPACE: /* POSIX space */
2591 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2592 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2593 c == CHAR_FF || c == CHAR_CR)
2594 == (op == OP_NOTPROP))
2595 RRETURN(MATCH_NOMATCH);
2596 break;
2597
2598 case PT_WORD:
2599 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2600 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2601 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2604
2605 /* This should never occur */
2606
2607 default:
2608 RRETURN(PCRE_ERROR_INTERNAL);
2609 }
2610
2611 ecode += 3;
2612 }
2613 break;
2614
2615 /* Match an extended Unicode sequence. We will get here only if the support
2616 is in the binary; otherwise a compile-time error occurs. */
2617
2618 case OP_EXTUNI:
2619 if (eptr >= md->end_subject)
2620 {
2621 SCHECK_PARTIAL();
2622 RRETURN(MATCH_NOMATCH);
2623 }
2624 GETCHARINCTEST(c, eptr);
2625 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2626 while (eptr < md->end_subject)
2627 {
2628 int len = 1;
2629 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2630 if (UCD_CATEGORY(c) != ucp_M) break;
2631 eptr += len;
2632 }
2633 CHECK_PARTIAL();
2634 ecode++;
2635 break;
2636 #endif
2637
2638
2639 /* Match a back reference, possibly repeatedly. Look past the end of the
2640 item to see if there is repeat information following. The code is similar
2641 to that for character classes, but repeated for efficiency. Then obey
2642 similar code to character type repeats - written out again for speed.
2643 However, if the referenced string is the empty string, always treat
2644 it as matched, any number of times (otherwise there could be infinite
2645 loops). */
2646
2647 case OP_REF:
2648 case OP_REFI:
2649 caseless = op == OP_REFI;
2650 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2651 ecode += 1 + IMM2_SIZE;
2652
2653 /* If the reference is unset, there are two possibilities:
2654
2655 (a) In the default, Perl-compatible state, set the length negative;
2656 this ensures that every attempt at a match fails. We can't just fail
2657 here, because of the possibility of quantifiers with zero minima.
2658
2659 (b) If the JavaScript compatibility flag is set, set the length to zero
2660 so that the back reference matches an empty string.
2661
2662 Otherwise, set the length to the length of what was matched by the
2663 referenced subpattern. */
2664
2665 if (offset >= offset_top || md->offset_vector[offset] < 0)
2666 length = (md->jscript_compat)? 0 : -1;
2667 else
2668 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2669
2670 /* Set up for repetition, or handle the non-repeated case */
2671
2672 switch (*ecode)
2673 {
2674 case OP_CRSTAR:
2675 case OP_CRMINSTAR:
2676 case OP_CRPLUS:
2677 case OP_CRMINPLUS:
2678 case OP_CRQUERY:
2679 case OP_CRMINQUERY:
2680 c = *ecode++ - OP_CRSTAR;
2681 minimize = (c & 1) != 0;
2682 min = rep_min[c]; /* Pick up values from tables; */
2683 max = rep_max[c]; /* zero for max => infinity */
2684 if (max == 0) max = INT_MAX;
2685 break;
2686
2687 case OP_CRRANGE:
2688 case OP_CRMINRANGE:
2689 minimize = (*ecode == OP_CRMINRANGE);
2690 min = GET2(ecode, 1);
2691 max = GET2(ecode, 1 + IMM2_SIZE);
2692 if (max == 0) max = INT_MAX;
2693 ecode += 1 + 2 * IMM2_SIZE;
2694 break;
2695
2696 default: /* No repeat follows */
2697 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2698 {
2699 if (length == -2) eptr = md->end_subject; /* Partial match */
2700 CHECK_PARTIAL();
2701 RRETURN(MATCH_NOMATCH);
2702 }
2703 eptr += length;
2704 continue; /* With the main loop */
2705 }
2706
2707 /* Handle repeated back references. If the length of the reference is
2708 zero, just continue with the main loop. If the length is negative, it
2709 means the reference is unset in non-Java-compatible mode. If the minimum is
2710 zero, we can continue at the same level without recursion. For any other
2711 minimum, carrying on will result in NOMATCH. */
2712
2713 if (length == 0) continue;
2714 if (length < 0 && min == 0) continue;
2715
2716 /* First, ensure the minimum number of matches are present. We get back
2717 the length of the reference string explicitly rather than passing the
2718 address of eptr, so that eptr can be a register variable. */
2719
2720 for (i = 1; i <= min; i++)
2721 {
2722 int slength;
2723 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2724 {
2725 if (slength == -2) eptr = md->end_subject; /* Partial match */
2726 CHECK_PARTIAL();
2727 RRETURN(MATCH_NOMATCH);
2728 }
2729 eptr += slength;
2730 }
2731
2732 /* If min = max, continue at the same level without recursion.
2733 They are not both allowed to be zero. */
2734
2735 if (min == max) continue;
2736
2737 /* If minimizing, keep trying and advancing the pointer */
2738
2739 if (minimize)
2740 {
2741 for (fi = min;; fi++)
2742 {
2743 int slength;
2744 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2746 if (fi >= max) RRETURN(MATCH_NOMATCH);
2747 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2748 {
2749 if (slength == -2) eptr = md->end_subject; /* Partial match */
2750 CHECK_PARTIAL();
2751 RRETURN(MATCH_NOMATCH);
2752 }
2753 eptr += slength;
2754 }
2755 /* Control never gets here */
2756 }
2757
2758 /* If maximizing, find the longest string and work backwards */
2759
2760 else
2761 {
2762 pp = eptr;
2763 for (i = min; i < max; i++)
2764 {
2765 int slength;
2766 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2767 {
2768 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2769 the soft partial matching case. */
2770
2771 if (slength == -2 && md->partial != 0 &&
2772 md->end_subject > md->start_used_ptr)
2773 {
2774 md->hitend = TRUE;
2775 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2776 }
2777 break;
2778 }
2779 eptr += slength;
2780 }
2781
2782 while (eptr >= pp)
2783 {
2784 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2786 eptr -= length;
2787 }
2788 RRETURN(MATCH_NOMATCH);
2789 }
2790 /* Control never gets here */
2791
2792 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2793 used when all the characters in the class have values in the range 0-255,
2794 and either the matching is caseful, or the characters are in the range
2795 0-127 when UTF-8 processing is enabled. The only difference between
2796 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2797 encountered.
2798
2799 First, look past the end of the item to see if there is repeat information
2800 following. Then obey similar code to character type repeats - written out
2801 again for speed. */
2802
2803 case OP_NCLASS:
2804 case OP_CLASS:
2805 {
2806 /* The data variable is saved across frames, so the byte map needs to
2807 be stored there. */
2808 #define BYTE_MAP ((pcre_uint8 *)data)
2809 data = ecode + 1; /* Save for matching */
2810 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2811
2812 switch (*ecode)
2813 {
2814 case OP_CRSTAR:
2815 case OP_CRMINSTAR:
2816 case OP_CRPLUS:
2817 case OP_CRMINPLUS:
2818 case OP_CRQUERY:
2819 case OP_CRMINQUERY:
2820 c = *ecode++ - OP_CRSTAR;
2821 minimize = (c & 1) != 0;
2822 min = rep_min[c]; /* Pick up values from tables; */
2823 max = rep_max[c]; /* zero for max => infinity */
2824 if (max == 0) max = INT_MAX;
2825 break;
2826
2827 case OP_CRRANGE:
2828 case OP_CRMINRANGE:
2829 minimize = (*ecode == OP_CRMINRANGE);
2830 min = GET2(ecode, 1);
2831 max = GET2(ecode, 1 + IMM2_SIZE);
2832 if (max == 0) max = INT_MAX;
2833 ecode += 1 + 2 * IMM2_SIZE;
2834 break;
2835
2836 default: /* No repeat follows */
2837 min = max = 1;
2838 break;
2839 }
2840
2841 /* First, ensure the minimum number of matches are present. */
2842
2843 #ifdef SUPPORT_UTF
2844 if (utf)
2845 {
2846 for (i = 1; i <= min; i++)
2847 {
2848 if (eptr >= md->end_subject)
2849 {
2850 SCHECK_PARTIAL();
2851 RRETURN(MATCH_NOMATCH);
2852 }
2853 GETCHARINC(c, eptr);
2854 if (c > 255)
2855 {
2856 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2857 }
2858 else
2859 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2860 }
2861 }
2862 else
2863 #endif
2864 /* Not UTF mode */
2865 {
2866 for (i = 1; i <= min; i++)
2867 {
2868 if (eptr >= md->end_subject)
2869 {
2870 SCHECK_PARTIAL();
2871 RRETURN(MATCH_NOMATCH);
2872 }
2873 c = *eptr++;
2874 #ifndef COMPILE_PCRE8
2875 if (c > 255)
2876 {
2877 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2878 }
2879 else
2880 #endif
2881 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2882 }
2883 }
2884
2885 /* If max == min we can continue with the main loop without the
2886 need to recurse. */
2887
2888 if (min == max) continue;
2889
2890 /* If minimizing, keep testing the rest of the expression and advancing
2891 the pointer while it matches the class. */
2892
2893 if (minimize)
2894 {
2895 #ifdef SUPPORT_UTF
2896 if (utf)
2897 {
2898 for (fi = min;; fi++)
2899 {
2900 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2901 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2902 if (fi >= max) RRETURN(MATCH_NOMATCH);
2903 if (eptr >= md->end_subject)
2904 {
2905 SCHECK_PARTIAL();
2906 RRETURN(MATCH_NOMATCH);
2907 }
2908 GETCHARINC(c, eptr);
2909 if (c > 255)
2910 {
2911 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2912 }
2913 else
2914 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2915 }
2916 }
2917 else
2918 #endif
2919 /* Not UTF mode */
2920 {
2921 for (fi = min;; fi++)
2922 {
2923 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2925 if (fi >= max) RRETURN(MATCH_NOMATCH);
2926 if (eptr >= md->end_subject)
2927 {
2928 SCHECK_PARTIAL();
2929 RRETURN(MATCH_NOMATCH);
2930 }
2931 c = *eptr++;
2932 #ifndef COMPILE_PCRE8
2933 if (c > 255)
2934 {
2935 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2936 }
2937 else
2938 #endif
2939 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2940 }
2941 }
2942 /* Control never gets here */
2943 }
2944
2945 /* If maximizing, find the longest possible run, then work backwards. */
2946
2947 else
2948 {
2949 pp = eptr;
2950
2951 #ifdef SUPPORT_UTF
2952 if (utf)
2953 {
2954 for (i = min; i < max; i++)
2955 {
2956 int len = 1;
2957 if (eptr >= md->end_subject)
2958 {
2959 SCHECK_PARTIAL();
2960 break;
2961 }
2962 GETCHARLEN(c, eptr, len);
2963 if (c > 255)
2964 {
2965 if (op == OP_CLASS) break;
2966 }
2967 else
2968 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2969 eptr += len;
2970 }
2971 for (;;)
2972 {
2973 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2975 if (eptr-- == pp) break; /* Stop if tried at original pos */
2976 BACKCHAR(eptr);
2977 }
2978 }
2979 else
2980 #endif
2981 /* Not UTF mode */
2982 {
2983 for (i = min; i < max; i++)
2984 {
2985 if (eptr >= md->end_subject)
2986 {
2987 SCHECK_PARTIAL();
2988 break;
2989 }
2990 c = *eptr;
2991 #ifndef COMPILE_PCRE8
2992 if (c > 255)
2993 {
2994 if (op == OP_CLASS) break;
2995 }
2996 else
2997 #endif
2998 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2999 eptr++;
3000 }
3001 while (eptr >= pp)
3002 {
3003 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3004 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3005 eptr--;
3006 }
3007 }
3008
3009 RRETURN(MATCH_NOMATCH);
3010 }
3011 #undef BYTE_MAP
3012 }
3013 /* Control never gets here */
3014
3015
3016 /* Match an extended character class. This opcode is encountered only
3017 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3018 mode, because Unicode properties are supported in non-UTF-8 mode. */
3019
3020 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3021 case OP_XCLASS:
3022 {
3023 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3024 ecode += GET(ecode, 1); /* Advance past the item */
3025
3026 switch (*ecode)
3027 {
3028 case OP_CRSTAR:
3029 case OP_CRMINSTAR:
3030 case OP_CRPLUS:
3031 case OP_CRMINPLUS:
3032 case OP_CRQUERY:
3033 case OP_CRMINQUERY:
3034 c = *ecode++ - OP_CRSTAR;
3035 minimize = (c & 1) != 0;
3036 min = rep_min[c]; /* Pick up values from tables; */
3037 max = rep_max[c]; /* zero for max => infinity */
3038 if (max == 0) max = INT_MAX;
3039 break;
3040
3041 case OP_CRRANGE:
3042 case OP_CRMINRANGE:
3043 minimize = (*ecode == OP_CRMINRANGE);
3044 min = GET2(ecode, 1);
3045 max = GET2(ecode, 1 + IMM2_SIZE);
3046 if (max == 0) max = INT_MAX;
3047 ecode += 1 + 2 * IMM2_SIZE;
3048 break;
3049
3050 default: /* No repeat follows */
3051 min = max = 1;
3052 break;
3053 }
3054
3055 /* First, ensure the minimum number of matches are present. */
3056
3057 for (i = 1; i <= min; i++)
3058 {
3059 if (eptr >= md->end_subject)
3060 {
3061 SCHECK_PARTIAL();
3062 RRETURN(MATCH_NOMATCH);
3063 }
3064 GETCHARINCTEST(c, eptr);
3065 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3066 }
3067
3068 /* If max == min we can continue with the main loop without the
3069 need to recurse. */
3070
3071 if (min == max) continue;
3072
3073 /* If minimizing, keep testing the rest of the expression and advancing
3074 the pointer while it matches the class. */
3075
3076 if (minimize)
3077 {
3078 for (fi = min;; fi++)
3079 {
3080 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3081 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3082 if (fi >= max) RRETURN(MATCH_NOMATCH);
3083 if (eptr >= md->end_subject)
3084 {
3085 SCHECK_PARTIAL();
3086 RRETURN(MATCH_NOMATCH);
3087 }
3088 GETCHARINCTEST(c, eptr);
3089 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3090 }
3091 /* Control never gets here */
3092 }
3093
3094 /* If maximizing, find the longest possible run, then work backwards. */
3095
3096 else
3097 {
3098 pp = eptr;
3099 for (i = min; i < max; i++)
3100 {
3101 int len = 1;
3102 if (eptr >= md->end_subject)
3103 {
3104 SCHECK_PARTIAL();
3105 break;
3106 }
3107 #ifdef SUPPORT_UTF
3108 GETCHARLENTEST(c, eptr, len);
3109 #else
3110 c = *eptr;
3111 #endif
3112 if (!PRIV(xclass)(c, data, utf)) break;
3113 eptr += len;
3114 }
3115 for(;;)
3116 {
3117 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3118 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3119 if (eptr-- == pp) break; /* Stop if tried at original pos */
3120 #ifdef SUPPORT_UTF
3121 if (utf) BACKCHAR(eptr);
3122 #endif
3123 }
3124 RRETURN(MATCH_NOMATCH);
3125 }
3126
3127 /* Control never gets here */
3128 }
3129 #endif /* End of XCLASS */
3130
3131 /* Match a single character, casefully */
3132
3133 case OP_CHAR:
3134 #ifdef SUPPORT_UTF
3135 if (utf)
3136 {
3137 length = 1;
3138 ecode++;
3139 GETCHARLEN(fc, ecode, length);
3140 if (length > md->end_subject - eptr)
3141 {
3142 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3143 RRETURN(MATCH_NOMATCH);
3144 }
3145 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3146 }
3147 else
3148 #endif
3149 /* Not UTF mode */
3150 {
3151 if (md->end_subject - eptr < 1)
3152 {
3153 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3154 RRETURN(MATCH_NOMATCH);
3155 }
3156 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3157 ecode += 2;
3158 }
3159 break;
3160
3161 /* Match a single character, caselessly. If we are at the end of the
3162 subject, give up immediately. */
3163
3164 case OP_CHARI:
3165 if (eptr >= md->end_subject)
3166 {
3167 SCHECK_PARTIAL();
3168 RRETURN(MATCH_NOMATCH);
3169 }
3170
3171 #ifdef SUPPORT_UTF
3172 if (utf)
3173 {
3174 length = 1;
3175 ecode++;
3176 GETCHARLEN(fc, ecode, length);
3177
3178 /* If the pattern character's value is < 128, we have only one byte, and
3179 we know that its other case must also be one byte long, so we can use the
3180 fast lookup table. We know that there is at least one byte left in the
3181 subject. */
3182
3183 if (fc < 128)
3184 {
3185 if (md->lcc[fc]
3186 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3187 ecode++;
3188 eptr++;
3189 }
3190
3191 /* Otherwise we must pick up the subject character. Note that we cannot
3192 use the value of "length" to check for sufficient bytes left, because the
3193 other case of the character may have more or fewer bytes. */
3194
3195 else
3196 {
3197 unsigned int dc;
3198 GETCHARINC(dc, eptr);
3199 ecode += length;
3200
3201 /* If we have Unicode property support, we can use it to test the other
3202 case of the character, if there is one. */
3203
3204 if (fc != dc)
3205 {
3206 #ifdef SUPPORT_UCP
3207 if (dc != UCD_OTHERCASE(fc))
3208 #endif
3209 RRETURN(MATCH_NOMATCH);
3210 }
3211 }
3212 }
3213 else
3214 #endif /* SUPPORT_UTF */
3215
3216 /* Not UTF mode */
3217 {
3218 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3219 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3220 eptr++;
3221 ecode += 2;
3222 }
3223 break;
3224
3225 /* Match a single character repeatedly. */
3226
3227 case OP_EXACT:
3228 case OP_EXACTI:
3229 min = max = GET2(ecode, 1);
3230 ecode += 1 + IMM2_SIZE;
3231 goto REPEATCHAR;
3232
3233 case OP_POSUPTO:
3234 case OP_POSUPTOI:
3235 possessive = TRUE;
3236 /* Fall through */
3237
3238 case OP_UPTO:
3239 case OP_UPTOI:
3240 case OP_MINUPTO:
3241 case OP_MINUPTOI:
3242 min = 0;
3243 max = GET2(ecode, 1);
3244 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3245 ecode += 1 + IMM2_SIZE;
3246 goto REPEATCHAR;
3247
3248 case OP_POSSTAR:
3249 case OP_POSSTARI:
3250 possessive = TRUE;
3251 min = 0;
3252 max = INT_MAX;
3253 ecode++;
3254 goto REPEATCHAR;
3255
3256 case OP_POSPLUS:
3257 case OP_POSPLUSI:
3258 possessive = TRUE;
3259 min = 1;
3260 max = INT_MAX;
3261 ecode++;
3262 goto REPEATCHAR;
3263
3264 case OP_POSQUERY:
3265 case OP_POSQUERYI:
3266 possessive = TRUE;
3267 min = 0;
3268 max = 1;
3269 ecode++;
3270 goto REPEATCHAR;
3271
3272 case OP_STAR:
3273 case OP_STARI:
3274 case OP_MINSTAR:
3275 case OP_MINSTARI:
3276 case OP_PLUS:
3277 case OP_PLUSI:
3278 case OP_MINPLUS:
3279 case OP_MINPLUSI:
3280 case OP_QUERY:
3281 case OP_QUERYI:
3282 case OP_MINQUERY:
3283 case OP_MINQUERYI:
3284 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3285 minimize = (c & 1) != 0;
3286 min = rep_min[c]; /* Pick up values from tables; */
3287 max = rep_max[c]; /* zero for max => infinity */
3288 if (max == 0) max = INT_MAX;
3289
3290 /* Common code for all repeated single-character matches. */
3291
3292 REPEATCHAR:
3293 #ifdef SUPPORT_UTF
3294 if (utf)
3295 {
3296 length = 1;
3297 charptr = ecode;
3298 GETCHARLEN(fc, ecode, length);
3299 ecode += length;
3300
3301 /* Handle multibyte character matching specially here. There is
3302 support for caseless matching if UCP support is present. */
3303
3304 if (length > 1)
3305 {
3306 #ifdef SUPPORT_UCP
3307 unsigned int othercase;
3308 if (op >= OP_STARI && /* Caseless */
3309 (othercase = UCD_OTHERCASE(fc)) != fc)
3310 oclength = PRIV(ord2utf)(othercase, occhars);
3311 else oclength = 0;
3312 #endif /* SUPPORT_UCP */
3313
3314 for (i = 1; i <= min; i++)
3315 {
3316 if (eptr <= md->end_subject - length &&
3317 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3318 #ifdef SUPPORT_UCP
3319 else if (oclength > 0 &&
3320 eptr <= md->end_subject - oclength &&
3321 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3322 #endif /* SUPPORT_UCP */
3323 else
3324 {
3325 CHECK_PARTIAL();
3326 RRETURN(MATCH_NOMATCH);
3327 }
3328 }
3329
3330 if (min == max) continue;
3331
3332 if (minimize)
3333 {
3334 for (fi = min;; fi++)
3335 {
3336 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3338 if (fi >= max) RRETURN(MATCH_NOMATCH);
3339 if (eptr <= md->end_subject - length &&
3340 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3341 #ifdef SUPPORT_UCP
3342 else if (oclength > 0 &&
3343 eptr <= md->end_subject - oclength &&
3344 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3345 #endif /* SUPPORT_UCP */
3346 else
3347 {
3348 CHECK_PARTIAL();
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 }
3352 /* Control never gets here */
3353 }
3354
3355 else /* Maximize */
3356 {
3357 pp = eptr;
3358 for (i = min; i < max; i++)
3359 {
3360 if (eptr <= md->end_subject - length &&
3361 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3362 #ifdef SUPPORT_UCP
3363 else if (oclength > 0 &&
3364 eptr <= md->end_subject - oclength &&
3365 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3366 #endif /* SUPPORT_UCP */
3367 else
3368 {
3369 CHECK_PARTIAL();
3370 break;
3371 }
3372 }
3373
3374 if (possessive) continue;
3375
3376 for(;;)
3377 {
3378 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3379 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3380 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3381 #ifdef SUPPORT_UCP
3382 eptr--;
3383 BACKCHAR(eptr);
3384 #else /* without SUPPORT_UCP */
3385 eptr -= length;
3386 #endif /* SUPPORT_UCP */
3387 }
3388 }
3389 /* Control never gets here */
3390 }
3391
3392 /* If the length of a UTF-8 character is 1, we fall through here, and
3393 obey the code as for non-UTF-8 characters below, though in this case the
3394 value of fc will always be < 128. */
3395 }
3396 else
3397 #endif /* SUPPORT_UTF */
3398 /* When not in UTF-8 mode, load a single-byte character. */
3399 fc = *ecode++;
3400
3401 /* The value of fc at this point is always one character, though we may
3402 or may not be in UTF mode. The code is duplicated for the caseless and
3403 caseful cases, for speed, since matching characters is likely to be quite
3404 common. First, ensure the minimum number of matches are present. If min =
3405 max, continue at the same level without recursing. Otherwise, if
3406 minimizing, keep trying the rest of the expression and advancing one
3407 matching character if failing, up to the maximum. Alternatively, if
3408 maximizing, find the maximum number of characters and work backwards. */
3409
3410 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3411 max, eptr));
3412
3413 if (op >= OP_STARI) /* Caseless */
3414 {
3415 #ifdef COMPILE_PCRE8
3416 /* fc must be < 128 if UTF is enabled. */
3417 foc = md->fcc[fc];
3418 #else
3419 #ifdef SUPPORT_UTF
3420 #ifdef SUPPORT_UCP
3421 if (utf && fc > 127)
3422 foc = UCD_OTHERCASE(fc);
3423 #else
3424 if (utf && fc > 127)
3425 foc = fc;
3426 #endif /* SUPPORT_UCP */
3427 else
3428 #endif /* SUPPORT_UTF */
3429 foc = TABLE_GET(fc, md->fcc, fc);
3430 #endif /* COMPILE_PCRE8 */
3431
3432 for (i = 1; i <= min; i++)
3433 {
3434 if (eptr >= md->end_subject)
3435 {
3436 SCHECK_PARTIAL();
3437 RRETURN(MATCH_NOMATCH);
3438 }
3439 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3440 eptr++;
3441 }
3442 if (min == max) continue;
3443 if (minimize)
3444 {
3445 for (fi = min;; fi++)
3446 {
3447 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3449 if (fi >= max) RRETURN(MATCH_NOMATCH);
3450 if (eptr >= md->end_subject)
3451 {
3452 SCHECK_PARTIAL();
3453 RRETURN(MATCH_NOMATCH);
3454 }
3455 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3456 eptr++;
3457 }
3458 /* Control never gets here */
3459 }
3460 else /* Maximize */
3461 {
3462 pp = eptr;
3463 for (i = min; i < max; i++)
3464 {
3465 if (eptr >= md->end_subject)
3466 {
3467 SCHECK_PARTIAL();
3468 break;
3469 }
3470 if (fc != *eptr && foc != *eptr) break;
3471 eptr++;
3472 }
3473
3474 if (possessive) continue;
3475
3476 while (eptr >= pp)
3477 {
3478 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3479 eptr--;
3480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3481 }
3482 RRETURN(MATCH_NOMATCH);
3483 }
3484 /* Control never gets here */
3485 }
3486
3487 /* Caseful comparisons (includes all multi-byte characters) */
3488
3489 else
3490 {
3491 for (i = 1; i <= min; i++)
3492 {
3493 if (eptr >= md->end_subject)
3494 {
3495 SCHECK_PARTIAL();
3496 RRETURN(MATCH_NOMATCH);
3497 }
3498 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3499 }
3500
3501 if (min == max) continue;
3502
3503 if (minimize)
3504 {
3505 for (fi = min;; fi++)
3506 {
3507 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3508 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3509 if (fi >= max) RRETURN(MATCH_NOMATCH);
3510 if (eptr >= md->end_subject)
3511 {
3512 SCHECK_PARTIAL();
3513 RRETURN(MATCH_NOMATCH);
3514 }
3515 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3516 }
3517 /* Control never gets here */
3518 }
3519 else /* Maximize */
3520 {
3521 pp = eptr;
3522 for (i = min; i < max; i++)
3523 {
3524 if (eptr >= md->end_subject)
3525 {
3526 SCHECK_PARTIAL();
3527 break;
3528 }
3529 if (fc != *eptr) break;
3530 eptr++;
3531 }
3532 if (possessive) continue;
3533
3534 while (eptr >= pp)
3535 {
3536 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3537 eptr--;
3538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3539 }
3540 RRETURN(MATCH_NOMATCH);
3541 }
3542 }
3543 /* Control never gets here */
3544
3545 /* Match a negated single one-byte character. The character we are
3546 checking can be multibyte. */
3547
3548 case OP_NOT:
3549 case OP_NOTI:
3550 if (eptr >= md->end_subject)
3551 {
3552 SCHECK_PARTIAL();
3553 RRETURN(MATCH_NOMATCH);
3554 }
3555 ecode++;
3556 GETCHARINCTEST(c, eptr);
3557 if (op == OP_NOTI) /* The caseless case */
3558 {
3559 register unsigned int ch, och;
3560 ch = *ecode++;
3561 #ifdef COMPILE_PCRE8
3562 /* ch must be < 128 if UTF is enabled. */
3563 och = md->fcc[ch];
3564 #else
3565 #ifdef SUPPORT_UTF
3566 #ifdef SUPPORT_UCP
3567 if (utf && ch > 127)
3568 och = UCD_OTHERCASE(ch);
3569 #else
3570 if (utf && ch > 127)
3571 och = ch;
3572 #endif /* SUPPORT_UCP */
3573 else
3574 #endif /* SUPPORT_UTF */
3575 och = TABLE_GET(ch, md->fcc, ch);
3576 #endif /* COMPILE_PCRE8 */
3577 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3578 }
3579 else /* Caseful */
3580 {
3581 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3582 }
3583 break;
3584
3585 /* Match a negated single one-byte character repeatedly. This is almost a
3586 repeat of the code for a repeated single character, but I haven't found a
3587 nice way of commoning these up that doesn't require a test of the
3588 positive/negative option for each character match. Maybe that wouldn't add
3589 very much to the time taken, but character matching *is* what this is all
3590 about... */
3591
3592 case OP_NOTEXACT:
3593 case OP_NOTEXACTI:
3594 min = max = GET2(ecode, 1);
3595 ecode += 1 + IMM2_SIZE;
3596 goto REPEATNOTCHAR;
3597
3598 case OP_NOTUPTO:
3599 case OP_NOTUPTOI:
3600 case OP_NOTMINUPTO:
3601 case OP_NOTMINUPTOI:
3602 min = 0;
3603 max = GET2(ecode, 1);
3604 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3605 ecode += 1 + IMM2_SIZE;
3606 goto REPEATNOTCHAR;
3607
3608 case OP_NOTPOSSTAR:
3609 case OP_NOTPOSSTARI:
3610 possessive = TRUE;
3611 min = 0;
3612 max = INT_MAX;
3613 ecode++;
3614 goto REPEATNOTCHAR;
3615
3616 case OP_NOTPOSPLUS:
3617 case OP_NOTPOSPLUSI:
3618 possessive = TRUE;
3619 min = 1;
3620 max = INT_MAX;
3621 ecode++;
3622 goto REPEATNOTCHAR;
3623
3624 case OP_NOTPOSQUERY:
3625 case OP_NOTPOSQUERYI:
3626 possessive = TRUE;
3627 min = 0;
3628 max = 1;
3629 ecode++;
3630 goto REPEATNOTCHAR;
3631
3632 case OP_NOTPOSUPTO:
3633 case OP_NOTPOSUPTOI:
3634 possessive = TRUE;
3635 min = 0;
3636 max = GET2(ecode, 1);
3637 ecode += 1 + IMM2_SIZE;
3638 goto REPEATNOTCHAR;
3639
3640 case OP_NOTSTAR:
3641 case OP_NOTSTARI:
3642 case OP_NOTMINSTAR:
3643 case OP_NOTMINSTARI:
3644 case OP_NOTPLUS:
3645 case OP_NOTPLUSI:
3646 case OP_NOTMINPLUS:
3647 case OP_NOTMINPLUSI:
3648 case OP_NOTQUERY:
3649 case OP_NOTQUERYI:
3650 case OP_NOTMINQUERY:
3651 case OP_NOTMINQUERYI:
3652 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3653 minimize = (c & 1) != 0;
3654 min = rep_min[c]; /* Pick up values from tables; */
3655 max = rep_max[c]; /* zero for max => infinity */
3656 if (max == 0) max = INT_MAX;
3657
3658 /* Common code for all repeated single-byte matches. */
3659
3660 REPEATNOTCHAR:
3661 fc = *ecode++;
3662
3663 /* The code is duplicated for the caseless and caseful cases, for speed,
3664 since matching characters is likely to be quite common. First, ensure the
3665 minimum number of matches are present. If min = max, continue at the same
3666 level without recursing. Otherwise, if minimizing, keep trying the rest of
3667 the expression and advancing one matching character if failing, up to the
3668 maximum. Alternatively, if maximizing, find the maximum number of
3669 characters and work backwards. */
3670
3671 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3672 max, eptr));
3673
3674 if (op >= OP_NOTSTARI) /* Caseless */
3675 {
3676 #ifdef COMPILE_PCRE8
3677 /* fc must be < 128 if UTF is enabled. */
3678 foc = md->fcc[fc];
3679 #else
3680 #ifdef SUPPORT_UTF
3681 #ifdef SUPPORT_UCP
3682 if (utf && fc > 127)
3683 foc = UCD_OTHERCASE(fc);
3684 #else
3685 if (utf && fc > 127)
3686 foc = fc;
3687 #endif /* SUPPORT_UCP */
3688 else
3689 #endif /* SUPPORT_UTF */
3690 foc = TABLE_GET(fc, md->fcc, fc);
3691 #endif /* COMPILE_PCRE8 */
3692
3693 #ifdef SUPPORT_UTF
3694 if (utf)
3695 {
3696 register unsigned int d;
3697 for (i = 1; i <= min; i++)
3698 {
3699 if (eptr >= md->end_subject)
3700 {
3701 SCHECK_PARTIAL();
3702 RRETURN(MATCH_NOMATCH);
3703 }
3704 GETCHARINC(d, eptr);
3705 if (fc == d || (unsigned int) foc == d) RRETURN(MATCH_NOMATCH);
3706 }
3707 }
3708 else
3709 #endif
3710 /* Not UTF mode */
3711 {
3712 for (i = 1; i <= min; i++)
3713 {
3714 if (eptr >= md->end_subject)
3715 {
3716 SCHECK_PARTIAL();
3717 RRETURN(MATCH_NOMATCH);
3718 }
3719 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3720 eptr++;
3721 }
3722 }
3723
3724 if (min == max) continue;
3725
3726 if (minimize)
3727 {
3728 #ifdef SUPPORT_UTF
3729 if (utf)
3730 {
3731 register unsigned int d;
3732 for (fi = min;; fi++)
3733 {
3734 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3735 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3736 if (fi >= max) RRETURN(MATCH_NOMATCH);
3737 if (eptr >= md->end_subject)
3738 {
3739 SCHECK_PARTIAL();
3740 RRETURN(MATCH_NOMATCH);
3741 }
3742 GETCHARINC(d, eptr);
3743 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3744 }
3745 }
3746 else
3747 #endif
3748 /* Not UTF mode */
3749 {
3750 for (fi = min;; fi++)
3751 {
3752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3754 if (fi >= max) RRETURN(MATCH_NOMATCH);
3755 if (eptr >= md->end_subject)
3756 {
3757 SCHECK_PARTIAL();
3758 RRETURN(MATCH_NOMATCH);
3759 }
3760 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3761 eptr++;
3762 }
3763 }
3764 /* Control never gets here */
3765 }
3766
3767 /* Maximize case */
3768
3769 else
3770 {
3771 pp = eptr;
3772
3773 #ifdef SUPPORT_UTF
3774 if (utf)
3775 {
3776 register unsigned int d;
3777 for (i = min; i < max; i++)
3778 {
3779 int len = 1;
3780 if (eptr >= md->end_subject)
3781 {
3782 SCHECK_PARTIAL();
3783 break;
3784 }
3785 GETCHARLEN(d, eptr, len);
3786 if (fc == d || (unsigned int)foc == d) break;
3787 eptr += len;
3788 }
3789 if (possessive) continue;
3790 for(;;)
3791 {
3792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3794 if (eptr-- == pp) break; /* Stop if tried at original pos */
3795 BACKCHAR(eptr);
3796 }
3797 }
3798 else
3799 #endif
3800 /* Not UTF mode */
3801 {
3802 for (i = min; i < max; i++)
3803 {
3804 if (eptr >= md->end_subject)
3805 {
3806 SCHECK_PARTIAL();
3807 break;
3808 }
3809 if (fc == *eptr || foc == *eptr) break;
3810 eptr++;
3811 }
3812 if (possessive) continue;
3813 while (eptr >= pp)
3814 {
3815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3817 eptr--;
3818 }
3819 }
3820
3821 RRETURN(MATCH_NOMATCH);
3822 }
3823 /* Control never gets here */
3824 }
3825
3826 /* Caseful comparisons */
3827
3828 else
3829 {
3830 #ifdef SUPPORT_UTF
3831 if (utf)
3832 {
3833 register unsigned int d;
3834 for (i = 1; i <= min; i++)
3835 {
3836 if (eptr >= md->end_subject)
3837 {
3838 SCHECK_PARTIAL();
3839 RRETURN(MATCH_NOMATCH);
3840 }
3841 GETCHARINC(d, eptr);
3842 if (fc == d) RRETURN(MATCH_NOMATCH);
3843 }
3844 }
3845 else
3846 #endif
3847 /* Not UTF mode */
3848 {
3849 for (i = 1; i <= min; i++)
3850 {
3851 if (eptr >= md->end_subject)
3852 {
3853 SCHECK_PARTIAL();
3854 RRETURN(MATCH_NOMATCH);
3855 }
3856 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3857 }
3858 }
3859
3860 if (min == max) continue;
3861
3862 if (minimize)
3863 {
3864 #ifdef SUPPORT_UTF
3865 if (utf)
3866 {
3867 register unsigned int d;
3868 for (fi = min;; fi++)
3869 {
3870 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3872 if (fi >= max) RRETURN(MATCH_NOMATCH);
3873 if (eptr >= md->end_subject)
3874 {
3875 SCHECK_PARTIAL();
3876 RRETURN(MATCH_NOMATCH);
3877 }
3878 GETCHARINC(d, eptr);
3879 if (fc == d) RRETURN(MATCH_NOMATCH);
3880 }
3881 }
3882 else
3883 #endif
3884 /* Not UTF mode */
3885 {
3886 for (fi = min;; fi++)
3887 {
3888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3890 if (fi >= max) RRETURN(MATCH_NOMATCH);
3891 if (eptr >= md->end_subject)
3892 {
3893 SCHECK_PARTIAL();
3894 RRETURN(MATCH_NOMATCH);
3895 }
3896 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3897 }
3898 }
3899 /* Control never gets here */
3900 }
3901
3902 /* Maximize case */
3903
3904 else
3905 {
3906 pp = eptr;
3907
3908 #ifdef SUPPORT_UTF
3909 if (utf)
3910 {
3911 register unsigned int d;
3912 for (i = min; i < max; i++)
3913 {
3914 int len = 1;
3915 if (eptr >= md->end_subject)
3916 {
3917 SCHECK_PARTIAL();
3918 break;
3919 }
3920 GETCHARLEN(d, eptr, len);
3921 if (fc == d) break;
3922 eptr += len;
3923 }
3924 if (possessive) continue;
3925 for(;;)
3926 {
3927 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3928 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3929 if (eptr-- == pp) break; /* Stop if tried at original pos */
3930 BACKCHAR(eptr);
3931 }
3932 }
3933 else
3934 #endif
3935 /* Not UTF mode */
3936 {
3937 for (i = min; i < max; i++)
3938 {
3939 if (eptr >= md->end_subject)
3940 {
3941 SCHECK_PARTIAL();
3942 break;
3943 }
3944 if (fc == *eptr) break;
3945 eptr++;
3946 }
3947 if (possessive) continue;
3948 while (eptr >= pp)
3949 {
3950 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3952 eptr--;
3953 }
3954 }
3955
3956 RRETURN(MATCH_NOMATCH);
3957 }
3958 }
3959 /* Control never gets here */
3960
3961 /* Match a single character type repeatedly; several different opcodes
3962 share code. This is very similar to the code for single characters, but we
3963 repeat it in the interests of efficiency. */
3964
3965 case OP_TYPEEXACT:
3966 min = max = GET2(ecode, 1);
3967 minimize = TRUE;
3968 ecode += 1 + IMM2_SIZE;
3969 goto REPEATTYPE;
3970
3971 case OP_TYPEUPTO:
3972 case OP_TYPEMINUPTO:
3973 min = 0;
3974 max = GET2(ecode, 1);
3975 minimize = *ecode == OP_TYPEMINUPTO;
3976 ecode += 1 + IMM2_SIZE;
3977 goto REPEATTYPE;
3978
3979 case OP_TYPEPOSSTAR:
3980 possessive = TRUE;
3981 min = 0;
3982 max = INT_MAX;
3983 ecode++;
3984 goto REPEATTYPE;
3985
3986 case OP_TYPEPOSPLUS:
3987 possessive = TRUE;
3988 min = 1;
3989 max = INT_MAX;
3990 ecode++;
3991 goto REPEATTYPE;
3992
3993 case OP_TYPEPOSQUERY:
3994 possessive = TRUE;
3995 min = 0;
3996 max = 1;
3997 ecode++;
3998 goto REPEATTYPE;
3999
4000 case OP_TYPEPOSUPTO:
4001 possessive = TRUE;
4002 min = 0;
4003 max = GET2(ecode, 1);
4004 ecode += 1 + IMM2_SIZE;
4005 goto REPEATTYPE;
4006
4007 case OP_TYPESTAR:
4008 case OP_TYPEMINSTAR:
4009 case OP_TYPEPLUS:
4010 case OP_TYPEMINPLUS:
4011 case OP_TYPEQUERY:
4012 case OP_TYPEMINQUERY:
4013 c = *ecode++ - OP_TYPESTAR;
4014 minimize = (c & 1) != 0;
4015 min = rep_min[c]; /* Pick up values from tables; */
4016 max = rep_max[c]; /* zero for max => infinity */
4017 if (max == 0) max = INT_MAX;
4018
4019 /* Common code for all repeated single character type matches. Note that
4020 in UTF-8 mode, '.' matches a character of any length, but for the other
4021 character types, the valid characters are all one-byte long. */
4022
4023 REPEATTYPE:
4024 ctype = *ecode++; /* Code for the character type */
4025
4026 #ifdef SUPPORT_UCP
4027 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4028 {
4029 prop_fail_result = ctype == OP_NOTPROP;
4030 prop_type = *ecode++;
4031 prop_value = *ecode++;
4032 }
4033 else prop_type = -1;
4034 #endif
4035
4036 /* First, ensure the minimum number of matches are present. Use inline
4037 code for maximizing the speed, and do the type test once at the start
4038 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4039 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4040 and single-bytes. */
4041
4042 if (min > 0)
4043 {
4044 #ifdef SUPPORT_UCP
4045 if (prop_type >= 0)
4046 {
4047 switch(prop_type)
4048 {
4049 case PT_ANY:
4050 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4051 for (i = 1; i <= min; i++)
4052 {
4053 if (eptr >= md->end_subject)
4054 {
4055 SCHECK_PARTIAL();
4056 RRETURN(MATCH_NOMATCH);
4057 }
4058 GETCHARINCTEST(c, eptr);
4059 }
4060 break;
4061
4062 case PT_LAMP:
4063 for (i = 1; i <= min; i++)
4064 {
4065 int chartype;
4066 if (eptr >= md->end_subject)
4067 {
4068 SCHECK_PARTIAL();
4069 RRETURN(MATCH_NOMATCH);
4070 }
4071 GETCHARINCTEST(c, eptr);
4072 chartype = UCD_CHARTYPE(c);
4073 if ((chartype == ucp_Lu ||
4074 chartype == ucp_Ll ||
4075 chartype == ucp_Lt) == prop_fail_result)
4076 RRETURN(MATCH_NOMATCH);
4077 }
4078 break;
4079
4080 case PT_GC:
4081 for (i = 1; i <= min; i++)
4082 {
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 RRETURN(MATCH_NOMATCH);
4087 }
4088 GETCHARINCTEST(c, eptr);
4089 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4090 RRETURN(MATCH_NOMATCH);
4091 }
4092 break;
4093
4094 case PT_PC:
4095 for (i = 1; i <= min; i++)
4096 {
4097 if (eptr >= md->end_subject)
4098 {
4099 SCHECK_PARTIAL();
4100 RRETURN(MATCH_NOMATCH);
4101 }
4102 GETCHARINCTEST(c, eptr);
4103 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4104 RRETURN(MATCH_NOMATCH);
4105 }
4106 break;
4107
4108 case PT_SC:
4109 for (i = 1; i <= min; i++)
4110 {
4111 if (eptr >= md->end_subject)
4112 {
4113 SCHECK_PARTIAL();
4114 RRETURN(MATCH_NOMATCH);
4115 }
4116 GETCHARINCTEST(c, eptr);
4117 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 break;
4121
4122 case PT_ALNUM:
4123 for (i = 1; i <= min; i++)
4124 {
4125 int category;
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 RRETURN(MATCH_NOMATCH);
4130 }
4131 GETCHARINCTEST(c, eptr);
4132 category = UCD_CATEGORY(c);
4133 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4134 RRETURN(MATCH_NOMATCH);
4135 }
4136 break;
4137
4138 case PT_SPACE: /* Perl space */
4139 for (i = 1; i <= min; i++)
4140 {
4141 if (eptr >= md->end_subject)
4142 {
4143 SCHECK_PARTIAL();
4144 RRETURN(MATCH_NOMATCH);
4145 }
4146 GETCHARINCTEST(c, eptr);
4147 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4148 c == CHAR_FF || c == CHAR_CR)
4149 == prop_fail_result)
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 break;
4153
4154 case PT_PXSPACE: /* POSIX space */
4155 for (i = 1; i <= min; i++)
4156 {
4157 if (eptr >= md->end_subject)
4158 {
4159 SCHECK_PARTIAL();
4160 RRETURN(MATCH_NOMATCH);
4161 }
4162 GETCHARINCTEST(c, eptr);
4163 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4164 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4165 == prop_fail_result)
4166 RRETURN(MATCH_NOMATCH);
4167 }
4168 break;
4169
4170 case PT_WORD:
4171 for (i = 1; i <= min; i++)
4172 {
4173 int category;
4174 if (eptr >= md->end_subject)
4175 {
4176 SCHECK_PARTIAL();
4177 RRETURN(MATCH_NOMATCH);
4178 }
4179 GETCHARINCTEST(c, eptr);
4180 category = UCD_CATEGORY(c);
4181 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4182 == prop_fail_result)
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 break;
4186
4187 /* This should not occur */
4188
4189 default:
4190 RRETURN(PCRE_ERROR_INTERNAL);
4191 }
4192 }
4193
4194 /* Match extended Unicode sequences. We will get here only if the
4195 support is in the binary; otherwise a compile-time error occurs. */
4196
4197 else if (ctype == OP_EXTUNI)
4198 {
4199 for (i = 1; i <= min; i++)
4200 {
4201 if (eptr >= md->end_subject)
4202 {
4203 SCHECK_PARTIAL();
4204 RRETURN(MATCH_NOMATCH);
4205 }
4206 GETCHARINCTEST(c, eptr);
4207 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4208 while (eptr < md->end_subject)
4209 {
4210 int len = 1;
4211 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4212 if (UCD_CATEGORY(c) != ucp_M) break;
4213 eptr += len;
4214 }
4215 CHECK_PARTIAL();
4216 }
4217 }
4218
4219 else
4220 #endif /* SUPPORT_UCP */
4221
4222 /* Handle all other cases when the coding is UTF-8 */
4223
4224 #ifdef SUPPORT_UTF
4225 if (utf) switch(ctype)
4226 {
4227 case OP_ANY:
4228 for (i = 1; i <= min; i++)
4229 {
4230 if (eptr >= md->end_subject)
4231 {
4232 SCHECK_PARTIAL();
4233 RRETURN(MATCH_NOMATCH);
4234 }
4235 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4236 eptr++;
4237 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4238 }
4239 break;
4240
4241 case OP_ALLANY:
4242 for (i = 1; i <= min; i++)
4243 {
4244 if (eptr >= md->end_subject)
4245 {
4246 SCHECK_PARTIAL();
4247 RRETURN(MATCH_NOMATCH);
4248 }
4249 eptr++;
4250 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4251 }
4252 break;
4253
4254 case OP_ANYBYTE:
4255 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4256 eptr += min;
4257 break;
4258
4259 case OP_ANYNL:
4260 for (i = 1; i <= min; i++)
4261 {
4262 if (eptr >= md->end_subject)
4263 {
4264 SCHECK_PARTIAL();
4265 RRETURN(MATCH_NOMATCH);
4266 }
4267 GETCHARINC(c, eptr);
4268 switch(c)
4269 {
4270 default: RRETURN(MATCH_NOMATCH);
4271
4272 case 0x000d:
4273 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4274 break;
4275
4276 case 0x000a:
4277 break;
4278
4279 case 0x000b:
4280 case 0x000c:
4281 case 0x0085:
4282 case 0x2028:
4283 case 0x2029:
4284 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4285 break;
4286 }
4287 }
4288 break;
4289
4290 case OP_NOT_HSPACE:
4291 for (i = 1; i <= min; i++)
4292 {
4293 if (eptr >= md->end_subject)
4294 {
4295 SCHECK_PARTIAL();
4296 RRETURN(MATCH_NOMATCH);
4297 }
4298 GETCHARINC(c, eptr);
4299 switch(c)
4300 {
4301 default: break;
4302 case 0x09: /* HT */
4303 case 0x20: /* SPACE */
4304 case 0xa0: /* NBSP */
4305 case 0x1680: /* OGHAM SPACE MARK */
4306 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4307 case 0x2000: /* EN QUAD */
4308 case 0x2001: /* EM QUAD */
4309 case 0x2002: /* EN SPACE */
4310 case 0x2003: /* EM SPACE */
4311 case 0x2004: /* THREE-PER-EM SPACE */
4312 case 0x2005: /* FOUR-PER-EM SPACE */
4313 case 0x2006: /* SIX-PER-EM SPACE */
4314 case 0x2007: /* FIGURE SPACE */
4315 case 0x2008: /* PUNCTUATION SPACE */
4316 case 0x2009: /* THIN SPACE */
4317 case 0x200A: /* HAIR SPACE */
4318 case 0x202f: /* NARROW NO-BREAK SPACE */
4319 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4320 case 0x3000: /* IDEOGRAPHIC SPACE */
4321 RRETURN(MATCH_NOMATCH);
4322 }
4323 }
4324 break;
4325
4326 case OP_HSPACE:
4327 for (i = 1; i <= min; i++)
4328 {
4329 if (eptr >= md->end_subject)
4330 {
4331 SCHECK_PARTIAL();
4332 RRETURN(MATCH_NOMATCH);
4333 }
4334 GETCHARINC(c, eptr);
4335 switch(c)
4336 {
4337 default: RRETURN(MATCH_NOMATCH);
4338 case 0x09: /* HT */
4339 case 0x20: /* SPACE */
4340 case 0xa0: /* NBSP */
4341 case 0x1680: /* OGHAM SPACE MARK */
4342 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4343 case 0x2000: /* EN QUAD */
4344 case 0x2001: /* EM QUAD */
4345 case 0x2002: /* EN SPACE */
4346 case 0x2003: /* EM SPACE */
4347 case 0x2004: /* THREE-PER-EM SPACE */
4348 case 0x2005: /* FOUR-PER-EM SPACE */
4349 case 0x2006: /* SIX-PER-EM SPACE */
4350 case 0x2007: /* FIGURE SPACE */
4351 case 0x2008: /* PUNCTUATION SPACE */
4352 case 0x2009: /* THIN SPACE */
4353 case 0x200A: /* HAIR SPACE */
4354 case 0x202f: /* NARROW NO-BREAK SPACE */
4355 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4356 case 0x3000: /* IDEOGRAPHIC SPACE */
4357 break;
4358 }
4359 }
4360 break;
4361
4362 case OP_NOT_VSPACE:
4363 for (i = 1; i <= min; i++)
4364 {
4365 if (eptr >= md->end_subject)
4366 {
4367 SCHECK_PARTIAL();
4368 RRETURN(MATCH_NOMATCH);
4369 }
4370 GETCHARINC(c, eptr);
4371 switch(c)
4372 {
4373 default: break;
4374 case 0x0a: /* LF */
4375 case 0x0b: /* VT */
4376 case 0x0c: /* FF */
4377 case 0x0d: /* CR */
4378 case 0x85: /* NEL */
4379 case 0x2028: /* LINE SEPARATOR */
4380 case 0x2029: /* PARAGRAPH SEPARATOR */
4381 RRETURN(MATCH_NOMATCH);
4382 }
4383 }
4384 break;
4385
4386 case OP_VSPACE:
4387 for (i = 1; i <= min; i++)
4388 {
4389 if (eptr >= md->end_subject)
4390 {
4391 SCHECK_PARTIAL();
4392 RRETURN(MATCH_NOMATCH);
4393 }
4394 GETCHARINC(c, eptr);
4395 switch(c)
4396 {
4397 default: RRETURN(MATCH_NOMATCH);
4398 case 0x0a: /* LF */
4399 case 0x0b: /* VT */
4400 case 0x0c: /* FF */
4401 case 0x0d: /* CR */
4402 case 0x85: /* NEL */
4403 case 0x2028: /* LINE SEPARATOR */
4404 case 0x2029: /* PARAGRAPH SEPARATOR */
4405 break;
4406 }
4407 }
4408 break;
4409
4410 case OP_NOT_DIGIT:
4411 for (i = 1; i <= min; i++)
4412 {
4413 if (eptr >= md->end_subject)
4414 {
4415 SCHECK_PARTIAL();
4416 RRETURN(MATCH_NOMATCH);
4417 }
4418 GETCHARINC(c, eptr);
4419 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4420 RRETURN(MATCH_NOMATCH);
4421 }
4422 break;
4423
4424 case OP_DIGIT:
4425 for (i = 1; i <= min; i++)
4426 {
4427 if (eptr >= md->end_subject)
4428 {
4429 SCHECK_PARTIAL();
4430 RRETURN(MATCH_NOMATCH);
4431 }
4432 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4433 RRETURN(MATCH_NOMATCH);
4434 eptr++;
4435 /* No need to skip more bytes - we know it's a 1-byte character */
4436 }
4437 break;
4438
4439 case OP_NOT_WHITESPACE:
4440 for (i = 1; i <= min; i++)
4441 {
4442 if (eptr >= md->end_subject)
4443 {
4444 SCHECK_PARTIAL();
4445 RRETURN(MATCH_NOMATCH);
4446 }
4447 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4448 RRETURN(MATCH_NOMATCH);
4449 eptr++;
4450 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4451 }
4452 break;
4453
4454 case OP_WHITESPACE:
4455 for (i = 1; i <= min; i++)
4456 {
4457 if (eptr >= md->end_subject)
4458 {
4459 SCHECK_PARTIAL();
4460 RRETURN(MATCH_NOMATCH);
4461 }
4462 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4463 RRETURN(MATCH_NOMATCH);
4464 eptr++;
4465 /* No need to skip more bytes - we know it's a 1-byte character */
4466 }
4467 break;
4468
4469 case OP_NOT_WORDCHAR:
4470 for (i = 1; i <= min; i++)
4471 {
4472 if (eptr >= md->end_subject)
4473 {
4474 SCHECK_PARTIAL();
4475 RRETURN(MATCH_NOMATCH);
4476 }
4477 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4478 RRETURN(MATCH_NOMATCH);
4479 eptr++;
4480 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4481 }
4482 break;
4483
4484 case OP_WORDCHAR:
4485 for (i = 1; i <= min; i++)
4486 {
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 RRETURN(MATCH_NOMATCH);
4491 }
4492 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4493 RRETURN(MATCH_NOMATCH);
4494 eptr++;
4495 /* No need to skip more bytes - we know it's a 1-byte character */
4496 }
4497 break;
4498
4499 default:
4500 RRETURN(PCRE_ERROR_INTERNAL);
4501 } /* End switch(ctype) */
4502
4503 else
4504 #endif /* SUPPORT_UTF */
4505
4506 /* Code for the non-UTF-8 case for minimum matching of operators other
4507 than OP_PROP and OP_NOTPROP. */
4508
4509 switch(ctype)
4510 {
4511 case OP_ANY:
4512 for (i = 1; i <= min; i++)
4513 {
4514 if (eptr >= md->end_subject)
4515 {
4516 SCHECK_PARTIAL();
4517 RRETURN(MATCH_NOMATCH);
4518 }
4519 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4520 eptr++;
4521 }
4522 break;
4523
4524 case OP_ALLANY:
4525 if (eptr > md->end_subject - min)
4526 {
4527 SCHECK_PARTIAL();
4528 RRETURN(MATCH_NOMATCH);
4529 }
4530 eptr += min;
4531 break;
4532
4533 case OP_ANYBYTE:
4534 if (eptr > md->end_subject - min)
4535 {
4536 SCHECK_PARTIAL();
4537 RRETURN(MATCH_NOMATCH);
4538 }
4539 eptr += min;
4540 break;
4541
4542 case OP_ANYNL:
4543 for (i = 1; i <= min; i++)
4544 {
4545 if (eptr >= md->end_subject)
4546 {
4547 SCHECK_PARTIAL();
4548 RRETURN(MATCH_NOMATCH);
4549 }
4550 switch(*eptr++)
4551 {
4552 default: RRETURN(MATCH_NOMATCH);
4553
4554 case 0x000d:
4555 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4556 break;
4557
4558 case 0x000a:
4559 break;
4560
4561 case 0x000b:
4562 case 0x000c:
4563 case 0x0085:
4564 #ifdef COMPILE_PCRE16
4565 case 0x2028:
4566 case 0x2029:
4567 #endif
4568 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4569 break;
4570 }
4571 }
4572 break;
4573
4574 case OP_NOT_HSPACE:
4575 for (i = 1; i <= min; i++)
4576 {
4577 if (eptr >= md->end_subject)
4578 {
4579 SCHECK_PARTIAL();
4580 RRETURN(MATCH_NOMATCH);
4581 }
4582 switch(*eptr++)
4583 {
4584 default: break;
4585 case 0x09: /* HT */
4586 case 0x20: /* SPACE */
4587 case 0xa0: /* NBSP */
4588 #ifdef COMPILE_PCRE16
4589 case 0x1680: /* OGHAM SPACE MARK */
4590 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4591 case 0x2000: /* EN QUAD */
4592 case 0x2001: /* EM QUAD */
4593 case 0x2002: /* EN SPACE */
4594 case 0x2003: /* EM SPACE */
4595 case 0x2004: /* THREE-PER-EM SPACE */
4596 case 0x2005: /* FOUR-PER-EM SPACE */
4597 case 0x2006: /* SIX-PER-EM SPACE */
4598 case 0x2007: /* FIGURE SPACE */
4599 case 0x2008: /* PUNCTUATION SPACE */
4600 case 0x2009: /* THIN SPACE */
4601 case 0x200A: /* HAIR SPACE */
4602 case 0x202f: /* NARROW NO-BREAK SPACE */
4603 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4604 case 0x3000: /* IDEOGRAPHIC SPACE */
4605 #endif
4606 RRETURN(MATCH_NOMATCH);
4607 }
4608 }
4609 break;
4610
4611 case OP_HSPACE:
4612 for (i = 1; i <= min; i++)
4613 {
4614 if (eptr >= md->end_subject)
4615 {
4616 SCHECK_PARTIAL();
4617 RRETURN(MATCH_NOMATCH);
4618 }
4619 switch(*eptr++)
4620 {
4621 default: RRETURN(MATCH_NOMATCH);
4622 case 0x09: /* HT */
4623 case 0x20: /* SPACE */
4624 case 0xa0: /* NBSP */
4625 #ifdef COMPILE_PCRE16
4626 case 0x1680: /* OGHAM SPACE MARK */
4627 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4628 case 0x2000: /* EN QUAD */
4629 case 0x2001: /* EM QUAD */
4630 case 0x2002: /* EN SPACE */
4631 case 0x2003: /* EM SPACE */
4632 case 0x2004: /* THREE-PER-EM SPACE */
4633 case 0x2005: /* FOUR-PER-EM SPACE */
4634 case 0x2006: /* SIX-PER-EM SPACE */
4635 case 0x2007: /* FIGURE SPACE */
4636 case 0x2008: /* PUNCTUATION SPACE */
4637 case 0x2009: /* THIN SPACE */
4638 case 0x200A: /* HAIR SPACE */
4639 case 0x202f: /* NARROW NO-BREAK SPACE */
4640 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4641 case 0x3000: /* IDEOGRAPHIC SPACE */
4642 #endif
4643 break;
4644 }
4645 }
4646 break;
4647
4648 case OP_NOT_VSPACE:
4649 for (i = 1; i <= min; i++)
4650 {
4651 if (eptr >= md->end_subject)
4652 {
4653 SCHECK_PARTIAL();
4654 RRETURN(MATCH_NOMATCH);
4655 }
4656 switch(*eptr++)
4657 {
4658 default: break;
4659 case 0x0a: /* LF */
4660 case 0x0b: /* VT */
4661 case 0x0c: /* FF */
4662 case 0x0d: /* CR */
4663 case 0x85: /* NEL */
4664 #ifdef COMPILE_PCRE16
4665 case 0x2028: /* LINE SEPARATOR */
4666 case 0x2029: /* PARAGRAPH SEPARATOR */
4667 #endif
4668 RRETURN(MATCH_NOMATCH);
4669 }
4670 }
4671 break;
4672
4673 case OP_VSPACE:
4674 for (i = 1; i <= min; i++)
4675 {
4676 if (eptr >= md->end_subject)
4677 {
4678 SCHECK_PARTIAL();
4679 RRETURN(MATCH_NOMATCH);
4680 }
4681 switch(*eptr++)
4682 {
4683 default: RRETURN(MATCH_NOMATCH);
4684 case 0x0a: /* LF */
4685 case 0x0b: /* VT */
4686 case 0x0c: /* FF */
4687 case 0x0d: /* CR */
4688 case 0x85: /* NEL */
4689 #ifdef COMPILE_PCRE16
4690 case 0x2028: /* LINE SEPARATOR */
4691 case 0x2029: /* PARAGRAPH SEPARATOR */
4692 #endif
4693 break;
4694 }
4695 }
4696 break;
4697
4698 case OP_NOT_DIGIT:
4699 for (i = 1; i <= min; i++)
4700 {
4701 if (eptr >= md->end_subject)
4702 {
4703 SCHECK_PARTIAL();
4704 RRETURN(MATCH_NOMATCH);
4705 }
4706 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4707 RRETURN(MATCH_NOMATCH);
4708 eptr++;
4709 }
4710 break;
4711
4712 case OP_DIGIT:
4713 for (i = 1; i <= min; i++)
4714 {
4715 if (eptr >= md->end_subject)
4716 {
4717 SCHECK_PARTIAL();
4718 RRETURN(MATCH_NOMATCH);
4719 }
4720 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4721 RRETURN(MATCH_NOMATCH);
4722 eptr++;
4723 }
4724 break;
4725
4726 case OP_NOT_WHITESPACE:
4727 for (i = 1; i <= min; i++)
4728 {
4729 if (eptr >= md->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4733 }
4734 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4735 RRETURN(MATCH_NOMATCH);
4736 eptr++;
4737 }
4738 break;
4739
4740 case OP_WHITESPACE:
4741 for (i = 1; i <= min; i++)
4742 {
4743 if (eptr >= md->end_subject)
4744 {
4745 SCHECK_PARTIAL();
4746 RRETURN(MATCH_NOMATCH);
4747 }
4748 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4749 RRETURN(MATCH_NOMATCH);
4750 eptr++;
4751 }
4752 break;
4753
4754 case OP_NOT_WORDCHAR:
4755 for (i = 1; i <= min; i++)
4756 {
4757 if (eptr >= md->end_subject)
4758 {
4759 SCHECK_PARTIAL();
4760 RRETURN(MATCH_NOMATCH);
4761 }
4762 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4763 RRETURN(MATCH_NOMATCH);
4764 eptr++;
4765 }
4766 break;
4767
4768 case OP_WORDCHAR:
4769 for (i = 1; i <= min; i++)
4770 {
4771 if (eptr >= md->end_subject)
4772 {
4773 SCHECK_PARTIAL();
4774 RRETURN(MATCH_NOMATCH);
4775 }
4776 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4777 RRETURN(MATCH_NOMATCH);
4778 eptr++;
4779 }
4780 break;
4781
4782 default:
4783 RRETURN(PCRE_ERROR_INTERNAL);
4784 }
4785 }
4786
4787 /* If min = max, continue at the same level without recursing */
4788
4789 if (min == max) continue;
4790
4791 /* If minimizing, we have to test the rest of the pattern before each
4792 subsequent match. Again, separate the UTF-8 case for speed, and also
4793 separate the UCP cases. */
4794
4795 if (minimize)
4796 {
4797 #ifdef SUPPORT_UCP
4798 if (prop_type >= 0)
4799 {
4800 switch(prop_type)
4801 {
4802 case PT_ANY:
4803 for (fi = min;; fi++)
4804 {
4805 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4807 if (fi >= max) RRETURN(MATCH_NOMATCH);
4808 if (eptr >= md->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 RRETURN(MATCH_NOMATCH);
4812 }
4813 GETCHARINCTEST(c, eptr);
4814 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4815 }
4816 /* Control never gets here */
4817
4818 case PT_LAMP:
4819 for (fi = min;; fi++)
4820 {
4821 int chartype;
4822 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4824 if (fi >= max) RRETURN(MATCH_NOMATCH);
4825 if (eptr >= md->end_subject)
4826 {
4827 SCHECK_PARTIAL();
4828 RRETURN(MATCH_NOMATCH);
4829 }
4830 GETCHARINCTEST(c, eptr);
4831 chartype = UCD_CHARTYPE(c);
4832 if ((chartype == ucp_Lu ||
4833 chartype == ucp_Ll ||
4834 chartype == ucp_Lt) == prop_fail_result)
4835 RRETURN(MATCH_NOMATCH);
4836 }
4837 /* Control never gets here */
4838
4839 case PT_GC:
4840 for (fi = min;; fi++)
4841 {
4842 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4844 if (fi >= max) RRETURN(MATCH_NOMATCH);
4845 if (eptr >= md->end_subject)
4846 {
4847 SCHECK_PARTIAL();
4848 RRETURN(MATCH_NOMATCH);
4849 }
4850 GETCHARINCTEST(c, eptr);
4851 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4852 RRETURN(MATCH_NOMATCH);
4853 }
4854 /* Control never gets here */
4855
4856 case PT_PC:
4857 for (fi = min;; fi++)
4858 {
4859 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4861 if (fi >= max) RRETURN(MATCH_NOMATCH);
4862 if (eptr >= md->end_subject)
4863 {
4864 SCHECK_PARTIAL();
4865 RRETURN(MATCH_NOMATCH);
4866 }
4867 GETCHARINCTEST(c, eptr);
4868 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4869 RRETURN(MATCH_NOMATCH);
4870 }
4871 /* Control never gets here */
4872
4873 case PT_SC:
4874 for (fi = min;; fi++)
4875 {
4876 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4878 if (fi >= max) RRETURN(MATCH_NOMATCH);
4879 if (eptr >= md->end_subject)
4880 {
4881 SCHECK_PARTIAL();
4882 RRETURN(MATCH_NOMATCH);
4883 }
4884 GETCHARINCTEST(c, eptr);
4885 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4886 RRETURN(MATCH_NOMATCH);
4887 }
4888 /* Control never gets here */
4889
4890 case PT_ALNUM:
4891 for (fi = min;; fi++)
4892 {
4893 int category;
4894 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4896 if (fi >= max) RRETURN(MATCH_NOMATCH);
4897 if (eptr >= md->end_subject)
4898 {
4899 SCHECK_PARTIAL();
4900 RRETURN(MATCH_NOMATCH);
4901 }
4902 GETCHARINCTEST(c, eptr);
4903 category = UCD_CATEGORY(c);
4904 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4905 RRETURN(MATCH_NOMATCH);
4906 }
4907 /* Control never gets here */
4908
4909 case PT_SPACE: /* Perl space */
4910 for (fi = min;; fi++)
4911 {
4912 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4913 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4914 if (fi >= max) RRETURN(MATCH_NOMATCH);
4915 if (eptr >= md->end_subject)
4916 {
4917 SCHECK_PARTIAL();
4918 RRETURN(MATCH_NOMATCH);
4919 }
4920 GETCHARINCTEST(c, eptr);
4921 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4922 c == CHAR_FF || c == CHAR_CR)
4923 == prop_fail_result)
4924 RRETURN(MATCH_NOMATCH);
4925 }
4926 /* Control never gets here */
4927
4928 case PT_PXSPACE: /* POSIX space */
4929 for (fi = min;; fi++)
4930 {
4931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4933 if (fi >= max) RRETURN(MATCH_NOMATCH);
4934 if (eptr >= md->end_subject)
4935 {
4936 SCHECK_PARTIAL();
4937 RRETURN(MATCH_NOMATCH);
4938 }
4939 GETCHARINCTEST(c, eptr);
4940 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4941 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4942 == prop_fail_result)
4943 RRETURN(MATCH_NOMATCH);
4944 }
4945 /* Control never gets here */
4946
4947 case PT_WORD:
4948 for (fi = min;; fi++)
4949 {
4950 int category;
4951 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4953 if (fi >= max) RRETURN(MATCH_NOMATCH);
4954 if (eptr >= md->end_subject)
4955 {
4956 SCHECK_PARTIAL();
4957 RRETURN(MATCH_NOMATCH);
4958 }
4959 GETCHARINCTEST(c, eptr);
4960 category = UCD_CATEGORY(c);
4961 if ((category == ucp_L ||
4962 category == ucp_N ||
4963 c == CHAR_UNDERSCORE)
4964 == prop_fail_result)
4965 RRETURN(MATCH_NOMATCH);
4966 }
4967 /* Control never gets here */
4968
4969 /* This should never occur */
4970
4971 default:
4972 RRETURN(PCRE_ERROR_INTERNAL);
4973 }
4974 }
4975
4976 /* Match extended Unicode sequences. We will get here only if the
4977 support is in the binary; otherwise a compile-time error occurs. */
4978
4979 else if (ctype == OP_EXTUNI)
4980 {
4981 for (fi = min;; fi++)
4982 {
4983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4985 if (fi >= max) RRETURN(MATCH_NOMATCH);
4986 if (eptr >= md->end_subject)
4987 {
4988 SCHECK_PARTIAL();
4989 RRETURN(MATCH_NOMATCH);
4990 }
4991 GETCHARINCTEST(c, eptr);
4992 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4993 while (eptr < md->end_subject)
4994 {
4995 int len = 1;
4996 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4997 if (UCD_CATEGORY(c) != ucp_M) break;
4998 eptr += len;
4999 }
5000 CHECK_PARTIAL();
5001 }
5002 }
5003 else
5004 #endif /* SUPPORT_UCP */
5005
5006 #ifdef SUPPORT_UTF
5007 if (utf)
5008 {
5009 for (fi = min;; fi++)
5010 {
5011 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5013 if (fi >= max) RRETURN(MATCH_NOMATCH);
5014 if (eptr >= md->end_subject)
5015 {
5016 SCHECK_PARTIAL();
5017 RRETURN(MATCH_NOMATCH);
5018 }
5019 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5020 RRETURN(MATCH_NOMATCH);
5021 GETCHARINC(c, eptr);
5022 switch(ctype)
5023 {
5024 case OP_ANY: /* This is the non-NL case */
5025 case OP_ALLANY:
5026 case OP_ANYBYTE:
5027 break;
5028
5029 case OP_ANYNL:
5030 switch(c)
5031 {
5032 default: RRETURN(MATCH_NOMATCH);
5033 case 0x000d:
5034 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5035 break;
5036 case 0x000a:
5037 break;
5038
5039 case 0x000b:
5040 case 0x000c:
5041 case 0x0085:
5042 case 0x2028:
5043 case 0x2029:
5044 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5045 break;
5046 }
5047 break;
5048
5049 case OP_NOT_HSPACE:
5050 switch(c)
5051 {
5052 default: break;
5053 case 0x09: /* HT */
5054 case 0x20: /* SPACE */
5055 case 0xa0: /* NBSP */
5056 case 0x1680: /* OGHAM SPACE MARK */
5057 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5058 case 0x2000: /* EN QUAD */
5059 case 0x2001: /* EM QUAD */
5060 case 0x2002: /* EN SPACE */
5061 case 0x2003: /* EM SPACE */
5062 case 0x2004: /* THREE-PER-EM SPACE */
5063 case 0x2005: /* FOUR-PER-EM SPACE */
5064 case 0x2006: /* SIX-PER-EM SPACE */
5065 case 0x2007: /* FIGURE SPACE */
5066 case 0x2008: /* PUNCTUATION SPACE */
5067 case 0x2009: /* THIN SPACE */
5068 case 0x200A: /* HAIR SPACE */
5069 case 0x202f: /* NARROW NO-BREAK SPACE */
5070 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5071 case 0x3000: /* IDEOGRAPHIC SPACE */
5072 RRETURN(MATCH_NOMATCH);
5073 }
5074 break;
5075
5076 case OP_HSPACE:
5077 switch(c)
5078 {
5079 default: RRETURN(MATCH_NOMATCH);
5080 case 0x09: /* HT */
5081 case 0x20: /* SPACE */
5082 case 0xa0: /* NBSP */
5083 case 0x1680: /* OGHAM SPACE MARK */
5084 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5085 case 0x2000: /* EN QUAD */
5086 case 0x2001: /* EM QUAD */
5087 case 0x2002: /* EN SPACE */
5088 case 0x2003: /* EM SPACE */
5089 case 0x2004: /* THREE-PER-EM SPACE */
5090 case 0x2005: /* FOUR-PER-EM SPACE */
5091 case 0x2006: /* SIX-PER-EM SPACE */
5092 case 0x2007: /* FIGURE SPACE */
5093 case 0x2008: /* PUNCTUATION SPACE */
5094 case 0x2009: /* THIN SPACE */
5095 case 0x200A: /* HAIR SPACE */
5096 case 0x202f: /* NARROW NO-BREAK SPACE */
5097 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5098 case 0x3000: /* IDEOGRAPHIC SPACE */
5099 break;
5100 }
5101 break;
5102
5103 case OP_NOT_VSPACE:
5104 switch(c)
5105 {
5106 default: break;
5107 case 0x0a: /* LF */
5108 case 0x0b: /* VT */
5109 case 0x0c: /* FF */
5110 case 0x0d: /* CR */
5111 case 0x85: /* NEL */
5112 case 0x2028: /* LINE SEPARATOR */
5113 case 0x2029: /* PARAGRAPH SEPARATOR */
5114 RRETURN(MATCH_NOMATCH);
5115 }
5116 break;
5117
5118 case OP_VSPACE:
5119 switch(c)
5120 {
5121 default: RRETURN(MATCH_NOMATCH);
5122 case 0x0a: /* LF */
5123 case 0x0b: /* VT */
5124 case 0x0c: /* FF */
5125 case 0x0d: /* CR */
5126 case 0x85: /* NEL */
5127 case 0x2028: /* LINE SEPARATOR */
5128 case 0x2029: /* PARAGRAPH SEPARATOR */
5129 break;
5130 }
5131 break;
5132
5133 case OP_NOT_DIGIT:
5134 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5135 RRETURN(MATCH_NOMATCH);
5136 break;
5137
5138 case OP_DIGIT:
5139 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5140 RRETURN(MATCH_NOMATCH);
5141 break;
5142
5143 case OP_NOT_WHITESPACE:
5144 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5145 RRETURN(MATCH_NOMATCH);
5146 break;
5147
5148 case OP_WHITESPACE:
5149 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5150 RRETURN(MATCH_NOMATCH);
5151 break;
5152
5153 case OP_NOT_WORDCHAR:
5154 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5155 RRETURN(MATCH_NOMATCH);
5156 break;
5157
5158 case OP_WORDCHAR:
5159 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5160 RRETURN(MATCH_NOMATCH);
5161 break;
5162
5163 default:
5164 RRETURN(PCRE_ERROR_INTERNAL);
5165 }
5166 }
5167 }
5168 else
5169 #endif
5170 /* Not UTF mode */
5171 {
5172 for (fi = min;; fi++)
5173 {
5174 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5175 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5176 if (fi >= max) RRETURN(MATCH_NOMATCH);
5177 if (eptr >= md->end_subject)
5178 {
5179 SCHECK_PARTIAL();
5180 RRETURN(MATCH_NOMATCH);
5181 }
5182 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5183 RRETURN(MATCH_NOMATCH);
5184 c = *eptr++;
5185 switch(ctype)
5186 {
5187 case OP_ANY: /* This is the non-NL case */
5188 case OP_ALLANY:
5189 case OP_ANYBYTE:
5190 break;
5191
5192 case OP_ANYNL:
5193 switch(c)
5194 {
5195 default: RRETURN(MATCH_NOMATCH);
5196 case 0x000d:
5197 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5198 break;
5199
5200 case 0x000a:
5201 break;
5202
5203 case 0x000b:
5204 case 0x000c:
5205 case 0x0085:
5206 #ifdef COMPILE_PCRE16
5207 case 0x2028:
5208 case 0x2029:
5209 #endif
5210 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5211 break;
5212 }
5213 break;
5214
5215 case OP_NOT_HSPACE:
5216 switch(c)
5217 {
5218 default: break;
5219 case 0x09: /* HT */
5220 case 0x20: /* SPACE */
5221 case 0xa0: /* NBSP */
5222 #ifdef COMPILE_PCRE16
5223 case 0x1680: /* OGHAM SPACE MARK */
5224 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5225 case 0x2000: /* EN QUAD */
5226 case 0x2001: /* EM QUAD */
5227 case 0x2002: /* EN SPACE */
5228 case 0x2003: /* EM SPACE */
5229 case 0x2004: /* THREE-PER-EM SPACE */
5230 case 0x2005: /* FOUR-PER-EM SPACE */
5231 case 0x2006: /* SIX-PER-EM SPACE */
5232 case 0x2007: /* FIGURE SPACE */
5233 case 0x2008: /* PUNCTUATION SPACE */
5234 case 0x2009: /* THIN SPACE */
5235 case 0x200A: /* HAIR SPACE */
5236 case 0x202f: /* NARROW NO-BREAK SPACE */
5237 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5238 case 0x3000: /* IDEOGRAPHIC SPACE */
5239 #endif
5240 RRETURN(MATCH_NOMATCH);
5241 }
5242 break;
5243
5244 case OP_HSPACE:
5245 switch(c)
5246 {
5247 default: RRETURN(MATCH_NOMATCH);
5248 case 0x09: /* HT */
5249 case 0x20: /* SPACE */
5250 case 0xa0: /* NBSP */
5251 #ifdef COMPILE_PCRE16
5252 case 0x1680: /* OGHAM SPACE MARK */
5253 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5254 case 0x2000: /* EN QUAD */
5255 case 0x2001: /* EM QUAD */
5256 case 0x2002: /* EN SPACE */
5257 case 0x2003: /* EM SPACE */
5258 case 0x2004: /* THREE-PER-EM SPACE */
5259 case 0x2005: /* FOUR-PER-EM SPACE */
5260 case 0x2006: /* SIX-PER-EM SPACE */
5261 case 0x2007: /* FIGURE SPACE */
5262 case 0x2008: /* PUNCTUATION SPACE */
5263 case 0x2009: /* THIN SPACE */
5264 case 0x200A: /* HAIR SPACE */
5265 case 0x202f: /* NARROW NO-BREAK SPACE */
5266 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5267 case 0x3000: /* IDEOGRAPHIC SPACE */
5268 #endif
5269 break;
5270 }
5271 break;
5272
5273 case OP_NOT_VSPACE:
5274 switch(c)
5275 {
5276 default: break;
5277 case 0x0a: /* LF */
5278 case 0x0b: /* VT */
5279 case 0x0c: /* FF */
5280 case 0x0d: /* CR */
5281 case 0x85: /* NEL */
5282 #ifdef COMPILE_PCRE16
5283 case 0x2028: /* LINE SEPARATOR */
5284 case 0x2029: /* PARAGRAPH SEPARATOR */
5285 #endif
5286 RRETURN(MATCH_NOMATCH);
5287 }
5288 break;
5289
5290 case OP_VSPACE:
5291 switch(c)
5292 {
5293 default: RRETURN(MATCH_NOMATCH);
5294 case 0x0a: /* LF */
5295 case 0x0b: /* VT */
5296 case 0x0c: /* FF */
5297 case 0x0d: /* CR */
5298 case 0x85: /* NEL */
5299 #ifdef COMPILE_PCRE16
5300 case 0x2028: /* LINE SEPARATOR */
5301 case 0x2029: /* PARAGRAPH SEPARATOR */
5302 #endif
5303 break;
5304 }
5305 break;
5306
5307 case OP_NOT_DIGIT:
5308 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5309 break;
5310
5311 case OP_DIGIT:
5312 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5313 break;
5314
5315 case OP_NOT_WHITESPACE:
5316 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5317 break;
5318
5319 case OP_WHITESPACE:
5320 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5321 break;
5322
5323 case OP_NOT_WORDCHAR:
5324 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5325 break;
5326
5327 case OP_WORDCHAR:
5328 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5329 break;
5330
5331 default:
5332 RRETURN(PCRE_ERROR_INTERNAL);
5333 }
5334 }
5335 }
5336 /* Control never gets here */
5337 }
5338
5339 /* If maximizing, it is worth using inline code for speed, doing the type
5340 test once at the start (i.e. keep it out of the loop). Again, keep the
5341 UTF-8 and UCP stuff separate. */
5342
5343 else
5344 {
5345 pp = eptr; /* Remember where we started */
5346
5347 #ifdef SUPPORT_UCP
5348 if (prop_type >= 0)
5349 {
5350 switch(prop_type)
5351 {
5352 case PT_ANY:
5353 for (i = min; i < max; i++)
5354 {
5355 int len = 1;
5356 if (eptr >= md->end_subject)
5357 {
5358 SCHECK_PARTIAL();
5359 break;
5360 }
5361 GETCHARLENTEST(c, eptr, len);
5362 if (prop_fail_result) break;
5363 eptr+= len;
5364 }
5365 break;
5366
5367 case PT_LAMP:
5368 for (i = min; i < max; i++)
5369 {
5370 int chartype;
5371 int len = 1;
5372 if (eptr >= md->end_subject)
5373 {
5374 SCHECK_PARTIAL();
5375 break;
5376 }
5377 GETCHARLENTEST(c, eptr, len);
5378 chartype = UCD_CHARTYPE(c);
5379 if ((chartype == ucp_Lu ||
5380 chartype == ucp_Ll ||
5381 chartype == ucp_Lt) == prop_fail_result)
5382 break;
5383 eptr+= len;
5384 }
5385 break;
5386
5387 case PT_GC:
5388 for (i = min; i < max; i++)
5389 {
5390 int len = 1;
5391 if (eptr >= md->end_subject)
5392 {
5393 SCHECK_PARTIAL();
5394 break;
5395 }
5396 GETCHARLENTEST(c, eptr, len);
5397 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5398 eptr+= len;
5399 }
5400 break;
5401
5402 case PT_PC:
5403 for (i = min; i < max; i++)
5404 {
5405 int len = 1;
5406 if (eptr >= md->end_subject)
5407 {
5408 SCHECK_PARTIAL();
5409 break;
5410 }
5411 GETCHARLENTEST(c, eptr, len);
5412 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5413 eptr+= len;
5414 }
5415 break;
5416
5417 case PT_SC:
5418 for (i = min; i < max; i++)
5419 {
5420 int len = 1;
5421 if (eptr >= md->end_subject)
5422 {
5423 SCHECK_PARTIAL();
5424 break;
5425 }
5426 GETCHARLENTEST(c, eptr, len);
5427 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5428 eptr+= len;
5429 }
5430 break;
5431
5432 case PT_ALNUM:
5433 for (i = min; i < max; i++)
5434 {
5435 int category;
5436 int len = 1;
5437 if (eptr >= md->end_subject)
5438 {
5439 SCHECK_PARTIAL();
5440 break;
5441 }
5442 GETCHARLENTEST(c, eptr, len);
5443 category = UCD_CATEGORY(c);
5444 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5445 break;
5446 eptr+= len;
5447 }
5448 break;
5449
5450 case PT_SPACE: /* Perl space */
5451 for (i = min; i < max; i++)
5452 {
5453 int len = 1;
5454 if (eptr >= md->end_subject)
5455 {
5456 SCHECK_PARTIAL();
5457 break;
5458 }
5459 GETCHARLENTEST(c, eptr, len);
5460 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5461 c == CHAR_FF || c == CHAR_CR)
5462 == prop_fail_result)
5463 break;
5464 eptr+= len;
5465 }
5466 break;
5467
5468 case PT_PXSPACE: /* POSIX space */
5469 for (i = min; i < max; i++)
5470 {
5471 int len = 1;
5472 if (eptr >= md->end_subject)
5473 {
5474 SCHECK_PARTIAL();
5475 break;
5476 }
5477 GETCHARLENTEST(c, eptr, len);
5478 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5479 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5480 == prop_fail_result)
5481 break;
5482 eptr+= len;
5483 }
5484 break;
5485
5486 case PT_WORD:
5487 for (i = min; i < max; i++)
5488 {
5489 int category;
5490 int len = 1;
5491 if (eptr >= md->end_subject)
5492 {
5493 SCHECK_PARTIAL();
5494 break;
5495 }
5496 GETCHARLENTEST(c, eptr, len);
5497 category = UCD_CATEGORY(c);
5498 if ((category == ucp_L || category == ucp_N ||
5499 c == CHAR_UNDERSCORE) == prop_fail_result)
5500 break;
5501 eptr+= len;
5502 }
5503 break;
5504
5505 default:
5506 RRETURN(PCRE_ERROR_INTERNAL);
5507 }
5508
5509 /* eptr is now past the end of the maximum run */
5510
5511 if (possessive) continue;
5512 for(;;)
5513 {
5514 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5516 if (eptr-- == pp) break; /* Stop if tried at original pos */
5517 if (utf) BACKCHAR(eptr);
5518 }
5519 }
5520
5521 /* Match extended Unicode sequences. We will get here only if the
5522 support is in the binary; otherwise a compile-time error occurs. */
5523
5524 else if (ctype == OP_EXTUNI)
5525 {
5526 for (i = min; i < max; i++)
5527 {
5528 int len = 1;
5529 if (eptr >= md->end_subject)
5530 {
5531 SCHECK_PARTIAL();
5532 break;
5533 }
5534 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5535 if (UCD_CATEGORY(c) == ucp_M) break;
5536 eptr += len;
5537 while (eptr < md->end_subject)
5538 {
5539 len = 1;
5540 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5541 if (UCD_CATEGORY(c) != ucp_M) break;
5542 eptr += len;
5543 }
5544 CHECK_PARTIAL();
5545 }
5546
5547 /* eptr is now past the end of the maximum run */
5548
5549 if (possessive) continue;
5550
5551 for(;;)
5552 {
5553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5554 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5555 if (eptr-- == pp) break; /* Stop if tried at original pos */
5556 for (;;) /* Move back over one extended */
5557 {
5558 if (!utf) c = *eptr; else
5559 {
5560 BACKCHAR(eptr);
5561 GETCHAR(c, eptr);
5562 }
5563 if (UCD_CATEGORY(c) != ucp_M) break;
5564 eptr--;
5565 }
5566 }
5567 }
5568
5569 else
5570 #endif /* SUPPORT_UCP */
5571
5572 #ifdef SUPPORT_UTF
5573 if (utf)
5574 {
5575 switch(ctype)
5576 {
5577 case OP_ANY:
5578 if (max < INT_MAX)
5579 {
5580 for (i = min; i < max; i++)
5581 {
5582 if (eptr >= md->end_subject)
5583 {
5584 SCHECK_PARTIAL();
5585 break;
5586 }
5587 if (IS_NEWLINE(eptr)) break;
5588 eptr++;
5589 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5590 }
5591 }
5592
5593 /* Handle unlimited UTF-8 repeat */
5594
5595 else
5596 {
5597 for (i = min; i < max; i++)
5598 {
5599 if (eptr >= md->end_subject)
5600 {
5601 SCHECK_PARTIAL();
5602 break;
5603 }
5604 if (IS_NEWLINE(eptr)) break;
5605 eptr++;
5606 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5607 }
5608 }
5609 break;
5610
5611 case OP_ALLANY:
5612 if (max < INT_MAX)
5613 {
5614 for (i = min; i < max; i++)
5615 {
5616 if (eptr >= md->end_subject)
5617 {
5618 SCHECK_PARTIAL();
5619 break;
5620 }
5621 eptr++;
5622 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5623 }
5624 }
5625 else
5626 {
5627 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5628 SCHECK_PARTIAL();
5629 }
5630 break;
5631
5632 /* The byte case is the same as non-UTF8 */
5633
5634 case OP_ANYBYTE:
5635 c = max - min;
5636 if (c > (unsigned int)(md->end_subject - eptr))
5637 {
5638 eptr = md->end_subject;
5639 SCHECK_PARTIAL();
5640 }
5641 else eptr += c;
5642 break;
5643
5644 case OP_ANYNL:
5645 for (i = min; i < max; i++)
5646 {
5647 int len = 1;
5648 if (eptr >= md->end_subject)
5649 {
5650 SCHECK_PARTIAL();
5651 break;
5652 }
5653 GETCHARLEN(c, eptr, len);
5654 if (c == 0x000d)
5655 {
5656 if (++eptr >= md->end_subject) break;
5657 if (*eptr == 0x000a) eptr++;
5658 }
5659 else
5660 {
5661 if (c != 0x000a &&
5662 (md->bsr_anycrlf ||
5663 (c != 0x000b && c != 0x000c &&
5664 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5665 break;
5666 eptr += len;
5667 }
5668 }
5669 break;
5670
5671 case OP_NOT_HSPACE:
5672 case OP_HSPACE:
5673 for (i = min; i < max; i++)
5674 {
5675 BOOL gotspace;
5676 int len = 1;
5677 if (eptr >= md->end_subject)
5678 {
5679 SCHECK_PARTIAL();
5680 break;
5681 }
5682 GETCHARLEN(c, eptr, len);
5683 switch(c)
5684 {
5685 default: gotspace = FALSE; break;
5686 case 0x09: /* HT */
5687 case 0x20: /* SPACE */
5688 case 0xa0: /* NBSP */
5689 case 0x1680: /* OGHAM SPACE MARK */
5690 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5691 case 0x2000: /* EN QUAD */
5692 case 0x2001: /* EM QUAD */
5693 case 0x2002: /* EN SPACE */
5694 case 0x2003: /* EM SPACE */
5695 case 0x2004: /* THREE-PER-EM SPACE */
5696 case 0x2005: /* FOUR-PER-EM SPACE */
5697 case 0x2006: /* SIX-PER-EM SPACE */
5698 case 0x2007: /* FIGURE SPACE */
5699 case 0x2008: /* PUNCTUATION SPACE */
5700 case 0x2009: /* THIN SPACE */
5701 case 0x200A: /* HAIR SPACE */
5702 case 0x202f: /* NARROW NO-BREAK SPACE */
5703 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5704 case 0x3000: /* IDEOGRAPHIC SPACE */
5705 gotspace = TRUE;
5706 break;
5707 }
5708 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5709 eptr += len;
5710 }
5711 break;
5712
5713 case OP_NOT_VSPACE:
5714 case OP_VSPACE:
5715 for (i = min; i < max; i++)
5716 {
5717 BOOL gotspace;
5718 int len = 1;
5719 if (eptr >= md->end_subject)
5720 {
5721 SCHECK_PARTIAL();
5722 break;
5723 }
5724 GETCHARLEN(c, eptr, len);
5725 switch(c)
5726 {
5727 default: gotspace = FALSE; break;
5728 case 0x0a: /* LF */
5729 case 0x0b: /* VT */
5730 case 0x0c: /* FF */
5731 case 0x0d: /* CR */
5732 case 0x85: /* NEL */
5733 case 0x2028: /* LINE SEPARATOR */
5734 case 0x2029: /* PARAGRAPH SEPARATOR */
5735 gotspace = TRUE;
5736 break;
5737 }
5738 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5739 eptr += len;
5740 }
5741 break;
5742
5743 case OP_NOT_DIGIT:
5744 for (i = min; i < max; i++)
5745 {
5746 int len = 1;
5747 if (eptr >= md->end_subject)
5748 {
5749 SCHECK_PARTIAL();
5750 break;
5751 }
5752 GETCHARLEN(c, eptr, len);
5753 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5754 eptr+= len;
5755 }
5756 break;
5757
5758 case OP_DIGIT:
5759 for (i = min; i < max; i++)
5760 {
5761 int len = 1;
5762 if (eptr >= md->end_subject)
5763 {
5764 SCHECK_PARTIAL();
5765 break;
5766 }
5767 GETCHARLEN(c, eptr, len);
5768 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5769 eptr+= len;
5770 }
5771 break;
5772
5773 case OP_NOT_WHITESPACE:
5774 for (i = min; i < max; i++)
5775 {
5776 int len = 1;
5777 if (eptr >= md->end_subject)
5778 {
5779 SCHECK_PARTIAL();
5780 break;
5781 }
5782 GETCHARLEN(c, eptr, len);
5783 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5784 eptr+= len;
5785 }
5786 break;
5787
5788 case OP_WHITESPACE:
5789 for (i = min; i < max; i++)
5790 {
5791 int len = 1;
5792 if (eptr >= md->end_subject)
5793 {
5794 SCHECK_PARTIAL();
5795 break;
5796 }
5797 GETCHARLEN(c, eptr, len);
5798 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5799 eptr+= len;
5800 }
5801 break;
5802
5803 case OP_NOT_WORDCHAR:
5804 for (i = min; i < max; i++)
5805 {
5806 int len = 1;
5807 if (eptr >= md->end_subject)
5808 {
5809 SCHECK_PARTIAL();
5810 break;
5811 }
5812 GETCHARLEN(c, eptr, len);
5813 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5814 eptr+= len;
5815 }
5816 break;
5817
5818 case OP_WORDCHAR:
5819 for (i = min; i < max; i++)
5820 {
5821 int len = 1;
5822 if (eptr >= md->end_subject)
5823 {
5824 SCHECK_PARTIAL();
5825 break;
5826 }
5827 GETCHARLEN(c, eptr, len);
5828 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5829 eptr+= len;
5830 }
5831 break;
5832
5833 default:
5834 RRETURN(PCRE_ERROR_INTERNAL);
5835 }
5836
5837 /* eptr is now past the end of the maximum run. If possessive, we are
5838 done (no backing up). Otherwise, match at this position; anything other
5839 than no match is immediately returned. For nomatch, back up one
5840 character, unless we are matching \R and the last thing matched was
5841 \r\n, in which case, back up two bytes. */
5842
5843 if (possessive) continue;
5844 for(;;)
5845 {
5846 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5848 if (eptr-- == pp) break; /* Stop if tried at original pos */
5849 BACKCHAR(eptr);
5850 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5851 eptr[-1] == '\r') eptr--;
5852 }
5853 }
5854 else
5855 #endif /* SUPPORT_UTF */
5856 /* Not UTF mode */
5857 {
5858 switch(ctype)
5859 {
5860 case OP_ANY:
5861 for (i = min; i < max; i++)
5862 {
5863 if (eptr >= md->end_subject)
5864 {
5865 SCHECK_PARTIAL();
5866 break;
5867 }
5868 if (IS_NEWLINE(eptr)) break;
5869 eptr++;
5870 }
5871 break;
5872
5873 case OP_ALLANY:
5874 case OP_ANYBYTE:
5875 c = max - min;
5876 if (c > (unsigned int)(md->end_subject - eptr))
5877 {
5878 eptr = md->end_subject;
5879 SCHECK_PARTIAL();
5880 }
5881 else eptr += c;
5882 break;
5883
5884 case OP_ANYNL:
5885 for (i = min; i < max; i++)
5886 {
5887 if (eptr >= md->end_subject)
5888 {
5889 SCHECK_PARTIAL();
5890 break;
5891 }
5892 c = *eptr;
5893 if (c == 0x000d)
5894 {
5895 if (++eptr >= md->end_subject) break;
5896 if (*eptr == 0x000a) eptr++;
5897 }
5898 else
5899 {
5900 if (c != 0x000a && (md->bsr_anycrlf ||
5901 (c != 0x000b && c != 0x000c && c != 0x0085
5902 #ifdef COMPILE_PCRE16
5903 && c != 0x2028 && c != 0x2029
5904 #endif
5905 ))) break;
5906 eptr++;
5907 }
5908 }
5909 break;
5910
5911 case OP_NOT_HSPACE:
5912 for (i = min; i < max; i++)
5913 {
5914 if (eptr >= md->end_subject)
5915 {
5916 SCHECK_PARTIAL();
5917 break;
5918 }
5919 c = *eptr;
5920 if (c == 0x09 || c == 0x20 || c == 0xa0
5921 #ifdef COMPILE_PCRE16
5922 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5923 || c == 0x202f || c == 0x205f || c == 0x3000
5924 #endif
5925 ) break;
5926 eptr++;
5927 }
5928 break;
5929
5930 case OP_HSPACE:
5931 for (i = min; i < max; i++)
5932 {
5933 if (eptr >= md->end_subject)
5934 {
5935 SCHECK_PARTIAL();
5936 break;
5937 }
5938 c = *eptr;
5939 if (c != 0x09 && c != 0x20 && c != 0xa0
5940 #ifdef COMPILE_PCRE16
5941 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5942 && c != 0x202f && c != 0x205f && c != 0x3000
5943 #endif
5944 ) break;
5945 eptr++;
5946 }
5947 break;
5948
5949 case OP_NOT_VSPACE:
5950 for (i = min; i < max; i++)
5951 {
5952 if (eptr >= md->end_subject)
5953 {
5954 SCHECK_PARTIAL();
5955 break;
5956 }
5957 c = *eptr;
5958 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5959 #ifdef COMPILE_PCRE16
5960 || c == 0x2028 || c == 0x2029
5961 #endif
5962 ) break;
5963 eptr++;
5964 }
5965 break;
5966
5967 case OP_VSPACE:
5968 for (i = min; i < max; i++)
5969 {
5970 if (eptr >= md->end_subject)
5971 {
5972 SCHECK_PARTIAL();
5973 break;
5974 }
5975 c = *eptr;
5976 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5977 #ifdef COMPILE_PCRE16
5978 && c != 0x2028 && c != 0x2029
5979 #endif
5980 ) break;
5981 eptr++;
5982 }
5983 break;
5984
5985 case OP_NOT_DIGIT:
5986 for (i = min; i < max; i++)
5987 {
5988 if (eptr >= md->end_subject)
5989 {
5990 SCHECK_PARTIAL();
5991 break;
5992 }
5993 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5994 eptr++;
5995 }
5996 break;
5997
5998 case OP_DIGIT:
5999 for (i = min; i < max; i++)
6000 {
6001 if (eptr >= md->end_subject)
6002 {
6003 SCHECK_PARTIAL();
6004 break;
6005 }
6006 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6007 eptr++;
6008 }
6009 break;
6010
6011 case OP_NOT_WHITESPACE:
6012 for (i = min; i < max; i++)
6013 {
6014 if (eptr >= md->end_subject)
6015 {
6016 SCHECK_PARTIAL();
6017 break;
6018 }
6019 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6020 eptr++;
6021 }
6022 break;
6023
6024 case OP_WHITESPACE:
6025 for (i = min; i < max; i++)
6026 {
6027 if (eptr >= md->end_subject)
6028 {
6029 SCHECK_PARTIAL();
6030 break;
6031 }
6032 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6033 eptr++;
6034 }
6035 break;
6036
6037 case OP_NOT_WORDCHAR:
6038 for (i = min; i < max; i++)
6039 {
6040 if (eptr >= md->end_subject)
6041 {
6042 SCHECK_PARTIAL();
6043 break;
6044 }
6045 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6046 eptr++;
6047 }
6048 break;
6049
6050 case OP_WORDCHAR:
6051 for (i = min; i < max; i++)
6052 {
6053 if (eptr >= md->end_subject)
6054 {
6055 SCHECK_PARTIAL();
6056 break;
6057 }
6058 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6059 eptr++;
6060 }
6061 break;
6062
6063 default:
6064 RRETURN(PCRE_ERROR_INTERNAL);
6065 }
6066
6067 /* eptr is now past the end of the maximum run. If possessive, we are
6068 done (no backing up). Otherwise, match at this position; anything other
6069 than no match is immediately returned. For nomatch, back up one
6070 character (byte), unless we are matching \R and the last thing matched
6071 was \r\n, in which case, back up two bytes. */
6072
6073 if (possessive) continue;
6074 while (eptr >= pp)
6075 {
6076 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6077 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6078 eptr--;
6079 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6080 eptr[-1] == '\r') eptr--;
6081 }
6082 }
6083
6084 /* Get here if we can't make it match with any permitted repetitions */
6085
6086 RRETURN(MATCH_NOMATCH);
6087 }
6088 /* Control never gets here */
6089
6090 /* There's been some horrible disaster. Arrival here can only mean there is
6091 something seriously wrong in the code above or the OP_xxx definitions. */
6092
6093 default:
6094 DPRINTF(("Unknown opcode %d\n", *ecode));
6095 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6096 }
6097
6098 /* Do not stick any code in here without much thought; it is assumed
6099 that "continue" in the code above comes out to here to repeat the main
6100 loop. */
6101
6102 } /* End of main loop */
6103 /* Control never reaches here */
6104
6105
6106 /* When compiling to use the heap rather than the stack for recursive calls to
6107 match(), the RRETURN() macro jumps here. The number that is saved in
6108 frame->Xwhere indicates which label we actually want to return to. */
6109
6110 #ifdef NO_RECURSE
6111 #define LBL(val) case val: goto L_RM##val;
6112 HEAP_RETURN:
6113 switch (frame->Xwhere)
6114 {
6115 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6116 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6117 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6118 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6119 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6120 LBL(65) LBL(66)
6121 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6122 LBL(21)
6123 #endif
6124 #ifdef SUPPORT_UTF
6125 LBL(16) LBL(18) LBL(20)
6126 LBL(22) LBL(23) LBL(28) LBL(30)
6127 LBL(32) LBL(34) LBL(42) LBL(46)
6128 #ifdef SUPPORT_UCP
6129 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6130 LBL(59) LBL(60) LBL(61) LBL(62)
6131 #endif /* SUPPORT_UCP */
6132 #endif /* SUPPORT_UTF */
6133 default:
6134 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6135
6136 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6137
6138 return PCRE_ERROR_INTERNAL;
6139 }
6140 #undef LBL
6141 #endif /* NO_RECURSE */
6142 }
6143
6144
6145 /***************************************************************************
6146 ****************************************************************************
6147 RECURSION IN THE match() FUNCTION
6148
6149 Undefine all the macros that were defined above to handle this. */
6150
6151 #ifdef NO_RECURSE
6152 #undef eptr
6153 #undef ecode
6154 #undef mstart
6155 #undef offset_top
6156 #undef eptrb
6157 #undef flags
6158
6159 #undef callpat
6160 #undef charptr
6161 #undef data
6162 #undef next
6163 #undef pp
6164 #undef prev
6165 #undef saved_eptr
6166
6167 #undef new_recursive
6168
6169 #undef cur_is_word
6170 #undef condition
6171 #undef prev_is_word
6172
6173 #undef ctype
6174 #undef length
6175 #undef max
6176 #undef min
6177 #undef number
6178 #undef offset
6179 #undef op
6180 #undef save_capture_last
6181 #undef save_offset1
6182 #undef save_offset2
6183 #undef save_offset3
6184 #undef stacksave
6185
6186 #undef newptrb
6187
6188 #endif
6189
6190 /* These two are defined as macros in both cases */
6191
6192 #undef fc
6193 #undef fi
6194
6195 /***************************************************************************
6196 ***************************************************************************/
6197
6198
6199
6200 /*************************************************
6201 * Execute a Regular Expression *
6202 *************************************************/
6203
6204 /* This function applies a compiled re to a subject string and picks out
6205 portions of the string if it matches. Two elements in the vector are set for
6206 each substring: the offsets to the start and end of the substring.
6207
6208 Arguments:
6209 argument_re points to the compiled expression
6210 extra_data points to extra data or is NULL
6211 subject points to the subject string
6212 length length of subject string (may contain binary zeros)
6213 start_offset where to start in the subject string
6214 options option bits
6215 offsets points to a vector of ints to be filled in with offsets
6216 offsetcount the number of elements in the vector
6217
6218 Returns: > 0 => success; value is the number of elements filled in
6219 = 0 => success, but offsets is not big enough
6220 -1 => failed to match
6221 < -1 => some kind of unexpected problem
6222 */
6223
6224 #ifdef COMPILE_PCRE8
6225 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6226 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6227 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6228 int offsetcount)
6229 #else
6230 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6231 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6232 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6233 int offsetcount)
6234 #endif
6235 {
6236 int rc, ocount, arg_offset_max;
6237 int newline;
6238 BOOL using_temporary_offsets = FALSE;
6239 BOOL anchored;
6240 BOOL startline;
6241 BOOL firstline;
6242 BOOL utf;
6243 BOOL has_first_char = FALSE;
6244 BOOL has_req_char = FALSE;
6245 pcre_uchar first_char = 0;
6246 pcre_uchar first_char2 = 0;
6247 pcre_uchar req_char = 0;
6248 pcre_uchar req_char2 = 0;
6249 match_data match_block;
6250 match_data *md = &match_block;
6251 const pcre_uint8 *tables;
6252 const pcre_uint8 *start_bits = NULL;
6253 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6254 PCRE_PUCHAR end_subject;
6255 PCRE_PUCHAR start_partial = NULL;
6256 PCRE_PUCHAR req_char_ptr = start_match - 1;
6257
6258 const pcre_study_data *study;
6259 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6260
6261 /* Check for the special magic call that measures the size of the stack used
6262 per recursive call of match(). Without the funny casting for sizeof, a Windows
6263 compiler gave this error: "unary minus operator applied to unsigned type,
6264 result still unsigned". Hopefully the cast fixes that. */
6265
6266 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6267 start_offset == -999)
6268 #ifdef NO_RECURSE
6269 return -((int)sizeof(heapframe));
6270 #else
6271 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6272 #endif
6273
6274 /* Plausibility checks */
6275
6276 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6277 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6278 return PCRE_ERROR_NULL;
6279 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6280 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6281
6282 /* Check that the first field in the block is the magic number. If it is not,
6283 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6284 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6285 means that the pattern is likely compiled with different endianness. */
6286
6287 if (re->magic_number != MAGIC_NUMBER)
6288 return re->magic_number == REVERSED_MAGIC_NUMBER?
6289 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6290 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6291
6292 /* These two settings are used in the code for checking a UTF-8 string that
6293 follows immediately afterwards. Other values in the md block are used only
6294 during "normal" pcre_exec() processing, not when the JIT support is in use,
6295 so they are set up later. */
6296
6297 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6298 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6299 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6300 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6301
6302 /* Check a UTF-8 string if required. Pass back the character offset and error
6303 code for an invalid string if a results vector is available. */
6304
6305 #ifdef SUPPORT_UTF
6306 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6307 {
6308 int erroroffset;
6309 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6310 if (errorcode != 0)
6311 {
6312 if (offsetcount >= 2)
6313 {
6314 offsets[0] = erroroffset;
6315 offsets[1] = errorcode;
6316 }
6317 #ifdef COMPILE_PCRE16
6318 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6319 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6320 #else
6321 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6322 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6323 #endif
6324 }
6325
6326 /* Check that a start_offset points to the start of a UTF character. */
6327 if (start_offset > 0 && start_offset < length &&
6328 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6329 return PCRE_ERROR_BADUTF8_OFFSET;
6330 }
6331 #endif
6332
6333 /* If the pattern was successfully studied with JIT support, run the JIT
6334 executable instead of the rest of this function. Most options must be set at
6335 compile time for the JIT code to be usable. Fallback to the normal code path if
6336 an unsupported flag is set. */
6337
6338 #ifdef SUPPORT_JIT
6339 if (extra_data != NULL
6340 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6341 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6342 && extra_data->executable_jit != NULL
6343 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6344 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6345 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6346 {
6347 rc = PRIV(jit_exec)(re, extra_data->executable_jit,
6348 (const pcre_uchar *)subject, length, start_offset, options,
6349 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6350 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6351
6352 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6353 mode is not compiled. In this case we simply fallback to interpreter. */
6354
6355 if (rc != PCRE_ERROR_NULL) return rc;
6356 }
6357 #endif
6358
6359 /* Carry on with non-JIT matching. This information is for finding all the
6360 numbers associated with a given name, for condition testing. */
6361
6362 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6363 md->name_count = re->name_count;
6364 md->name_entry_size = re->name_entry_size;
6365
6366 /* Fish out the optional data from the extra_data structure, first setting
6367 the default values. */
6368
6369 study = NULL;
6370 md->match_limit = MATCH_LIMIT;
6371 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6372 md->callout_data = NULL;
6373
6374 /* The table pointer is always in native byte order. */
6375
6376 tables = re->tables;
6377
6378 if (extra_data != NULL)
6379 {
6380 register unsigned int flags = extra_data->flags;
6381 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6382 study = (const pcre_study_data *)extra_data->study_data;
6383 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6384 md->match_limit = extra_data->match_limit;
6385 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6386 md->match_limit_recursion = extra_data->match_limit_recursion;
6387 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6388 md->callout_data = extra_data->callout_data;
6389 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6390 }
6391
6392 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6393 is a feature that makes it possible to save compiled regex and re-use them
6394 in other programs later. */
6395
6396 if (tables == NULL) tables = PRIV(default_tables);
6397
6398 /* Set up other data */
6399
6400 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6401 startline = (re->flags & PCRE_STARTLINE) != 0;
6402 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6403
6404 /* The code starts after the real_pcre block and the capture name table. */
6405
6406 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6407 re->name_count * re->name_entry_size;
6408
6409 md->start_subject = (PCRE_PUCHAR)subject;
6410 md->start_offset = start_offset;
6411 md->end_subject = md->start_subject + length;
6412 end_subject = md->end_subject;
6413
6414 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6415 md->use_ucp = (re->options & PCRE_UCP) != 0;
6416 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6417 md->ignore_skip_arg = FALSE;
6418
6419 /* Some options are unpacked into BOOL variables in the hope that testing
6420 them will be faster than individual option bits. */
6421
6422 md->notbol = (options & PCRE_NOTBOL) != 0;
6423 md->noteol = (options & PCRE_NOTEOL) != 0;
6424 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6425 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6426
6427 md->hitend = FALSE;
6428 md->mark = md->nomatch_mark = NULL; /* In case never set */
6429
6430 md->recursive = NULL; /* No recursion at top level */
6431 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6432
6433 md->lcc = tables + lcc_offset;
6434 md->fcc = tables + fcc_offset;
6435 md->ctypes = tables + ctypes_offset;
6436
6437 /* Handle different \R options. */
6438
6439 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6440 {
6441 case 0:
6442 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6443 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6444 else
6445 #ifdef BSR_ANYCRLF
6446 md->bsr_anycrlf = TRUE;
6447 #else
6448 md->bsr_anycrlf = FALSE;
6449 #endif
6450 break;
6451
6452 case PCRE_BSR_ANYCRLF:
6453 md->bsr_anycrlf = TRUE;
6454 break;
6455
6456 case PCRE_BSR_UNICODE:
6457 md->bsr_anycrlf = FALSE;
6458 break;
6459
6460 default: return PCRE_ERROR_BADNEWLINE;
6461 }
6462
6463 /* Handle different types of newline. The three bits give eight cases. If
6464 nothing is set at run time, whatever was used at compile time applies. */
6465
6466 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6467 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6468 {
6469 case 0: newline = NEWLINE; break; /* Compile-time default */
6470 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6471 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6472 case PCRE_NEWLINE_CR+
6473 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6474 case PCRE_NEWLINE_ANY: newline = -1; break;
6475 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6476 default: return PCRE_ERROR_BADNEWLINE;
6477 }
6478
6479 if (newline == -2)
6480 {
6481 md->nltype = NLTYPE_ANYCRLF;
6482 }
6483 else if (newline < 0)
6484 {
6485 md->nltype = NLTYPE_ANY;
6486 }
6487 else
6488 {
6489 md->nltype = NLTYPE_FIXED;
6490 if (newline > 255)
6491 {
6492 md->nllen = 2;
6493 md->nl[0] = (newline >> 8) & 255;
6494 md->nl[1] = newline & 255;
6495 }
6496 else
6497 {
6498 md->nllen = 1;
6499 md->nl[0] = newline;
6500 }
6501 }
6502
6503 /* Partial matching was originally supported only for a restricted set of
6504 regexes; from release 8.00 there are no restrictions, but the bits are still
6505 defined (though never set). So there's no harm in leaving this code. */
6506
6507 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6508 return PCRE_ERROR_BADPARTIAL;
6509
6510 /* If the expression has got more back references than the offsets supplied can
6511 hold, we get a temporary chunk of working store to use during the matching.
6512 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6513 of 3. */
6514
6515 ocount = offsetcount - (offsetcount % 3);
6516 arg_offset_max = (2*ocount)/3;
6517
6518 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6519 {
6520 ocount = re->top_backref * 3 + 3;
6521 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6522 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6523 using_temporary_offsets = TRUE;
6524 DPRINTF(("Got memory to hold back references\n"));
6525 }
6526 else md->offset_vector = offsets;
6527
6528 md->offset_end = ocount;
6529 md->offset_max = (2*ocount)/3;
6530 md->offset_overflow = FALSE;
6531 md->capture_last = -1;
6532
6533 /* Reset the working variable associated with each extraction. These should
6534 never be used unless previously set, but they get saved and restored, and so we
6535 initialize them to avoid reading uninitialized locations. Also, unset the
6536 offsets for the matched string. This is really just for tidiness with callouts,
6537 in case they inspect these fields. */
6538
6539 if (md->offset_vector != NULL)
6540 {
6541 register int *iptr = md->offset_vector + ocount;
6542 register int *iend = iptr - re->top_bracket;
6543 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6544 while (--iptr >= iend) *iptr = -1;
6545 md->offset_vector[0] = md->offset_vector[1] = -1;
6546 }
6547
6548 /* Set up the first character to match, if available. The first_char value is
6549 never set for an anchored regular expression, but the anchoring may be forced
6550 at run time, so we have to test for anchoring. The first char may be unset for
6551 an unanchored pattern, of course. If there's no first char and the pattern was
6552 studied, there may be a bitmap of possible first characters. */
6553
6554 if (!anchored)
6555 {
6556 if ((re->flags & PCRE_FIRSTSET) != 0)
6557 {
6558 has_first_char = TRUE;
6559 first_char = first_char2 = (pcre_uchar)(re->first_char);
6560 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6561 {
6562 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6563 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6564 if (utf && first_char > 127)
6565 first_char2 = UCD_OTHERCASE(first_char);
6566 #endif
6567 }
6568 }
6569 else
6570 if (!startline && study != NULL &&
6571 (study->flags & PCRE_STUDY_MAPPED) != 0)
6572 start_bits = study->start_bits;
6573 }
6574
6575 /* For anchored or unanchored matches, there may be a "last known required
6576 character" set. */
6577
6578 if ((re->flags & PCRE_REQCHSET) != 0)
6579 {
6580 has_req_char = TRUE;
6581 req_char = req_char2 = (pcre_uchar)(re->req_char);
6582 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6583 {
6584 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6585 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6586 if (utf && req_char > 127)
6587 req_char2 = UCD_OTHERCASE(req_char);
6588 #endif
6589 }
6590 }
6591
6592
6593 /* ==========================================================================*/
6594
6595 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6596 the loop runs just once. */
6597
6598 for(;;)
6599 {
6600 PCRE_PUCHAR save_end_subject = end_subject;
6601 PCRE_PUCHAR new_start_match;
6602
6603 /* If firstline is TRUE, the start of the match is constrained to the first
6604 line of a multiline string. That is, the match must be before or at the first
6605 newline. Implement this by temporarily adjusting end_subject so that we stop
6606 scanning at a newline. If the match fails at the newline, later code breaks
6607 this loop. */
6608
6609 if (firstline)
6610 {
6611 PCRE_PUCHAR t = start_match;
6612 #ifdef SUPPORT_UTF
6613 if (utf)
6614 {
6615 while (t < md->end_subject && !IS_NEWLINE(t))
6616 {
6617 t++;
6618 ACROSSCHAR(t < end_subject, *t, t++);
6619 }
6620 }
6621 else
6622 #endif
6623 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6624 end_subject = t;
6625 }
6626
6627 /* There are some optimizations that avoid running the match if a known
6628 starting point is not found, or if a known later character is not present.
6629 However, there is an option that disables these, for testing and for ensuring
6630 that all callouts do actually occur. The option can be set in the regex by
6631 (*NO_START_OPT) or passed in match-time options. */
6632
6633 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6634 {
6635 /* Advance to a unique first char if there is one. */
6636
6637 if (has_first_char)
6638 {
6639 if (first_char != first_char2)
6640 while (start_match < end_subject &&
6641 *start_match != first_char && *start_match != first_char2)
6642 start_match++;
6643 else
6644 while (start_match < end_subject && *start_match != first_char)
6645 start_match++;
6646 }
6647
6648 /* Or to just after a linebreak for a multiline match */
6649
6650 else if (startline)
6651 {
6652 if (start_match > md->start_subject + start_offset)
6653 {
6654 #ifdef SUPPORT_UTF
6655 if (utf)
6656 {
6657 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6658 {
6659 start_match++;
6660 ACROSSCHAR(start_match < end_subject, *start_match,
6661 start_match++);
6662 }
6663 }
6664 else
6665 #endif
6666 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6667 start_match++;
6668
6669 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6670 and we are now at a LF, advance the match position by one more character.
6671 */
6672
6673 if (start_match[-1] == CHAR_CR &&
6674 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6675 start_match < end_subject &&
6676 *start_match == CHAR_NL)
6677 start_match++;
6678 }
6679 }
6680
6681 /* Or to a non-unique first byte after study */
6682
6683 else if (start_bits != NULL)
6684 {
6685 while (start_match < end_subject)
6686 {
6687 register unsigned int c = *start_match;
6688 #ifndef COMPILE_PCRE8
6689 if (c > 255) c = 255;
6690 #endif
6691 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6692 {
6693 start_match++;
6694 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6695 /* In non 8-bit mode, the iteration will stop for
6696 characters > 255 at the beginning or not stop at all. */
6697 if (utf)
6698 ACROSSCHAR(start_match < end_subject, *start_match,
6699 start_match++);
6700 #endif
6701 }
6702 else break;
6703 }
6704 }
6705 } /* Starting optimizations */
6706
6707 /* Restore fudged end_subject */
6708
6709 end_subject = save_end_subject;
6710
6711 /* The following two optimizations are disabled for partial matching or if
6712 disabling is explicitly requested. */
6713
6714 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6715 {
6716 /* If the pattern was studied, a minimum subject length may be set. This is
6717 a lower bound; no actual string of that length may actually match the
6718 pattern. Although the value is, strictly, in characters, we treat it as
6719 bytes to avoid spending too much time in this optimization. */
6720
6721 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6722 (pcre_uint32)(end_subject - start_match) < study->minlength)
6723 {
6724 rc = MATCH_NOMATCH;
6725 break;
6726 }
6727
6728 /* If req_char is set, we know that that character must appear in the
6729 subject for the match to succeed. If the first character is set, req_char
6730 must be later in the subject; otherwise the test starts at the match point.
6731 This optimization can save a huge amount of backtracking in patterns with
6732 nested unlimited repeats that aren't going to match. Writing separate code
6733 for cased/caseless versions makes it go faster, as does using an
6734 autoincrement and backing off on a match.
6735
6736 HOWEVER: when the subject string is very, very long, searching to its end
6737 can take a long time, and give bad performance on quite ordinary patterns.
6738 This showed up when somebody was matching something like /^\d+C/ on a
6739 32-megabyte string... so we don't do this when the string is sufficiently
6740 long. */
6741
6742 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6743 {