/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 922 - (show annotations)
Mon Feb 20 18:44:42 2012 UTC (7 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 218052 byte(s)
Error occurred while calculating annotation data.
Set PCRE_EXTRA_USED_JIT when JIT was actually used at runtime. Add /S++ and
-s++ to pcretest to show whether JIT was used or not. 
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: >= 0 the number of subject bytes matched
144 -1 no match
145 -2 partial match; always given if at end subject
146 */
147
148 static int
149 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
150 BOOL caseless)
151 {
152 PCRE_PUCHAR eptr_start = eptr;
153 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
154
155 #ifdef PCRE_DEBUG
156 if (eptr >= md->end_subject)
157 printf("matching subject <null>");
158 else
159 {
160 printf("matching subject ");
161 pchars(eptr, length, TRUE, md);
162 }
163 printf(" against backref ");
164 pchars(p, length, FALSE, md);
165 printf("\n");
166 #endif
167
168 /* Always fail if reference not set (and not JavaScript compatible - in that
169 case the length is passed as zero). */
170
171 if (length < 0) return -1;
172
173 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
174 properly if Unicode properties are supported. Otherwise, we can check only
175 ASCII characters. */
176
177 if (caseless)
178 {
179 #ifdef SUPPORT_UTF
180 #ifdef SUPPORT_UCP
181 if (md->utf)
182 {
183 /* Match characters up to the end of the reference. NOTE: the number of
184 bytes matched may differ, because there are some characters whose upper and
185 lower case versions code as different numbers of bytes. For example, U+023A
186 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
187 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
188 the latter. It is important, therefore, to check the length along the
189 reference, not along the subject (earlier code did this wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 int c, d;
195 if (eptr >= md->end_subject) return -2; /* Partial match */
196 GETCHARINC(c, eptr);
197 GETCHARINC(d, p);
198 if (c != d && c != UCD_OTHERCASE(d)) return -1;
199 }
200 }
201 else
202 #endif
203 #endif
204
205 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
206 is no UCP support. */
207 {
208 while (length-- > 0)
209 {
210 if (eptr >= md->end_subject) return -2; /* Partial match */
211 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
212 p++;
213 eptr++;
214 }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 while (length-- > 0)
224 {
225 if (eptr >= md->end_subject) return -2; /* Partial match */
226 if (*p++ != *eptr++) return -1;
227 }
228 }
229
230 return (int)(eptr - eptr_start);
231 }
232
233
234
235 /***************************************************************************
236 ****************************************************************************
237 RECURSION IN THE match() FUNCTION
238
239 The match() function is highly recursive, though not every recursive call
240 increases the recursive depth. Nevertheless, some regular expressions can cause
241 it to recurse to a great depth. I was writing for Unix, so I just let it call
242 itself recursively. This uses the stack for saving everything that has to be
243 saved for a recursive call. On Unix, the stack can be large, and this works
244 fine.
245
246 It turns out that on some non-Unix-like systems there are problems with
247 programs that use a lot of stack. (This despite the fact that every last chip
248 has oodles of memory these days, and techniques for extending the stack have
249 been known for decades.) So....
250
251 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
252 calls by keeping local variables that need to be preserved in blocks of memory
253 obtained from malloc() instead instead of on the stack. Macros are used to
254 achieve this so that the actual code doesn't look very different to what it
255 always used to.
256
257 The original heap-recursive code used longjmp(). However, it seems that this
258 can be very slow on some operating systems. Following a suggestion from Stan
259 Switzer, the use of longjmp() has been abolished, at the cost of having to
260 provide a unique number for each call to RMATCH. There is no way of generating
261 a sequence of numbers at compile time in C. I have given them names, to make
262 them stand out more clearly.
263
264 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
265 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
266 tests. Furthermore, not using longjmp() means that local dynamic variables
267 don't have indeterminate values; this has meant that the frame size can be
268 reduced because the result can be "passed back" by straight setting of the
269 variable instead of being passed in the frame.
270 ****************************************************************************
271 ***************************************************************************/
272
273 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
274 below must be updated in sync. */
275
276 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
277 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
278 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
279 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
280 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
281 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
282 RM61, RM62, RM63, RM64, RM65, RM66 };
283
284 /* These versions of the macros use the stack, as normal. There are debugging
285 versions and production versions. Note that the "rw" argument of RMATCH isn't
286 actually used in this definition. */
287
288 #ifndef NO_RECURSE
289 #define REGISTER register
290
291 #ifdef PCRE_DEBUG
292 #define RMATCH(ra,rb,rc,rd,re,rw) \
293 { \
294 printf("match() called in line %d\n", __LINE__); \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
296 printf("to line %d\n", __LINE__); \
297 }
298 #define RRETURN(ra) \
299 { \
300 printf("match() returned %d from line %d ", ra, __LINE__); \
301 return ra; \
302 }
303 #else
304 #define RMATCH(ra,rb,rc,rd,re,rw) \
305 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
306 #define RRETURN(ra) return ra
307 #endif
308
309 #else
310
311
312 /* These versions of the macros manage a private stack on the heap. Note that
313 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
314 argument of match(), which never changes. */
315
316 #define REGISTER
317
318 #define RMATCH(ra,rb,rc,rd,re,rw)\
319 {\
320 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
321 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
322 frame->Xwhere = rw; \
323 newframe->Xeptr = ra;\
324 newframe->Xecode = rb;\
325 newframe->Xmstart = mstart;\
326 newframe->Xoffset_top = rc;\
327 newframe->Xeptrb = re;\
328 newframe->Xrdepth = frame->Xrdepth + 1;\
329 newframe->Xprevframe = frame;\
330 frame = newframe;\
331 DPRINTF(("restarting from line %d\n", __LINE__));\
332 goto HEAP_RECURSE;\
333 L_##rw:\
334 DPRINTF(("jumped back to line %d\n", __LINE__));\
335 }
336
337 #define RRETURN(ra)\
338 {\
339 heapframe *oldframe = frame;\
340 frame = oldframe->Xprevframe;\
341 if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
342 if (frame != NULL)\
343 {\
344 rrc = ra;\
345 goto HEAP_RETURN;\
346 }\
347 return ra;\
348 }
349
350
351 /* Structure for remembering the local variables in a private frame */
352
353 typedef struct heapframe {
354 struct heapframe *Xprevframe;
355
356 /* Function arguments that may change */
357
358 PCRE_PUCHAR Xeptr;
359 const pcre_uchar *Xecode;
360 PCRE_PUCHAR Xmstart;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uchar Xocchars[6];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appear several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 offset_top current top pointer
463 md pointer to "static" info for the match
464 eptrb pointer to chain of blocks containing eptr at start of
465 brackets - for testing for empty matches
466 rdepth the recursion depth
467
468 Returns: MATCH_MATCH if matched ) these values are >= 0
469 MATCH_NOMATCH if failed to match )
470 a negative MATCH_xxx value for PRUNE, SKIP, etc
471 a negative PCRE_ERROR_xxx value if aborted by an error condition
472 (e.g. stopped by repeated call or recursion limit)
473 */
474
475 static int
476 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
477 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
478 unsigned int rdepth)
479 {
480 /* These variables do not need to be preserved over recursion in this function,
481 so they can be ordinary variables in all cases. Mark some of them with
482 "register" because they are used a lot in loops. */
483
484 register int rrc; /* Returns from recursive calls */
485 register int i; /* Used for loops not involving calls to RMATCH() */
486 register unsigned int c; /* Character values not kept over RMATCH() calls */
487 register BOOL utf; /* Local copy of UTF flag for speed */
488
489 BOOL minimize, possessive; /* Quantifier options */
490 BOOL caseless;
491 int condcode;
492
493 /* When recursion is not being used, all "local" variables that have to be
494 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
495 frame on the stack here; subsequent instantiations are obtained from the heap
496 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
497 the top-level on the stack rather than malloc-ing them all gives a performance
498 boost in many cases where there is not much "recursion". */
499
500 #ifdef NO_RECURSE
501 heapframe frame_zero;
502 heapframe *frame = &frame_zero;
503 frame->Xprevframe = NULL; /* Marks the top level */
504
505 /* Copy in the original argument variables */
506
507 frame->Xeptr = eptr;
508 frame->Xecode = ecode;
509 frame->Xmstart = mstart;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define offset_top frame->Xoffset_top
524 #define eptrb frame->Xeptrb
525 #define rdepth frame->Xrdepth
526
527 /* Ditto for the local variables */
528
529 #ifdef SUPPORT_UTF
530 #define charptr frame->Xcharptr
531 #endif
532 #define callpat frame->Xcallpat
533 #define codelink frame->Xcodelink
534 #define data frame->Xdata
535 #define next frame->Xnext
536 #define pp frame->Xpp
537 #define prev frame->Xprev
538 #define saved_eptr frame->Xsaved_eptr
539
540 #define new_recursive frame->Xnew_recursive
541
542 #define cur_is_word frame->Xcur_is_word
543 #define condition frame->Xcondition
544 #define prev_is_word frame->Xprev_is_word
545
546 #ifdef SUPPORT_UCP
547 #define prop_type frame->Xprop_type
548 #define prop_value frame->Xprop_value
549 #define prop_fail_result frame->Xprop_fail_result
550 #define oclength frame->Xoclength
551 #define occhars frame->Xocchars
552 #endif
553
554 #define ctype frame->Xctype
555 #define fc frame->Xfc
556 #define fi frame->Xfi
557 #define length frame->Xlength
558 #define max frame->Xmax
559 #define min frame->Xmin
560 #define number frame->Xnumber
561 #define offset frame->Xoffset
562 #define op frame->Xop
563 #define save_capture_last frame->Xsave_capture_last
564 #define save_offset1 frame->Xsave_offset1
565 #define save_offset2 frame->Xsave_offset2
566 #define save_offset3 frame->Xsave_offset3
567 #define stacksave frame->Xstacksave
568
569 #define newptrb frame->Xnewptrb
570
571 /* When recursion is being used, local variables are allocated on the stack and
572 get preserved during recursion in the normal way. In this environment, fi and
573 i, and fc and c, can be the same variables. */
574
575 #else /* NO_RECURSE not defined */
576 #define fi i
577 #define fc c
578
579 /* Many of the following variables are used only in small blocks of the code.
580 My normal style of coding would have declared them within each of those blocks.
581 However, in order to accommodate the version of this code that uses an external
582 "stack" implemented on the heap, it is easier to declare them all here, so the
583 declarations can be cut out in a block. The only declarations within blocks
584 below are for variables that do not have to be preserved over a recursive call
585 to RMATCH(). */
586
587 #ifdef SUPPORT_UTF
588 const pcre_uchar *charptr;
589 #endif
590 const pcre_uchar *callpat;
591 const pcre_uchar *data;
592 const pcre_uchar *next;
593 PCRE_PUCHAR pp;
594 const pcre_uchar *prev;
595 PCRE_PUCHAR saved_eptr;
596
597 recursion_info new_recursive;
598
599 BOOL cur_is_word;
600 BOOL condition;
601 BOOL prev_is_word;
602
603 #ifdef SUPPORT_UCP
604 int prop_type;
605 int prop_value;
606 int prop_fail_result;
607 int oclength;
608 pcre_uchar occhars[6];
609 #endif
610
611 int codelink;
612 int ctype;
613 int length;
614 int max;
615 int min;
616 int number;
617 int offset;
618 int op;
619 int save_capture_last;
620 int save_offset1, save_offset2, save_offset3;
621 int stacksave[REC_STACK_SAVE_MAX];
622
623 eptrblock newptrb;
624
625 /* There is a special fudge for calling match() in a way that causes it to
626 measure the size of its basic stack frame when the stack is being used for
627 recursion. The second argument (ecode) being NULL triggers this behaviour. It
628 cannot normally ever be NULL. The return is the negated value of the frame
629 size. */
630
631 if (ecode == NULL)
632 {
633 if (rdepth == 0)
634 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
635 else
636 {
637 int len = (char *)&rdepth - (char *)eptr;
638 return (len > 0)? -len : len;
639 }
640 }
641 #endif /* NO_RECURSE */
642
643 /* To save space on the stack and in the heap frame, I have doubled up on some
644 of the local variables that are used only in localised parts of the code, but
645 still need to be preserved over recursive calls of match(). These macros define
646 the alternative names that are used. */
647
648 #define allow_zero cur_is_word
649 #define cbegroup condition
650 #define code_offset codelink
651 #define condassert condition
652 #define matched_once prev_is_word
653 #define foc number
654 #define save_mark data
655
656 /* These statements are here to stop the compiler complaining about unitialized
657 variables. */
658
659 #ifdef SUPPORT_UCP
660 prop_value = 0;
661 prop_fail_result = 0;
662 #endif
663
664
665 /* This label is used for tail recursion, which is used in a few cases even
666 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
667 used. Thanks to Ian Taylor for noticing this possibility and sending the
668 original patch. */
669
670 TAIL_RECURSE:
671
672 /* OK, now we can get on with the real code of the function. Recursive calls
673 are specified by the macro RMATCH and RRETURN is used to return. When
674 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
675 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
676 defined). However, RMATCH isn't like a function call because it's quite a
677 complicated macro. It has to be used in one particular way. This shouldn't,
678 however, impact performance when true recursion is being used. */
679
680 #ifdef SUPPORT_UTF
681 utf = md->utf; /* Local copy of the flag */
682 #else
683 utf = FALSE;
684 #endif
685
686 /* First check that we haven't called match() too many times, or that we
687 haven't exceeded the recursive call limit. */
688
689 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
690 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
691
692 /* At the start of a group with an unlimited repeat that may match an empty
693 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
694 done this way to save having to use another function argument, which would take
695 up space on the stack. See also MATCH_CONDASSERT below.
696
697 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
698 such remembered pointers, to be checked when we hit the closing ket, in order
699 to break infinite loops that match no characters. When match() is called in
700 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
701 NOT be used with tail recursion, because the memory block that is used is on
702 the stack, so a new one may be required for each match(). */
703
704 if (md->match_function_type == MATCH_CBEGROUP)
705 {
706 newptrb.epb_saved_eptr = eptr;
707 newptrb.epb_prev = eptrb;
708 eptrb = &newptrb;
709 md->match_function_type = 0;
710 }
711
712 /* Now start processing the opcodes. */
713
714 for (;;)
715 {
716 minimize = possessive = FALSE;
717 op = *ecode;
718
719 switch(op)
720 {
721 case OP_MARK:
722 md->nomatch_mark = ecode + 2;
723 md->mark = NULL; /* In case previously set by assertion */
724 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
725 eptrb, RM55);
726 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
727 md->mark == NULL) md->mark = ecode + 2;
728
729 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
730 argument, and we must check whether that argument matches this MARK's
731 argument. It is passed back in md->start_match_ptr (an overloading of that
732 variable). If it does match, we reset that variable to the current subject
733 position and return MATCH_SKIP. Otherwise, pass back the return code
734 unaltered. */
735
736 else if (rrc == MATCH_SKIP_ARG &&
737 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
738 {
739 md->start_match_ptr = eptr;
740 RRETURN(MATCH_SKIP);
741 }
742 RRETURN(rrc);
743
744 case OP_FAIL:
745 RRETURN(MATCH_NOMATCH);
746
747 /* COMMIT overrides PRUNE, SKIP, and THEN */
748
749 case OP_COMMIT:
750 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
751 eptrb, RM52);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
753 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
754 rrc != MATCH_THEN)
755 RRETURN(rrc);
756 RRETURN(MATCH_COMMIT);
757
758 /* PRUNE overrides THEN */
759
760 case OP_PRUNE:
761 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
762 eptrb, RM51);
763 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
764 RRETURN(MATCH_PRUNE);
765
766 case OP_PRUNE_ARG:
767 md->nomatch_mark = ecode + 2;
768 md->mark = NULL; /* In case previously set by assertion */
769 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
770 eptrb, RM56);
771 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
772 md->mark == NULL) md->mark = ecode + 2;
773 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
774 RRETURN(MATCH_PRUNE);
775
776 /* SKIP overrides PRUNE and THEN */
777
778 case OP_SKIP:
779 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
780 eptrb, RM53);
781 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
782 RRETURN(rrc);
783 md->start_match_ptr = eptr; /* Pass back current position */
784 RRETURN(MATCH_SKIP);
785
786 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
787 nomatch_mark. There is a flag that disables this opcode when re-matching a
788 pattern that ended with a SKIP for which there was not a matching MARK. */
789
790 case OP_SKIP_ARG:
791 if (md->ignore_skip_arg)
792 {
793 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
794 break;
795 }
796 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
797 eptrb, RM57);
798 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
799 RRETURN(rrc);
800
801 /* Pass back the current skip name by overloading md->start_match_ptr and
802 returning the special MATCH_SKIP_ARG return code. This will either be
803 caught by a matching MARK, or get to the top, where it causes a rematch
804 with the md->ignore_skip_arg flag set. */
805
806 md->start_match_ptr = ecode + 2;
807 RRETURN(MATCH_SKIP_ARG);
808
809 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
810 the branch in which it occurs can be determined. Overload the start of
811 match pointer to do this. */
812
813 case OP_THEN:
814 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
815 eptrb, RM54);
816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
817 md->start_match_ptr = ecode;
818 RRETURN(MATCH_THEN);
819
820 case OP_THEN_ARG:
821 md->nomatch_mark = ecode + 2;
822 md->mark = NULL; /* In case previously set by assertion */
823 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
824 md, eptrb, RM58);
825 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
826 md->mark == NULL) md->mark = ecode + 2;
827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
828 md->start_match_ptr = ecode;
829 RRETURN(MATCH_THEN);
830
831 /* Handle an atomic group that does not contain any capturing parentheses.
832 This can be handled like an assertion. Prior to 8.13, all atomic groups
833 were handled this way. In 8.13, the code was changed as below for ONCE, so
834 that backups pass through the group and thereby reset captured values.
835 However, this uses a lot more stack, so in 8.20, atomic groups that do not
836 contain any captures generate OP_ONCE_NC, which can be handled in the old,
837 less stack intensive way.
838
839 Check the alternative branches in turn - the matching won't pass the KET
840 for this kind of subpattern. If any one branch matches, we carry on as at
841 the end of a normal bracket, leaving the subject pointer, but resetting
842 the start-of-match value in case it was changed by \K. */
843
844 case OP_ONCE_NC:
845 prev = ecode;
846 saved_eptr = eptr;
847 save_mark = md->mark;
848 do
849 {
850 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
851 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
852 {
853 mstart = md->start_match_ptr;
854 break;
855 }
856 if (rrc == MATCH_THEN)
857 {
858 next = ecode + GET(ecode,1);
859 if (md->start_match_ptr < next &&
860 (*ecode == OP_ALT || *next == OP_ALT))
861 rrc = MATCH_NOMATCH;
862 }
863
864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865 ecode += GET(ecode,1);
866 md->mark = save_mark;
867 }
868 while (*ecode == OP_ALT);
869
870 /* If hit the end of the group (which could be repeated), fail */
871
872 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
873
874 /* Continue as from after the group, updating the offsets high water
875 mark, since extracts may have been taken. */
876
877 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
878
879 offset_top = md->end_offset_top;
880 eptr = md->end_match_ptr;
881
882 /* For a non-repeating ket, just continue at this level. This also
883 happens for a repeating ket if no characters were matched in the group.
884 This is the forcible breaking of infinite loops as implemented in Perl
885 5.005. */
886
887 if (*ecode == OP_KET || eptr == saved_eptr)
888 {
889 ecode += 1+LINK_SIZE;
890 break;
891 }
892
893 /* The repeating kets try the rest of the pattern or restart from the
894 preceding bracket, in the appropriate order. The second "call" of match()
895 uses tail recursion, to avoid using another stack frame. */
896
897 if (*ecode == OP_KETRMIN)
898 {
899 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
901 ecode = prev;
902 goto TAIL_RECURSE;
903 }
904 else /* OP_KETRMAX */
905 {
906 md->match_function_type = MATCH_CBEGROUP;
907 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
909 ecode += 1 + LINK_SIZE;
910 goto TAIL_RECURSE;
911 }
912 /* Control never gets here */
913
914 /* Handle a capturing bracket, other than those that are possessive with an
915 unlimited repeat. If there is space in the offset vector, save the current
916 subject position in the working slot at the top of the vector. We mustn't
917 change the current values of the data slot, because they may be set from a
918 previous iteration of this group, and be referred to by a reference inside
919 the group. A failure to match might occur after the group has succeeded,
920 if something later on doesn't match. For this reason, we need to restore
921 the working value and also the values of the final offsets, in case they
922 were set by a previous iteration of the same bracket.
923
924 If there isn't enough space in the offset vector, treat this as if it were
925 a non-capturing bracket. Don't worry about setting the flag for the error
926 case here; that is handled in the code for KET. */
927
928 case OP_CBRA:
929 case OP_SCBRA:
930 number = GET2(ecode, 1+LINK_SIZE);
931 offset = number << 1;
932
933 #ifdef PCRE_DEBUG
934 printf("start bracket %d\n", number);
935 printf("subject=");
936 pchars(eptr, 16, TRUE, md);
937 printf("\n");
938 #endif
939
940 if (offset < md->offset_max)
941 {
942 save_offset1 = md->offset_vector[offset];
943 save_offset2 = md->offset_vector[offset+1];
944 save_offset3 = md->offset_vector[md->offset_end - number];
945 save_capture_last = md->capture_last;
946 save_mark = md->mark;
947
948 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
949 md->offset_vector[md->offset_end - number] =
950 (int)(eptr - md->start_subject);
951
952 for (;;)
953 {
954 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
955 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
956 eptrb, RM1);
957 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
958
959 /* If we backed up to a THEN, check whether it is within the current
960 branch by comparing the address of the THEN that is passed back with
961 the end of the branch. If it is within the current branch, and the
962 branch is one of two or more alternatives (it either starts or ends
963 with OP_ALT), we have reached the limit of THEN's action, so convert
964 the return code to NOMATCH, which will cause normal backtracking to
965 happen from now on. Otherwise, THEN is passed back to an outer
966 alternative. This implements Perl's treatment of parenthesized groups,
967 where a group not containing | does not affect the current alternative,
968 that is, (X) is NOT the same as (X|(*F)). */
969
970 if (rrc == MATCH_THEN)
971 {
972 next = ecode + GET(ecode,1);
973 if (md->start_match_ptr < next &&
974 (*ecode == OP_ALT || *next == OP_ALT))
975 rrc = MATCH_NOMATCH;
976 }
977
978 /* Anything other than NOMATCH is passed back. */
979
980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 md->mark = save_mark;
984 if (*ecode != OP_ALT) break;
985 }
986
987 DPRINTF(("bracket %d failed\n", number));
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991
992 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
993
994 RRETURN(rrc);
995 }
996
997 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
998 as a non-capturing bracket. */
999
1000 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1001 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1002
1003 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1004
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007
1008 /* Non-capturing or atomic group, except for possessive with unlimited
1009 repeat and ONCE group with no captures. Loop for all the alternatives.
1010
1011 When we get to the final alternative within the brackets, we used to return
1012 the result of a recursive call to match() whatever happened so it was
1013 possible to reduce stack usage by turning this into a tail recursion,
1014 except in the case of a possibly empty group. However, now that there is
1015 the possiblity of (*THEN) occurring in the final alternative, this
1016 optimization is no longer always possible.
1017
1018 We can optimize if we know there are no (*THEN)s in the pattern; at present
1019 this is the best that can be done.
1020
1021 MATCH_ONCE is returned when the end of an atomic group is successfully
1022 reached, but subsequent matching fails. It passes back up the tree (causing
1023 captured values to be reset) until the original atomic group level is
1024 reached. This is tested by comparing md->once_target with the start of the
1025 group. At this point, the return is converted into MATCH_NOMATCH so that
1026 previous backup points can be taken. */
1027
1028 case OP_ONCE:
1029 case OP_BRA:
1030 case OP_SBRA:
1031 DPRINTF(("start non-capturing bracket\n"));
1032
1033 for (;;)
1034 {
1035 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1036
1037 /* If this is not a possibly empty group, and there are no (*THEN)s in
1038 the pattern, and this is the final alternative, optimize as described
1039 above. */
1040
1041 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1042 {
1043 ecode += PRIV(OP_lengths)[*ecode];
1044 goto TAIL_RECURSE;
1045 }
1046
1047 /* In all other cases, we have to make another call to match(). */
1048
1049 save_mark = md->mark;
1050 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1051 RM2);
1052
1053 /* See comment in the code for capturing groups above about handling
1054 THEN. */
1055
1056 if (rrc == MATCH_THEN)
1057 {
1058 next = ecode + GET(ecode,1);
1059 if (md->start_match_ptr < next &&
1060 (*ecode == OP_ALT || *next == OP_ALT))
1061 rrc = MATCH_NOMATCH;
1062 }
1063
1064 if (rrc != MATCH_NOMATCH)
1065 {
1066 if (rrc == MATCH_ONCE)
1067 {
1068 const pcre_uchar *scode = ecode;
1069 if (*scode != OP_ONCE) /* If not at start, find it */
1070 {
1071 while (*scode == OP_ALT) scode += GET(scode, 1);
1072 scode -= GET(scode, 1);
1073 }
1074 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1075 }
1076 RRETURN(rrc);
1077 }
1078 ecode += GET(ecode, 1);
1079 md->mark = save_mark;
1080 if (*ecode != OP_ALT) break;
1081 }
1082
1083 RRETURN(MATCH_NOMATCH);
1084
1085 /* Handle possessive capturing brackets with an unlimited repeat. We come
1086 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1087 handled similarly to the normal case above. However, the matching is
1088 different. The end of these brackets will always be OP_KETRPOS, which
1089 returns MATCH_KETRPOS without going further in the pattern. By this means
1090 we can handle the group by iteration rather than recursion, thereby
1091 reducing the amount of stack needed. */
1092
1093 case OP_CBRAPOS:
1094 case OP_SCBRAPOS:
1095 allow_zero = FALSE;
1096
1097 POSSESSIVE_CAPTURE:
1098 number = GET2(ecode, 1+LINK_SIZE);
1099 offset = number << 1;
1100
1101 #ifdef PCRE_DEBUG
1102 printf("start possessive bracket %d\n", number);
1103 printf("subject=");
1104 pchars(eptr, 16, TRUE, md);
1105 printf("\n");
1106 #endif
1107
1108 if (offset < md->offset_max)
1109 {
1110 matched_once = FALSE;
1111 code_offset = (int)(ecode - md->start_code);
1112
1113 save_offset1 = md->offset_vector[offset];
1114 save_offset2 = md->offset_vector[offset+1];
1115 save_offset3 = md->offset_vector[md->offset_end - number];
1116 save_capture_last = md->capture_last;
1117
1118 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1119
1120 /* Each time round the loop, save the current subject position for use
1121 when the group matches. For MATCH_MATCH, the group has matched, so we
1122 restart it with a new subject starting position, remembering that we had
1123 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1124 usual. If we haven't matched any alternatives in any iteration, check to
1125 see if a previous iteration matched. If so, the group has matched;
1126 continue from afterwards. Otherwise it has failed; restore the previous
1127 capture values before returning NOMATCH. */
1128
1129 for (;;)
1130 {
1131 md->offset_vector[md->offset_end - number] =
1132 (int)(eptr - md->start_subject);
1133 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1134 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1135 eptrb, RM63);
1136 if (rrc == MATCH_KETRPOS)
1137 {
1138 offset_top = md->end_offset_top;
1139 eptr = md->end_match_ptr;
1140 ecode = md->start_code + code_offset;
1141 save_capture_last = md->capture_last;
1142 matched_once = TRUE;
1143 continue;
1144 }
1145
1146 /* See comment in the code for capturing groups above about handling
1147 THEN. */
1148
1149 if (rrc == MATCH_THEN)
1150 {
1151 next = ecode + GET(ecode,1);
1152 if (md->start_match_ptr < next &&
1153 (*ecode == OP_ALT || *next == OP_ALT))
1154 rrc = MATCH_NOMATCH;
1155 }
1156
1157 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1158 md->capture_last = save_capture_last;
1159 ecode += GET(ecode, 1);
1160 if (*ecode != OP_ALT) break;
1161 }
1162
1163 if (!matched_once)
1164 {
1165 md->offset_vector[offset] = save_offset1;
1166 md->offset_vector[offset+1] = save_offset2;
1167 md->offset_vector[md->offset_end - number] = save_offset3;
1168 }
1169
1170 if (allow_zero || matched_once)
1171 {
1172 ecode += 1 + LINK_SIZE;
1173 break;
1174 }
1175
1176 RRETURN(MATCH_NOMATCH);
1177 }
1178
1179 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1180 as a non-capturing bracket. */
1181
1182 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1183 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1184
1185 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1186
1187 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1189
1190 /* Non-capturing possessive bracket with unlimited repeat. We come here
1191 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1192 without the capturing complication. It is written out separately for speed
1193 and cleanliness. */
1194
1195 case OP_BRAPOS:
1196 case OP_SBRAPOS:
1197 allow_zero = FALSE;
1198
1199 POSSESSIVE_NON_CAPTURE:
1200 matched_once = FALSE;
1201 code_offset = (int)(ecode - md->start_code);
1202
1203 for (;;)
1204 {
1205 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1206 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1207 eptrb, RM48);
1208 if (rrc == MATCH_KETRPOS)
1209 {
1210 offset_top = md->end_offset_top;
1211 eptr = md->end_match_ptr;
1212 ecode = md->start_code + code_offset;
1213 matched_once = TRUE;
1214 continue;
1215 }
1216
1217 /* See comment in the code for capturing groups above about handling
1218 THEN. */
1219
1220 if (rrc == MATCH_THEN)
1221 {
1222 next = ecode + GET(ecode,1);
1223 if (md->start_match_ptr < next &&
1224 (*ecode == OP_ALT || *next == OP_ALT))
1225 rrc = MATCH_NOMATCH;
1226 }
1227
1228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1229 ecode += GET(ecode, 1);
1230 if (*ecode != OP_ALT) break;
1231 }
1232
1233 if (matched_once || allow_zero)
1234 {
1235 ecode += 1 + LINK_SIZE;
1236 break;
1237 }
1238 RRETURN(MATCH_NOMATCH);
1239
1240 /* Control never reaches here. */
1241
1242 /* Conditional group: compilation checked that there are no more than
1243 two branches. If the condition is false, skipping the first branch takes us
1244 past the end if there is only one branch, but that's OK because that is
1245 exactly what going to the ket would do. */
1246
1247 case OP_COND:
1248 case OP_SCOND:
1249 codelink = GET(ecode, 1);
1250
1251 /* Because of the way auto-callout works during compile, a callout item is
1252 inserted between OP_COND and an assertion condition. */
1253
1254 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1255 {
1256 if (PUBL(callout) != NULL)
1257 {
1258 PUBL(callout_block) cb;
1259 cb.version = 2; /* Version 1 of the callout block */
1260 cb.callout_number = ecode[LINK_SIZE+2];
1261 cb.offset_vector = md->offset_vector;
1262 #ifdef COMPILE_PCRE8
1263 cb.subject = (PCRE_SPTR)md->start_subject;
1264 #else
1265 cb.subject = (PCRE_SPTR16)md->start_subject;
1266 #endif
1267 cb.subject_length = (int)(md->end_subject - md->start_subject);
1268 cb.start_match = (int)(mstart - md->start_subject);
1269 cb.current_position = (int)(eptr - md->start_subject);
1270 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1271 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1272 cb.capture_top = offset_top/2;
1273 cb.capture_last = md->capture_last;
1274 cb.callout_data = md->callout_data;
1275 cb.mark = md->nomatch_mark;
1276 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1277 if (rrc < 0) RRETURN(rrc);
1278 }
1279 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1280 }
1281
1282 condcode = ecode[LINK_SIZE+1];
1283
1284 /* Now see what the actual condition is */
1285
1286 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1287 {
1288 if (md->recursive == NULL) /* Not recursing => FALSE */
1289 {
1290 condition = FALSE;
1291 ecode += GET(ecode, 1);
1292 }
1293 else
1294 {
1295 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1296 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1297
1298 /* If the test is for recursion into a specific subpattern, and it is
1299 false, but the test was set up by name, scan the table to see if the
1300 name refers to any other numbers, and test them. The condition is true
1301 if any one is set. */
1302
1303 if (!condition && condcode == OP_NRREF)
1304 {
1305 pcre_uchar *slotA = md->name_table;
1306 for (i = 0; i < md->name_count; i++)
1307 {
1308 if (GET2(slotA, 0) == recno) break;
1309 slotA += md->name_entry_size;
1310 }
1311
1312 /* Found a name for the number - there can be only one; duplicate
1313 names for different numbers are allowed, but not vice versa. First
1314 scan down for duplicates. */
1315
1316 if (i < md->name_count)
1317 {
1318 pcre_uchar *slotB = slotA;
1319 while (slotB > md->name_table)
1320 {
1321 slotB -= md->name_entry_size;
1322 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1323 {
1324 condition = GET2(slotB, 0) == md->recursive->group_num;
1325 if (condition) break;
1326 }
1327 else break;
1328 }
1329
1330 /* Scan up for duplicates */
1331
1332 if (!condition)
1333 {
1334 slotB = slotA;
1335 for (i++; i < md->name_count; i++)
1336 {
1337 slotB += md->name_entry_size;
1338 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1339 {
1340 condition = GET2(slotB, 0) == md->recursive->group_num;
1341 if (condition) break;
1342 }
1343 else break;
1344 }
1345 }
1346 }
1347 }
1348
1349 /* Chose branch according to the condition */
1350
1351 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1352 }
1353 }
1354
1355 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1356 {
1357 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1358 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1359
1360 /* If the numbered capture is unset, but the reference was by name,
1361 scan the table to see if the name refers to any other numbers, and test
1362 them. The condition is true if any one is set. This is tediously similar
1363 to the code above, but not close enough to try to amalgamate. */
1364
1365 if (!condition && condcode == OP_NCREF)
1366 {
1367 int refno = offset >> 1;
1368 pcre_uchar *slotA = md->name_table;
1369
1370 for (i = 0; i < md->name_count; i++)
1371 {
1372 if (GET2(slotA, 0) == refno) break;
1373 slotA += md->name_entry_size;
1374 }
1375
1376 /* Found a name for the number - there can be only one; duplicate names
1377 for different numbers are allowed, but not vice versa. First scan down
1378 for duplicates. */
1379
1380 if (i < md->name_count)
1381 {
1382 pcre_uchar *slotB = slotA;
1383 while (slotB > md->name_table)
1384 {
1385 slotB -= md->name_entry_size;
1386 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1387 {
1388 offset = GET2(slotB, 0) << 1;
1389 condition = offset < offset_top &&
1390 md->offset_vector[offset] >= 0;
1391 if (condition) break;
1392 }
1393 else break;
1394 }
1395
1396 /* Scan up for duplicates */
1397
1398 if (!condition)
1399 {
1400 slotB = slotA;
1401 for (i++; i < md->name_count; i++)
1402 {
1403 slotB += md->name_entry_size;
1404 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1405 {
1406 offset = GET2(slotB, 0) << 1;
1407 condition = offset < offset_top &&
1408 md->offset_vector[offset] >= 0;
1409 if (condition) break;
1410 }
1411 else break;
1412 }
1413 }
1414 }
1415 }
1416
1417 /* Chose branch according to the condition */
1418
1419 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1420 }
1421
1422 else if (condcode == OP_DEF) /* DEFINE - always false */
1423 {
1424 condition = FALSE;
1425 ecode += GET(ecode, 1);
1426 }
1427
1428 /* The condition is an assertion. Call match() to evaluate it - setting
1429 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1430 an assertion. */
1431
1432 else
1433 {
1434 md->match_function_type = MATCH_CONDASSERT;
1435 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1436 if (rrc == MATCH_MATCH)
1437 {
1438 if (md->end_offset_top > offset_top)
1439 offset_top = md->end_offset_top; /* Captures may have happened */
1440 condition = TRUE;
1441 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1442 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1443 }
1444
1445 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1446 assertion; it is therefore treated as NOMATCH. */
1447
1448 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1449 {
1450 RRETURN(rrc); /* Need braces because of following else */
1451 }
1452 else
1453 {
1454 condition = FALSE;
1455 ecode += codelink;
1456 }
1457 }
1458
1459 /* We are now at the branch that is to be obeyed. As there is only one, can
1460 use tail recursion to avoid using another stack frame, except when there is
1461 unlimited repeat of a possibly empty group. In the latter case, a recursive
1462 call to match() is always required, unless the second alternative doesn't
1463 exist, in which case we can just plough on. Note that, for compatibility
1464 with Perl, the | in a conditional group is NOT treated as creating two
1465 alternatives. If a THEN is encountered in the branch, it propagates out to
1466 the enclosing alternative (unless nested in a deeper set of alternatives,
1467 of course). */
1468
1469 if (condition || *ecode == OP_ALT)
1470 {
1471 if (op != OP_SCOND)
1472 {
1473 ecode += 1 + LINK_SIZE;
1474 goto TAIL_RECURSE;
1475 }
1476
1477 md->match_function_type = MATCH_CBEGROUP;
1478 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1479 RRETURN(rrc);
1480 }
1481
1482 /* Condition false & no alternative; continue after the group. */
1483
1484 else
1485 {
1486 ecode += 1 + LINK_SIZE;
1487 }
1488 break;
1489
1490
1491 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1492 to close any currently open capturing brackets. */
1493
1494 case OP_CLOSE:
1495 number = GET2(ecode, 1);
1496 offset = number << 1;
1497
1498 #ifdef PCRE_DEBUG
1499 printf("end bracket %d at *ACCEPT", number);
1500 printf("\n");
1501 #endif
1502
1503 md->capture_last = number;
1504 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1505 {
1506 md->offset_vector[offset] =
1507 md->offset_vector[md->offset_end - number];
1508 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1509 if (offset_top <= offset) offset_top = offset + 2;
1510 }
1511 ecode += 1 + IMM2_SIZE;
1512 break;
1513
1514
1515 /* End of the pattern, either real or forced. */
1516
1517 case OP_END:
1518 case OP_ACCEPT:
1519 case OP_ASSERT_ACCEPT:
1520
1521 /* If we have matched an empty string, fail if not in an assertion and not
1522 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1523 is set and we have matched at the start of the subject. In both cases,
1524 backtracking will then try other alternatives, if any. */
1525
1526 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1527 md->recursive == NULL &&
1528 (md->notempty ||
1529 (md->notempty_atstart &&
1530 mstart == md->start_subject + md->start_offset)))
1531 RRETURN(MATCH_NOMATCH);
1532
1533 /* Otherwise, we have a match. */
1534
1535 md->end_match_ptr = eptr; /* Record where we ended */
1536 md->end_offset_top = offset_top; /* and how many extracts were taken */
1537 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1538
1539 /* For some reason, the macros don't work properly if an expression is
1540 given as the argument to RRETURN when the heap is in use. */
1541
1542 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1543 RRETURN(rrc);
1544
1545 /* Assertion brackets. Check the alternative branches in turn - the
1546 matching won't pass the KET for an assertion. If any one branch matches,
1547 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1548 start of each branch to move the current point backwards, so the code at
1549 this level is identical to the lookahead case. When the assertion is part
1550 of a condition, we want to return immediately afterwards. The caller of
1551 this incarnation of the match() function will have set MATCH_CONDASSERT in
1552 md->match_function type, and one of these opcodes will be the first opcode
1553 that is processed. We use a local variable that is preserved over calls to
1554 match() to remember this case. */
1555
1556 case OP_ASSERT:
1557 case OP_ASSERTBACK:
1558 save_mark = md->mark;
1559 if (md->match_function_type == MATCH_CONDASSERT)
1560 {
1561 condassert = TRUE;
1562 md->match_function_type = 0;
1563 }
1564 else condassert = FALSE;
1565
1566 do
1567 {
1568 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1569 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1570 {
1571 mstart = md->start_match_ptr; /* In case \K reset it */
1572 break;
1573 }
1574
1575 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1576 as NOMATCH. */
1577
1578 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1579 ecode += GET(ecode, 1);
1580 md->mark = save_mark;
1581 }
1582 while (*ecode == OP_ALT);
1583
1584 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1585
1586 /* If checking an assertion for a condition, return MATCH_MATCH. */
1587
1588 if (condassert) RRETURN(MATCH_MATCH);
1589
1590 /* Continue from after the assertion, updating the offsets high water
1591 mark, since extracts may have been taken during the assertion. */
1592
1593 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1594 ecode += 1 + LINK_SIZE;
1595 offset_top = md->end_offset_top;
1596 continue;
1597
1598 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1599 PRUNE, or COMMIT means we must assume failure without checking subsequent
1600 branches. */
1601
1602 case OP_ASSERT_NOT:
1603 case OP_ASSERTBACK_NOT:
1604 save_mark = md->mark;
1605 if (md->match_function_type == MATCH_CONDASSERT)
1606 {
1607 condassert = TRUE;
1608 md->match_function_type = 0;
1609 }
1610 else condassert = FALSE;
1611
1612 do
1613 {
1614 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1615 md->mark = save_mark;
1616 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1617 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1618 {
1619 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1620 break;
1621 }
1622
1623 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1624 as NOMATCH. */
1625
1626 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1627 ecode += GET(ecode,1);
1628 }
1629 while (*ecode == OP_ALT);
1630
1631 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1632
1633 ecode += 1 + LINK_SIZE;
1634 continue;
1635
1636 /* Move the subject pointer back. This occurs only at the start of
1637 each branch of a lookbehind assertion. If we are too close to the start to
1638 move back, this match function fails. When working with UTF-8 we move
1639 back a number of characters, not bytes. */
1640
1641 case OP_REVERSE:
1642 #ifdef SUPPORT_UTF
1643 if (utf)
1644 {
1645 i = GET(ecode, 1);
1646 while (i-- > 0)
1647 {
1648 eptr--;
1649 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1650 BACKCHAR(eptr);
1651 }
1652 }
1653 else
1654 #endif
1655
1656 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1657
1658 {
1659 eptr -= GET(ecode, 1);
1660 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1661 }
1662
1663 /* Save the earliest consulted character, then skip to next op code */
1664
1665 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1666 ecode += 1 + LINK_SIZE;
1667 break;
1668
1669 /* The callout item calls an external function, if one is provided, passing
1670 details of the match so far. This is mainly for debugging, though the
1671 function is able to force a failure. */
1672
1673 case OP_CALLOUT:
1674 if (PUBL(callout) != NULL)
1675 {
1676 PUBL(callout_block) cb;
1677 cb.version = 2; /* Version 1 of the callout block */
1678 cb.callout_number = ecode[1];
1679 cb.offset_vector = md->offset_vector;
1680 #ifdef COMPILE_PCRE8
1681 cb.subject = (PCRE_SPTR)md->start_subject;
1682 #else
1683 cb.subject = (PCRE_SPTR16)md->start_subject;
1684 #endif
1685 cb.subject_length = (int)(md->end_subject - md->start_subject);
1686 cb.start_match = (int)(mstart - md->start_subject);
1687 cb.current_position = (int)(eptr - md->start_subject);
1688 cb.pattern_position = GET(ecode, 2);
1689 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1690 cb.capture_top = offset_top/2;
1691 cb.capture_last = md->capture_last;
1692 cb.callout_data = md->callout_data;
1693 cb.mark = md->nomatch_mark;
1694 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1695 if (rrc < 0) RRETURN(rrc);
1696 }
1697 ecode += 2 + 2*LINK_SIZE;
1698 break;
1699
1700 /* Recursion either matches the current regex, or some subexpression. The
1701 offset data is the offset to the starting bracket from the start of the
1702 whole pattern. (This is so that it works from duplicated subpatterns.)
1703
1704 The state of the capturing groups is preserved over recursion, and
1705 re-instated afterwards. We don't know how many are started and not yet
1706 finished (offset_top records the completed total) so we just have to save
1707 all the potential data. There may be up to 65535 such values, which is too
1708 large to put on the stack, but using malloc for small numbers seems
1709 expensive. As a compromise, the stack is used when there are no more than
1710 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1711
1712 There are also other values that have to be saved. We use a chained
1713 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1714 for the original version of this logic. It has, however, been hacked around
1715 a lot, so he is not to blame for the current way it works. */
1716
1717 case OP_RECURSE:
1718 {
1719 recursion_info *ri;
1720 int recno;
1721
1722 callpat = md->start_code + GET(ecode, 1);
1723 recno = (callpat == md->start_code)? 0 :
1724 GET2(callpat, 1 + LINK_SIZE);
1725
1726 /* Check for repeating a recursion without advancing the subject pointer.
1727 This should catch convoluted mutual recursions. (Some simple cases are
1728 caught at compile time.) */
1729
1730 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1731 if (recno == ri->group_num && eptr == ri->subject_position)
1732 RRETURN(PCRE_ERROR_RECURSELOOP);
1733
1734 /* Add to "recursing stack" */
1735
1736 new_recursive.group_num = recno;
1737 new_recursive.subject_position = eptr;
1738 new_recursive.prevrec = md->recursive;
1739 md->recursive = &new_recursive;
1740
1741 /* Where to continue from afterwards */
1742
1743 ecode += 1 + LINK_SIZE;
1744
1745 /* Now save the offset data */
1746
1747 new_recursive.saved_max = md->offset_end;
1748 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1749 new_recursive.offset_save = stacksave;
1750 else
1751 {
1752 new_recursive.offset_save =
1753 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1754 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1755 }
1756 memcpy(new_recursive.offset_save, md->offset_vector,
1757 new_recursive.saved_max * sizeof(int));
1758
1759 /* OK, now we can do the recursion. After processing each alternative,
1760 restore the offset data. If there were nested recursions, md->recursive
1761 might be changed, so reset it before looping. */
1762
1763 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1764 cbegroup = (*callpat >= OP_SBRA);
1765 do
1766 {
1767 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1768 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1769 md, eptrb, RM6);
1770 memcpy(md->offset_vector, new_recursive.offset_save,
1771 new_recursive.saved_max * sizeof(int));
1772 md->recursive = new_recursive.prevrec;
1773 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1774 {
1775 DPRINTF(("Recursion matched\n"));
1776 if (new_recursive.offset_save != stacksave)
1777 (PUBL(free))(new_recursive.offset_save);
1778
1779 /* Set where we got to in the subject, and reset the start in case
1780 it was changed by \K. This *is* propagated back out of a recursion,
1781 for Perl compatibility. */
1782
1783 eptr = md->end_match_ptr;
1784 mstart = md->start_match_ptr;
1785 goto RECURSION_MATCHED; /* Exit loop; end processing */
1786 }
1787
1788 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1789 as NOMATCH. */
1790
1791 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1792 {
1793 DPRINTF(("Recursion gave error %d\n", rrc));
1794 if (new_recursive.offset_save != stacksave)
1795 (PUBL(free))(new_recursive.offset_save);
1796 RRETURN(rrc);
1797 }
1798
1799 md->recursive = &new_recursive;
1800 callpat += GET(callpat, 1);
1801 }
1802 while (*callpat == OP_ALT);
1803
1804 DPRINTF(("Recursion didn't match\n"));
1805 md->recursive = new_recursive.prevrec;
1806 if (new_recursive.offset_save != stacksave)
1807 (PUBL(free))(new_recursive.offset_save);
1808 RRETURN(MATCH_NOMATCH);
1809 }
1810
1811 RECURSION_MATCHED:
1812 break;
1813
1814 /* An alternation is the end of a branch; scan along to find the end of the
1815 bracketed group and go to there. */
1816
1817 case OP_ALT:
1818 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1819 break;
1820
1821 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1822 indicating that it may occur zero times. It may repeat infinitely, or not
1823 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1824 with fixed upper repeat limits are compiled as a number of copies, with the
1825 optional ones preceded by BRAZERO or BRAMINZERO. */
1826
1827 case OP_BRAZERO:
1828 next = ecode + 1;
1829 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1831 do next += GET(next, 1); while (*next == OP_ALT);
1832 ecode = next + 1 + LINK_SIZE;
1833 break;
1834
1835 case OP_BRAMINZERO:
1836 next = ecode + 1;
1837 do next += GET(next, 1); while (*next == OP_ALT);
1838 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840 ecode++;
1841 break;
1842
1843 case OP_SKIPZERO:
1844 next = ecode+1;
1845 do next += GET(next,1); while (*next == OP_ALT);
1846 ecode = next + 1 + LINK_SIZE;
1847 break;
1848
1849 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1850 here; just jump to the group, with allow_zero set TRUE. */
1851
1852 case OP_BRAPOSZERO:
1853 op = *(++ecode);
1854 allow_zero = TRUE;
1855 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1856 goto POSSESSIVE_NON_CAPTURE;
1857
1858 /* End of a group, repeated or non-repeating. */
1859
1860 case OP_KET:
1861 case OP_KETRMIN:
1862 case OP_KETRMAX:
1863 case OP_KETRPOS:
1864 prev = ecode - GET(ecode, 1);
1865
1866 /* If this was a group that remembered the subject start, in order to break
1867 infinite repeats of empty string matches, retrieve the subject start from
1868 the chain. Otherwise, set it NULL. */
1869
1870 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1871 {
1872 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1873 eptrb = eptrb->epb_prev; /* Backup to previous group */
1874 }
1875 else saved_eptr = NULL;
1876
1877 /* If we are at the end of an assertion group or a non-capturing atomic
1878 group, stop matching and return MATCH_MATCH, but record the current high
1879 water mark for use by positive assertions. We also need to record the match
1880 start in case it was changed by \K. */
1881
1882 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1883 *prev == OP_ONCE_NC)
1884 {
1885 md->end_match_ptr = eptr; /* For ONCE_NC */
1886 md->end_offset_top = offset_top;
1887 md->start_match_ptr = mstart;
1888 RRETURN(MATCH_MATCH); /* Sets md->mark */
1889 }
1890
1891 /* For capturing groups we have to check the group number back at the start
1892 and if necessary complete handling an extraction by setting the offsets and
1893 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1894 into group 0, so it won't be picked up here. Instead, we catch it when the
1895 OP_END is reached. Other recursion is handled here. We just have to record
1896 the current subject position and start match pointer and give a MATCH
1897 return. */
1898
1899 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1900 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1901 {
1902 number = GET2(prev, 1+LINK_SIZE);
1903 offset = number << 1;
1904
1905 #ifdef PCRE_DEBUG
1906 printf("end bracket %d", number);
1907 printf("\n");
1908 #endif
1909
1910 /* Handle a recursively called group. */
1911
1912 if (md->recursive != NULL && md->recursive->group_num == number)
1913 {
1914 md->end_match_ptr = eptr;
1915 md->start_match_ptr = mstart;
1916 RRETURN(MATCH_MATCH);
1917 }
1918
1919 /* Deal with capturing */
1920
1921 md->capture_last = number;
1922 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1923 {
1924 /* If offset is greater than offset_top, it means that we are
1925 "skipping" a capturing group, and that group's offsets must be marked
1926 unset. In earlier versions of PCRE, all the offsets were unset at the
1927 start of matching, but this doesn't work because atomic groups and
1928 assertions can cause a value to be set that should later be unset.
1929 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1930 part of the atomic group, but this is not on the final matching path,
1931 so must be unset when 2 is set. (If there is no group 2, there is no
1932 problem, because offset_top will then be 2, indicating no capture.) */
1933
1934 if (offset > offset_top)
1935 {
1936 register int *iptr = md->offset_vector + offset_top;
1937 register int *iend = md->offset_vector + offset;
1938 while (iptr < iend) *iptr++ = -1;
1939 }
1940
1941 /* Now make the extraction */
1942
1943 md->offset_vector[offset] =
1944 md->offset_vector[md->offset_end - number];
1945 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1946 if (offset_top <= offset) offset_top = offset + 2;
1947 }
1948 }
1949
1950 /* For an ordinary non-repeating ket, just continue at this level. This
1951 also happens for a repeating ket if no characters were matched in the
1952 group. This is the forcible breaking of infinite loops as implemented in
1953 Perl 5.005. For a non-repeating atomic group that includes captures,
1954 establish a backup point by processing the rest of the pattern at a lower
1955 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1956 original OP_ONCE level, thereby bypassing intermediate backup points, but
1957 resetting any captures that happened along the way. */
1958
1959 if (*ecode == OP_KET || eptr == saved_eptr)
1960 {
1961 if (*prev == OP_ONCE)
1962 {
1963 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1965 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1966 RRETURN(MATCH_ONCE);
1967 }
1968 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1969 break;
1970 }
1971
1972 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1973 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1974 at a time from the outer level, thus saving stack. */
1975
1976 if (*ecode == OP_KETRPOS)
1977 {
1978 md->end_match_ptr = eptr;
1979 md->end_offset_top = offset_top;
1980 RRETURN(MATCH_KETRPOS);
1981 }
1982
1983 /* The normal repeating kets try the rest of the pattern or restart from
1984 the preceding bracket, in the appropriate order. In the second case, we can
1985 use tail recursion to avoid using another stack frame, unless we have an
1986 an atomic group or an unlimited repeat of a group that can match an empty
1987 string. */
1988
1989 if (*ecode == OP_KETRMIN)
1990 {
1991 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1992 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1993 if (*prev == OP_ONCE)
1994 {
1995 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1998 RRETURN(MATCH_ONCE);
1999 }
2000 if (*prev >= OP_SBRA) /* Could match an empty string */
2001 {
2002 md->match_function_type = MATCH_CBEGROUP;
2003 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2004 RRETURN(rrc);
2005 }
2006 ecode = prev;
2007 goto TAIL_RECURSE;
2008 }
2009 else /* OP_KETRMAX */
2010 {
2011 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2012 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2013 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2015 if (*prev == OP_ONCE)
2016 {
2017 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 md->once_target = prev;
2020 RRETURN(MATCH_ONCE);
2021 }
2022 ecode += 1 + LINK_SIZE;
2023 goto TAIL_RECURSE;
2024 }
2025 /* Control never gets here */
2026
2027 /* Not multiline mode: start of subject assertion, unless notbol. */
2028
2029 case OP_CIRC:
2030 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2031
2032 /* Start of subject assertion */
2033
2034 case OP_SOD:
2035 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2036 ecode++;
2037 break;
2038
2039 /* Multiline mode: start of subject unless notbol, or after any newline. */
2040
2041 case OP_CIRCM:
2042 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2043 if (eptr != md->start_subject &&
2044 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2045 RRETURN(MATCH_NOMATCH);
2046 ecode++;
2047 break;
2048
2049 /* Start of match assertion */
2050
2051 case OP_SOM:
2052 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2053 ecode++;
2054 break;
2055
2056 /* Reset the start of match point */
2057
2058 case OP_SET_SOM:
2059 mstart = eptr;
2060 ecode++;
2061 break;
2062
2063 /* Multiline mode: assert before any newline, or before end of subject
2064 unless noteol is set. */
2065
2066 case OP_DOLLM:
2067 if (eptr < md->end_subject)
2068 {
2069 if (!IS_NEWLINE(eptr))
2070 {
2071 if (md->partial != 0 &&
2072 eptr + 1 >= md->end_subject &&
2073 NLBLOCK->nltype == NLTYPE_FIXED &&
2074 NLBLOCK->nllen == 2 &&
2075 *eptr == NLBLOCK->nl[0])
2076 {
2077 md->hitend = TRUE;
2078 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2079 }
2080 RRETURN(MATCH_NOMATCH);
2081 }
2082 }
2083 else
2084 {
2085 if (md->noteol) RRETURN(MATCH_NOMATCH);
2086 SCHECK_PARTIAL();
2087 }
2088 ecode++;
2089 break;
2090
2091 /* Not multiline mode: assert before a terminating newline or before end of
2092 subject unless noteol is set. */
2093
2094 case OP_DOLL:
2095 if (md->noteol) RRETURN(MATCH_NOMATCH);
2096 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2097
2098 /* ... else fall through for endonly */
2099
2100 /* End of subject assertion (\z) */
2101
2102 case OP_EOD:
2103 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2104 SCHECK_PARTIAL();
2105 ecode++;
2106 break;
2107
2108 /* End of subject or ending \n assertion (\Z) */
2109
2110 case OP_EODN:
2111 ASSERT_NL_OR_EOS:
2112 if (eptr < md->end_subject &&
2113 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2114 {
2115 if (md->partial != 0 &&
2116 eptr + 1 >= md->end_subject &&
2117 NLBLOCK->nltype == NLTYPE_FIXED &&
2118 NLBLOCK->nllen == 2 &&
2119 *eptr == NLBLOCK->nl[0])
2120 {
2121 md->hitend = TRUE;
2122 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2123 }
2124 RRETURN(MATCH_NOMATCH);
2125 }
2126
2127 /* Either at end of string or \n before end. */
2128
2129 SCHECK_PARTIAL();
2130 ecode++;
2131 break;
2132
2133 /* Word boundary assertions */
2134
2135 case OP_NOT_WORD_BOUNDARY:
2136 case OP_WORD_BOUNDARY:
2137 {
2138
2139 /* Find out if the previous and current characters are "word" characters.
2140 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2141 be "non-word" characters. Remember the earliest consulted character for
2142 partial matching. */
2143
2144 #ifdef SUPPORT_UTF
2145 if (utf)
2146 {
2147 /* Get status of previous character */
2148
2149 if (eptr == md->start_subject) prev_is_word = FALSE; else
2150 {
2151 PCRE_PUCHAR lastptr = eptr - 1;
2152 BACKCHAR(lastptr);
2153 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2154 GETCHAR(c, lastptr);
2155 #ifdef SUPPORT_UCP
2156 if (md->use_ucp)
2157 {
2158 if (c == '_') prev_is_word = TRUE; else
2159 {
2160 int cat = UCD_CATEGORY(c);
2161 prev_is_word = (cat == ucp_L || cat == ucp_N);
2162 }
2163 }
2164 else
2165 #endif
2166 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2167 }
2168
2169 /* Get status of next character */
2170
2171 if (eptr >= md->end_subject)
2172 {
2173 SCHECK_PARTIAL();
2174 cur_is_word = FALSE;
2175 }
2176 else
2177 {
2178 GETCHAR(c, eptr);
2179 #ifdef SUPPORT_UCP
2180 if (md->use_ucp)
2181 {
2182 if (c == '_') cur_is_word = TRUE; else
2183 {
2184 int cat = UCD_CATEGORY(c);
2185 cur_is_word = (cat == ucp_L || cat == ucp_N);
2186 }
2187 }
2188 else
2189 #endif
2190 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2191 }
2192 }
2193 else
2194 #endif
2195
2196 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2197 consistency with the behaviour of \w we do use it in this case. */
2198
2199 {
2200 /* Get status of previous character */
2201
2202 if (eptr == md->start_subject) prev_is_word = FALSE; else
2203 {
2204 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2205 #ifdef SUPPORT_UCP
2206 if (md->use_ucp)
2207 {
2208 c = eptr[-1];
2209 if (c == '_') prev_is_word = TRUE; else
2210 {
2211 int cat = UCD_CATEGORY(c);
2212 prev_is_word = (cat == ucp_L || cat == ucp_N);
2213 }
2214 }
2215 else
2216 #endif
2217 prev_is_word = MAX_255(eptr[-1])
2218 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2219 }
2220
2221 /* Get status of next character */
2222
2223 if (eptr >= md->end_subject)
2224 {
2225 SCHECK_PARTIAL();
2226 cur_is_word = FALSE;
2227 }
2228 else
2229 #ifdef SUPPORT_UCP
2230 if (md->use_ucp)
2231 {
2232 c = *eptr;
2233 if (c == '_') cur_is_word = TRUE; else
2234 {
2235 int cat = UCD_CATEGORY(c);
2236 cur_is_word = (cat == ucp_L || cat == ucp_N);
2237 }
2238 }
2239 else
2240 #endif
2241 cur_is_word = MAX_255(*eptr)
2242 && ((md->ctypes[*eptr] & ctype_word) != 0);
2243 }
2244
2245 /* Now see if the situation is what we want */
2246
2247 if ((*ecode++ == OP_WORD_BOUNDARY)?
2248 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2249 RRETURN(MATCH_NOMATCH);
2250 }
2251 break;
2252
2253 /* Match any single character type except newline; have to take care with
2254 CRLF newlines and partial matching. */
2255
2256 case OP_ANY:
2257 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2258 if (md->partial != 0 &&
2259 eptr + 1 >= md->end_subject &&
2260 NLBLOCK->nltype == NLTYPE_FIXED &&
2261 NLBLOCK->nllen == 2 &&
2262 *eptr == NLBLOCK->nl[0])
2263 {
2264 md->hitend = TRUE;
2265 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2266 }
2267
2268 /* Fall through */
2269
2270 /* Match any single character whatsoever. */
2271
2272 case OP_ALLANY:
2273 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2274 { /* not be updated before SCHECK_PARTIAL. */
2275 SCHECK_PARTIAL();
2276 RRETURN(MATCH_NOMATCH);
2277 }
2278 eptr++;
2279 #ifdef SUPPORT_UTF
2280 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2281 #endif
2282 ecode++;
2283 break;
2284
2285 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2286 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2287
2288 case OP_ANYBYTE:
2289 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2290 { /* not be updated before SCHECK_PARTIAL. */
2291 SCHECK_PARTIAL();
2292 RRETURN(MATCH_NOMATCH);
2293 }
2294 eptr++;
2295 ecode++;
2296 break;
2297
2298 case OP_NOT_DIGIT:
2299 if (eptr >= md->end_subject)
2300 {
2301 SCHECK_PARTIAL();
2302 RRETURN(MATCH_NOMATCH);
2303 }
2304 GETCHARINCTEST(c, eptr);
2305 if (
2306 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2307 c < 256 &&
2308 #endif
2309 (md->ctypes[c] & ctype_digit) != 0
2310 )
2311 RRETURN(MATCH_NOMATCH);
2312 ecode++;
2313 break;
2314
2315 case OP_DIGIT:
2316 if (eptr >= md->end_subject)
2317 {
2318 SCHECK_PARTIAL();
2319 RRETURN(MATCH_NOMATCH);
2320 }
2321 GETCHARINCTEST(c, eptr);
2322 if (
2323 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2324 c > 255 ||
2325 #endif
2326 (md->ctypes[c] & ctype_digit) == 0
2327 )
2328 RRETURN(MATCH_NOMATCH);
2329 ecode++;
2330 break;
2331
2332 case OP_NOT_WHITESPACE:
2333 if (eptr >= md->end_subject)
2334 {
2335 SCHECK_PARTIAL();
2336 RRETURN(MATCH_NOMATCH);
2337 }
2338 GETCHARINCTEST(c, eptr);
2339 if (
2340 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2341 c < 256 &&
2342 #endif
2343 (md->ctypes[c] & ctype_space) != 0
2344 )
2345 RRETURN(MATCH_NOMATCH);
2346 ecode++;
2347 break;
2348
2349 case OP_WHITESPACE:
2350 if (eptr >= md->end_subject)
2351 {
2352 SCHECK_PARTIAL();
2353 RRETURN(MATCH_NOMATCH);
2354 }
2355 GETCHARINCTEST(c, eptr);
2356 if (
2357 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2358 c > 255 ||
2359 #endif
2360 (md->ctypes[c] & ctype_space) == 0
2361 )
2362 RRETURN(MATCH_NOMATCH);
2363 ecode++;
2364 break;
2365
2366 case OP_NOT_WORDCHAR:
2367 if (eptr >= md->end_subject)
2368 {
2369 SCHECK_PARTIAL();
2370 RRETURN(MATCH_NOMATCH);
2371 }
2372 GETCHARINCTEST(c, eptr);
2373 if (
2374 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2375 c < 256 &&
2376 #endif
2377 (md->ctypes[c] & ctype_word) != 0
2378 )
2379 RRETURN(MATCH_NOMATCH);
2380 ecode++;
2381 break;
2382
2383 case OP_WORDCHAR:
2384 if (eptr >= md->end_subject)
2385 {
2386 SCHECK_PARTIAL();
2387 RRETURN(MATCH_NOMATCH);
2388 }
2389 GETCHARINCTEST(c, eptr);
2390 if (
2391 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2392 c > 255 ||
2393 #endif
2394 (md->ctypes[c] & ctype_word) == 0
2395 )
2396 RRETURN(MATCH_NOMATCH);
2397 ecode++;
2398 break;
2399
2400 case OP_ANYNL:
2401 if (eptr >= md->end_subject)
2402 {
2403 SCHECK_PARTIAL();
2404 RRETURN(MATCH_NOMATCH);
2405 }
2406 GETCHARINCTEST(c, eptr);
2407 switch(c)
2408 {
2409 default: RRETURN(MATCH_NOMATCH);
2410
2411 case 0x000d:
2412 if (eptr >= md->end_subject)
2413 {
2414 SCHECK_PARTIAL();
2415 }
2416 else if (*eptr == 0x0a) eptr++;
2417 break;
2418
2419 case 0x000a:
2420 break;
2421
2422 case 0x000b:
2423 case 0x000c:
2424 case 0x0085:
2425 case 0x2028:
2426 case 0x2029:
2427 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2428 break;
2429 }
2430 ecode++;
2431 break;
2432
2433 case OP_NOT_HSPACE:
2434 if (eptr >= md->end_subject)
2435 {
2436 SCHECK_PARTIAL();
2437 RRETURN(MATCH_NOMATCH);
2438 }
2439 GETCHARINCTEST(c, eptr);
2440 switch(c)
2441 {
2442 default: break;
2443 case 0x09: /* HT */
2444 case 0x20: /* SPACE */
2445 case 0xa0: /* NBSP */
2446 case 0x1680: /* OGHAM SPACE MARK */
2447 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2448 case 0x2000: /* EN QUAD */
2449 case 0x2001: /* EM QUAD */
2450 case 0x2002: /* EN SPACE */
2451 case 0x2003: /* EM SPACE */
2452 case 0x2004: /* THREE-PER-EM SPACE */
2453 case 0x2005: /* FOUR-PER-EM SPACE */
2454 case 0x2006: /* SIX-PER-EM SPACE */
2455 case 0x2007: /* FIGURE SPACE */
2456 case 0x2008: /* PUNCTUATION SPACE */
2457 case 0x2009: /* THIN SPACE */
2458 case 0x200A: /* HAIR SPACE */
2459 case 0x202f: /* NARROW NO-BREAK SPACE */
2460 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2461 case 0x3000: /* IDEOGRAPHIC SPACE */
2462 RRETURN(MATCH_NOMATCH);
2463 }
2464 ecode++;
2465 break;
2466
2467 case OP_HSPACE:
2468 if (eptr >= md->end_subject)
2469 {
2470 SCHECK_PARTIAL();
2471 RRETURN(MATCH_NOMATCH);
2472 }
2473 GETCHARINCTEST(c, eptr);
2474 switch(c)
2475 {
2476 default: RRETURN(MATCH_NOMATCH);
2477 case 0x09: /* HT */
2478 case 0x20: /* SPACE */
2479 case 0xa0: /* NBSP */
2480 case 0x1680: /* OGHAM SPACE MARK */
2481 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2482 case 0x2000: /* EN QUAD */
2483 case 0x2001: /* EM QUAD */
2484 case 0x2002: /* EN SPACE */
2485 case 0x2003: /* EM SPACE */
2486 case 0x2004: /* THREE-PER-EM SPACE */
2487 case 0x2005: /* FOUR-PER-EM SPACE */
2488 case 0x2006: /* SIX-PER-EM SPACE */
2489 case 0x2007: /* FIGURE SPACE */
2490 case 0x2008: /* PUNCTUATION SPACE */
2491 case 0x2009: /* THIN SPACE */
2492 case 0x200A: /* HAIR SPACE */
2493 case 0x202f: /* NARROW NO-BREAK SPACE */
2494 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2495 case 0x3000: /* IDEOGRAPHIC SPACE */
2496 break;
2497 }
2498 ecode++;
2499 break;
2500
2501 case OP_NOT_VSPACE:
2502 if (eptr >= md->end_subject)
2503 {
2504 SCHECK_PARTIAL();
2505 RRETURN(MATCH_NOMATCH);
2506 }
2507 GETCHARINCTEST(c, eptr);
2508 switch(c)
2509 {
2510 default: break;
2511 case 0x0a: /* LF */
2512 case 0x0b: /* VT */
2513 case 0x0c: /* FF */
2514 case 0x0d: /* CR */
2515 case 0x85: /* NEL */
2516 case 0x2028: /* LINE SEPARATOR */
2517 case 0x2029: /* PARAGRAPH SEPARATOR */
2518 RRETURN(MATCH_NOMATCH);
2519 }
2520 ecode++;
2521 break;
2522
2523 case OP_VSPACE:
2524 if (eptr >= md->end_subject)
2525 {
2526 SCHECK_PARTIAL();
2527 RRETURN(MATCH_NOMATCH);
2528 }
2529 GETCHARINCTEST(c, eptr);
2530 switch(c)
2531 {
2532 default: RRETURN(MATCH_NOMATCH);
2533 case 0x0a: /* LF */
2534 case 0x0b: /* VT */
2535 case 0x0c: /* FF */
2536 case 0x0d: /* CR */
2537 case 0x85: /* NEL */
2538 case 0x2028: /* LINE SEPARATOR */
2539 case 0x2029: /* PARAGRAPH SEPARATOR */
2540 break;
2541 }
2542 ecode++;
2543 break;
2544
2545 #ifdef SUPPORT_UCP
2546 /* Check the next character by Unicode property. We will get here only
2547 if the support is in the binary; otherwise a compile-time error occurs. */
2548
2549 case OP_PROP:
2550 case OP_NOTPROP:
2551 if (eptr >= md->end_subject)
2552 {
2553 SCHECK_PARTIAL();
2554 RRETURN(MATCH_NOMATCH);
2555 }
2556 GETCHARINCTEST(c, eptr);
2557 {
2558 const ucd_record *prop = GET_UCD(c);
2559
2560 switch(ecode[1])
2561 {
2562 case PT_ANY:
2563 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2564 break;
2565
2566 case PT_LAMP:
2567 if ((prop->chartype == ucp_Lu ||
2568 prop->chartype == ucp_Ll ||
2569 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2570 RRETURN(MATCH_NOMATCH);
2571 break;
2572
2573 case PT_GC:
2574 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2575 RRETURN(MATCH_NOMATCH);
2576 break;
2577
2578 case PT_PC:
2579 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2580 RRETURN(MATCH_NOMATCH);
2581 break;
2582
2583 case PT_SC:
2584 if ((ecode[2] != prop->script) == (op == OP_PROP))
2585 RRETURN(MATCH_NOMATCH);
2586 break;
2587
2588 /* These are specials */
2589
2590 case PT_ALNUM:
2591 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2592 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2593 RRETURN(MATCH_NOMATCH);
2594 break;
2595
2596 case PT_SPACE: /* Perl space */
2597 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2598 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2599 == (op == OP_NOTPROP))
2600 RRETURN(MATCH_NOMATCH);
2601 break;
2602
2603 case PT_PXSPACE: /* POSIX space */
2604 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2605 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2606 c == CHAR_FF || c == CHAR_CR)
2607 == (op == OP_NOTPROP))
2608 RRETURN(MATCH_NOMATCH);
2609 break;
2610
2611 case PT_WORD:
2612 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2613 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2614 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2615 RRETURN(MATCH_NOMATCH);
2616 break;
2617
2618 /* This should never occur */
2619
2620 default:
2621 RRETURN(PCRE_ERROR_INTERNAL);
2622 }
2623
2624 ecode += 3;
2625 }
2626 break;
2627
2628 /* Match an extended Unicode sequence. We will get here only if the support
2629 is in the binary; otherwise a compile-time error occurs. */
2630
2631 case OP_EXTUNI:
2632 if (eptr >= md->end_subject)
2633 {
2634 SCHECK_PARTIAL();
2635 RRETURN(MATCH_NOMATCH);
2636 }
2637 GETCHARINCTEST(c, eptr);
2638 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2639 while (eptr < md->end_subject)
2640 {
2641 int len = 1;
2642 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2643 if (UCD_CATEGORY(c) != ucp_M) break;
2644 eptr += len;
2645 }
2646 CHECK_PARTIAL();
2647 ecode++;
2648 break;
2649 #endif
2650
2651
2652 /* Match a back reference, possibly repeatedly. Look past the end of the
2653 item to see if there is repeat information following. The code is similar
2654 to that for character classes, but repeated for efficiency. Then obey
2655 similar code to character type repeats - written out again for speed.
2656 However, if the referenced string is the empty string, always treat
2657 it as matched, any number of times (otherwise there could be infinite
2658 loops). */
2659
2660 case OP_REF:
2661 case OP_REFI:
2662 caseless = op == OP_REFI;
2663 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2664 ecode += 1 + IMM2_SIZE;
2665
2666 /* If the reference is unset, there are two possibilities:
2667
2668 (a) In the default, Perl-compatible state, set the length negative;
2669 this ensures that every attempt at a match fails. We can't just fail
2670 here, because of the possibility of quantifiers with zero minima.
2671
2672 (b) If the JavaScript compatibility flag is set, set the length to zero
2673 so that the back reference matches an empty string.
2674
2675 Otherwise, set the length to the length of what was matched by the
2676 referenced subpattern. */
2677
2678 if (offset >= offset_top || md->offset_vector[offset] < 0)
2679 length = (md->jscript_compat)? 0 : -1;
2680 else
2681 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2682
2683 /* Set up for repetition, or handle the non-repeated case */
2684
2685 switch (*ecode)
2686 {
2687 case OP_CRSTAR:
2688 case OP_CRMINSTAR:
2689 case OP_CRPLUS:
2690 case OP_CRMINPLUS:
2691 case OP_CRQUERY:
2692 case OP_CRMINQUERY:
2693 c = *ecode++ - OP_CRSTAR;
2694 minimize = (c & 1) != 0;
2695 min = rep_min[c]; /* Pick up values from tables; */
2696 max = rep_max[c]; /* zero for max => infinity */
2697 if (max == 0) max = INT_MAX;
2698 break;
2699
2700 case OP_CRRANGE:
2701 case OP_CRMINRANGE:
2702 minimize = (*ecode == OP_CRMINRANGE);
2703 min = GET2(ecode, 1);
2704 max = GET2(ecode, 1 + IMM2_SIZE);
2705 if (max == 0) max = INT_MAX;
2706 ecode += 1 + 2 * IMM2_SIZE;
2707 break;
2708
2709 default: /* No repeat follows */
2710 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2711 {
2712 if (length == -2) eptr = md->end_subject; /* Partial match */
2713 CHECK_PARTIAL();
2714 RRETURN(MATCH_NOMATCH);
2715 }
2716 eptr += length;
2717 continue; /* With the main loop */
2718 }
2719
2720 /* Handle repeated back references. If the length of the reference is
2721 zero, just continue with the main loop. If the length is negative, it
2722 means the reference is unset in non-Java-compatible mode. If the minimum is
2723 zero, we can continue at the same level without recursion. For any other
2724 minimum, carrying on will result in NOMATCH. */
2725
2726 if (length == 0) continue;
2727 if (length < 0 && min == 0) continue;
2728
2729 /* First, ensure the minimum number of matches are present. We get back
2730 the length of the reference string explicitly rather than passing the
2731 address of eptr, so that eptr can be a register variable. */
2732
2733 for (i = 1; i <= min; i++)
2734 {
2735 int slength;
2736 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2737 {
2738 if (slength == -2) eptr = md->end_subject; /* Partial match */
2739 CHECK_PARTIAL();
2740 RRETURN(MATCH_NOMATCH);
2741 }
2742 eptr += slength;
2743 }
2744
2745 /* If min = max, continue at the same level without recursion.
2746 They are not both allowed to be zero. */
2747
2748 if (min == max) continue;
2749
2750 /* If minimizing, keep trying and advancing the pointer */
2751
2752 if (minimize)
2753 {
2754 for (fi = min;; fi++)
2755 {
2756 int slength;
2757 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2759 if (fi >= max) RRETURN(MATCH_NOMATCH);
2760 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2761 {
2762 if (slength == -2) eptr = md->end_subject; /* Partial match */
2763 CHECK_PARTIAL();
2764 RRETURN(MATCH_NOMATCH);
2765 }
2766 eptr += slength;
2767 }
2768 /* Control never gets here */
2769 }
2770
2771 /* If maximizing, find the longest string and work backwards */
2772
2773 else
2774 {
2775 pp = eptr;
2776 for (i = min; i < max; i++)
2777 {
2778 int slength;
2779 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2780 {
2781 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2782 the soft partial matching case. */
2783
2784 if (slength == -2 && md->partial != 0 &&
2785 md->end_subject > md->start_used_ptr)
2786 {
2787 md->hitend = TRUE;
2788 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2789 }
2790 break;
2791 }
2792 eptr += slength;
2793 }
2794
2795 while (eptr >= pp)
2796 {
2797 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2799 eptr -= length;
2800 }
2801 RRETURN(MATCH_NOMATCH);
2802 }
2803 /* Control never gets here */
2804
2805 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2806 used when all the characters in the class have values in the range 0-255,
2807 and either the matching is caseful, or the characters are in the range
2808 0-127 when UTF-8 processing is enabled. The only difference between
2809 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2810 encountered.
2811
2812 First, look past the end of the item to see if there is repeat information
2813 following. Then obey similar code to character type repeats - written out
2814 again for speed. */
2815
2816 case OP_NCLASS:
2817 case OP_CLASS:
2818 {
2819 /* The data variable is saved across frames, so the byte map needs to
2820 be stored there. */
2821 #define BYTE_MAP ((pcre_uint8 *)data)
2822 data = ecode + 1; /* Save for matching */
2823 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2824
2825 switch (*ecode)
2826 {
2827 case OP_CRSTAR:
2828 case OP_CRMINSTAR:
2829 case OP_CRPLUS:
2830 case OP_CRMINPLUS:
2831 case OP_CRQUERY:
2832 case OP_CRMINQUERY:
2833 c = *ecode++ - OP_CRSTAR;
2834 minimize = (c & 1) != 0;
2835 min = rep_min[c]; /* Pick up values from tables; */
2836 max = rep_max[c]; /* zero for max => infinity */
2837 if (max == 0) max = INT_MAX;
2838 break;
2839
2840 case OP_CRRANGE:
2841 case OP_CRMINRANGE:
2842 minimize = (*ecode == OP_CRMINRANGE);
2843 min = GET2(ecode, 1);
2844 max = GET2(ecode, 1 + IMM2_SIZE);
2845 if (max == 0) max = INT_MAX;
2846 ecode += 1 + 2 * IMM2_SIZE;
2847 break;
2848
2849 default: /* No repeat follows */
2850 min = max = 1;
2851 break;
2852 }
2853
2854 /* First, ensure the minimum number of matches are present. */
2855
2856 #ifdef SUPPORT_UTF
2857 if (utf)
2858 {
2859 for (i = 1; i <= min; i++)
2860 {
2861 if (eptr >= md->end_subject)
2862 {
2863 SCHECK_PARTIAL();
2864 RRETURN(MATCH_NOMATCH);
2865 }
2866 GETCHARINC(c, eptr);
2867 if (c > 255)
2868 {
2869 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2870 }
2871 else
2872 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2873 }
2874 }
2875 else
2876 #endif
2877 /* Not UTF mode */
2878 {
2879 for (i = 1; i <= min; i++)
2880 {
2881 if (eptr >= md->end_subject)
2882 {
2883 SCHECK_PARTIAL();
2884 RRETURN(MATCH_NOMATCH);
2885 }
2886 c = *eptr++;
2887 #ifndef COMPILE_PCRE8
2888 if (c > 255)
2889 {
2890 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2891 }
2892 else
2893 #endif
2894 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2895 }
2896 }
2897
2898 /* If max == min we can continue with the main loop without the
2899 need to recurse. */
2900
2901 if (min == max) continue;
2902
2903 /* If minimizing, keep testing the rest of the expression and advancing
2904 the pointer while it matches the class. */
2905
2906 if (minimize)
2907 {
2908 #ifdef SUPPORT_UTF
2909 if (utf)
2910 {
2911 for (fi = min;; fi++)
2912 {
2913 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2915 if (fi >= max) RRETURN(MATCH_NOMATCH);
2916 if (eptr >= md->end_subject)
2917 {
2918 SCHECK_PARTIAL();
2919 RRETURN(MATCH_NOMATCH);
2920 }
2921 GETCHARINC(c, eptr);
2922 if (c > 255)
2923 {
2924 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2925 }
2926 else
2927 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2928 }
2929 }
2930 else
2931 #endif
2932 /* Not UTF mode */
2933 {
2934 for (fi = min;; fi++)
2935 {
2936 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2938 if (fi >= max) RRETURN(MATCH_NOMATCH);
2939 if (eptr >= md->end_subject)
2940 {
2941 SCHECK_PARTIAL();
2942 RRETURN(MATCH_NOMATCH);
2943 }
2944 c = *eptr++;
2945 #ifndef COMPILE_PCRE8
2946 if (c > 255)
2947 {
2948 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2949 }
2950 else
2951 #endif
2952 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2953 }
2954 }
2955 /* Control never gets here */
2956 }
2957
2958 /* If maximizing, find the longest possible run, then work backwards. */
2959
2960 else
2961 {
2962 pp = eptr;
2963
2964 #ifdef SUPPORT_UTF
2965 if (utf)
2966 {
2967 for (i = min; i < max; i++)
2968 {
2969 int len = 1;
2970 if (eptr >= md->end_subject)
2971 {
2972 SCHECK_PARTIAL();
2973 break;
2974 }
2975 GETCHARLEN(c, eptr, len);
2976 if (c > 255)
2977 {
2978 if (op == OP_CLASS) break;
2979 }
2980 else
2981 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2982 eptr += len;
2983 }
2984 for (;;)
2985 {
2986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 if (eptr-- == pp) break; /* Stop if tried at original pos */
2989 BACKCHAR(eptr);
2990 }
2991 }
2992 else
2993 #endif
2994 /* Not UTF mode */
2995 {
2996 for (i = min; i < max; i++)
2997 {
2998 if (eptr >= md->end_subject)
2999 {
3000 SCHECK_PARTIAL();
3001 break;
3002 }
3003 c = *eptr;
3004 #ifndef COMPILE_PCRE8
3005 if (c > 255)
3006 {
3007 if (op == OP_CLASS) break;
3008 }
3009 else
3010 #endif
3011 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3012 eptr++;
3013 }
3014 while (eptr >= pp)
3015 {
3016 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3017 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3018 eptr--;
3019 }
3020 }
3021
3022 RRETURN(MATCH_NOMATCH);
3023 }
3024 #undef BYTE_MAP
3025 }
3026 /* Control never gets here */
3027
3028
3029 /* Match an extended character class. This opcode is encountered only
3030 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3031 mode, because Unicode properties are supported in non-UTF-8 mode. */
3032
3033 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3034 case OP_XCLASS:
3035 {
3036 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3037 ecode += GET(ecode, 1); /* Advance past the item */
3038
3039 switch (*ecode)
3040 {
3041 case OP_CRSTAR:
3042 case OP_CRMINSTAR:
3043 case OP_CRPLUS:
3044 case OP_CRMINPLUS:
3045 case OP_CRQUERY:
3046 case OP_CRMINQUERY:
3047 c = *ecode++ - OP_CRSTAR;
3048 minimize = (c & 1) != 0;
3049 min = rep_min[c]; /* Pick up values from tables; */
3050 max = rep_max[c]; /* zero for max => infinity */
3051 if (max == 0) max = INT_MAX;
3052 break;
3053
3054 case OP_CRRANGE:
3055 case OP_CRMINRANGE:
3056 minimize = (*ecode == OP_CRMINRANGE);
3057 min = GET2(ecode, 1);
3058 max = GET2(ecode, 1 + IMM2_SIZE);
3059 if (max == 0) max = INT_MAX;
3060 ecode += 1 + 2 * IMM2_SIZE;
3061 break;
3062
3063 default: /* No repeat follows */
3064 min = max = 1;
3065 break;
3066 }
3067
3068 /* First, ensure the minimum number of matches are present. */
3069
3070 for (i = 1; i <= min; i++)
3071 {
3072 if (eptr >= md->end_subject)
3073 {
3074 SCHECK_PARTIAL();
3075 RRETURN(MATCH_NOMATCH);
3076 }
3077 GETCHARINCTEST(c, eptr);
3078 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3079 }
3080
3081 /* If max == min we can continue with the main loop without the
3082 need to recurse. */
3083
3084 if (min == max) continue;
3085
3086 /* If minimizing, keep testing the rest of the expression and advancing
3087 the pointer while it matches the class. */
3088
3089 if (minimize)
3090 {
3091 for (fi = min;; fi++)
3092 {
3093 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3095 if (fi >= max) RRETURN(MATCH_NOMATCH);
3096 if (eptr >= md->end_subject)
3097 {
3098 SCHECK_PARTIAL();
3099 RRETURN(MATCH_NOMATCH);
3100 }
3101 GETCHARINCTEST(c, eptr);
3102 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3103 }
3104 /* Control never gets here */
3105 }
3106
3107 /* If maximizing, find the longest possible run, then work backwards. */
3108
3109 else
3110 {
3111 pp = eptr;
3112 for (i = min; i < max; i++)
3113 {
3114 int len = 1;
3115 if (eptr >= md->end_subject)
3116 {
3117 SCHECK_PARTIAL();
3118 break;
3119 }
3120 #ifdef SUPPORT_UTF
3121 GETCHARLENTEST(c, eptr, len);
3122 #else
3123 c = *eptr;
3124 #endif
3125 if (!PRIV(xclass)(c, data, utf)) break;
3126 eptr += len;
3127 }
3128 for(;;)
3129 {
3130 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3131 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3132 if (eptr-- == pp) break; /* Stop if tried at original pos */
3133 #ifdef SUPPORT_UTF
3134 if (utf) BACKCHAR(eptr);
3135 #endif
3136 }
3137 RRETURN(MATCH_NOMATCH);
3138 }
3139
3140 /* Control never gets here */
3141 }
3142 #endif /* End of XCLASS */
3143
3144 /* Match a single character, casefully */
3145
3146 case OP_CHAR:
3147 #ifdef SUPPORT_UTF
3148 if (utf)
3149 {
3150 length = 1;
3151 ecode++;
3152 GETCHARLEN(fc, ecode, length);
3153 if (length > md->end_subject - eptr)
3154 {
3155 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3156 RRETURN(MATCH_NOMATCH);
3157 }
3158 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3159 }
3160 else
3161 #endif
3162 /* Not UTF mode */
3163 {
3164 if (md->end_subject - eptr < 1)
3165 {
3166 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3167 RRETURN(MATCH_NOMATCH);
3168 }
3169 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3170 ecode += 2;
3171 }
3172 break;
3173
3174 /* Match a single character, caselessly. If we are at the end of the
3175 subject, give up immediately. */
3176
3177 case OP_CHARI:
3178 if (eptr >= md->end_subject)
3179 {
3180 SCHECK_PARTIAL();
3181 RRETURN(MATCH_NOMATCH);
3182 }
3183
3184 #ifdef SUPPORT_UTF
3185 if (utf)
3186 {
3187 length = 1;
3188 ecode++;
3189 GETCHARLEN(fc, ecode, length);
3190
3191 /* If the pattern character's value is < 128, we have only one byte, and
3192 we know that its other case must also be one byte long, so we can use the
3193 fast lookup table. We know that there is at least one byte left in the
3194 subject. */
3195
3196 if (fc < 128)
3197 {
3198 if (md->lcc[fc]
3199 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3200 ecode++;
3201 eptr++;
3202 }
3203
3204 /* Otherwise we must pick up the subject character. Note that we cannot
3205 use the value of "length" to check for sufficient bytes left, because the
3206 other case of the character may have more or fewer bytes. */
3207
3208 else
3209 {
3210 unsigned int dc;
3211 GETCHARINC(dc, eptr);
3212 ecode += length;
3213
3214 /* If we have Unicode property support, we can use it to test the other
3215 case of the character, if there is one. */
3216
3217 if (fc != dc)
3218 {
3219 #ifdef SUPPORT_UCP
3220 if (dc != UCD_OTHERCASE(fc))
3221 #endif
3222 RRETURN(MATCH_NOMATCH);
3223 }
3224 }
3225 }
3226 else
3227 #endif /* SUPPORT_UTF */
3228
3229 /* Not UTF mode */
3230 {
3231 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3232 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3233 eptr++;
3234 ecode += 2;
3235 }
3236 break;
3237
3238 /* Match a single character repeatedly. */
3239
3240 case OP_EXACT:
3241 case OP_EXACTI:
3242 min = max = GET2(ecode, 1);
3243 ecode += 1 + IMM2_SIZE;
3244 goto REPEATCHAR;
3245
3246 case OP_POSUPTO:
3247 case OP_POSUPTOI:
3248 possessive = TRUE;
3249 /* Fall through */
3250
3251 case OP_UPTO:
3252 case OP_UPTOI:
3253 case OP_MINUPTO:
3254 case OP_MINUPTOI:
3255 min = 0;
3256 max = GET2(ecode, 1);
3257 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3258 ecode += 1 + IMM2_SIZE;
3259 goto REPEATCHAR;
3260
3261 case OP_POSSTAR:
3262 case OP_POSSTARI:
3263 possessive = TRUE;
3264 min = 0;
3265 max = INT_MAX;
3266 ecode++;
3267 goto REPEATCHAR;
3268
3269 case OP_POSPLUS:
3270 case OP_POSPLUSI:
3271 possessive = TRUE;
3272 min = 1;
3273 max = INT_MAX;
3274 ecode++;
3275 goto REPEATCHAR;
3276
3277 case OP_POSQUERY:
3278 case OP_POSQUERYI:
3279 possessive = TRUE;
3280 min = 0;
3281 max = 1;
3282 ecode++;
3283 goto REPEATCHAR;
3284
3285 case OP_STAR:
3286 case OP_STARI:
3287 case OP_MINSTAR:
3288 case OP_MINSTARI:
3289 case OP_PLUS:
3290 case OP_PLUSI:
3291 case OP_MINPLUS:
3292 case OP_MINPLUSI:
3293 case OP_QUERY:
3294 case OP_QUERYI:
3295 case OP_MINQUERY:
3296 case OP_MINQUERYI:
3297 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3298 minimize = (c & 1) != 0;
3299 min = rep_min[c]; /* Pick up values from tables; */
3300 max = rep_max[c]; /* zero for max => infinity */
3301 if (max == 0) max = INT_MAX;
3302
3303 /* Common code for all repeated single-character matches. */
3304
3305 REPEATCHAR:
3306 #ifdef SUPPORT_UTF
3307 if (utf)
3308 {
3309 length = 1;
3310 charptr = ecode;
3311 GETCHARLEN(fc, ecode, length);
3312 ecode += length;
3313
3314 /* Handle multibyte character matching specially here. There is
3315 support for caseless matching if UCP support is present. */
3316
3317 if (length > 1)
3318 {
3319 #ifdef SUPPORT_UCP
3320 unsigned int othercase;
3321 if (op >= OP_STARI && /* Caseless */
3322 (othercase = UCD_OTHERCASE(fc)) != fc)
3323 oclength = PRIV(ord2utf)(othercase, occhars);
3324 else oclength = 0;
3325 #endif /* SUPPORT_UCP */
3326
3327 for (i = 1; i <= min; i++)
3328 {
3329 if (eptr <= md->end_subject - length &&
3330 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3331 #ifdef SUPPORT_UCP
3332 else if (oclength > 0 &&
3333 eptr <= md->end_subject - oclength &&
3334 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3335 #endif /* SUPPORT_UCP */
3336 else
3337 {
3338 CHECK_PARTIAL();
3339 RRETURN(MATCH_NOMATCH);
3340 }
3341 }
3342
3343 if (min == max) continue;
3344
3345 if (minimize)
3346 {
3347 for (fi = min;; fi++)
3348 {
3349 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3350 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3351 if (fi >= max) RRETURN(MATCH_NOMATCH);
3352 if (eptr <= md->end_subject - length &&
3353 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3354 #ifdef SUPPORT_UCP
3355 else if (oclength > 0 &&
3356 eptr <= md->end_subject - oclength &&
3357 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3358 #endif /* SUPPORT_UCP */
3359 else
3360 {
3361 CHECK_PARTIAL();
3362 RRETURN(MATCH_NOMATCH);
3363 }
3364 }
3365 /* Control never gets here */
3366 }
3367
3368 else /* Maximize */
3369 {
3370 pp = eptr;
3371 for (i = min; i < max; i++)
3372 {
3373 if (eptr <= md->end_subject - length &&
3374 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3375 #ifdef SUPPORT_UCP
3376 else if (oclength > 0 &&
3377 eptr <= md->end_subject - oclength &&
3378 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3379 #endif /* SUPPORT_UCP */
3380 else
3381 {
3382 CHECK_PARTIAL();
3383 break;
3384 }
3385 }
3386
3387 if (possessive) continue;
3388
3389 for(;;)
3390 {
3391 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3393 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3394 #ifdef SUPPORT_UCP
3395 eptr--;
3396 BACKCHAR(eptr);
3397 #else /* without SUPPORT_UCP */
3398 eptr -= length;
3399 #endif /* SUPPORT_UCP */
3400 }
3401 }
3402 /* Control never gets here */
3403 }
3404
3405 /* If the length of a UTF-8 character is 1, we fall through here, and
3406 obey the code as for non-UTF-8 characters below, though in this case the
3407 value of fc will always be < 128. */
3408 }
3409 else
3410 #endif /* SUPPORT_UTF */
3411 /* When not in UTF-8 mode, load a single-byte character. */
3412 fc = *ecode++;
3413
3414 /* The value of fc at this point is always one character, though we may
3415 or may not be in UTF mode. The code is duplicated for the caseless and
3416 caseful cases, for speed, since matching characters is likely to be quite
3417 common. First, ensure the minimum number of matches are present. If min =
3418 max, continue at the same level without recursing. Otherwise, if
3419 minimizing, keep trying the rest of the expression and advancing one
3420 matching character if failing, up to the maximum. Alternatively, if
3421 maximizing, find the maximum number of characters and work backwards. */
3422
3423 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3424 max, eptr));
3425
3426 if (op >= OP_STARI) /* Caseless */
3427 {
3428 #ifdef COMPILE_PCRE8
3429 /* fc must be < 128 if UTF is enabled. */
3430 foc = md->fcc[fc];
3431 #else
3432 #ifdef SUPPORT_UTF
3433 #ifdef SUPPORT_UCP
3434 if (utf && fc > 127)
3435 foc = UCD_OTHERCASE(fc);
3436 #else
3437 if (utf && fc > 127)
3438 foc = fc;
3439 #endif /* SUPPORT_UCP */
3440 else
3441 #endif /* SUPPORT_UTF */
3442 foc = TABLE_GET(fc, md->fcc, fc);
3443 #endif /* COMPILE_PCRE8 */
3444
3445 for (i = 1; i <= min; i++)
3446 {
3447 if (eptr >= md->end_subject)
3448 {
3449 SCHECK_PARTIAL();
3450 RRETURN(MATCH_NOMATCH);
3451 }
3452 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3453 eptr++;
3454 }
3455 if (min == max) continue;
3456 if (minimize)
3457 {
3458 for (fi = min;; fi++)
3459 {
3460 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3461 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3462 if (fi >= max) RRETURN(MATCH_NOMATCH);
3463 if (eptr >= md->end_subject)
3464 {
3465 SCHECK_PARTIAL();
3466 RRETURN(MATCH_NOMATCH);
3467 }
3468 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3469 eptr++;
3470 }
3471 /* Control never gets here */
3472 }
3473 else /* Maximize */
3474 {
3475 pp = eptr;
3476 for (i = min; i < max; i++)
3477 {
3478 if (eptr >= md->end_subject)
3479 {
3480 SCHECK_PARTIAL();
3481 break;
3482 }
3483 if (fc != *eptr && foc != *eptr) break;
3484 eptr++;
3485 }
3486
3487 if (possessive) continue;
3488
3489 while (eptr >= pp)
3490 {
3491 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3492 eptr--;
3493 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3494 }
3495 RRETURN(MATCH_NOMATCH);
3496 }
3497 /* Control never gets here */
3498 }
3499
3500 /* Caseful comparisons (includes all multi-byte characters) */
3501
3502 else
3503 {
3504 for (i = 1; i <= min; i++)
3505 {
3506 if (eptr >= md->end_subject)
3507 {
3508 SCHECK_PARTIAL();
3509 RRETURN(MATCH_NOMATCH);
3510 }
3511 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3512 }
3513
3514 if (min == max) continue;
3515
3516 if (minimize)
3517 {
3518 for (fi = min;; fi++)
3519 {
3520 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3521 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3522 if (fi >= max) RRETURN(MATCH_NOMATCH);
3523 if (eptr >= md->end_subject)
3524 {
3525 SCHECK_PARTIAL();
3526 RRETURN(MATCH_NOMATCH);
3527 }
3528 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3529 }
3530 /* Control never gets here */
3531 }
3532 else /* Maximize */
3533 {
3534 pp = eptr;
3535 for (i = min; i < max; i++)
3536 {
3537 if (eptr >= md->end_subject)
3538 {
3539 SCHECK_PARTIAL();
3540 break;
3541 }
3542 if (fc != *eptr) break;
3543 eptr++;
3544 }
3545 if (possessive) continue;
3546
3547 while (eptr >= pp)
3548 {
3549 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3550 eptr--;
3551 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3552 }
3553 RRETURN(MATCH_NOMATCH);
3554 }
3555 }
3556 /* Control never gets here */
3557
3558 /* Match a negated single one-byte character. The character we are
3559 checking can be multibyte. */
3560
3561 case OP_NOT:
3562 case OP_NOTI:
3563 if (eptr >= md->end_subject)
3564 {
3565 SCHECK_PARTIAL();
3566 RRETURN(MATCH_NOMATCH);
3567 }
3568 ecode++;
3569 GETCHARINCTEST(c, eptr);
3570 if (op == OP_NOTI) /* The caseless case */
3571 {
3572 register unsigned int ch, och;
3573 ch = *ecode++;
3574 #ifdef COMPILE_PCRE8
3575 /* ch must be < 128 if UTF is enabled. */
3576 och = md->fcc[ch];
3577 #else
3578 #ifdef SUPPORT_UTF
3579 #ifdef SUPPORT_UCP
3580 if (utf && ch > 127)
3581 och = UCD_OTHERCASE(ch);
3582 #else
3583 if (utf && ch > 127)
3584 och = ch;
3585 #endif /* SUPPORT_UCP */
3586 else
3587 #endif /* SUPPORT_UTF */
3588 och = TABLE_GET(ch, md->fcc, ch);
3589 #endif /* COMPILE_PCRE8 */
3590 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3591 }
3592 else /* Caseful */
3593 {
3594 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3595 }
3596 break;
3597
3598 /* Match a negated single one-byte character repeatedly. This is almost a
3599 repeat of the code for a repeated single character, but I haven't found a
3600 nice way of commoning these up that doesn't require a test of the
3601 positive/negative option for each character match. Maybe that wouldn't add
3602 very much to the time taken, but character matching *is* what this is all
3603 about... */
3604
3605 case OP_NOTEXACT:
3606 case OP_NOTEXACTI:
3607 min = max = GET2(ecode, 1);
3608 ecode += 1 + IMM2_SIZE;
3609 goto REPEATNOTCHAR;
3610
3611 case OP_NOTUPTO:
3612 case OP_NOTUPTOI:
3613 case OP_NOTMINUPTO:
3614 case OP_NOTMINUPTOI:
3615 min = 0;
3616 max = GET2(ecode, 1);
3617 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3618 ecode += 1 + IMM2_SIZE;
3619 goto REPEATNOTCHAR;
3620
3621 case OP_NOTPOSSTAR:
3622 case OP_NOTPOSSTARI:
3623 possessive = TRUE;
3624 min = 0;
3625 max = INT_MAX;
3626 ecode++;
3627 goto REPEATNOTCHAR;
3628
3629 case OP_NOTPOSPLUS:
3630 case OP_NOTPOSPLUSI:
3631 possessive = TRUE;
3632 min = 1;
3633 max = INT_MAX;
3634 ecode++;
3635 goto REPEATNOTCHAR;
3636
3637 case OP_NOTPOSQUERY:
3638 case OP_NOTPOSQUERYI:
3639 possessive = TRUE;
3640 min = 0;
3641 max = 1;
3642 ecode++;
3643 goto REPEATNOTCHAR;
3644
3645 case OP_NOTPOSUPTO:
3646 case OP_NOTPOSUPTOI:
3647 possessive = TRUE;
3648 min = 0;
3649 max = GET2(ecode, 1);
3650 ecode += 1 + IMM2_SIZE;
3651 goto REPEATNOTCHAR;
3652
3653 case OP_NOTSTAR:
3654 case OP_NOTSTARI:
3655 case OP_NOTMINSTAR:
3656 case OP_NOTMINSTARI:
3657 case OP_NOTPLUS:
3658 case OP_NOTPLUSI:
3659 case OP_NOTMINPLUS:
3660 case OP_NOTMINPLUSI:
3661 case OP_NOTQUERY:
3662 case OP_NOTQUERYI:
3663 case OP_NOTMINQUERY:
3664 case OP_NOTMINQUERYI:
3665 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3666 minimize = (c & 1) != 0;
3667 min = rep_min[c]; /* Pick up values from tables; */
3668 max = rep_max[c]; /* zero for max => infinity */
3669 if (max == 0) max = INT_MAX;
3670
3671 /* Common code for all repeated single-byte matches. */
3672
3673 REPEATNOTCHAR:
3674 fc = *ecode++;
3675
3676 /* The code is duplicated for the caseless and caseful cases, for speed,
3677 since matching characters is likely to be quite common. First, ensure the
3678 minimum number of matches are present. If min = max, continue at the same
3679 level without recursing. Otherwise, if minimizing, keep trying the rest of
3680 the expression and advancing one matching character if failing, up to the
3681 maximum. Alternatively, if maximizing, find the maximum number of
3682 characters and work backwards. */
3683
3684 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3685 max, eptr));
3686
3687 if (op >= OP_NOTSTARI) /* Caseless */
3688 {
3689 #ifdef COMPILE_PCRE8
3690 /* fc must be < 128 if UTF is enabled. */
3691 foc = md->fcc[fc];
3692 #else
3693 #ifdef SUPPORT_UTF
3694 #ifdef SUPPORT_UCP
3695 if (utf && fc > 127)
3696 foc = UCD_OTHERCASE(fc);
3697 #else
3698 if (utf && fc > 127)
3699 foc = fc;
3700 #endif /* SUPPORT_UCP */
3701 else
3702 #endif /* SUPPORT_UTF */
3703 foc = TABLE_GET(fc, md->fcc, fc);
3704 #endif /* COMPILE_PCRE8 */
3705
3706 #ifdef SUPPORT_UTF
3707 if (utf)
3708 {
3709 register unsigned int d;
3710 for (i = 1; i <= min; i++)
3711 {
3712 if (eptr >= md->end_subject)
3713 {
3714 SCHECK_PARTIAL();
3715 RRETURN(MATCH_NOMATCH);
3716 }
3717 GETCHARINC(d, eptr);
3718 if (fc == d || (unsigned int) foc == d) RRETURN(MATCH_NOMATCH);
3719 }
3720 }
3721 else
3722 #endif
3723 /* Not UTF mode */
3724 {
3725 for (i = 1; i <= min; i++)
3726 {
3727 if (eptr >= md->end_subject)
3728 {
3729 SCHECK_PARTIAL();
3730 RRETURN(MATCH_NOMATCH);
3731 }
3732 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3733 eptr++;
3734 }
3735 }
3736
3737 if (min == max) continue;
3738
3739 if (minimize)
3740 {
3741 #ifdef SUPPORT_UTF
3742 if (utf)
3743 {
3744 register unsigned int d;
3745 for (fi = min;; fi++)
3746 {
3747 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3748 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3749 if (fi >= max) RRETURN(MATCH_NOMATCH);
3750 if (eptr >= md->end_subject)
3751 {
3752 SCHECK_PARTIAL();
3753 RRETURN(MATCH_NOMATCH);
3754 }
3755 GETCHARINC(d, eptr);
3756 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3757 }
3758 }
3759 else
3760 #endif
3761 /* Not UTF mode */
3762 {
3763 for (fi = min;; fi++)
3764 {
3765 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3766 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3767 if (fi >= max) RRETURN(MATCH_NOMATCH);
3768 if (eptr >= md->end_subject)
3769 {
3770 SCHECK_PARTIAL();
3771 RRETURN(MATCH_NOMATCH);
3772 }
3773 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3774 eptr++;
3775 }
3776 }
3777 /* Control never gets here */
3778 }
3779
3780 /* Maximize case */
3781
3782 else
3783 {
3784 pp = eptr;
3785
3786 #ifdef SUPPORT_UTF
3787 if (utf)
3788 {
3789 register unsigned int d;
3790 for (i = min; i < max; i++)
3791 {
3792 int len = 1;
3793 if (eptr >= md->end_subject)
3794 {
3795 SCHECK_PARTIAL();
3796 break;
3797 }
3798 GETCHARLEN(d, eptr, len);
3799 if (fc == d || (unsigned int)foc == d) break;
3800 eptr += len;
3801 }
3802 if (possessive) continue;
3803 for(;;)
3804 {
3805 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3807 if (eptr-- == pp) break; /* Stop if tried at original pos */
3808 BACKCHAR(eptr);
3809 }
3810 }
3811 else
3812 #endif
3813 /* Not UTF mode */
3814 {
3815 for (i = min; i < max; i++)
3816 {
3817 if (eptr >= md->end_subject)
3818 {
3819 SCHECK_PARTIAL();
3820 break;
3821 }
3822 if (fc == *eptr || foc == *eptr) break;
3823 eptr++;
3824 }
3825 if (possessive) continue;
3826 while (eptr >= pp)
3827 {
3828 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3830 eptr--;
3831 }
3832 }
3833
3834 RRETURN(MATCH_NOMATCH);
3835 }
3836 /* Control never gets here */
3837 }
3838
3839 /* Caseful comparisons */
3840
3841 else
3842 {
3843 #ifdef SUPPORT_UTF
3844 if (utf)
3845 {
3846 register unsigned int d;
3847 for (i = 1; i <= min; i++)
3848 {
3849 if (eptr >= md->end_subject)
3850 {
3851 SCHECK_PARTIAL();
3852 RRETURN(MATCH_NOMATCH);
3853 }
3854 GETCHARINC(d, eptr);
3855 if (fc == d) RRETURN(MATCH_NOMATCH);
3856 }
3857 }
3858 else
3859 #endif
3860 /* Not UTF mode */
3861 {
3862 for (i = 1; i <= min; i++)
3863 {
3864 if (eptr >= md->end_subject)
3865 {
3866 SCHECK_PARTIAL();
3867 RRETURN(MATCH_NOMATCH);
3868 }
3869 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3870 }
3871 }
3872
3873 if (min == max) continue;
3874
3875 if (minimize)
3876 {
3877 #ifdef SUPPORT_UTF
3878 if (utf)
3879 {
3880 register unsigned int d;
3881 for (fi = min;; fi++)
3882 {
3883 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3885 if (fi >= max) RRETURN(MATCH_NOMATCH);
3886 if (eptr >= md->end_subject)
3887 {
3888 SCHECK_PARTIAL();
3889 RRETURN(MATCH_NOMATCH);
3890 }
3891 GETCHARINC(d, eptr);
3892 if (fc == d) RRETURN(MATCH_NOMATCH);
3893 }
3894 }
3895 else
3896 #endif
3897 /* Not UTF mode */
3898 {
3899 for (fi = min;; fi++)
3900 {
3901 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3902 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3903 if (fi >= max) RRETURN(MATCH_NOMATCH);
3904 if (eptr >= md->end_subject)
3905 {
3906 SCHECK_PARTIAL();
3907 RRETURN(MATCH_NOMATCH);
3908 }
3909 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3910 }
3911 }
3912 /* Control never gets here */
3913 }
3914
3915 /* Maximize case */
3916
3917 else
3918 {
3919 pp = eptr;
3920
3921 #ifdef SUPPORT_UTF
3922 if (utf)
3923 {
3924 register unsigned int d;
3925 for (i = min; i < max; i++)
3926 {
3927 int len = 1;
3928 if (eptr >= md->end_subject)
3929 {
3930 SCHECK_PARTIAL();
3931 break;
3932 }
3933 GETCHARLEN(d, eptr, len);
3934 if (fc == d) break;
3935 eptr += len;
3936 }
3937 if (possessive) continue;
3938 for(;;)
3939 {
3940 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3942 if (eptr-- == pp) break; /* Stop if tried at original pos */
3943 BACKCHAR(eptr);
3944 }
3945 }
3946 else
3947 #endif
3948 /* Not UTF mode */
3949 {
3950 for (i = min; i < max; i++)
3951 {
3952 if (eptr >= md->end_subject)
3953 {
3954 SCHECK_PARTIAL();
3955 break;
3956 }
3957 if (fc == *eptr) break;
3958 eptr++;
3959 }
3960 if (possessive) continue;
3961 while (eptr >= pp)
3962 {
3963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3965 eptr--;
3966 }
3967 }
3968
3969 RRETURN(MATCH_NOMATCH);
3970 }
3971 }
3972 /* Control never gets here */
3973
3974 /* Match a single character type repeatedly; several different opcodes
3975 share code. This is very similar to the code for single characters, but we
3976 repeat it in the interests of efficiency. */
3977
3978 case OP_TYPEEXACT:
3979 min = max = GET2(ecode, 1);
3980 minimize = TRUE;
3981 ecode += 1 + IMM2_SIZE;
3982 goto REPEATTYPE;
3983
3984 case OP_TYPEUPTO:
3985 case OP_TYPEMINUPTO:
3986 min = 0;
3987 max = GET2(ecode, 1);
3988 minimize = *ecode == OP_TYPEMINUPTO;
3989 ecode += 1 + IMM2_SIZE;
3990 goto REPEATTYPE;
3991
3992 case OP_TYPEPOSSTAR:
3993 possessive = TRUE;
3994 min = 0;
3995 max = INT_MAX;
3996 ecode++;
3997 goto REPEATTYPE;
3998
3999 case OP_TYPEPOSPLUS:
4000 possessive = TRUE;
4001 min = 1;
4002 max = INT_MAX;
4003 ecode++;
4004 goto REPEATTYPE;
4005
4006 case OP_TYPEPOSQUERY:
4007 possessive = TRUE;
4008 min = 0;
4009 max = 1;
4010 ecode++;
4011 goto REPEATTYPE;
4012
4013 case OP_TYPEPOSUPTO:
4014 possessive = TRUE;
4015 min = 0;
4016 max = GET2(ecode, 1);
4017 ecode += 1 + IMM2_SIZE;
4018 goto REPEATTYPE;
4019
4020 case OP_TYPESTAR:
4021 case OP_TYPEMINSTAR:
4022 case OP_TYPEPLUS:
4023 case OP_TYPEMINPLUS:
4024 case OP_TYPEQUERY:
4025 case OP_TYPEMINQUERY:
4026 c = *ecode++ - OP_TYPESTAR;
4027 minimize = (c & 1) != 0;
4028 min = rep_min[c]; /* Pick up values from tables; */
4029 max = rep_max[c]; /* zero for max => infinity */
4030 if (max == 0) max = INT_MAX;
4031
4032 /* Common code for all repeated single character type matches. Note that
4033 in UTF-8 mode, '.' matches a character of any length, but for the other
4034 character types, the valid characters are all one-byte long. */
4035
4036 REPEATTYPE:
4037 ctype = *ecode++; /* Code for the character type */
4038
4039 #ifdef SUPPORT_UCP
4040 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4041 {
4042 prop_fail_result = ctype == OP_NOTPROP;
4043 prop_type = *ecode++;
4044 prop_value = *ecode++;
4045 }
4046 else prop_type = -1;
4047 #endif
4048
4049 /* First, ensure the minimum number of matches are present. Use inline
4050 code for maximizing the speed, and do the type test once at the start
4051 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4052 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4053 and single-bytes. */
4054
4055 if (min > 0)
4056 {
4057 #ifdef SUPPORT_UCP
4058 if (prop_type >= 0)
4059 {
4060 switch(prop_type)
4061 {
4062 case PT_ANY:
4063 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4064 for (i = 1; i <= min; i++)
4065 {
4066 if (eptr >= md->end_subject)
4067 {
4068 SCHECK_PARTIAL();
4069 RRETURN(MATCH_NOMATCH);
4070 }
4071 GETCHARINCTEST(c, eptr);
4072 }
4073 break;
4074
4075 case PT_LAMP:
4076 for (i = 1; i <= min; i++)
4077 {
4078 int chartype;
4079 if (eptr >= md->end_subject)
4080 {
4081 SCHECK_PARTIAL();
4082 RRETURN(MATCH_NOMATCH);
4083 }
4084 GETCHARINCTEST(c, eptr);
4085 chartype = UCD_CHARTYPE(c);
4086 if ((chartype == ucp_Lu ||
4087 chartype == ucp_Ll ||
4088 chartype == ucp_Lt) == prop_fail_result)
4089 RRETURN(MATCH_NOMATCH);
4090 }
4091 break;
4092
4093 case PT_GC:
4094 for (i = 1; i <= min; i++)
4095 {
4096 if (eptr >= md->end_subject)
4097 {
4098 SCHECK_PARTIAL();
4099 RRETURN(MATCH_NOMATCH);
4100 }
4101 GETCHARINCTEST(c, eptr);
4102 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4103 RRETURN(MATCH_NOMATCH);
4104 }
4105 break;
4106
4107 case PT_PC:
4108 for (i = 1; i <= min; i++)
4109 {
4110 if (eptr >= md->end_subject)
4111 {
4112 SCHECK_PARTIAL();
4113 RRETURN(MATCH_NOMATCH);
4114 }
4115 GETCHARINCTEST(c, eptr);
4116 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4117 RRETURN(MATCH_NOMATCH);
4118 }
4119 break;
4120
4121 case PT_SC:
4122 for (i = 1; i <= min; i++)
4123 {
4124 if (eptr >= md->end_subject)
4125 {
4126 SCHECK_PARTIAL();
4127 RRETURN(MATCH_NOMATCH);
4128 }
4129 GETCHARINCTEST(c, eptr);
4130 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4131 RRETURN(MATCH_NOMATCH);
4132 }
4133 break;
4134
4135 case PT_ALNUM:
4136 for (i = 1; i <= min; i++)
4137 {
4138 int category;
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 RRETURN(MATCH_NOMATCH);
4143 }
4144 GETCHARINCTEST(c, eptr);
4145 category = UCD_CATEGORY(c);
4146 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4147 RRETURN(MATCH_NOMATCH);
4148 }
4149 break;
4150
4151 case PT_SPACE: /* Perl space */
4152 for (i = 1; i <= min; i++)
4153 {
4154 if (eptr >= md->end_subject)
4155 {
4156 SCHECK_PARTIAL();
4157 RRETURN(MATCH_NOMATCH);
4158 }
4159 GETCHARINCTEST(c, eptr);
4160 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4161 c == CHAR_FF || c == CHAR_CR)
4162 == prop_fail_result)
4163 RRETURN(MATCH_NOMATCH);
4164 }
4165 break;
4166
4167 case PT_PXSPACE: /* POSIX space */
4168 for (i = 1; i <= min; i++)
4169 {
4170 if (eptr >= md->end_subject)
4171 {
4172 SCHECK_PARTIAL();
4173 RRETURN(MATCH_NOMATCH);
4174 }
4175 GETCHARINCTEST(c, eptr);
4176 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4177 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4178 == prop_fail_result)
4179 RRETURN(MATCH_NOMATCH);
4180 }
4181 break;
4182
4183 case PT_WORD:
4184 for (i = 1; i <= min; i++)
4185 {
4186 int category;
4187 if (eptr >= md->end_subject)
4188 {
4189 SCHECK_PARTIAL();
4190 RRETURN(MATCH_NOMATCH);
4191 }
4192 GETCHARINCTEST(c, eptr);
4193 category = UCD_CATEGORY(c);
4194 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4195 == prop_fail_result)
4196 RRETURN(MATCH_NOMATCH);
4197 }
4198 break;
4199
4200 /* This should not occur */
4201
4202 default:
4203 RRETURN(PCRE_ERROR_INTERNAL);
4204 }
4205 }
4206
4207 /* Match extended Unicode sequences. We will get here only if the
4208 support is in the binary; otherwise a compile-time error occurs. */
4209
4210 else if (ctype == OP_EXTUNI)
4211 {
4212 for (i = 1; i <= min; i++)
4213 {
4214 if (eptr >= md->end_subject)
4215 {
4216 SCHECK_PARTIAL();
4217 RRETURN(MATCH_NOMATCH);
4218 }
4219 GETCHARINCTEST(c, eptr);
4220 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4221 while (eptr < md->end_subject)
4222 {
4223 int len = 1;
4224 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4225 if (UCD_CATEGORY(c) != ucp_M) break;
4226 eptr += len;
4227 }
4228 CHECK_PARTIAL();
4229 }
4230 }
4231
4232 else
4233 #endif /* SUPPORT_UCP */
4234
4235 /* Handle all other cases when the coding is UTF-8 */
4236
4237 #ifdef SUPPORT_UTF
4238 if (utf) switch(ctype)
4239 {
4240 case OP_ANY:
4241 for (i = 1; i <= min; i++)
4242 {
4243 if (eptr >= md->end_subject)
4244 {
4245 SCHECK_PARTIAL();
4246 RRETURN(MATCH_NOMATCH);
4247 }
4248 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4249 if (md->partial != 0 &&
4250 eptr + 1 >= md->end_subject &&
4251 NLBLOCK->nltype == NLTYPE_FIXED &&
4252 NLBLOCK->nllen == 2 &&
4253 *eptr == NLBLOCK->nl[0])
4254 {
4255 md->hitend = TRUE;
4256 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4257 }
4258 eptr++;
4259 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4260 }
4261 break;
4262
4263 case OP_ALLANY:
4264 for (i = 1; i <= min; i++)
4265 {
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 RRETURN(MATCH_NOMATCH);
4270 }
4271 eptr++;
4272 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4273 }
4274 break;
4275
4276 case OP_ANYBYTE:
4277 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4278 eptr += min;
4279 break;
4280
4281 case OP_ANYNL:
4282 for (i = 1; i <= min; i++)
4283 {
4284 if (eptr >= md->end_subject)
4285 {
4286 SCHECK_PARTIAL();
4287 RRETURN(MATCH_NOMATCH);
4288 }
4289 GETCHARINC(c, eptr);
4290 switch(c)
4291 {
4292 default: RRETURN(MATCH_NOMATCH);
4293
4294 case 0x000d:
4295 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4296 break;
4297
4298 case 0x000a:
4299 break;
4300
4301 case 0x000b:
4302 case 0x000c:
4303 case 0x0085:
4304 case 0x2028:
4305 case 0x2029:
4306 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4307 break;
4308 }
4309 }
4310 break;
4311
4312 case OP_NOT_HSPACE:
4313 for (i = 1; i <= min; i++)
4314 {
4315 if (eptr >= md->end_subject)
4316 {
4317 SCHECK_PARTIAL();
4318 RRETURN(MATCH_NOMATCH);
4319 }
4320 GETCHARINC(c, eptr);
4321 switch(c)
4322 {
4323 default: break;
4324 case 0x09: /* HT */
4325 case 0x20: /* SPACE */
4326 case 0xa0: /* NBSP */
4327 case 0x1680: /* OGHAM SPACE MARK */
4328 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4329 case 0x2000: /* EN QUAD */
4330 case 0x2001: /* EM QUAD */
4331 case 0x2002: /* EN SPACE */
4332 case 0x2003: /* EM SPACE */
4333 case 0x2004: /* THREE-PER-EM SPACE */
4334 case 0x2005: /* FOUR-PER-EM SPACE */
4335 case 0x2006: /* SIX-PER-EM SPACE */
4336 case 0x2007: /* FIGURE SPACE */
4337 case 0x2008: /* PUNCTUATION SPACE */
4338 case 0x2009: /* THIN SPACE */
4339 case 0x200A: /* HAIR SPACE */
4340 case 0x202f: /* NARROW NO-BREAK SPACE */
4341 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4342 case 0x3000: /* IDEOGRAPHIC SPACE */
4343 RRETURN(MATCH_NOMATCH);
4344 }
4345 }
4346 break;
4347
4348 case OP_HSPACE:
4349 for (i = 1; i <= min; i++)
4350 {
4351 if (eptr >= md->end_subject)
4352 {
4353 SCHECK_PARTIAL();
4354 RRETURN(MATCH_NOMATCH);
4355 }
4356 GETCHARINC(c, eptr);
4357 switch(c)
4358 {
4359 default: RRETURN(MATCH_NOMATCH);
4360 case 0x09: /* HT */
4361 case 0x20: /* SPACE */
4362 case 0xa0: /* NBSP */
4363 case 0x1680: /* OGHAM SPACE MARK */
4364 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4365 case 0x2000: /* EN QUAD */
4366 case 0x2001: /* EM QUAD */
4367 case 0x2002: /* EN SPACE */
4368 case 0x2003: /* EM SPACE */
4369 case 0x2004: /* THREE-PER-EM SPACE */
4370 case 0x2005: /* FOUR-PER-EM SPACE */
4371 case 0x2006: /* SIX-PER-EM SPACE */
4372 case 0x2007: /* FIGURE SPACE */
4373 case 0x2008: /* PUNCTUATION SPACE */
4374 case 0x2009: /* THIN SPACE */
4375 case 0x200A: /* HAIR SPACE */
4376 case 0x202f: /* NARROW NO-BREAK SPACE */
4377 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4378 case 0x3000: /* IDEOGRAPHIC SPACE */
4379 break;
4380 }
4381 }
4382 break;
4383
4384 case OP_NOT_VSPACE:
4385 for (i = 1; i <= min; i++)
4386 {
4387 if (eptr >= md->end_subject)
4388 {
4389 SCHECK_PARTIAL();
4390 RRETURN(MATCH_NOMATCH);
4391 }
4392 GETCHARINC(c, eptr);
4393 switch(c)
4394 {
4395 default: break;
4396 case 0x0a: /* LF */
4397 case 0x0b: /* VT */
4398 case 0x0c: /* FF */
4399 case 0x0d: /* CR */
4400 case 0x85: /* NEL */
4401 case 0x2028: /* LINE SEPARATOR */
4402 case 0x2029: /* PARAGRAPH SEPARATOR */
4403 RRETURN(MATCH_NOMATCH);
4404 }
4405 }
4406 break;
4407
4408 case OP_VSPACE:
4409 for (i = 1; i <= min; i++)
4410 {
4411 if (eptr >= md->end_subject)
4412 {
4413 SCHECK_PARTIAL();
4414 RRETURN(MATCH_NOMATCH);
4415 }
4416 GETCHARINC(c, eptr);
4417 switch(c)
4418 {
4419 default: RRETURN(MATCH_NOMATCH);
4420 case 0x0a: /* LF */
4421 case 0x0b: /* VT */
4422 case 0x0c: /* FF */
4423 case 0x0d: /* CR */
4424 case 0x85: /* NEL */
4425 case 0x2028: /* LINE SEPARATOR */
4426 case 0x2029: /* PARAGRAPH SEPARATOR */
4427 break;
4428 }
4429 }
4430 break;
4431
4432 case OP_NOT_DIGIT:
4433 for (i = 1; i <= min; i++)
4434 {
4435 if (eptr >= md->end_subject)
4436 {
4437 SCHECK_PARTIAL();
4438 RRETURN(MATCH_NOMATCH);
4439 }
4440 GETCHARINC(c, eptr);
4441 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4442 RRETURN(MATCH_NOMATCH);
4443 }
4444 break;
4445
4446 case OP_DIGIT:
4447 for (i = 1; i <= min; i++)
4448 {
4449 if (eptr >= md->end_subject)
4450 {
4451 SCHECK_PARTIAL();
4452 RRETURN(MATCH_NOMATCH);
4453 }
4454 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4455 RRETURN(MATCH_NOMATCH);
4456 eptr++;
4457 /* No need to skip more bytes - we know it's a 1-byte character */
4458 }
4459 break;
4460
4461 case OP_NOT_WHITESPACE:
4462 for (i = 1; i <= min; i++)
4463 {
4464 if (eptr >= md->end_subject)
4465 {
4466 SCHECK_PARTIAL();
4467 RRETURN(MATCH_NOMATCH);
4468 }
4469 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4470 RRETURN(MATCH_NOMATCH);
4471 eptr++;
4472 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4473 }
4474 break;
4475
4476 case OP_WHITESPACE:
4477 for (i = 1; i <= min; i++)
4478 {
4479 if (eptr >= md->end_subject)
4480 {
4481 SCHECK_PARTIAL();
4482 RRETURN(MATCH_NOMATCH);
4483 }
4484 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4485 RRETURN(MATCH_NOMATCH);
4486 eptr++;
4487 /* No need to skip more bytes - we know it's a 1-byte character */
4488 }
4489 break;
4490
4491 case OP_NOT_WORDCHAR:
4492 for (i = 1; i <= min; i++)
4493 {
4494 if (eptr >= md->end_subject)
4495 {
4496 SCHECK_PARTIAL();
4497 RRETURN(MATCH_NOMATCH);
4498 }
4499 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4500 RRETURN(MATCH_NOMATCH);
4501 eptr++;
4502 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4503 }
4504 break;
4505
4506 case OP_WORDCHAR:
4507 for (i = 1; i <= min; i++)
4508 {
4509 if (eptr >= md->end_subject)
4510 {
4511 SCHECK_PARTIAL();
4512 RRETURN(MATCH_NOMATCH);
4513 }
4514 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4515 RRETURN(MATCH_NOMATCH);
4516 eptr++;
4517 /* No need to skip more bytes - we know it's a 1-byte character */
4518 }
4519 break;
4520
4521 default:
4522 RRETURN(PCRE_ERROR_INTERNAL);
4523 } /* End switch(ctype) */
4524
4525 else
4526 #endif /* SUPPORT_UTF */
4527
4528 /* Code for the non-UTF-8 case for minimum matching of operators other
4529 than OP_PROP and OP_NOTPROP. */
4530
4531 switch(ctype)
4532 {
4533 case OP_ANY:
4534 for (i = 1; i <= min; i++)
4535 {
4536 if (eptr >= md->end_subject)
4537 {
4538 SCHECK_PARTIAL();
4539 RRETURN(MATCH_NOMATCH);
4540 }
4541 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4542 if (md->partial != 0 &&
4543 eptr + 1 >= md->end_subject &&
4544 NLBLOCK->nltype == NLTYPE_FIXED &&
4545 NLBLOCK->nllen == 2 &&
4546 *eptr == NLBLOCK->nl[0])
4547 {
4548 md->hitend = TRUE;
4549 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4550 }
4551 eptr++;
4552 }
4553 break;
4554
4555 case OP_ALLANY:
4556 if (eptr > md->end_subject - min)
4557 {
4558 SCHECK_PARTIAL();
4559 RRETURN(MATCH_NOMATCH);
4560 }
4561 eptr += min;
4562 break;
4563
4564 case OP_ANYBYTE:
4565 if (eptr > md->end_subject - min)
4566 {
4567 SCHECK_PARTIAL();
4568 RRETURN(MATCH_NOMATCH);
4569 }
4570 eptr += min;
4571 break;
4572
4573 case OP_ANYNL:
4574 for (i = 1; i <= min; i++)
4575 {
4576 if (eptr >= md->end_subject)
4577 {
4578 SCHECK_PARTIAL();
4579 RRETURN(MATCH_NOMATCH);
4580 }
4581 switch(*eptr++)
4582 {
4583 default: RRETURN(MATCH_NOMATCH);
4584
4585 case 0x000d:
4586 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4587 break;
4588
4589 case 0x000a:
4590 break;
4591
4592 case 0x000b:
4593 case 0x000c:
4594 case 0x0085:
4595 #ifdef COMPILE_PCRE16
4596 case 0x2028:
4597 case 0x2029:
4598 #endif
4599 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4600 break;
4601 }
4602 }
4603 break;
4604
4605 case OP_NOT_HSPACE:
4606 for (i = 1; i <= min; i++)
4607 {
4608 if (eptr >= md->end_subject)
4609 {
4610 SCHECK_PARTIAL();
4611 RRETURN(MATCH_NOMATCH);
4612 }
4613 switch(*eptr++)
4614 {
4615 default: break;
4616 case 0x09: /* HT */
4617 case 0x20: /* SPACE */
4618 case 0xa0: /* NBSP */
4619 #ifdef COMPILE_PCRE16
4620 case 0x1680: /* OGHAM SPACE MARK */
4621 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4622 case 0x2000: /* EN QUAD */
4623 case 0x2001: /* EM QUAD */
4624 case 0x2002: /* EN SPACE */
4625 case 0x2003: /* EM SPACE */
4626 case 0x2004: /* THREE-PER-EM SPACE */
4627 case 0x2005: /* FOUR-PER-EM SPACE */
4628 case 0x2006: /* SIX-PER-EM SPACE */
4629 case 0x2007: /* FIGURE SPACE */
4630 case 0x2008: /* PUNCTUATION SPACE */
4631 case 0x2009: /* THIN SPACE */
4632 case 0x200A: /* HAIR SPACE */
4633 case 0x202f: /* NARROW NO-BREAK SPACE */
4634 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4635 case 0x3000: /* IDEOGRAPHIC SPACE */
4636 #endif
4637 RRETURN(MATCH_NOMATCH);
4638 }
4639 }
4640 break;
4641
4642 case OP_HSPACE:
4643 for (i = 1; i <= min; i++)
4644 {
4645 if (eptr >= md->end_subject)
4646 {
4647 SCHECK_PARTIAL();
4648 RRETURN(MATCH_NOMATCH);
4649 }
4650 switch(*eptr++)
4651 {
4652 default: RRETURN(MATCH_NOMATCH);
4653 case 0x09: /* HT */
4654 case 0x20: /* SPACE */
4655 case 0xa0: /* NBSP */
4656 #ifdef COMPILE_PCRE16
4657 case 0x1680: /* OGHAM SPACE MARK */
4658 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4659 case 0x2000: /* EN QUAD */
4660 case 0x2001: /* EM QUAD */
4661 case 0x2002: /* EN SPACE */
4662 case 0x2003: /* EM SPACE */
4663 case 0x2004: /* THREE-PER-EM SPACE */
4664 case 0x2005: /* FOUR-PER-EM SPACE */
4665 case 0x2006: /* SIX-PER-EM SPACE */
4666 case 0x2007: /* FIGURE SPACE */
4667 case 0x2008: /* PUNCTUATION SPACE */
4668 case 0x2009: /* THIN SPACE */
4669 case 0x200A: /* HAIR SPACE */
4670 case 0x202f: /* NARROW NO-BREAK SPACE */
4671 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4672 case 0x3000: /* IDEOGRAPHIC SPACE */
4673 #endif
4674 break;
4675 }
4676 }
4677 break;
4678
4679 case OP_NOT_VSPACE:
4680 for (i = 1; i <= min; i++)
4681 {
4682 if (eptr >= md->end_subject)
4683 {
4684 SCHECK_PARTIAL();
4685 RRETURN(MATCH_NOMATCH);
4686 }
4687 switch(*eptr++)
4688 {
4689 default: break;
4690 case 0x0a: /* LF */
4691 case 0x0b: /* VT */
4692 case 0x0c: /* FF */
4693 case 0x0d: /* CR */
4694 case 0x85: /* NEL */
4695 #ifdef COMPILE_PCRE16
4696 case 0x2028: /* LINE SEPARATOR */
4697 case 0x2029: /* PARAGRAPH SEPARATOR */
4698 #endif
4699 RRETURN(MATCH_NOMATCH);
4700 }
4701 }
4702 break;
4703
4704 case OP_VSPACE:
4705 for (i = 1; i <= min; i++)
4706 {
4707 if (eptr >= md->end_subject)
4708 {
4709 SCHECK_PARTIAL();
4710 RRETURN(MATCH_NOMATCH);
4711 }
4712 switch(*eptr++)
4713 {
4714 default: RRETURN(MATCH_NOMATCH);
4715 case 0x0a: /* LF */
4716 case 0x0b: /* VT */
4717 case 0x0c: /* FF */
4718 case 0x0d: /* CR */
4719 case 0x85: /* NEL */
4720 #ifdef COMPILE_PCRE16
4721 case 0x2028: /* LINE SEPARATOR */
4722 case 0x2029: /* PARAGRAPH SEPARATOR */
4723 #endif
4724 break;
4725 }
4726 }
4727 break;
4728
4729 case OP_NOT_DIGIT:
4730 for (i = 1; i <= min; i++)
4731 {
4732 if (eptr >= md->end_subject)
4733 {
4734 SCHECK_PARTIAL();
4735 RRETURN(MATCH_NOMATCH);
4736 }
4737 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4738 RRETURN(MATCH_NOMATCH);
4739 eptr++;
4740 }
4741 break;
4742
4743 case OP_DIGIT:
4744 for (i = 1; i <= min; i++)
4745 {
4746 if (eptr >= md->end_subject)
4747 {
4748 SCHECK_PARTIAL();
4749 RRETURN(MATCH_NOMATCH);
4750 }
4751 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4752 RRETURN(MATCH_NOMATCH);
4753 eptr++;
4754 }
4755 break;
4756
4757 case OP_NOT_WHITESPACE:
4758 for (i = 1; i <= min; i++)
4759 {
4760 if (eptr >= md->end_subject)
4761 {
4762 SCHECK_PARTIAL();
4763 RRETURN(MATCH_NOMATCH);
4764 }
4765 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4766 RRETURN(MATCH_NOMATCH);
4767 eptr++;
4768 }
4769 break;
4770
4771 case OP_WHITESPACE:
4772 for (i = 1; i <= min; i++)
4773 {
4774 if (eptr >= md->end_subject)
4775 {
4776 SCHECK_PARTIAL();
4777 RRETURN(MATCH_NOMATCH);
4778 }
4779 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4780 RRETURN(MATCH_NOMATCH);
4781 eptr++;
4782 }
4783 break;
4784
4785 case OP_NOT_WORDCHAR:
4786 for (i = 1; i <= min; i++)
4787 {
4788 if (eptr >= md->end_subject)
4789 {
4790 SCHECK_PARTIAL();
4791 RRETURN(MATCH_NOMATCH);
4792 }
4793 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4794 RRETURN(MATCH_NOMATCH);
4795 eptr++;
4796 }
4797 break;
4798
4799 case OP_WORDCHAR:
4800 for (i = 1; i <= min; i++)
4801 {
4802 if (eptr >= md->end_subject)
4803 {
4804 SCHECK_PARTIAL();
4805 RRETURN(MATCH_NOMATCH);
4806 }
4807 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4808 RRETURN(MATCH_NOMATCH);
4809 eptr++;
4810 }
4811 break;
4812
4813 default:
4814 RRETURN(PCRE_ERROR_INTERNAL);
4815 }
4816 }
4817
4818 /* If min = max, continue at the same level without recursing */
4819
4820 if (min == max) continue;
4821
4822 /* If minimizing, we have to test the rest of the pattern before each
4823 subsequent match. Again, separate the UTF-8 case for speed, and also
4824 separate the UCP cases. */
4825
4826 if (minimize)
4827 {
4828 #ifdef SUPPORT_UCP
4829 if (prop_type >= 0)
4830 {
4831 switch(prop_type)
4832 {
4833 case PT_ANY:
4834 for (fi = min;; fi++)
4835 {
4836 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4838 if (fi >= max) RRETURN(MATCH_NOMATCH);
4839 if (eptr >= md->end_subject)
4840 {
4841 SCHECK_PARTIAL();
4842 RRETURN(MATCH_NOMATCH);
4843 }
4844 GETCHARINCTEST(c, eptr);
4845 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4846 }
4847 /* Control never gets here */
4848
4849 case PT_LAMP:
4850 for (fi = min;; fi++)
4851 {
4852 int chartype;
4853 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4854 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4855 if (fi >= max) RRETURN(MATCH_NOMATCH);
4856 if (eptr >= md->end_subject)
4857 {
4858 SCHECK_PARTIAL();
4859 RRETURN(MATCH_NOMATCH);
4860 }
4861 GETCHARINCTEST(c, eptr);
4862 chartype = UCD_CHARTYPE(c);
4863 if ((chartype == ucp_Lu ||
4864 chartype == ucp_Ll ||
4865 chartype == ucp_Lt) == prop_fail_result)
4866 RRETURN(MATCH_NOMATCH);
4867 }
4868 /* Control never gets here */
4869
4870 case PT_GC:
4871 for (fi = min;; fi++)
4872 {
4873 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4875 if (fi >= max) RRETURN(MATCH_NOMATCH);
4876 if (eptr >= md->end_subject)
4877 {
4878 SCHECK_PARTIAL();
4879 RRETURN(MATCH_NOMATCH);
4880 }
4881 GETCHARINCTEST(c, eptr);
4882 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4883 RRETURN(MATCH_NOMATCH);
4884 }
4885 /* Control never gets here */
4886
4887 case PT_PC:
4888 for (fi = min;; fi++)
4889 {
4890 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4892 if (fi >= max) RRETURN(MATCH_NOMATCH);
4893 if (eptr >= md->end_subject)
4894 {
4895 SCHECK_PARTIAL();
4896 RRETURN(MATCH_NOMATCH);
4897 }
4898 GETCHARINCTEST(c, eptr);
4899 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4900 RRETURN(MATCH_NOMATCH);
4901 }
4902 /* Control never gets here */
4903
4904 case PT_SC:
4905 for (fi = min;; fi++)
4906 {
4907 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4909 if (fi >= max) RRETURN(MATCH_NOMATCH);
4910 if (eptr >= md->end_subject)
4911 {
4912 SCHECK_PARTIAL();
4913 RRETURN(MATCH_NOMATCH);
4914 }
4915 GETCHARINCTEST(c, eptr);
4916 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4917 RRETURN(MATCH_NOMATCH);
4918 }
4919 /* Control never gets here */
4920
4921 case PT_ALNUM:
4922 for (fi = min;; fi++)
4923 {
4924 int category;
4925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4927 if (fi >= max) RRETURN(MATCH_NOMATCH);
4928 if (eptr >= md->end_subject)
4929 {
4930 SCHECK_PARTIAL();
4931 RRETURN(MATCH_NOMATCH);
4932 }
4933 GETCHARINCTEST(c, eptr);
4934 category = UCD_CATEGORY(c);
4935 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4936 RRETURN(MATCH_NOMATCH);
4937 }
4938 /* Control never gets here */
4939
4940 case PT_SPACE: /* Perl space */
4941 for (fi = min;; fi++)
4942 {
4943 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4945 if (fi >= max) RRETURN(MATCH_NOMATCH);
4946 if (eptr >= md->end_subject)
4947 {
4948 SCHECK_PARTIAL();
4949 RRETURN(MATCH_NOMATCH);
4950 }
4951 GETCHARINCTEST(c, eptr);
4952 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4953 c == CHAR_FF || c == CHAR_CR)
4954 == prop_fail_result)
4955 RRETURN(MATCH_NOMATCH);
4956 }
4957 /* Control never gets here */
4958
4959 case PT_PXSPACE: /* POSIX space */
4960 for (fi = min;; fi++)
4961 {
4962 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4964 if (fi >= max) RRETURN(MATCH_NOMATCH);
4965 if (eptr >= md->end_subject)
4966 {
4967 SCHECK_PARTIAL();
4968 RRETURN(MATCH_NOMATCH);
4969 }
4970 GETCHARINCTEST(c, eptr);
4971 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4972 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4973 == prop_fail_result)
4974 RRETURN(MATCH_NOMATCH);
4975 }
4976 /* Control never gets here */
4977
4978 case PT_WORD:
4979 for (fi = min;; fi++)
4980 {
4981 int category;
4982 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4983 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4984 if (fi >= max) RRETURN(MATCH_NOMATCH);
4985 if (eptr >= md->end_subject)
4986 {
4987 SCHECK_PARTIAL();
4988 RRETURN(MATCH_NOMATCH);
4989 }
4990 GETCHARINCTEST(c, eptr);
4991 category = UCD_CATEGORY(c);
4992 if ((category == ucp_L ||
4993 category == ucp_N ||
4994 c == CHAR_UNDERSCORE)
4995 == prop_fail_result)
4996 RRETURN(MATCH_NOMATCH);
4997 }
4998 /* Control never gets here */
4999
5000 /* This should never occur */
5001
5002 default:
5003 RRETURN(PCRE_ERROR_INTERNAL);
5004 }
5005 }
5006
5007 /* Match extended Unicode sequences. We will get here only if the
5008 support is in the binary; otherwise a compile-time error occurs. */
5009
5010 else if (ctype == OP_EXTUNI)
5011 {
5012 for (fi = min;; fi++)
5013 {
5014 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5016 if (fi >= max) RRETURN(MATCH_NOMATCH);
5017 if (eptr >= md->end_subject)
5018 {
5019 SCHECK_PARTIAL();
5020 RRETURN(MATCH_NOMATCH);
5021 }
5022 GETCHARINCTEST(c, eptr);
5023 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
5024 while (eptr < md->end_subject)
5025 {
5026 int len = 1;
5027 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5028 if (UCD_CATEGORY(c) != ucp_M) break;
5029 eptr += len;
5030 }
5031 CHECK_PARTIAL();
5032 }
5033 }
5034 else
5035 #endif /* SUPPORT_UCP */
5036
5037 #ifdef SUPPORT_UTF
5038 if (utf)
5039 {
5040 for (fi = min;; fi++)
5041 {
5042 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5044 if (fi >= max) RRETURN(MATCH_NOMATCH);
5045 if (eptr >= md->end_subject)
5046 {
5047 SCHECK_PARTIAL();
5048 RRETURN(MATCH_NOMATCH);
5049 }
5050 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5051 RRETURN(MATCH_NOMATCH);
5052 GETCHARINC(c, eptr);
5053 switch(ctype)
5054 {
5055 case OP_ANY: /* This is the non-NL case */
5056 if (md->partial != 0 && /* Take care with CRLF partial */
5057 eptr >= md->end_subject &&
5058 NLBLOCK->nltype == NLTYPE_FIXED &&
5059 NLBLOCK->nllen == 2 &&
5060 c == NLBLOCK->nl[0])
5061 {
5062 md->hitend = TRUE;
5063 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5064 }
5065 break;
5066
5067 case OP_ALLANY:
5068 case OP_ANYBYTE:
5069 break;
5070
5071 case OP_ANYNL:
5072 switch(c)
5073 {
5074 default: RRETURN(MATCH_NOMATCH);
5075 case 0x000d:
5076 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5077 break;
5078 case 0x000a:
5079 break;
5080
5081 case 0x000b:
5082 case 0x000c:
5083 case 0x0085:
5084 case 0x2028:
5085 case 0x2029:
5086 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5087 break;
5088 }
5089 break;
5090
5091 case OP_NOT_HSPACE:
5092 switch(c)
5093 {
5094 default: break;
5095 case 0x09: /* HT */
5096 case 0x20: /* SPACE */
5097 case 0xa0: /* NBSP */
5098 case 0x1680: /* OGHAM SPACE MARK */
5099 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5100 case 0x2000: /* EN QUAD */
5101 case 0x2001: /* EM QUAD */
5102 case 0x2002: /* EN SPACE */
5103 case 0x2003: /* EM SPACE */
5104 case 0x2004: /* THREE-PER-EM SPACE */
5105 case 0x2005: /* FOUR-PER-EM SPACE */
5106 case 0x2006: /* SIX-PER-EM SPACE */
5107 case 0x2007: /* FIGURE SPACE */
5108 case 0x2008: /* PUNCTUATION SPACE */
5109 case 0x2009: /* THIN SPACE */
5110 case 0x200A: /* HAIR SPACE */
5111 case 0x202f: /* NARROW NO-BREAK SPACE */
5112 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5113 case 0x3000: /* IDEOGRAPHIC SPACE */
5114 RRETURN(MATCH_NOMATCH);
5115 }
5116 break;
5117
5118 case OP_HSPACE:
5119 switch(c)
5120 {
5121 default: RRETURN(MATCH_NOMATCH);
5122 case 0x09: /* HT */
5123 case 0x20: /* SPACE */
5124 case 0xa0: /* NBSP */
5125 case 0x1680: /* OGHAM SPACE MARK */
5126 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5127 case 0x2000: /* EN QUAD */
5128 case 0x2001: /* EM QUAD */
5129 case 0x2002: /* EN SPACE */
5130 case 0x2003: /* EM SPACE */
5131 case 0x2004: /* THREE-PER-EM SPACE */
5132 case 0x2005: /* FOUR-PER-EM SPACE */
5133 case 0x2006: /* SIX-PER-EM SPACE */
5134 case 0x2007: /* FIGURE SPACE */
5135 case 0x2008: /* PUNCTUATION SPACE */
5136 case 0x2009: /* THIN SPACE */
5137 case 0x200A: /* HAIR SPACE */
5138 case 0x202f: /* NARROW NO-BREAK SPACE */
5139 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5140 case 0x3000: /* IDEOGRAPHIC SPACE */
5141 break;
5142 }
5143 break;
5144
5145 case OP_NOT_VSPACE:
5146 switch(c)
5147 {
5148 default: break;
5149 case 0x0a: /* LF */
5150 case 0x0b: /* VT */
5151 case 0x0c: /* FF */
5152 case 0x0d: /* CR */
5153 case 0x85: /* NEL */
5154 case 0x2028: /* LINE SEPARATOR */
5155 case 0x2029: /* PARAGRAPH SEPARATOR */
5156 RRETURN(MATCH_NOMATCH);
5157 }
5158 break;
5159
5160 case OP_VSPACE:
5161 switch(c)
5162 {
5163 default: RRETURN(MATCH_NOMATCH);
5164 case 0x0a: /* LF */
5165 case 0x0b: /* VT */
5166 case 0x0c: /* FF */
5167 case 0x0d: /* CR */
5168 case 0x85: /* NEL */
5169 case 0x2028: /* LINE SEPARATOR */
5170 case 0x2029: /* PARAGRAPH SEPARATOR */
5171 break;
5172 }
5173 break;
5174
5175 case OP_NOT_DIGIT:
5176 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5177 RRETURN(MATCH_NOMATCH);
5178 break;
5179
5180 case OP_DIGIT:
5181 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5182 RRETURN(MATCH_NOMATCH);
5183 break;
5184
5185 case OP_NOT_WHITESPACE:
5186 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5187 RRETURN(MATCH_NOMATCH);
5188 break;
5189
5190 case OP_WHITESPACE:
5191 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5192 RRETURN(MATCH_NOMATCH);
5193 break;
5194
5195 case OP_NOT_WORDCHAR:
5196 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5197 RRETURN(MATCH_NOMATCH);
5198 break;
5199
5200 case OP_WORDCHAR:
5201 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5202 RRETURN(MATCH_NOMATCH);
5203 break;
5204
5205 default:
5206 RRETURN(PCRE_ERROR_INTERNAL);
5207 }
5208 }
5209 }
5210 else
5211 #endif
5212 /* Not UTF mode */
5213 {
5214 for (fi = min;; fi++)
5215 {
5216 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5217 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5218 if (fi >= max) RRETURN(MATCH_NOMATCH);
5219 if (eptr >= md->end_subject)
5220 {
5221 SCHECK_PARTIAL();
5222 RRETURN(MATCH_NOMATCH);
5223 }
5224 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5225 RRETURN(MATCH_NOMATCH);
5226 c = *eptr++;
5227 switch(ctype)
5228 {
5229 case OP_ANY: /* This is the non-NL case */
5230 if (md->partial != 0 && /* Take care with CRLF partial */
5231 eptr >= md->end_subject &&
5232 NLBLOCK->nltype == NLTYPE_FIXED &&
5233 NLBLOCK->nllen == 2 &&
5234 c == NLBLOCK->nl[0])
5235 {
5236 md->hitend = TRUE;
5237 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5238 }
5239 break;
5240
5241 case OP_ALLANY:
5242 case OP_ANYBYTE:
5243 break;
5244
5245 case OP_ANYNL:
5246 switch(c)
5247 {
5248 default: RRETURN(MATCH_NOMATCH);
5249 case 0x000d:
5250 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5251 break;
5252
5253 case 0x000a:
5254 break;
5255
5256 case 0x000b:
5257 case 0x000c:
5258 case 0x0085:
5259 #ifdef COMPILE_PCRE16
5260 case 0x2028:
5261 case 0x2029:
5262 #endif
5263 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5264 break;
5265 }
5266 break;
5267
5268 case OP_NOT_HSPACE:
5269 switch(c)
5270 {
5271 default: break;
5272 case 0x09: /* HT */
5273 case 0x20: /* SPACE */
5274 case 0xa0: /* NBSP */
5275 #ifdef COMPILE_PCRE16
5276 case 0x1680: /* OGHAM SPACE MARK */
5277 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5278 case 0x2000: /* EN QUAD */
5279 case 0x2001: /* EM QUAD */
5280 case 0x2002: /* EN SPACE */
5281 case 0x2003: /* EM SPACE */
5282 case 0x2004: /* THREE-PER-EM SPACE */
5283 case 0x2005: /* FOUR-PER-EM SPACE */
5284 case 0x2006: /* SIX-PER-EM SPACE */
5285 case 0x2007: /* FIGURE SPACE */
5286 case 0x2008: /* PUNCTUATION SPACE */
5287 case 0x2009: /* THIN SPACE */
5288 case 0x200A: /* HAIR SPACE */
5289 case 0x202f: /* NARROW NO-BREAK SPACE */
5290 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5291 case 0x3000: /* IDEOGRAPHIC SPACE */
5292 #endif
5293 RRETURN(MATCH_NOMATCH);
5294 }
5295 break;
5296
5297 case OP_HSPACE:
5298 switch(c)
5299 {
5300 default: RRETURN(MATCH_NOMATCH);
5301 case 0x09: /* HT */
5302 case 0x20: /* SPACE */
5303 case 0xa0: /* NBSP */
5304 #ifdef COMPILE_PCRE16
5305 case 0x1680: /* OGHAM SPACE MARK */
5306 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5307 case 0x2000: /* EN QUAD */
5308 case 0x2001: /* EM QUAD */
5309 case 0x2002: /* EN SPACE */
5310 case 0x2003: /* EM SPACE */
5311 case 0x2004: /* THREE-PER-EM SPACE */
5312 case 0x2005: /* FOUR-PER-EM SPACE */
5313 case 0x2006: /* SIX-PER-EM SPACE */
5314 case 0x2007: /* FIGURE SPACE */
5315 case 0x2008: /* PUNCTUATION SPACE */
5316 case 0x2009: /* THIN SPACE */
5317 case 0x200A: /* HAIR SPACE */
5318 case 0x202f: /* NARROW NO-BREAK SPACE */
5319 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5320 case 0x3000: /* IDEOGRAPHIC SPACE */
5321 #endif
5322 break;
5323 }
5324 break;
5325
5326 case OP_NOT_VSPACE:
5327 switch(c)
5328 {
5329 default: break;
5330 case 0x0a: /* LF */
5331 case 0x0b: /* VT */
5332 case 0x0c: /* FF */
5333 case 0x0d: /* CR */
5334 case 0x85: /* NEL */
5335 #ifdef COMPILE_PCRE16
5336 case 0x2028: /* LINE SEPARATOR */
5337 case 0x2029: /* PARAGRAPH SEPARATOR */
5338 #endif
5339 RRETURN(MATCH_NOMATCH);
5340 }
5341 break;
5342
5343 case OP_VSPACE:
5344 switch(c)
5345 {
5346 default: RRETURN(MATCH_NOMATCH);
5347 case 0x0a: /* LF */
5348 case 0x0b: /* VT */
5349 case 0x0c: /* FF */
5350 case 0x0d: /* CR */
5351 case 0x85: /* NEL */
5352 #ifdef COMPILE_PCRE16
5353 case 0x2028: /* LINE SEPARATOR */
5354 case 0x2029: /* PARAGRAPH SEPARATOR */
5355 #endif
5356 break;
5357 }
5358 break;
5359
5360 case OP_NOT_DIGIT:
5361 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5362 break;
5363
5364 case OP_DIGIT:
5365 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5366 break;
5367
5368 case OP_NOT_WHITESPACE:
5369 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5370 break;
5371
5372 case OP_WHITESPACE:
5373 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5374 break;
5375
5376 case OP_NOT_WORDCHAR:
5377 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5378 break;
5379
5380 case OP_WORDCHAR:
5381 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5382 break;
5383
5384 default:
5385 RRETURN(PCRE_ERROR_INTERNAL);
5386 }
5387 }
5388 }
5389 /* Control never gets here */
5390 }
5391
5392 /* If maximizing, it is worth using inline code for speed, doing the type
5393 test once at the start (i.e. keep it out of the loop). Again, keep the
5394 UTF-8 and UCP stuff separate. */
5395
5396 else
5397 {
5398 pp = eptr; /* Remember where we started */
5399
5400 #ifdef SUPPORT_UCP
5401 if (prop_type >= 0)
5402 {
5403 switch(prop_type)
5404 {
5405 case PT_ANY:
5406 for (i = min; i < max; i++)
5407 {
5408 int len = 1;
5409 if (eptr >= md->end_subject)
5410 {
5411 SCHECK_PARTIAL();
5412 break;
5413 }
5414 GETCHARLENTEST(c, eptr, len);
5415 if (prop_fail_result) break;
5416 eptr+= len;
5417 }
5418 break;
5419
5420 case PT_LAMP:
5421 for (i = min; i < max; i++)
5422 {
5423 int chartype;
5424 int len = 1;
5425 if (eptr >= md->end_subject)
5426 {
5427 SCHECK_PARTIAL();
5428 break;
5429 }
5430 GETCHARLENTEST(c, eptr, len);
5431 chartype = UCD_CHARTYPE(c);
5432 if ((chartype == ucp_Lu ||
5433 chartype == ucp_Ll ||
5434 chartype == ucp_Lt) == prop_fail_result)
5435 break;
5436 eptr+= len;
5437 }
5438 break;
5439
5440 case PT_GC:
5441 for (i = min; i < max; i++)
5442 {
5443 int len = 1;
5444 if (eptr >= md->end_subject)
5445 {
5446 SCHECK_PARTIAL();
5447 break;
5448 }
5449 GETCHARLENTEST(c, eptr, len);
5450 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5451 eptr+= len;
5452 }
5453 break;
5454
5455 case PT_PC:
5456 for (i = min; i < max; i++)
5457 {
5458 int len = 1;
5459 if (eptr >= md->end_subject)
5460 {
5461 SCHECK_PARTIAL();
5462 break;
5463 }
5464 GETCHARLENTEST(c, eptr, len);
5465 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5466 eptr+= len;
5467 }
5468 break;
5469
5470 case PT_SC:
5471 for (i = min; i < max; i++)
5472 {
5473 int len = 1;
5474 if (eptr >= md->end_subject)
5475 {
5476 SCHECK_PARTIAL();
5477 break;
5478 }
5479 GETCHARLENTEST(c, eptr, len);
5480 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5481 eptr+= len;
5482 }
5483 break;
5484
5485 case PT_ALNUM:
5486 for (i = min; i < max; i++)
5487 {
5488 int category;
5489 int len = 1;
5490 if (eptr >= md->end_subject)
5491 {
5492 SCHECK_PARTIAL();
5493 break;
5494 }
5495 GETCHARLENTEST(c, eptr, len);
5496 category = UCD_CATEGORY(c);
5497 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5498 break;
5499 eptr+= len;
5500 }
5501 break;
5502
5503 case PT_SPACE: /* Perl space */
5504 for (i = min; i < max; i++)
5505 {
5506 int len = 1;
5507 if (eptr >= md->end_subject)
5508 {
5509 SCHECK_PARTIAL();
5510 break;
5511 }
5512 GETCHARLENTEST(c, eptr, len);
5513 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5514 c == CHAR_FF || c == CHAR_CR)
5515 == prop_fail_result)
5516 break;
5517 eptr+= len;
5518 }
5519 break;
5520
5521 case PT_PXSPACE: /* POSIX space */
5522 for (i = min; i < max; i++)
5523 {
5524 int len = 1;
5525 if (eptr >= md->end_subject)
5526 {
5527 SCHECK_PARTIAL();
5528 break;
5529 }
5530 GETCHARLENTEST(c, eptr, len);
5531 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5532 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5533 == prop_fail_result)
5534 break;
5535 eptr+= len;
5536 }
5537 break;
5538
5539 case PT_WORD:
5540 for (i = min; i < max; i++)
5541 {
5542 int category;
5543 int len = 1;
5544 if (eptr >= md->end_subject)
5545 {
5546 SCHECK_PARTIAL();
5547 break;
5548 }
5549 GETCHARLENTEST(c, eptr, len);
5550 category = UCD_CATEGORY(c);
5551 if ((category == ucp_L || category == ucp_N ||
5552 c == CHAR_UNDERSCORE) == prop_fail_result)
5553 break;
5554 eptr+= len;
5555 }
5556 break;
5557
5558 default:
5559 RRETURN(PCRE_ERROR_INTERNAL);
5560 }
5561
5562 /* eptr is now past the end of the maximum run */
5563
5564 if (possessive) continue;
5565 for(;;)
5566 {
5567 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5569 if (eptr-- == pp) break; /* Stop if tried at original pos */
5570 if (utf) BACKCHAR(eptr);
5571 }
5572 }
5573
5574 /* Match extended Unicode sequences. We will get here only if the
5575 support is in the binary; otherwise a compile-time error occurs. */
5576
5577 else if (ctype == OP_EXTUNI)
5578 {
5579 for (i = min; i < max; i++)
5580 {
5581 int len = 1;
5582 if (eptr >= md->end_subject)
5583 {
5584 SCHECK_PARTIAL();
5585 break;
5586 }
5587 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5588 if (UCD_CATEGORY(c) == ucp_M) break;
5589 eptr += len;
5590 while (eptr < md->end_subject)
5591 {
5592 len = 1;
5593 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5594 if (UCD_CATEGORY(c) != ucp_M) break;
5595 eptr += len;
5596 }
5597 CHECK_PARTIAL();
5598 }
5599
5600 /* eptr is now past the end of the maximum run */
5601
5602 if (possessive) continue;
5603
5604 for(;;)
5605 {
5606 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5608 if (eptr-- == pp) break; /* Stop if tried at original pos */
5609 for (;;) /* Move back over one extended */
5610 {
5611 if (!utf) c = *eptr; else
5612 {
5613 BACKCHAR(eptr);
5614 GETCHAR(c, eptr);
5615 }
5616 if (UCD_CATEGORY(c) != ucp_M) break;
5617 eptr--;
5618 }
5619 }
5620 }
5621
5622 else
5623 #endif /* SUPPORT_UCP */
5624
5625 #ifdef SUPPORT_UTF
5626 if (utf)
5627 {
5628 switch(ctype)
5629 {
5630 case OP_ANY:
5631 if (max < INT_MAX)
5632 {
5633 for (i = min; i < max; i++)
5634 {
5635 if (eptr >= md->end_subject)
5636 {
5637 SCHECK_PARTIAL();
5638 break;
5639 }
5640 if (IS_NEWLINE(eptr)) break;
5641 if (md->partial != 0 && /* Take care with CRLF partial */
5642 eptr + 1 >= md->end_subject &&
5643 NLBLOCK->nltype == NLTYPE_FIXED &&
5644 NLBLOCK->nllen == 2 &&
5645 *eptr == NLBLOCK->nl[0])
5646 {
5647 md->hitend = TRUE;
5648 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5649 }
5650 eptr++;
5651 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5652 }
5653 }
5654
5655 /* Handle unlimited UTF-8 repeat */
5656
5657 else
5658 {
5659 for (i = min; i < max; i++)
5660 {
5661 if (eptr >= md->end_subject)
5662 {
5663 SCHECK_PARTIAL();
5664 break;
5665 }
5666 if (IS_NEWLINE(eptr)) break;
5667 if (md->partial != 0 && /* Take care with CRLF partial */
5668 eptr + 1 >= md->end_subject &&
5669 NLBLOCK->nltype == NLTYPE_FIXED &&
5670 NLBLOCK->nllen == 2 &&
5671 *eptr == NLBLOCK->nl[0])
5672 {
5673 md->hitend = TRUE;
5674 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5675 }
5676 eptr++;
5677 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5678 }
5679 }
5680 break;
5681
5682 case OP_ALLANY:
5683 if (max < INT_MAX)
5684 {
5685 for (i = min; i < max; i++)
5686 {
5687 if (eptr >= md->end_subject)
5688 {
5689 SCHECK_PARTIAL();
5690 break;
5691 }
5692 eptr++;
5693 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5694 }
5695 }
5696 else
5697 {
5698 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5699 SCHECK_PARTIAL();
5700 }
5701 break;
5702
5703 /* The byte case is the same as non-UTF8 */
5704
5705 case OP_ANYBYTE:
5706 c = max - min;
5707 if (c > (unsigned int)(md->end_subject - eptr))
5708 {
5709 eptr = md->end_subject;
5710 SCHECK_PARTIAL();
5711 }
5712 else eptr += c;
5713 break;
5714
5715 case OP_ANYNL:
5716 for (i = min; i < max; i++)
5717 {
5718 int len = 1;
5719 if (eptr >= md->end_subject)
5720 {
5721 SCHECK_PARTIAL();
5722 break;
5723 }
5724 GETCHARLEN(c, eptr, len);
5725 if (c == 0x000d)
5726 {
5727 if (++eptr >= md->end_subject) break;
5728 if (*eptr == 0x000a) eptr++;
5729 }
5730 else
5731 {
5732 if (c != 0x000a &&
5733 (md->bsr_anycrlf ||
5734 (c != 0x000b && c != 0x000c &&
5735 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5736 break;
5737 eptr += len;
5738 }
5739 }
5740 break;
5741
5742 case OP_NOT_HSPACE:
5743 case OP_HSPACE:
5744 for (i = min; i < max; i++)
5745 {
5746 BOOL gotspace;
5747 int len = 1;
5748 if (eptr >= md->end_subject)
5749 {
5750 SCHECK_PARTIAL();
5751 break;
5752 }
5753 GETCHARLEN(c, eptr, len);
5754 switch(c)
5755 {
5756 default: gotspace = FALSE; break;
5757 case 0x09: /* HT */
5758 case 0x20: /* SPACE */
5759 case 0xa0: /* NBSP */
5760 case 0x1680: /* OGHAM SPACE MARK */
5761 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5762 case 0x2000: /* EN QUAD */
5763 case 0x2001: /* EM QUAD */
5764 case 0x2002: /* EN SPACE */
5765 case 0x2003: /* EM SPACE */
5766 case 0x2004: /* THREE-PER-EM SPACE */
5767 case 0x2005: /* FOUR-PER-EM SPACE */
5768 case 0x2006: /* SIX-PER-EM SPACE */
5769 case 0x2007: /* FIGURE SPACE */
5770 case 0x2008: /* PUNCTUATION SPACE */
5771 case 0x2009: /* THIN SPACE */
5772 case 0x200A: /* HAIR SPACE */
5773 case 0x202f: /* NARROW NO-BREAK SPACE */
5774 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5775 case 0x3000: /* IDEOGRAPHIC SPACE */
5776 gotspace = TRUE;
5777 break;
5778 }
5779 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5780 eptr += len;
5781 }
5782 break;
5783
5784 case OP_NOT_VSPACE:
5785 case OP_VSPACE:
5786 for (i = min; i < max; i++)
5787 {
5788 BOOL gotspace;
5789 int len = 1;
5790 if (eptr >= md->end_subject)
5791 {
5792 SCHECK_PARTIAL();
5793 break;
5794 }
5795 GETCHARLEN(c, eptr, len);
5796 switch(c)
5797 {
5798 default: gotspace = FALSE; break;
5799 case 0x0a: /* LF */
5800 case 0x0b: /* VT */
5801 case 0x0c: /* FF */
5802 case 0x0d: /* CR */
5803 case 0x85: /* NEL */
5804 case 0x2028: /* LINE SEPARATOR */
5805 case 0x2029: /* PARAGRAPH SEPARATOR */
5806 gotspace = TRUE;
5807 break;
5808 }
5809 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5810 eptr += len;
5811 }
5812 break;
5813
5814 case OP_NOT_DIGIT:
5815 for (i = min; i < max; i++)
5816 {
5817 int len = 1;
5818 if (eptr >= md->end_subject)
5819 {
5820 SCHECK_PARTIAL();
5821 break;
5822 }
5823 GETCHARLEN(c, eptr, len);
5824 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5825 eptr+= len;
5826 }
5827 break;
5828
5829 case OP_DIGIT:
5830 for (i = min; i < max; i++)
5831 {
5832 int len = 1;
5833 if (eptr >= md->end_subject)
5834 {
5835 SCHECK_PARTIAL();
5836 break;
5837 }
5838 GETCHARLEN(c, eptr, len);
5839 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5840 eptr+= len;
5841 }
5842 break;
5843
5844 case OP_NOT_WHITESPACE:
5845 for (i = min; i < max; i++)
5846 {
5847 int len = 1;
5848 if (eptr >= md->end_subject)
5849 {
5850 SCHECK_PARTIAL();
5851 break;
5852 }
5853 GETCHARLEN(c, eptr, len);
5854 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5855 eptr+= len;
5856 }
5857 break;
5858
5859 case OP_WHITESPACE:
5860 for (i = min; i < max; i++)
5861 {
5862 int len = 1;
5863 if (eptr >= md->end_subject)
5864 {
5865 SCHECK_PARTIAL();
5866 break;
5867 }
5868 GETCHARLEN(c, eptr, len);
5869 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5870 eptr+= len;
5871 }
5872 break;
5873
5874 case OP_NOT_WORDCHAR:
5875 for (i = min; i < max; i++)
5876 {
5877 int len = 1;
5878 if (eptr >= md->end_subject)
5879 {
5880 SCHECK_PARTIAL();
5881 break;
5882 }
5883 GETCHARLEN(c, eptr, len);
5884 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5885 eptr+= len;
5886 }
5887 break;
5888
5889 case OP_WORDCHAR:
5890 for (i = min; i < max; i++)
5891 {
5892 int len = 1;
5893 if (eptr >= md->end_subject)
5894 {
5895 SCHECK_PARTIAL();
5896 break;
5897 }
5898 GETCHARLEN(c, eptr, len);
5899 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5900 eptr+= len;
5901 }
5902 break;
5903
5904 default:
5905 RRETURN(PCRE_ERROR_INTERNAL);
5906 }
5907
5908 /* eptr is now past the end of the maximum run. If possessive, we are
5909 done (no backing up). Otherwise, match at this position; anything other
5910 than no match is immediately returned. For nomatch, back up one
5911 character, unless we are matching \R and the last thing matched was
5912 \r\n, in which case, back up two bytes. */
5913
5914 if (possessive) continue;
5915 for(;;)
5916 {
5917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5919 if (eptr-- == pp) break; /* Stop if tried at original pos */
5920 BACKCHAR(eptr);
5921 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5922 eptr[-1] == '\r') eptr--;
5923 }
5924 }
5925 else
5926 #endif /* SUPPORT_UTF */
5927 /* Not UTF mode */
5928 {
5929 switch(ctype)
5930 {
5931 case OP_ANY:
5932 for (i = min; i < max; i++)
5933 {
5934 if (eptr >= md->end_subject)
5935 {
5936 SCHECK_PARTIAL();
5937 break;
5938 }
5939 if (IS_NEWLINE(eptr)) break;
5940 if (md->partial != 0 && /* Take care with CRLF partial */
5941 eptr + 1 >= md->end_subject &&
5942 NLBLOCK->nltype == NLTYPE_FIXED &&
5943 NLBLOCK->nllen == 2 &&
5944 *eptr == NLBLOCK->nl[0])
5945 {
5946 md->hitend = TRUE;
5947 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5948 }
5949 eptr++;
5950 }
5951 break;
5952
5953 case OP_ALLANY:
5954 case OP_ANYBYTE:
5955 c = max - min;
5956 if (c > (unsigned int)(md->end_subject - eptr))
5957 {
5958 eptr = md->end_subject;
5959 SCHECK_PARTIAL();
5960 }
5961 else eptr += c;
5962 break;
5963
5964 case OP_ANYNL:
5965 for (i = min; i < max; i++)
5966 {
5967 if (eptr >= md->end_subject)
5968 {
5969 SCHECK_PARTIAL();
5970 break;
5971 }
5972 c = *eptr;
5973 if (c == 0x000d)
5974 {
5975 if (++eptr >= md->end_subject) break;
5976 if (*eptr == 0x000a) eptr++;
5977 }
5978 else
5979 {
5980 if (c != 0x000a && (md->bsr_anycrlf ||
5981 (c != 0x000b && c != 0x000c && c != 0x0085
5982 #ifdef COMPILE_PCRE16
5983 && c != 0x2028 && c != 0x2029
5984 #endif
5985 ))) break;
5986 eptr++;
5987 }
5988 }
5989 break;
5990
5991 case OP_NOT_HSPACE:
5992 for (i = min; i < max; i++)
5993 {
5994 if (eptr >= md->end_subject)
5995 {
5996 SCHECK_PARTIAL();
5997 break;
5998 }
5999 c = *eptr;
6000 if (c == 0x09 || c == 0x20 || c == 0xa0
6001 #ifdef COMPILE_PCRE16
6002 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6003 || c == 0x202f || c == 0x205f || c == 0x3000
6004 #endif
6005 ) break;
6006 eptr++;
6007 }
6008 break;
6009
6010 case OP_HSPACE:
6011 for (i = min; i < max; i++)
6012 {
6013 if (eptr >= md->end_subject)
6014 {
6015 SCHECK_PARTIAL();
6016 break;
6017 }
6018 c = *eptr;
6019 if (c != 0x09 && c != 0x20 && c != 0xa0
6020 #ifdef COMPILE_PCRE16
6021 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6022 && c != 0x202f && c != 0x205f && c != 0x3000
6023 #endif
6024 ) break;
6025 eptr++;
6026 }
6027 break;
6028
6029 case OP_NOT_VSPACE:
6030 for (i = min; i < max; i++)
6031 {
6032 if (eptr >= md->end_subject)
6033 {
6034 SCHECK_PARTIAL();
6035 break;
6036 }
6037 c = *eptr;
6038 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6039 #ifdef COMPILE_PCRE16
6040 || c == 0x2028 || c == 0x2029
6041 #endif
6042 ) break;
6043 eptr++;
6044 }
6045 break;
6046
6047 case OP_VSPACE:
6048 for (i = min; i < max; i++)
6049 {
6050 if (eptr >= md->end_subject)
6051 {
6052 SCHECK_PARTIAL();
6053 break;
6054 }
6055 c = *eptr;
6056 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6057 #ifdef COMPILE_PCRE16
6058 && c != 0x2028 && c != 0x2029
6059 #endif
6060 ) break;
6061 eptr++;
6062 }
6063 break;
6064
6065 case OP_NOT_DIGIT:
6066 for (i = min; i < max; i++)
6067 {
6068 if (eptr >= md->end_subject)
6069 {
6070 SCHECK_PARTIAL();
6071 break;
6072 }
6073 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6074 eptr++;
6075 }
6076 break;
6077
6078 case OP_DIGIT:
6079 for (i = min; i < max; i++)
6080 {
6081 if (eptr >= md->end_subject)
6082 {
6083 SCHECK_PARTIAL();
6084 break;
6085 }
6086 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6087 eptr++;
6088 }
6089 break;
6090
6091 case OP_NOT_WHITESPACE:
6092 for (i = min; i < max; i++)
6093 {
6094 if (eptr >= md->end_subject)
6095 {
6096 SCHECK_PARTIAL();
6097 break;
6098 }
6099 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6100 eptr++;
6101 }
6102 break;
6103
6104 case OP_WHITESPACE:
6105 for (i = min; i < max; i++)
6106 {
6107 if (eptr >= md->end_subject)
6108 {
6109 SCHECK_PARTIAL();
6110 break;
6111 }
6112 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6113 eptr++;
6114 }
6115 break;
6116
6117 case OP_NOT_WORDCHAR:
6118 for (i = min; i < max; i++)
6119 {
6120 if (eptr >= md->end_subject)
6121 {
6122 SCHECK_PARTIAL();
6123 break;
6124 }
6125 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6126 eptr++;
6127 }
6128 break;
6129
6130 case OP_WORDCHAR:
6131 for (i = min; i < max; i++)
6132 {
6133 if (eptr >= md->end_subject)
6134 {
6135 SCHECK_PARTIAL();
6136 break;
6137 }
6138 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6139 eptr++;
6140 }
6141 break;
6142
6143 default:
6144 RRETURN(PCRE_ERROR_INTERNAL);
6145 }
6146
6147 /* eptr is now past the end of the maximum run. If possessive, we are
6148 done (no backing up). Otherwise, match at this position; anything other
6149 than no match is immediately returned. For nomatch, back up one
6150 character (byte), unless we are matching \R and the last thing matched
6151 was \r\n, in which case, back up two bytes. */
6152
6153 if (possessive) continue;
6154 while (eptr >= pp)
6155 {
6156 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6157 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6158 eptr--;
6159 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6160 eptr[-1] == '\r') eptr--;
6161 }
6162 }
6163
6164 /* Get here if we can't make it match with any permitted repetitions */
6165
6166 RRETURN(MATCH_NOMATCH);
6167 }
6168 /* Control never gets here */
6169
6170 /* There's been some horrible disaster. Arrival here can only mean there is
6171 something seriously wrong in the code above or the OP_xxx definitions. */
6172
6173 default:
6174 DPRINTF(("Unknown opcode %d\n", *ecode));
6175 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6176 }
6177
6178 /* Do not stick any code in here without much thought; it is assumed
6179 that "continue" in the code above comes out to here to repeat the main
6180 loop. */
6181
6182 } /* End of main loop */
6183 /* Control never reaches here */
6184
6185
6186 /* When compiling to use the heap rather than the stack for recursive calls to
6187 match(), the RRETURN() macro jumps here. The number that is saved in
6188 frame->Xwhere indicates which label we actually want to return to. */
6189
6190 #ifdef NO_RECURSE
6191 #define LBL(val) case val: goto L_RM##val;
6192 HEAP_RETURN:
6193 switch (frame->Xwhere)
6194 {
6195 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6196 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6197 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6198 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6199 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6200 LBL(65) LBL(66)
6201 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6202 LBL(21)
6203 #endif
6204 #ifdef SUPPORT_UTF
6205 LBL(16) LBL(18) LBL(20)
6206 LBL(22) LBL(23) LBL(28) LBL(30)
6207 LBL(32) LBL(34) LBL(42) LBL(46)
6208 #ifdef SUPPORT_UCP
6209 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6210 LBL(59) LBL(60) LBL(61) LBL(62)
6211 #endif /* SUPPORT_UCP */
6212 #endif /* SUPPORT_UTF */
6213 default:
6214 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6215
6216 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6217
6218 return PCRE_ERROR_INTERNAL;
6219 }
6220 #undef LBL
6221 #endif /* NO_RECURSE */
6222 }
6223
6224
6225 /***************************************************************************
6226 ****************************************************************************
6227 RECURSION IN THE match() FUNCTION
6228
6229 Undefine all the macros that were defined above to handle this. */
6230
6231 #ifdef NO_RECURSE
6232 #undef eptr
6233 #undef ecode
6234 #undef mstart
6235 #undef offset_top
6236 #undef eptrb
6237 #undef flags
6238
6239 #undef callpat
6240 #undef charptr
6241 #undef data
6242 #undef next
6243 #undef pp
6244 #undef prev
6245 #undef saved_eptr
6246
6247 #undef new_recursive
6248
6249 #undef cur_is_word
6250 #undef condition
6251 #undef prev_is_word
6252
6253 #undef ctype
6254 #undef length
6255 #undef max
6256 #undef min
6257 #undef number
6258 #undef offset
6259 #undef op
6260 #undef save_capture_last
6261 #undef save_offset1
6262 #undef save_offset2
6263 #undef save_offset3
6264 #undef stacksave
6265
6266 #undef newptrb
6267
6268 #endif
6269
6270 /* These two are defined as macros in both cases */
6271
6272 #undef fc
6273 #undef fi
6274
6275 /***************************************************************************
6276 ***************************************************************************/
6277
6278
6279
6280 /*************************************************
6281 * Execute a Regular Expression *
6282 *************************************************/
6283
6284 /* This function applies a compiled re to a subject string and picks out
6285 portions of the string if it matches. Two elements in the vector are set for
6286 each substring: the offsets to the start and end of the substring.
6287
6288 Arguments:
6289 argument_re points to the compiled expression
6290 extra_data points to extra data or is NULL
6291 subject points to the subject string
6292 length length of subject string (may contain binary zeros)
6293 start_offset where to start in the subject string
6294 options option bits
6295 offsets points to a vector of ints to be filled in with offsets
6296 offsetcount the number of elements in the vector
6297
6298 Returns: > 0 => success; value is the number of elements filled in
6299 = 0 => success, but offsets is not big enough
6300 -1 => failed to match
6301 < -1 => some kind of unexpected problem
6302 */
6303
6304 #ifdef COMPILE_PCRE8
6305 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6306 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6307 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6308 int offsetcount)
6309 #else
6310 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6311 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6312 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6313 int offsetcount)
6314 #endif
6315 {
6316 int rc, ocount, arg_offset_max;
6317 int newline;
6318 BOOL using_temporary_offsets = FALSE;
6319 BOOL anchored;
6320 BOOL startline;
6321 BOOL firstline;
6322 BOOL utf;
6323 BOOL has_first_char = FALSE;
6324 BOOL has_req_char = FALSE;
6325 pcre_uchar first_char = 0;
6326 pcre_uchar first_char2 = 0;
6327 pcre_uchar req_char = 0;
6328 pcre_uchar req_char2 = 0;
6329 match_data match_block;
6330 match_data *md = &match_block;
6331 const pcre_uint8 *tables;
6332 const pcre_uint8 *start_bits = NULL;
6333 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6334 PCRE_PUCHAR end_subject;
6335 PCRE_PUCHAR start_partial = NULL;
6336 PCRE_PUCHAR req_char_ptr = start_match - 1;
6337
6338 const pcre_study_data *study;
6339 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6340
6341 /* Check for the special magic call that measures the size of the stack used
6342 per recursive call of match(). Without the funny casting for sizeof, a Windows
6343 compiler gave this error: "unary minus operator applied to unsigned type,
6344 result still unsigned". Hopefully the cast fixes that. */
6345
6346 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6347 start_offset == -999)
6348 #ifdef NO_RECURSE
6349 return -((int)sizeof(heapframe));
6350 #else
6351 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6352 #endif
6353
6354 /* Plausibility checks */
6355
6356 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6357 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6358 return PCRE_ERROR_NULL;
6359 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6360 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6361
6362 /* Check that the first field in the block is the magic number. If it is not,
6363 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6364 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6365 means that the pattern is likely compiled with different endianness. */
6366
6367 if (re->magic_number != MAGIC_NUMBER)
6368 return re->magic_number == REVERSED_MAGIC_NUMBER?
6369 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6370 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6371
6372 /* These two settings are used in the code for checking a UTF-8 string that
6373 follows immediately afterwards. Other values in the md block are used only
6374 during "normal" pcre_exec() processing, not when the JIT support is in use,
6375 so they are set up later. */
6376
6377 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6378 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6379 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6380 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6381
6382 /* Check a UTF-8 string if required. Pass back the character offset and error
6383 code for an invalid string if a results vector is available. */
6384
6385 #ifdef SUPPORT_UTF
6386 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6387 {
6388 int erroroffset;
6389 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6390 if (errorcode != 0)
6391 {
6392 if (offsetcount >= 2)
6393 {
6394 offsets[0] = erroroffset;
6395 offsets[1] = errorcode;
6396 }
6397 #ifdef COMPILE_PCRE16
6398 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6399 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6400 #else
6401 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6402 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6403 #endif
6404 }
6405
6406 /* Check that a start_offset points to the start of a UTF character. */
6407 if (start_offset > 0 && start_offset < length &&
6408 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6409 return PCRE_ERROR_BADUTF8_OFFSET;
6410 }
6411 #endif
6412
6413 /* If the pattern was successfully studied with JIT support, run the JIT
6414 executable instead of the rest of this function. Most options must be set at
6415 compile time for the JIT code to be usable. Fallback to the normal code path if
6416 an unsupported flag is set. */
6417
6418 #ifdef SUPPORT_JIT
6419 if (extra_data != NULL
6420 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6421 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6422 && extra_data->executable_jit != NULL
6423 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6424 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6425 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6426 {
6427 rc = PRIV(jit_exec)(re, extra_data->executable_jit,
6428 (const pcre_uchar *)subject, length, start_offset, options,
6429 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6430 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6431
6432 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6433 mode is not compiled. In this case we simply fallback to interpreter. */
6434
6435 if (rc != PCRE_ERROR_NULL) /* JIT was used */
6436 {
6437 ((pcre_extra *)extra_data)->flags |= PCRE_EXTRA_USED_JIT;
6438 return rc;
6439 }
6440 }
6441 #endif
6442
6443 /* Carry on with non-JIT matching. This information is for finding all the
6444 numbers associated with a given name, for condition testing. */
6445
6446 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6447 md->name_count = re->name_count;
6448 md->name_entry_size = re->name_entry_size;
6449
6450 /* Fish out the optional data from the extra_data structure, first setting
6451 the default values. */
6452
6453 study = NULL;
6454 md->match_limit = MATCH_LIMIT;
6455 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6456 md->callout_data = NULL;
6457
6458 /* The table pointer is always in native byte order. */
6459
6460 tables = re->tables;
6461
6462 if (extra_data != NULL)
6463 {
6464 register unsigned int flags = extra_data->flags;
6465 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6466 study = (const pcre_study_data *)extra_data->study_data;
6467 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6468 md->match_limit = extra_data->match_limit;
6469 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6470 md->match_limit_recursion = extra_data->match_limit_recursion;
6471 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6472 md->callout_data = extra_data->callout_data;
6473 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6474 ((pcre_extra *)extra_data)->flags &= ~PCRE_EXTRA_USED_JIT; /* JIT not used */
6475 }
6476
6477 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6478 is a feature that makes it possible to save compiled regex and re-use them
6479 in other programs later. */
6480
6481 if (tables == NULL) tables = PRIV(default_tables);
6482
6483 /* Set up other data */
6484
6485 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6486 startline = (re->flags & PCRE_STARTLINE) != 0;
6487 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6488
6489 /* The code starts after the real_pcre block and the capture name table. */
6490
6491 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6492 re->name_count * re->name_entry_size;
6493
6494 md->start_subject = (PCRE_PUCHAR)subject;
6495 md->start_offset = start_offset;
6496 md->end_subject = md->start_subject + length;
6497 end_subject = md->end_subject;
6498
6499 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6500 md->use_ucp = (re->options & PCRE_UCP) != 0;
6501 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6502 md->ignore_skip_arg = FALSE;
6503
6504 /* Some options are unpacked into BOOL variables in the hope that testing
6505 them will be faster than individual option bits. */
6506
6507 md->notbol = (options & PCRE_NOTBOL) != 0;
6508 md->noteol = (options & PCRE_NOTEOL) != 0;
6509 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6510 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6511
6512 md->hitend = FALSE;
6513 md->mark = md->nomatch_mark = NULL; /* In case never set */
6514
6515 md->recursive = NULL; /* No recursion at top level */
6516 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6517
6518 md->lcc = tables + lcc_offset;
6519 md->fcc = tables + fcc_offset;
6520 md->ctypes = tables + ctypes_offset;
6521
6522 /* Handle different \R options. */
6523
6524 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6525 {
6526 case 0:
6527 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6528 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6529 else
6530 #ifdef BSR_ANYCRLF
6531 md->bsr_anycrlf = TRUE;
6532 #else
6533 md->bsr_anycrlf = FALSE;
6534 #endif
6535 break;
6536
6537 case PCRE_BSR_ANYCRLF:
6538 md->bsr_anycrlf = TRUE;
6539 break;
6540
6541 case PCRE_BSR_UNICODE:
6542 md->bsr_anycrlf = FALSE;
6543 break;
6544
6545 default: return PCRE_ERROR_BADNEWLINE;
6546 }
6547
6548 /* Handle different types of newline. The three bits give eight cases. If
6549 nothing is set at run time, whatever was used at compile time applies. */
6550
6551 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6552 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6553 {
6554 case 0: newline = NEWLINE; break; /* Compile-time default */
6555 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6556 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6557 case PCRE_NEWLINE_CR+
6558 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6559 case PCRE_NEWLINE_ANY: newline = -1; break;
6560 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6561 default: return PCRE_ERROR_BADNEWLINE;
6562 }
6563
6564 if (newline == -2)
6565 {
6566 md->nltype = NLTYPE_ANYCRLF;
6567 }
6568 else if (newline < 0)
6569 {
6570 md->nltype = NLTYPE_ANY;
6571 }
6572 else
6573 {
6574 md->nltype = NLTYPE_FIXED;
6575 if (newline > 255)
6576 {
6577 md->nllen = 2;
6578 md->nl[0] = (newline >> 8) & 255;
6579 md->nl[1] = newline & 255;
6580 }
6581 else
6582 {
6583 md->nllen = 1;
6584 md->nl[0] = newline;
6585 }
6586 }
6587
6588 /* Partial matching was originally supported only for a restricted set of
6589 regexes; from release 8.00 there are no restrictions, but the bits are still
6590 defined (though never set). So there's no harm in leaving this code. */
6591
6592 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6593 return PCRE_ERROR_BADPARTIAL;
6594
6595 /* If the expression has got more back references than the offsets supplied can
6596 hold, we get a temporary chunk of working store to use during the matching.
6597 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6598 of 3. */
6599
6600 ocount = offsetcount - (offsetcount % 3);
6601 arg_offset_max = (2*ocount)/3;
6602
6603 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6604 {
6605 ocount = re->top_backref * 3 + 3;
6606 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6607 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6608 using_temporary_offsets = TRUE;
6609 DPRINTF(("Got memory to hold back references\n"));
6610 }
6611 else md->offset_vector = offsets;
6612
6613 md->offset_end = ocount;
6614 md->offset_max = (2*ocount)/3;
6615 md->offset_overflow = FALSE;
6616 md->capture_last = -1;
6617
6618 /* Reset the working variable associated with each extraction. These should
6619 never be used unless previously set, but they get saved and restored, and so we
6620 initialize them to avoid reading uninitialized locations. Also, unset the
6621 offsets for the matched string. This is really just for tidiness with callouts,
6622 in case they inspect these fields. */
6623
6624 if (md->offset_vector != NULL)
6625 {
6626 register int *iptr = md->offset_vector + ocount;
6627 register int *iend = iptr - re->top_bracket;
6628 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6629 while (--iptr >= iend) *iptr = -1;
6630 md->offset_vector[0] = md->offset_vector[1] = -1;
6631 }
6632
6633 /* Set up the first character to match, if available. The first_char value is
6634 never set for an anchored regular expression, but the anchoring may be forced
6635 at run time, so we have to test for anchoring. The first char may be unset for
6636 an unanchored pattern, of course. If there's no first char and the pattern was
6637 studied, there may be a bitmap of possible first characters. */
6638
6639 if (!anchored)
6640 {
6641 if ((re->flags & PCRE_FIRSTSET) != 0)
6642 {
6643 has_first_char = TRUE;
6644 first_char = first_char2 = (pcre_uchar)(re->first_char);
6645 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6646 {
6647 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6648 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6649 if (utf && first_char > 127)
6650 first_char2 = UCD_OTHERCASE(first_char);
6651 #endif
6652 }
6653 }
6654 else
6655 if (!startline && study != NULL &&
6656 (study->flags & PCRE_STUDY_MAPPED) != 0)
6657 start_bits = study->start_bits;
6658 }
6659
6660 /* For anchored or unanchored matches, there may be a "last known required
6661 character" set. */
6662
6663 if ((re->flags & PCRE_REQCHSET) != 0)
6664 {
6665 has_req_char = TRUE;
6666 req_char = req_char2 = (pcre_uchar)(re->req_char);
6667 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6668 {
6669 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6670 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6671 if (utf && req_char > 127)
6672 req_char2 = UCD_OTHERCASE(req_char);
6673 #endif
6674 }
6675 }
6676
6677
6678 /* ==========================================================================*/
6679
6680 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6681 the loop runs just once. */
6682
6683 for(;;)
6684 {
6685 PCRE_PUCHAR save_end_subject = end_subject;
6686 PCRE_PUCHAR new_start_match;
6687
6688 /* If firstline is TRUE, the start of the match is constrained to the first
6689 line of a multiline string. That is, the match must be before or at the first
6690 newline. Implement this by temporarily adjusting end_subject so that we stop
6691 scanning at a newline. If the match fails at the newline, later code breaks
6692 this loop. */
6693
6694 if (firstline)
6695 {
6696 PCRE_PUCHAR t = start_match;
6697 #ifdef SUPPORT_UTF
6698 if (utf)
6699 {
6700 while (t < md->end_subject && !IS_NEWLINE(t))
6701 {
6702 t++;
6703 ACROSSCHAR(t < end_subject, *t, t++);
6704 }
6705 }
6706 else
6707 #endif
6708 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6709 end_subject = t;
6710 }
6711
6712 /* There are some optimizations that avoid running the match if a known
6713 starting point is not found, or if a known later character is not present.
6714 However, there is an option that disables these, for testing and for ensuring
6715 that all callouts do actually occur. The option can be set in the regex by
6716 (*NO_START_OPT) or passed in match-time options. */
6717
6718 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6719 {
6720 /* Advance to a unique first char if there is one. */
6721
6722 if (has_first_char)
6723 {
6724 if (first_char != first_char2)
6725 while (start_match < end_subject &&
6726 *start_match != first_char && *start_match != first_char2)
6727 start_match++;
6728 else
6729 while (start_match < end_subject && *start_match != first_char)
6730 start_match++;
6731 }
6732
6733 /* Or to just after a linebreak for a multiline match */
6734
6735 else if (startline)
6736 {
6737 if (start_match > md->start_subject + start_offset)
6738 {
6739 #ifdef SUPPORT_UTF