/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 940 - (show annotations)
Tue Feb 28 10:30:51 2012 UTC (7 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 219092 byte(s)
Previous patch to fix (*COMMIT) in assertions was bad; fix it.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: >= 0 the number of subject bytes matched
144 -1 no match
145 -2 partial match; always given if at end subject
146 */
147
148 static int
149 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
150 BOOL caseless)
151 {
152 PCRE_PUCHAR eptr_start = eptr;
153 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
154
155 #ifdef PCRE_DEBUG
156 if (eptr >= md->end_subject)
157 printf("matching subject <null>");
158 else
159 {
160 printf("matching subject ");
161 pchars(eptr, length, TRUE, md);
162 }
163 printf(" against backref ");
164 pchars(p, length, FALSE, md);
165 printf("\n");
166 #endif
167
168 /* Always fail if reference not set (and not JavaScript compatible - in that
169 case the length is passed as zero). */
170
171 if (length < 0) return -1;
172
173 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
174 properly if Unicode properties are supported. Otherwise, we can check only
175 ASCII characters. */
176
177 if (caseless)
178 {
179 #ifdef SUPPORT_UTF
180 #ifdef SUPPORT_UCP
181 if (md->utf)
182 {
183 /* Match characters up to the end of the reference. NOTE: the number of
184 bytes matched may differ, because there are some characters whose upper and
185 lower case versions code as different numbers of bytes. For example, U+023A
186 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
187 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
188 the latter. It is important, therefore, to check the length along the
189 reference, not along the subject (earlier code did this wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 int c, d;
195 if (eptr >= md->end_subject) return -2; /* Partial match */
196 GETCHARINC(c, eptr);
197 GETCHARINC(d, p);
198 if (c != d && c != UCD_OTHERCASE(d)) return -1;
199 }
200 }
201 else
202 #endif
203 #endif
204
205 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
206 is no UCP support. */
207 {
208 while (length-- > 0)
209 {
210 if (eptr >= md->end_subject) return -2; /* Partial match */
211 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
212 p++;
213 eptr++;
214 }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 while (length-- > 0)
224 {
225 if (eptr >= md->end_subject) return -2; /* Partial match */
226 if (*p++ != *eptr++) return -1;
227 }
228 }
229
230 return (int)(eptr - eptr_start);
231 }
232
233
234
235 /***************************************************************************
236 ****************************************************************************
237 RECURSION IN THE match() FUNCTION
238
239 The match() function is highly recursive, though not every recursive call
240 increases the recursive depth. Nevertheless, some regular expressions can cause
241 it to recurse to a great depth. I was writing for Unix, so I just let it call
242 itself recursively. This uses the stack for saving everything that has to be
243 saved for a recursive call. On Unix, the stack can be large, and this works
244 fine.
245
246 It turns out that on some non-Unix-like systems there are problems with
247 programs that use a lot of stack. (This despite the fact that every last chip
248 has oodles of memory these days, and techniques for extending the stack have
249 been known for decades.) So....
250
251 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
252 calls by keeping local variables that need to be preserved in blocks of memory
253 obtained from malloc() instead instead of on the stack. Macros are used to
254 achieve this so that the actual code doesn't look very different to what it
255 always used to.
256
257 The original heap-recursive code used longjmp(). However, it seems that this
258 can be very slow on some operating systems. Following a suggestion from Stan
259 Switzer, the use of longjmp() has been abolished, at the cost of having to
260 provide a unique number for each call to RMATCH. There is no way of generating
261 a sequence of numbers at compile time in C. I have given them names, to make
262 them stand out more clearly.
263
264 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
265 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
266 tests. Furthermore, not using longjmp() means that local dynamic variables
267 don't have indeterminate values; this has meant that the frame size can be
268 reduced because the result can be "passed back" by straight setting of the
269 variable instead of being passed in the frame.
270 ****************************************************************************
271 ***************************************************************************/
272
273 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
274 below must be updated in sync. */
275
276 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
277 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
278 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
279 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
280 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
281 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
282 RM61, RM62, RM63, RM64, RM65, RM66 };
283
284 /* These versions of the macros use the stack, as normal. There are debugging
285 versions and production versions. Note that the "rw" argument of RMATCH isn't
286 actually used in this definition. */
287
288 #ifndef NO_RECURSE
289 #define REGISTER register
290
291 #ifdef PCRE_DEBUG
292 #define RMATCH(ra,rb,rc,rd,re,rw) \
293 { \
294 printf("match() called in line %d\n", __LINE__); \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
296 printf("to line %d\n", __LINE__); \
297 }
298 #define RRETURN(ra) \
299 { \
300 printf("match() returned %d from line %d ", ra, __LINE__); \
301 return ra; \
302 }
303 #else
304 #define RMATCH(ra,rb,rc,rd,re,rw) \
305 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
306 #define RRETURN(ra) return ra
307 #endif
308
309 #else
310
311
312 /* These versions of the macros manage a private stack on the heap. Note that
313 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
314 argument of match(), which never changes. */
315
316 #define REGISTER
317
318 #define RMATCH(ra,rb,rc,rd,re,rw)\
319 {\
320 heapframe *newframe = frame->Xnextframe;\
321 if (newframe == NULL)\
322 {\
323 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
324 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
325 newframe->Xnextframe = NULL;\
326 frame->Xnextframe = newframe;\
327 }\
328 frame->Xwhere = rw;\
329 newframe->Xeptr = ra;\
330 newframe->Xecode = rb;\
331 newframe->Xmstart = mstart;\
332 newframe->Xoffset_top = rc;\
333 newframe->Xeptrb = re;\
334 newframe->Xrdepth = frame->Xrdepth + 1;\
335 newframe->Xprevframe = frame;\
336 frame = newframe;\
337 DPRINTF(("restarting from line %d\n", __LINE__));\
338 goto HEAP_RECURSE;\
339 L_##rw:\
340 DPRINTF(("jumped back to line %d\n", __LINE__));\
341 }
342
343 #define RRETURN(ra)\
344 {\
345 heapframe *oldframe = frame;\
346 frame = oldframe->Xprevframe;\
347 if (frame != NULL)\
348 {\
349 rrc = ra;\
350 goto HEAP_RETURN;\
351 }\
352 return ra;\
353 }
354
355
356 /* Structure for remembering the local variables in a private frame */
357
358 typedef struct heapframe {
359 struct heapframe *Xprevframe;
360 struct heapframe *Xnextframe;
361
362 /* Function arguments that may change */
363
364 PCRE_PUCHAR Xeptr;
365 const pcre_uchar *Xecode;
366 PCRE_PUCHAR Xmstart;
367 int Xoffset_top;
368 eptrblock *Xeptrb;
369 unsigned int Xrdepth;
370
371 /* Function local variables */
372
373 PCRE_PUCHAR Xcallpat;
374 #ifdef SUPPORT_UTF
375 PCRE_PUCHAR Xcharptr;
376 #endif
377 PCRE_PUCHAR Xdata;
378 PCRE_PUCHAR Xnext;
379 PCRE_PUCHAR Xpp;
380 PCRE_PUCHAR Xprev;
381 PCRE_PUCHAR Xsaved_eptr;
382
383 recursion_info Xnew_recursive;
384
385 BOOL Xcur_is_word;
386 BOOL Xcondition;
387 BOOL Xprev_is_word;
388
389 #ifdef SUPPORT_UCP
390 int Xprop_type;
391 int Xprop_value;
392 int Xprop_fail_result;
393 int Xoclength;
394 pcre_uchar Xocchars[6];
395 #endif
396
397 int Xcodelink;
398 int Xctype;
399 unsigned int Xfc;
400 int Xfi;
401 int Xlength;
402 int Xmax;
403 int Xmin;
404 int Xnumber;
405 int Xoffset;
406 int Xop;
407 int Xsave_capture_last;
408 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
409 int Xstacksave[REC_STACK_SAVE_MAX];
410
411 eptrblock Xnewptrb;
412
413 /* Where to jump back to */
414
415 int Xwhere;
416
417 } heapframe;
418
419 #endif
420
421
422 /***************************************************************************
423 ***************************************************************************/
424
425
426
427 /*************************************************
428 * Match from current position *
429 *************************************************/
430
431 /* This function is called recursively in many circumstances. Whenever it
432 returns a negative (error) response, the outer incarnation must also return the
433 same response. */
434
435 /* These macros pack up tests that are used for partial matching, and which
436 appear several times in the code. We set the "hit end" flag if the pointer is
437 at the end of the subject and also past the start of the subject (i.e.
438 something has been matched). For hard partial matching, we then return
439 immediately. The second one is used when we already know we are past the end of
440 the subject. */
441
442 #define CHECK_PARTIAL()\
443 if (md->partial != 0 && eptr >= md->end_subject && \
444 eptr > md->start_used_ptr) \
445 { \
446 md->hitend = TRUE; \
447 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
448 }
449
450 #define SCHECK_PARTIAL()\
451 if (md->partial != 0 && eptr > md->start_used_ptr) \
452 { \
453 md->hitend = TRUE; \
454 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
455 }
456
457
458 /* Performance note: It might be tempting to extract commonly used fields from
459 the md structure (e.g. utf, end_subject) into individual variables to improve
460 performance. Tests using gcc on a SPARC disproved this; in the first case, it
461 made performance worse.
462
463 Arguments:
464 eptr pointer to current character in subject
465 ecode pointer to current position in compiled code
466 mstart pointer to the current match start position (can be modified
467 by encountering \K)
468 offset_top current top pointer
469 md pointer to "static" info for the match
470 eptrb pointer to chain of blocks containing eptr at start of
471 brackets - for testing for empty matches
472 rdepth the recursion depth
473
474 Returns: MATCH_MATCH if matched ) these values are >= 0
475 MATCH_NOMATCH if failed to match )
476 a negative MATCH_xxx value for PRUNE, SKIP, etc
477 a negative PCRE_ERROR_xxx value if aborted by an error condition
478 (e.g. stopped by repeated call or recursion limit)
479 */
480
481 static int
482 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
483 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
484 unsigned int rdepth)
485 {
486 /* These variables do not need to be preserved over recursion in this function,
487 so they can be ordinary variables in all cases. Mark some of them with
488 "register" because they are used a lot in loops. */
489
490 register int rrc; /* Returns from recursive calls */
491 register int i; /* Used for loops not involving calls to RMATCH() */
492 register unsigned int c; /* Character values not kept over RMATCH() calls */
493 register BOOL utf; /* Local copy of UTF flag for speed */
494
495 BOOL minimize, possessive; /* Quantifier options */
496 BOOL caseless;
497 int condcode;
498
499 /* When recursion is not being used, all "local" variables that have to be
500 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
501 frame on the stack here; subsequent instantiations are obtained from the heap
502 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
503 the top-level on the stack rather than malloc-ing them all gives a performance
504 boost in many cases where there is not much "recursion". */
505
506 #ifdef NO_RECURSE
507 heapframe *frame = (heapframe *)md->match_frames_base;
508
509 /* Copy in the original argument variables */
510
511 frame->Xeptr = eptr;
512 frame->Xecode = ecode;
513 frame->Xmstart = mstart;
514 frame->Xoffset_top = offset_top;
515 frame->Xeptrb = eptrb;
516 frame->Xrdepth = rdepth;
517
518 /* This is where control jumps back to to effect "recursion" */
519
520 HEAP_RECURSE:
521
522 /* Macros make the argument variables come from the current frame */
523
524 #define eptr frame->Xeptr
525 #define ecode frame->Xecode
526 #define mstart frame->Xmstart
527 #define offset_top frame->Xoffset_top
528 #define eptrb frame->Xeptrb
529 #define rdepth frame->Xrdepth
530
531 /* Ditto for the local variables */
532
533 #ifdef SUPPORT_UTF
534 #define charptr frame->Xcharptr
535 #endif
536 #define callpat frame->Xcallpat
537 #define codelink frame->Xcodelink
538 #define data frame->Xdata
539 #define next frame->Xnext
540 #define pp frame->Xpp
541 #define prev frame->Xprev
542 #define saved_eptr frame->Xsaved_eptr
543
544 #define new_recursive frame->Xnew_recursive
545
546 #define cur_is_word frame->Xcur_is_word
547 #define condition frame->Xcondition
548 #define prev_is_word frame->Xprev_is_word
549
550 #ifdef SUPPORT_UCP
551 #define prop_type frame->Xprop_type
552 #define prop_value frame->Xprop_value
553 #define prop_fail_result frame->Xprop_fail_result
554 #define oclength frame->Xoclength
555 #define occhars frame->Xocchars
556 #endif
557
558 #define ctype frame->Xctype
559 #define fc frame->Xfc
560 #define fi frame->Xfi
561 #define length frame->Xlength
562 #define max frame->Xmax
563 #define min frame->Xmin
564 #define number frame->Xnumber
565 #define offset frame->Xoffset
566 #define op frame->Xop
567 #define save_capture_last frame->Xsave_capture_last
568 #define save_offset1 frame->Xsave_offset1
569 #define save_offset2 frame->Xsave_offset2
570 #define save_offset3 frame->Xsave_offset3
571 #define stacksave frame->Xstacksave
572
573 #define newptrb frame->Xnewptrb
574
575 /* When recursion is being used, local variables are allocated on the stack and
576 get preserved during recursion in the normal way. In this environment, fi and
577 i, and fc and c, can be the same variables. */
578
579 #else /* NO_RECURSE not defined */
580 #define fi i
581 #define fc c
582
583 /* Many of the following variables are used only in small blocks of the code.
584 My normal style of coding would have declared them within each of those blocks.
585 However, in order to accommodate the version of this code that uses an external
586 "stack" implemented on the heap, it is easier to declare them all here, so the
587 declarations can be cut out in a block. The only declarations within blocks
588 below are for variables that do not have to be preserved over a recursive call
589 to RMATCH(). */
590
591 #ifdef SUPPORT_UTF
592 const pcre_uchar *charptr;
593 #endif
594 const pcre_uchar *callpat;
595 const pcre_uchar *data;
596 const pcre_uchar *next;
597 PCRE_PUCHAR pp;
598 const pcre_uchar *prev;
599 PCRE_PUCHAR saved_eptr;
600
601 recursion_info new_recursive;
602
603 BOOL cur_is_word;
604 BOOL condition;
605 BOOL prev_is_word;
606
607 #ifdef SUPPORT_UCP
608 int prop_type;
609 int prop_value;
610 int prop_fail_result;
611 int oclength;
612 pcre_uchar occhars[6];
613 #endif
614
615 int codelink;
616 int ctype;
617 int length;
618 int max;
619 int min;
620 int number;
621 int offset;
622 int op;
623 int save_capture_last;
624 int save_offset1, save_offset2, save_offset3;
625 int stacksave[REC_STACK_SAVE_MAX];
626
627 eptrblock newptrb;
628
629 /* There is a special fudge for calling match() in a way that causes it to
630 measure the size of its basic stack frame when the stack is being used for
631 recursion. The second argument (ecode) being NULL triggers this behaviour. It
632 cannot normally ever be NULL. The return is the negated value of the frame
633 size. */
634
635 if (ecode == NULL)
636 {
637 if (rdepth == 0)
638 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
639 else
640 {
641 int len = (char *)&rdepth - (char *)eptr;
642 return (len > 0)? -len : len;
643 }
644 }
645 #endif /* NO_RECURSE */
646
647 /* To save space on the stack and in the heap frame, I have doubled up on some
648 of the local variables that are used only in localised parts of the code, but
649 still need to be preserved over recursive calls of match(). These macros define
650 the alternative names that are used. */
651
652 #define allow_zero cur_is_word
653 #define cbegroup condition
654 #define code_offset codelink
655 #define condassert condition
656 #define matched_once prev_is_word
657 #define foc number
658 #define save_mark data
659
660 /* These statements are here to stop the compiler complaining about unitialized
661 variables. */
662
663 #ifdef SUPPORT_UCP
664 prop_value = 0;
665 prop_fail_result = 0;
666 #endif
667
668
669 /* This label is used for tail recursion, which is used in a few cases even
670 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
671 used. Thanks to Ian Taylor for noticing this possibility and sending the
672 original patch. */
673
674 TAIL_RECURSE:
675
676 /* OK, now we can get on with the real code of the function. Recursive calls
677 are specified by the macro RMATCH and RRETURN is used to return. When
678 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
679 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
680 defined). However, RMATCH isn't like a function call because it's quite a
681 complicated macro. It has to be used in one particular way. This shouldn't,
682 however, impact performance when true recursion is being used. */
683
684 #ifdef SUPPORT_UTF
685 utf = md->utf; /* Local copy of the flag */
686 #else
687 utf = FALSE;
688 #endif
689
690 /* First check that we haven't called match() too many times, or that we
691 haven't exceeded the recursive call limit. */
692
693 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
694 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
695
696 /* At the start of a group with an unlimited repeat that may match an empty
697 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
698 done this way to save having to use another function argument, which would take
699 up space on the stack. See also MATCH_CONDASSERT below.
700
701 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
702 such remembered pointers, to be checked when we hit the closing ket, in order
703 to break infinite loops that match no characters. When match() is called in
704 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
705 NOT be used with tail recursion, because the memory block that is used is on
706 the stack, so a new one may be required for each match(). */
707
708 if (md->match_function_type == MATCH_CBEGROUP)
709 {
710 newptrb.epb_saved_eptr = eptr;
711 newptrb.epb_prev = eptrb;
712 eptrb = &newptrb;
713 md->match_function_type = 0;
714 }
715
716 /* Now start processing the opcodes. */
717
718 for (;;)
719 {
720 minimize = possessive = FALSE;
721 op = *ecode;
722
723 switch(op)
724 {
725 case OP_MARK:
726 md->nomatch_mark = ecode + 2;
727 md->mark = NULL; /* In case previously set by assertion */
728 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
729 eptrb, RM55);
730 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
731 md->mark == NULL) md->mark = ecode + 2;
732
733 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
734 argument, and we must check whether that argument matches this MARK's
735 argument. It is passed back in md->start_match_ptr (an overloading of that
736 variable). If it does match, we reset that variable to the current subject
737 position and return MATCH_SKIP. Otherwise, pass back the return code
738 unaltered. */
739
740 else if (rrc == MATCH_SKIP_ARG &&
741 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
742 {
743 md->start_match_ptr = eptr;
744 RRETURN(MATCH_SKIP);
745 }
746 RRETURN(rrc);
747
748 case OP_FAIL:
749 RRETURN(MATCH_NOMATCH);
750
751 /* COMMIT overrides PRUNE, SKIP, and THEN */
752
753 case OP_COMMIT:
754 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
755 eptrb, RM52);
756 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
757 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
758 rrc != MATCH_THEN)
759 RRETURN(rrc);
760 RRETURN(MATCH_COMMIT);
761
762 /* PRUNE overrides THEN */
763
764 case OP_PRUNE:
765 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
766 eptrb, RM51);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
768 RRETURN(MATCH_PRUNE);
769
770 case OP_PRUNE_ARG:
771 md->nomatch_mark = ecode + 2;
772 md->mark = NULL; /* In case previously set by assertion */
773 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM56);
775 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
776 md->mark == NULL) md->mark = ecode + 2;
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 RRETURN(MATCH_PRUNE);
779
780 /* SKIP overrides PRUNE and THEN */
781
782 case OP_SKIP:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM53);
785 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
786 RRETURN(rrc);
787 md->start_match_ptr = eptr; /* Pass back current position */
788 RRETURN(MATCH_SKIP);
789
790 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
791 nomatch_mark. There is a flag that disables this opcode when re-matching a
792 pattern that ended with a SKIP for which there was not a matching MARK. */
793
794 case OP_SKIP_ARG:
795 if (md->ignore_skip_arg)
796 {
797 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
798 break;
799 }
800 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
801 eptrb, RM57);
802 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
803 RRETURN(rrc);
804
805 /* Pass back the current skip name by overloading md->start_match_ptr and
806 returning the special MATCH_SKIP_ARG return code. This will either be
807 caught by a matching MARK, or get to the top, where it causes a rematch
808 with the md->ignore_skip_arg flag set. */
809
810 md->start_match_ptr = ecode + 2;
811 RRETURN(MATCH_SKIP_ARG);
812
813 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
814 the branch in which it occurs can be determined. Overload the start of
815 match pointer to do this. */
816
817 case OP_THEN:
818 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
819 eptrb, RM54);
820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
821 md->start_match_ptr = ecode;
822 RRETURN(MATCH_THEN);
823
824 case OP_THEN_ARG:
825 md->nomatch_mark = ecode + 2;
826 md->mark = NULL; /* In case previously set by assertion */
827 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
828 md, eptrb, RM58);
829 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
830 md->mark == NULL) md->mark = ecode + 2;
831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832 md->start_match_ptr = ecode;
833 RRETURN(MATCH_THEN);
834
835 /* Handle an atomic group that does not contain any capturing parentheses.
836 This can be handled like an assertion. Prior to 8.13, all atomic groups
837 were handled this way. In 8.13, the code was changed as below for ONCE, so
838 that backups pass through the group and thereby reset captured values.
839 However, this uses a lot more stack, so in 8.20, atomic groups that do not
840 contain any captures generate OP_ONCE_NC, which can be handled in the old,
841 less stack intensive way.
842
843 Check the alternative branches in turn - the matching won't pass the KET
844 for this kind of subpattern. If any one branch matches, we carry on as at
845 the end of a normal bracket, leaving the subject pointer, but resetting
846 the start-of-match value in case it was changed by \K. */
847
848 case OP_ONCE_NC:
849 prev = ecode;
850 saved_eptr = eptr;
851 save_mark = md->mark;
852 do
853 {
854 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
855 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
856 {
857 mstart = md->start_match_ptr;
858 break;
859 }
860 if (rrc == MATCH_THEN)
861 {
862 next = ecode + GET(ecode,1);
863 if (md->start_match_ptr < next &&
864 (*ecode == OP_ALT || *next == OP_ALT))
865 rrc = MATCH_NOMATCH;
866 }
867
868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
869 ecode += GET(ecode,1);
870 md->mark = save_mark;
871 }
872 while (*ecode == OP_ALT);
873
874 /* If hit the end of the group (which could be repeated), fail */
875
876 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
877
878 /* Continue as from after the group, updating the offsets high water
879 mark, since extracts may have been taken. */
880
881 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
882
883 offset_top = md->end_offset_top;
884 eptr = md->end_match_ptr;
885
886 /* For a non-repeating ket, just continue at this level. This also
887 happens for a repeating ket if no characters were matched in the group.
888 This is the forcible breaking of infinite loops as implemented in Perl
889 5.005. */
890
891 if (*ecode == OP_KET || eptr == saved_eptr)
892 {
893 ecode += 1+LINK_SIZE;
894 break;
895 }
896
897 /* The repeating kets try the rest of the pattern or restart from the
898 preceding bracket, in the appropriate order. The second "call" of match()
899 uses tail recursion, to avoid using another stack frame. */
900
901 if (*ecode == OP_KETRMIN)
902 {
903 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
905 ecode = prev;
906 goto TAIL_RECURSE;
907 }
908 else /* OP_KETRMAX */
909 {
910 md->match_function_type = MATCH_CBEGROUP;
911 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
912 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
913 ecode += 1 + LINK_SIZE;
914 goto TAIL_RECURSE;
915 }
916 /* Control never gets here */
917
918 /* Handle a capturing bracket, other than those that are possessive with an
919 unlimited repeat. If there is space in the offset vector, save the current
920 subject position in the working slot at the top of the vector. We mustn't
921 change the current values of the data slot, because they may be set from a
922 previous iteration of this group, and be referred to by a reference inside
923 the group. A failure to match might occur after the group has succeeded,
924 if something later on doesn't match. For this reason, we need to restore
925 the working value and also the values of the final offsets, in case they
926 were set by a previous iteration of the same bracket.
927
928 If there isn't enough space in the offset vector, treat this as if it were
929 a non-capturing bracket. Don't worry about setting the flag for the error
930 case here; that is handled in the code for KET. */
931
932 case OP_CBRA:
933 case OP_SCBRA:
934 number = GET2(ecode, 1+LINK_SIZE);
935 offset = number << 1;
936
937 #ifdef PCRE_DEBUG
938 printf("start bracket %d\n", number);
939 printf("subject=");
940 pchars(eptr, 16, TRUE, md);
941 printf("\n");
942 #endif
943
944 if (offset < md->offset_max)
945 {
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950 save_mark = md->mark;
951
952 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
953 md->offset_vector[md->offset_end - number] =
954 (int)(eptr - md->start_subject);
955
956 for (;;)
957 {
958 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
959 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
960 eptrb, RM1);
961 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
962
963 /* If we backed up to a THEN, check whether it is within the current
964 branch by comparing the address of the THEN that is passed back with
965 the end of the branch. If it is within the current branch, and the
966 branch is one of two or more alternatives (it either starts or ends
967 with OP_ALT), we have reached the limit of THEN's action, so convert
968 the return code to NOMATCH, which will cause normal backtracking to
969 happen from now on. Otherwise, THEN is passed back to an outer
970 alternative. This implements Perl's treatment of parenthesized groups,
971 where a group not containing | does not affect the current alternative,
972 that is, (X) is NOT the same as (X|(*F)). */
973
974 if (rrc == MATCH_THEN)
975 {
976 next = ecode + GET(ecode,1);
977 if (md->start_match_ptr < next &&
978 (*ecode == OP_ALT || *next == OP_ALT))
979 rrc = MATCH_NOMATCH;
980 }
981
982 /* Anything other than NOMATCH is passed back. */
983
984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
985 md->capture_last = save_capture_last;
986 ecode += GET(ecode, 1);
987 md->mark = save_mark;
988 if (*ecode != OP_ALT) break;
989 }
990
991 DPRINTF(("bracket %d failed\n", number));
992 md->offset_vector[offset] = save_offset1;
993 md->offset_vector[offset+1] = save_offset2;
994 md->offset_vector[md->offset_end - number] = save_offset3;
995
996 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
997
998 RRETURN(rrc);
999 }
1000
1001 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1002 as a non-capturing bracket. */
1003
1004 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006
1007 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1008
1009 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1010 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1011
1012 /* Non-capturing or atomic group, except for possessive with unlimited
1013 repeat and ONCE group with no captures. Loop for all the alternatives.
1014
1015 When we get to the final alternative within the brackets, we used to return
1016 the result of a recursive call to match() whatever happened so it was
1017 possible to reduce stack usage by turning this into a tail recursion,
1018 except in the case of a possibly empty group. However, now that there is
1019 the possiblity of (*THEN) occurring in the final alternative, this
1020 optimization is no longer always possible.
1021
1022 We can optimize if we know there are no (*THEN)s in the pattern; at present
1023 this is the best that can be done.
1024
1025 MATCH_ONCE is returned when the end of an atomic group is successfully
1026 reached, but subsequent matching fails. It passes back up the tree (causing
1027 captured values to be reset) until the original atomic group level is
1028 reached. This is tested by comparing md->once_target with the start of the
1029 group. At this point, the return is converted into MATCH_NOMATCH so that
1030 previous backup points can be taken. */
1031
1032 case OP_ONCE:
1033 case OP_BRA:
1034 case OP_SBRA:
1035 DPRINTF(("start non-capturing bracket\n"));
1036
1037 for (;;)
1038 {
1039 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1040
1041 /* If this is not a possibly empty group, and there are no (*THEN)s in
1042 the pattern, and this is the final alternative, optimize as described
1043 above. */
1044
1045 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1046 {
1047 ecode += PRIV(OP_lengths)[*ecode];
1048 goto TAIL_RECURSE;
1049 }
1050
1051 /* In all other cases, we have to make another call to match(). */
1052
1053 save_mark = md->mark;
1054 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1055 RM2);
1056
1057 /* See comment in the code for capturing groups above about handling
1058 THEN. */
1059
1060 if (rrc == MATCH_THEN)
1061 {
1062 next = ecode + GET(ecode,1);
1063 if (md->start_match_ptr < next &&
1064 (*ecode == OP_ALT || *next == OP_ALT))
1065 rrc = MATCH_NOMATCH;
1066 }
1067
1068 if (rrc != MATCH_NOMATCH)
1069 {
1070 if (rrc == MATCH_ONCE)
1071 {
1072 const pcre_uchar *scode = ecode;
1073 if (*scode != OP_ONCE) /* If not at start, find it */
1074 {
1075 while (*scode == OP_ALT) scode += GET(scode, 1);
1076 scode -= GET(scode, 1);
1077 }
1078 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1079 }
1080 RRETURN(rrc);
1081 }
1082 ecode += GET(ecode, 1);
1083 md->mark = save_mark;
1084 if (*ecode != OP_ALT) break;
1085 }
1086
1087 RRETURN(MATCH_NOMATCH);
1088
1089 /* Handle possessive capturing brackets with an unlimited repeat. We come
1090 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1091 handled similarly to the normal case above. However, the matching is
1092 different. The end of these brackets will always be OP_KETRPOS, which
1093 returns MATCH_KETRPOS without going further in the pattern. By this means
1094 we can handle the group by iteration rather than recursion, thereby
1095 reducing the amount of stack needed. */
1096
1097 case OP_CBRAPOS:
1098 case OP_SCBRAPOS:
1099 allow_zero = FALSE;
1100
1101 POSSESSIVE_CAPTURE:
1102 number = GET2(ecode, 1+LINK_SIZE);
1103 offset = number << 1;
1104
1105 #ifdef PCRE_DEBUG
1106 printf("start possessive bracket %d\n", number);
1107 printf("subject=");
1108 pchars(eptr, 16, TRUE, md);
1109 printf("\n");
1110 #endif
1111
1112 if (offset < md->offset_max)
1113 {
1114 matched_once = FALSE;
1115 code_offset = (int)(ecode - md->start_code);
1116
1117 save_offset1 = md->offset_vector[offset];
1118 save_offset2 = md->offset_vector[offset+1];
1119 save_offset3 = md->offset_vector[md->offset_end - number];
1120 save_capture_last = md->capture_last;
1121
1122 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1123
1124 /* Each time round the loop, save the current subject position for use
1125 when the group matches. For MATCH_MATCH, the group has matched, so we
1126 restart it with a new subject starting position, remembering that we had
1127 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1128 usual. If we haven't matched any alternatives in any iteration, check to
1129 see if a previous iteration matched. If so, the group has matched;
1130 continue from afterwards. Otherwise it has failed; restore the previous
1131 capture values before returning NOMATCH. */
1132
1133 for (;;)
1134 {
1135 md->offset_vector[md->offset_end - number] =
1136 (int)(eptr - md->start_subject);
1137 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1138 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1139 eptrb, RM63);
1140 if (rrc == MATCH_KETRPOS)
1141 {
1142 offset_top = md->end_offset_top;
1143 eptr = md->end_match_ptr;
1144 ecode = md->start_code + code_offset;
1145 save_capture_last = md->capture_last;
1146 matched_once = TRUE;
1147 continue;
1148 }
1149
1150 /* See comment in the code for capturing groups above about handling
1151 THEN. */
1152
1153 if (rrc == MATCH_THEN)
1154 {
1155 next = ecode + GET(ecode,1);
1156 if (md->start_match_ptr < next &&
1157 (*ecode == OP_ALT || *next == OP_ALT))
1158 rrc = MATCH_NOMATCH;
1159 }
1160
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 md->capture_last = save_capture_last;
1163 ecode += GET(ecode, 1);
1164 if (*ecode != OP_ALT) break;
1165 }
1166
1167 if (!matched_once)
1168 {
1169 md->offset_vector[offset] = save_offset1;
1170 md->offset_vector[offset+1] = save_offset2;
1171 md->offset_vector[md->offset_end - number] = save_offset3;
1172 }
1173
1174 if (allow_zero || matched_once)
1175 {
1176 ecode += 1 + LINK_SIZE;
1177 break;
1178 }
1179
1180 RRETURN(MATCH_NOMATCH);
1181 }
1182
1183 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1184 as a non-capturing bracket. */
1185
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188
1189 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1190
1191 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1192 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1193
1194 /* Non-capturing possessive bracket with unlimited repeat. We come here
1195 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1196 without the capturing complication. It is written out separately for speed
1197 and cleanliness. */
1198
1199 case OP_BRAPOS:
1200 case OP_SBRAPOS:
1201 allow_zero = FALSE;
1202
1203 POSSESSIVE_NON_CAPTURE:
1204 matched_once = FALSE;
1205 code_offset = (int)(ecode - md->start_code);
1206
1207 for (;;)
1208 {
1209 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1210 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1211 eptrb, RM48);
1212 if (rrc == MATCH_KETRPOS)
1213 {
1214 offset_top = md->end_offset_top;
1215 eptr = md->end_match_ptr;
1216 ecode = md->start_code + code_offset;
1217 matched_once = TRUE;
1218 continue;
1219 }
1220
1221 /* See comment in the code for capturing groups above about handling
1222 THEN. */
1223
1224 if (rrc == MATCH_THEN)
1225 {
1226 next = ecode + GET(ecode,1);
1227 if (md->start_match_ptr < next &&
1228 (*ecode == OP_ALT || *next == OP_ALT))
1229 rrc = MATCH_NOMATCH;
1230 }
1231
1232 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1233 ecode += GET(ecode, 1);
1234 if (*ecode != OP_ALT) break;
1235 }
1236
1237 if (matched_once || allow_zero)
1238 {
1239 ecode += 1 + LINK_SIZE;
1240 break;
1241 }
1242 RRETURN(MATCH_NOMATCH);
1243
1244 /* Control never reaches here. */
1245
1246 /* Conditional group: compilation checked that there are no more than
1247 two branches. If the condition is false, skipping the first branch takes us
1248 past the end if there is only one branch, but that's OK because that is
1249 exactly what going to the ket would do. */
1250
1251 case OP_COND:
1252 case OP_SCOND:
1253 codelink = GET(ecode, 1);
1254
1255 /* Because of the way auto-callout works during compile, a callout item is
1256 inserted between OP_COND and an assertion condition. */
1257
1258 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1259 {
1260 if (PUBL(callout) != NULL)
1261 {
1262 PUBL(callout_block) cb;
1263 cb.version = 2; /* Version 1 of the callout block */
1264 cb.callout_number = ecode[LINK_SIZE+2];
1265 cb.offset_vector = md->offset_vector;
1266 #ifdef COMPILE_PCRE8
1267 cb.subject = (PCRE_SPTR)md->start_subject;
1268 #else
1269 cb.subject = (PCRE_SPTR16)md->start_subject;
1270 #endif
1271 cb.subject_length = (int)(md->end_subject - md->start_subject);
1272 cb.start_match = (int)(mstart - md->start_subject);
1273 cb.current_position = (int)(eptr - md->start_subject);
1274 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1275 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1276 cb.capture_top = offset_top/2;
1277 cb.capture_last = md->capture_last;
1278 cb.callout_data = md->callout_data;
1279 cb.mark = md->nomatch_mark;
1280 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1281 if (rrc < 0) RRETURN(rrc);
1282 }
1283 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1284 }
1285
1286 condcode = ecode[LINK_SIZE+1];
1287
1288 /* Now see what the actual condition is */
1289
1290 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1291 {
1292 if (md->recursive == NULL) /* Not recursing => FALSE */
1293 {
1294 condition = FALSE;
1295 ecode += GET(ecode, 1);
1296 }
1297 else
1298 {
1299 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1300 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1301
1302 /* If the test is for recursion into a specific subpattern, and it is
1303 false, but the test was set up by name, scan the table to see if the
1304 name refers to any other numbers, and test them. The condition is true
1305 if any one is set. */
1306
1307 if (!condition && condcode == OP_NRREF)
1308 {
1309 pcre_uchar *slotA = md->name_table;
1310 for (i = 0; i < md->name_count; i++)
1311 {
1312 if (GET2(slotA, 0) == recno) break;
1313 slotA += md->name_entry_size;
1314 }
1315
1316 /* Found a name for the number - there can be only one; duplicate
1317 names for different numbers are allowed, but not vice versa. First
1318 scan down for duplicates. */
1319
1320 if (i < md->name_count)
1321 {
1322 pcre_uchar *slotB = slotA;
1323 while (slotB > md->name_table)
1324 {
1325 slotB -= md->name_entry_size;
1326 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1327 {
1328 condition = GET2(slotB, 0) == md->recursive->group_num;
1329 if (condition) break;
1330 }
1331 else break;
1332 }
1333
1334 /* Scan up for duplicates */
1335
1336 if (!condition)
1337 {
1338 slotB = slotA;
1339 for (i++; i < md->name_count; i++)
1340 {
1341 slotB += md->name_entry_size;
1342 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1343 {
1344 condition = GET2(slotB, 0) == md->recursive->group_num;
1345 if (condition) break;
1346 }
1347 else break;
1348 }
1349 }
1350 }
1351 }
1352
1353 /* Chose branch according to the condition */
1354
1355 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1356 }
1357 }
1358
1359 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1360 {
1361 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1362 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1363
1364 /* If the numbered capture is unset, but the reference was by name,
1365 scan the table to see if the name refers to any other numbers, and test
1366 them. The condition is true if any one is set. This is tediously similar
1367 to the code above, but not close enough to try to amalgamate. */
1368
1369 if (!condition && condcode == OP_NCREF)
1370 {
1371 int refno = offset >> 1;
1372 pcre_uchar *slotA = md->name_table;
1373
1374 for (i = 0; i < md->name_count; i++)
1375 {
1376 if (GET2(slotA, 0) == refno) break;
1377 slotA += md->name_entry_size;
1378 }
1379
1380 /* Found a name for the number - there can be only one; duplicate names
1381 for different numbers are allowed, but not vice versa. First scan down
1382 for duplicates. */
1383
1384 if (i < md->name_count)
1385 {
1386 pcre_uchar *slotB = slotA;
1387 while (slotB > md->name_table)
1388 {
1389 slotB -= md->name_entry_size;
1390 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1391 {
1392 offset = GET2(slotB, 0) << 1;
1393 condition = offset < offset_top &&
1394 md->offset_vector[offset] >= 0;
1395 if (condition) break;
1396 }
1397 else break;
1398 }
1399
1400 /* Scan up for duplicates */
1401
1402 if (!condition)
1403 {
1404 slotB = slotA;
1405 for (i++; i < md->name_count; i++)
1406 {
1407 slotB += md->name_entry_size;
1408 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1409 {
1410 offset = GET2(slotB, 0) << 1;
1411 condition = offset < offset_top &&
1412 md->offset_vector[offset] >= 0;
1413 if (condition) break;
1414 }
1415 else break;
1416 }
1417 }
1418 }
1419 }
1420
1421 /* Chose branch according to the condition */
1422
1423 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1424 }
1425
1426 else if (condcode == OP_DEF) /* DEFINE - always false */
1427 {
1428 condition = FALSE;
1429 ecode += GET(ecode, 1);
1430 }
1431
1432 /* The condition is an assertion. Call match() to evaluate it - setting
1433 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1434 an assertion. */
1435
1436 else
1437 {
1438 md->match_function_type = MATCH_CONDASSERT;
1439 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1440 if (rrc == MATCH_MATCH)
1441 {
1442 if (md->end_offset_top > offset_top)
1443 offset_top = md->end_offset_top; /* Captures may have happened */
1444 condition = TRUE;
1445 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1446 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1447 }
1448
1449 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1450 assertion; it is therefore treated as NOMATCH. */
1451
1452 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1453 {
1454 RRETURN(rrc); /* Need braces because of following else */
1455 }
1456 else
1457 {
1458 condition = FALSE;
1459 ecode += codelink;
1460 }
1461 }
1462
1463 /* We are now at the branch that is to be obeyed. As there is only one, can
1464 use tail recursion to avoid using another stack frame, except when there is
1465 unlimited repeat of a possibly empty group. In the latter case, a recursive
1466 call to match() is always required, unless the second alternative doesn't
1467 exist, in which case we can just plough on. Note that, for compatibility
1468 with Perl, the | in a conditional group is NOT treated as creating two
1469 alternatives. If a THEN is encountered in the branch, it propagates out to
1470 the enclosing alternative (unless nested in a deeper set of alternatives,
1471 of course). */
1472
1473 if (condition || *ecode == OP_ALT)
1474 {
1475 if (op != OP_SCOND)
1476 {
1477 ecode += 1 + LINK_SIZE;
1478 goto TAIL_RECURSE;
1479 }
1480
1481 md->match_function_type = MATCH_CBEGROUP;
1482 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1483 RRETURN(rrc);
1484 }
1485
1486 /* Condition false & no alternative; continue after the group. */
1487
1488 else
1489 {
1490 ecode += 1 + LINK_SIZE;
1491 }
1492 break;
1493
1494
1495 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1496 to close any currently open capturing brackets. */
1497
1498 case OP_CLOSE:
1499 number = GET2(ecode, 1);
1500 offset = number << 1;
1501
1502 #ifdef PCRE_DEBUG
1503 printf("end bracket %d at *ACCEPT", number);
1504 printf("\n");
1505 #endif
1506
1507 md->capture_last = number;
1508 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1509 {
1510 md->offset_vector[offset] =
1511 md->offset_vector[md->offset_end - number];
1512 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1513 if (offset_top <= offset) offset_top = offset + 2;
1514 }
1515 ecode += 1 + IMM2_SIZE;
1516 break;
1517
1518
1519 /* End of the pattern, either real or forced. */
1520
1521 case OP_END:
1522 case OP_ACCEPT:
1523 case OP_ASSERT_ACCEPT:
1524
1525 /* If we have matched an empty string, fail if not in an assertion and not
1526 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1527 is set and we have matched at the start of the subject. In both cases,
1528 backtracking will then try other alternatives, if any. */
1529
1530 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1531 md->recursive == NULL &&
1532 (md->notempty ||
1533 (md->notempty_atstart &&
1534 mstart == md->start_subject + md->start_offset)))
1535 RRETURN(MATCH_NOMATCH);
1536
1537 /* Otherwise, we have a match. */
1538
1539 md->end_match_ptr = eptr; /* Record where we ended */
1540 md->end_offset_top = offset_top; /* and how many extracts were taken */
1541 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1542
1543 /* For some reason, the macros don't work properly if an expression is
1544 given as the argument to RRETURN when the heap is in use. */
1545
1546 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1547 RRETURN(rrc);
1548
1549 /* Assertion brackets. Check the alternative branches in turn - the
1550 matching won't pass the KET for an assertion. If any one branch matches,
1551 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1552 start of each branch to move the current point backwards, so the code at
1553 this level is identical to the lookahead case. When the assertion is part
1554 of a condition, we want to return immediately afterwards. The caller of
1555 this incarnation of the match() function will have set MATCH_CONDASSERT in
1556 md->match_function type, and one of these opcodes will be the first opcode
1557 that is processed. We use a local variable that is preserved over calls to
1558 match() to remember this case. */
1559
1560 case OP_ASSERT:
1561 case OP_ASSERTBACK:
1562 save_mark = md->mark;
1563 if (md->match_function_type == MATCH_CONDASSERT)
1564 {
1565 condassert = TRUE;
1566 md->match_function_type = 0;
1567 }
1568 else condassert = FALSE;
1569
1570 do
1571 {
1572 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1574 {
1575 mstart = md->start_match_ptr; /* In case \K reset it */
1576 break;
1577 }
1578 md->mark = save_mark;
1579
1580 /* A COMMIT failure must fail the entire assertion, without trying any
1581 subsequent branches. */
1582
1583 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1584
1585 /* PCRE does not allow THEN to escape beyond an assertion; it
1586 is treated as NOMATCH. */
1587
1588 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1589 ecode += GET(ecode, 1);
1590 }
1591 while (*ecode == OP_ALT);
1592
1593 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1594
1595 /* If checking an assertion for a condition, return MATCH_MATCH. */
1596
1597 if (condassert) RRETURN(MATCH_MATCH);
1598
1599 /* Continue from after the assertion, updating the offsets high water
1600 mark, since extracts may have been taken during the assertion. */
1601
1602 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1603 ecode += 1 + LINK_SIZE;
1604 offset_top = md->end_offset_top;
1605 continue;
1606
1607 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1608 PRUNE, or COMMIT means we must assume failure without checking subsequent
1609 branches. */
1610
1611 case OP_ASSERT_NOT:
1612 case OP_ASSERTBACK_NOT:
1613 save_mark = md->mark;
1614 if (md->match_function_type == MATCH_CONDASSERT)
1615 {
1616 condassert = TRUE;
1617 md->match_function_type = 0;
1618 }
1619 else condassert = FALSE;
1620
1621 do
1622 {
1623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1624 md->mark = save_mark;
1625 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1626 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1627 {
1628 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1629 break;
1630 }
1631
1632 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1633 as NOMATCH. */
1634
1635 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1636 ecode += GET(ecode,1);
1637 }
1638 while (*ecode == OP_ALT);
1639
1640 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1641
1642 ecode += 1 + LINK_SIZE;
1643 continue;
1644
1645 /* Move the subject pointer back. This occurs only at the start of
1646 each branch of a lookbehind assertion. If we are too close to the start to
1647 move back, this match function fails. When working with UTF-8 we move
1648 back a number of characters, not bytes. */
1649
1650 case OP_REVERSE:
1651 #ifdef SUPPORT_UTF
1652 if (utf)
1653 {
1654 i = GET(ecode, 1);
1655 while (i-- > 0)
1656 {
1657 eptr--;
1658 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1659 BACKCHAR(eptr);
1660 }
1661 }
1662 else
1663 #endif
1664
1665 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1666
1667 {
1668 eptr -= GET(ecode, 1);
1669 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1670 }
1671
1672 /* Save the earliest consulted character, then skip to next op code */
1673
1674 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1675 ecode += 1 + LINK_SIZE;
1676 break;
1677
1678 /* The callout item calls an external function, if one is provided, passing
1679 details of the match so far. This is mainly for debugging, though the
1680 function is able to force a failure. */
1681
1682 case OP_CALLOUT:
1683 if (PUBL(callout) != NULL)
1684 {
1685 PUBL(callout_block) cb;
1686 cb.version = 2; /* Version 1 of the callout block */
1687 cb.callout_number = ecode[1];
1688 cb.offset_vector = md->offset_vector;
1689 #ifdef COMPILE_PCRE8
1690 cb.subject = (PCRE_SPTR)md->start_subject;
1691 #else
1692 cb.subject = (PCRE_SPTR16)md->start_subject;
1693 #endif
1694 cb.subject_length = (int)(md->end_subject - md->start_subject);
1695 cb.start_match = (int)(mstart - md->start_subject);
1696 cb.current_position = (int)(eptr - md->start_subject);
1697 cb.pattern_position = GET(ecode, 2);
1698 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1699 cb.capture_top = offset_top/2;
1700 cb.capture_last = md->capture_last;
1701 cb.callout_data = md->callout_data;
1702 cb.mark = md->nomatch_mark;
1703 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1704 if (rrc < 0) RRETURN(rrc);
1705 }
1706 ecode += 2 + 2*LINK_SIZE;
1707 break;
1708
1709 /* Recursion either matches the current regex, or some subexpression. The
1710 offset data is the offset to the starting bracket from the start of the
1711 whole pattern. (This is so that it works from duplicated subpatterns.)
1712
1713 The state of the capturing groups is preserved over recursion, and
1714 re-instated afterwards. We don't know how many are started and not yet
1715 finished (offset_top records the completed total) so we just have to save
1716 all the potential data. There may be up to 65535 such values, which is too
1717 large to put on the stack, but using malloc for small numbers seems
1718 expensive. As a compromise, the stack is used when there are no more than
1719 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1720
1721 There are also other values that have to be saved. We use a chained
1722 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1723 for the original version of this logic. It has, however, been hacked around
1724 a lot, so he is not to blame for the current way it works. */
1725
1726 case OP_RECURSE:
1727 {
1728 recursion_info *ri;
1729 int recno;
1730
1731 callpat = md->start_code + GET(ecode, 1);
1732 recno = (callpat == md->start_code)? 0 :
1733 GET2(callpat, 1 + LINK_SIZE);
1734
1735 /* Check for repeating a recursion without advancing the subject pointer.
1736 This should catch convoluted mutual recursions. (Some simple cases are
1737 caught at compile time.) */
1738
1739 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1740 if (recno == ri->group_num && eptr == ri->subject_position)
1741 RRETURN(PCRE_ERROR_RECURSELOOP);
1742
1743 /* Add to "recursing stack" */
1744
1745 new_recursive.group_num = recno;
1746 new_recursive.subject_position = eptr;
1747 new_recursive.prevrec = md->recursive;
1748 md->recursive = &new_recursive;
1749
1750 /* Where to continue from afterwards */
1751
1752 ecode += 1 + LINK_SIZE;
1753
1754 /* Now save the offset data */
1755
1756 new_recursive.saved_max = md->offset_end;
1757 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1758 new_recursive.offset_save = stacksave;
1759 else
1760 {
1761 new_recursive.offset_save =
1762 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1763 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1764 }
1765 memcpy(new_recursive.offset_save, md->offset_vector,
1766 new_recursive.saved_max * sizeof(int));
1767
1768 /* OK, now we can do the recursion. After processing each alternative,
1769 restore the offset data. If there were nested recursions, md->recursive
1770 might be changed, so reset it before looping. */
1771
1772 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1773 cbegroup = (*callpat >= OP_SBRA);
1774 do
1775 {
1776 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1777 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1778 md, eptrb, RM6);
1779 memcpy(md->offset_vector, new_recursive.offset_save,
1780 new_recursive.saved_max * sizeof(int));
1781 md->recursive = new_recursive.prevrec;
1782 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1783 {
1784 DPRINTF(("Recursion matched\n"));
1785 if (new_recursive.offset_save != stacksave)
1786 (PUBL(free))(new_recursive.offset_save);
1787
1788 /* Set where we got to in the subject, and reset the start in case
1789 it was changed by \K. This *is* propagated back out of a recursion,
1790 for Perl compatibility. */
1791
1792 eptr = md->end_match_ptr;
1793 mstart = md->start_match_ptr;
1794 goto RECURSION_MATCHED; /* Exit loop; end processing */
1795 }
1796
1797 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1798 is treated as NOMATCH. */
1799
1800 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1801 rrc != MATCH_COMMIT)
1802 {
1803 DPRINTF(("Recursion gave error %d\n", rrc));
1804 if (new_recursive.offset_save != stacksave)
1805 (PUBL(free))(new_recursive.offset_save);
1806 RRETURN(rrc);
1807 }
1808
1809 md->recursive = &new_recursive;
1810 callpat += GET(callpat, 1);
1811 }
1812 while (*callpat == OP_ALT);
1813
1814 DPRINTF(("Recursion didn't match\n"));
1815 md->recursive = new_recursive.prevrec;
1816 if (new_recursive.offset_save != stacksave)
1817 (PUBL(free))(new_recursive.offset_save);
1818 RRETURN(MATCH_NOMATCH);
1819 }
1820
1821 RECURSION_MATCHED:
1822 break;
1823
1824 /* An alternation is the end of a branch; scan along to find the end of the
1825 bracketed group and go to there. */
1826
1827 case OP_ALT:
1828 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1829 break;
1830
1831 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1832 indicating that it may occur zero times. It may repeat infinitely, or not
1833 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1834 with fixed upper repeat limits are compiled as a number of copies, with the
1835 optional ones preceded by BRAZERO or BRAMINZERO. */
1836
1837 case OP_BRAZERO:
1838 next = ecode + 1;
1839 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1840 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1841 do next += GET(next, 1); while (*next == OP_ALT);
1842 ecode = next + 1 + LINK_SIZE;
1843 break;
1844
1845 case OP_BRAMINZERO:
1846 next = ecode + 1;
1847 do next += GET(next, 1); while (*next == OP_ALT);
1848 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1850 ecode++;
1851 break;
1852
1853 case OP_SKIPZERO:
1854 next = ecode+1;
1855 do next += GET(next,1); while (*next == OP_ALT);
1856 ecode = next + 1 + LINK_SIZE;
1857 break;
1858
1859 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1860 here; just jump to the group, with allow_zero set TRUE. */
1861
1862 case OP_BRAPOSZERO:
1863 op = *(++ecode);
1864 allow_zero = TRUE;
1865 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1866 goto POSSESSIVE_NON_CAPTURE;
1867
1868 /* End of a group, repeated or non-repeating. */
1869
1870 case OP_KET:
1871 case OP_KETRMIN:
1872 case OP_KETRMAX:
1873 case OP_KETRPOS:
1874 prev = ecode - GET(ecode, 1);
1875
1876 /* If this was a group that remembered the subject start, in order to break
1877 infinite repeats of empty string matches, retrieve the subject start from
1878 the chain. Otherwise, set it NULL. */
1879
1880 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1881 {
1882 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1883 eptrb = eptrb->epb_prev; /* Backup to previous group */
1884 }
1885 else saved_eptr = NULL;
1886
1887 /* If we are at the end of an assertion group or a non-capturing atomic
1888 group, stop matching and return MATCH_MATCH, but record the current high
1889 water mark for use by positive assertions. We also need to record the match
1890 start in case it was changed by \K. */
1891
1892 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1893 *prev == OP_ONCE_NC)
1894 {
1895 md->end_match_ptr = eptr; /* For ONCE_NC */
1896 md->end_offset_top = offset_top;
1897 md->start_match_ptr = mstart;
1898 RRETURN(MATCH_MATCH); /* Sets md->mark */
1899 }
1900
1901 /* For capturing groups we have to check the group number back at the start
1902 and if necessary complete handling an extraction by setting the offsets and
1903 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1904 into group 0, so it won't be picked up here. Instead, we catch it when the
1905 OP_END is reached. Other recursion is handled here. We just have to record
1906 the current subject position and start match pointer and give a MATCH
1907 return. */
1908
1909 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1910 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1911 {
1912 number = GET2(prev, 1+LINK_SIZE);
1913 offset = number << 1;
1914
1915 #ifdef PCRE_DEBUG
1916 printf("end bracket %d", number);
1917 printf("\n");
1918 #endif
1919
1920 /* Handle a recursively called group. */
1921
1922 if (md->recursive != NULL && md->recursive->group_num == number)
1923 {
1924 md->end_match_ptr = eptr;
1925 md->start_match_ptr = mstart;
1926 RRETURN(MATCH_MATCH);
1927 }
1928
1929 /* Deal with capturing */
1930
1931 md->capture_last = number;
1932 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1933 {
1934 /* If offset is greater than offset_top, it means that we are
1935 "skipping" a capturing group, and that group's offsets must be marked
1936 unset. In earlier versions of PCRE, all the offsets were unset at the
1937 start of matching, but this doesn't work because atomic groups and
1938 assertions can cause a value to be set that should later be unset.
1939 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1940 part of the atomic group, but this is not on the final matching path,
1941 so must be unset when 2 is set. (If there is no group 2, there is no
1942 problem, because offset_top will then be 2, indicating no capture.) */
1943
1944 if (offset > offset_top)
1945 {
1946 register int *iptr = md->offset_vector + offset_top;
1947 register int *iend = md->offset_vector + offset;
1948 while (iptr < iend) *iptr++ = -1;
1949 }
1950
1951 /* Now make the extraction */
1952
1953 md->offset_vector[offset] =
1954 md->offset_vector[md->offset_end - number];
1955 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1956 if (offset_top <= offset) offset_top = offset + 2;
1957 }
1958 }
1959
1960 /* For an ordinary non-repeating ket, just continue at this level. This
1961 also happens for a repeating ket if no characters were matched in the
1962 group. This is the forcible breaking of infinite loops as implemented in
1963 Perl 5.005. For a non-repeating atomic group that includes captures,
1964 establish a backup point by processing the rest of the pattern at a lower
1965 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1966 original OP_ONCE level, thereby bypassing intermediate backup points, but
1967 resetting any captures that happened along the way. */
1968
1969 if (*ecode == OP_KET || eptr == saved_eptr)
1970 {
1971 if (*prev == OP_ONCE)
1972 {
1973 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1975 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1976 RRETURN(MATCH_ONCE);
1977 }
1978 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1979 break;
1980 }
1981
1982 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1983 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1984 at a time from the outer level, thus saving stack. */
1985
1986 if (*ecode == OP_KETRPOS)
1987 {
1988 md->end_match_ptr = eptr;
1989 md->end_offset_top = offset_top;
1990 RRETURN(MATCH_KETRPOS);
1991 }
1992
1993 /* The normal repeating kets try the rest of the pattern or restart from
1994 the preceding bracket, in the appropriate order. In the second case, we can
1995 use tail recursion to avoid using another stack frame, unless we have an
1996 an atomic group or an unlimited repeat of a group that can match an empty
1997 string. */
1998
1999 if (*ecode == OP_KETRMIN)
2000 {
2001 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2002 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2003 if (*prev == OP_ONCE)
2004 {
2005 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2007 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2008 RRETURN(MATCH_ONCE);
2009 }
2010 if (*prev >= OP_SBRA) /* Could match an empty string */
2011 {
2012 md->match_function_type = MATCH_CBEGROUP;
2013 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2014 RRETURN(rrc);
2015 }
2016 ecode = prev;
2017 goto TAIL_RECURSE;
2018 }
2019 else /* OP_KETRMAX */
2020 {
2021 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2022 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2023 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2025 if (*prev == OP_ONCE)
2026 {
2027 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2029 md->once_target = prev;
2030 RRETURN(MATCH_ONCE);
2031 }
2032 ecode += 1 + LINK_SIZE;
2033 goto TAIL_RECURSE;
2034 }
2035 /* Control never gets here */
2036
2037 /* Not multiline mode: start of subject assertion, unless notbol. */
2038
2039 case OP_CIRC:
2040 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2041
2042 /* Start of subject assertion */
2043
2044 case OP_SOD:
2045 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2046 ecode++;
2047 break;
2048
2049 /* Multiline mode: start of subject unless notbol, or after any newline. */
2050
2051 case OP_CIRCM:
2052 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2053 if (eptr != md->start_subject &&
2054 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2055 RRETURN(MATCH_NOMATCH);
2056 ecode++;
2057 break;
2058
2059 /* Start of match assertion */
2060
2061 case OP_SOM:
2062 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2063 ecode++;
2064 break;
2065
2066 /* Reset the start of match point */
2067
2068 case OP_SET_SOM:
2069 mstart = eptr;
2070 ecode++;
2071 break;
2072
2073 /* Multiline mode: assert before any newline, or before end of subject
2074 unless noteol is set. */
2075
2076 case OP_DOLLM:
2077 if (eptr < md->end_subject)
2078 {
2079 if (!IS_NEWLINE(eptr))
2080 {
2081 if (md->partial != 0 &&
2082 eptr + 1 >= md->end_subject &&
2083 NLBLOCK->nltype == NLTYPE_FIXED &&
2084 NLBLOCK->nllen == 2 &&
2085 *eptr == NLBLOCK->nl[0])
2086 {
2087 md->hitend = TRUE;
2088 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2089 }
2090 RRETURN(MATCH_NOMATCH);
2091 }
2092 }
2093 else
2094 {
2095 if (md->noteol) RRETURN(MATCH_NOMATCH);
2096 SCHECK_PARTIAL();
2097 }
2098 ecode++;
2099 break;
2100
2101 /* Not multiline mode: assert before a terminating newline or before end of
2102 subject unless noteol is set. */
2103
2104 case OP_DOLL:
2105 if (md->noteol) RRETURN(MATCH_NOMATCH);
2106 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2107
2108 /* ... else fall through for endonly */
2109
2110 /* End of subject assertion (\z) */
2111
2112 case OP_EOD:
2113 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2114 SCHECK_PARTIAL();
2115 ecode++;
2116 break;
2117
2118 /* End of subject or ending \n assertion (\Z) */
2119
2120 case OP_EODN:
2121 ASSERT_NL_OR_EOS:
2122 if (eptr < md->end_subject &&
2123 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2124 {
2125 if (md->partial != 0 &&
2126 eptr + 1 >= md->end_subject &&
2127 NLBLOCK->nltype == NLTYPE_FIXED &&
2128 NLBLOCK->nllen == 2 &&
2129 *eptr == NLBLOCK->nl[0])
2130 {
2131 md->hitend = TRUE;
2132 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2133 }
2134 RRETURN(MATCH_NOMATCH);
2135 }
2136
2137 /* Either at end of string or \n before end. */
2138
2139 SCHECK_PARTIAL();
2140 ecode++;
2141 break;
2142
2143 /* Word boundary assertions */
2144
2145 case OP_NOT_WORD_BOUNDARY:
2146 case OP_WORD_BOUNDARY:
2147 {
2148
2149 /* Find out if the previous and current characters are "word" characters.
2150 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2151 be "non-word" characters. Remember the earliest consulted character for
2152 partial matching. */
2153
2154 #ifdef SUPPORT_UTF
2155 if (utf)
2156 {
2157 /* Get status of previous character */
2158
2159 if (eptr == md->start_subject) prev_is_word = FALSE; else
2160 {
2161 PCRE_PUCHAR lastptr = eptr - 1;
2162 BACKCHAR(lastptr);
2163 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2164 GETCHAR(c, lastptr);
2165 #ifdef SUPPORT_UCP
2166 if (md->use_ucp)
2167 {
2168 if (c == '_') prev_is_word = TRUE; else
2169 {
2170 int cat = UCD_CATEGORY(c);
2171 prev_is_word = (cat == ucp_L || cat == ucp_N);
2172 }
2173 }
2174 else
2175 #endif
2176 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2177 }
2178
2179 /* Get status of next character */
2180
2181 if (eptr >= md->end_subject)
2182 {
2183 SCHECK_PARTIAL();
2184 cur_is_word = FALSE;
2185 }
2186 else
2187 {
2188 GETCHAR(c, eptr);
2189 #ifdef SUPPORT_UCP
2190 if (md->use_ucp)
2191 {
2192 if (c == '_') cur_is_word = TRUE; else
2193 {
2194 int cat = UCD_CATEGORY(c);
2195 cur_is_word = (cat == ucp_L || cat == ucp_N);
2196 }
2197 }
2198 else
2199 #endif
2200 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2201 }
2202 }
2203 else
2204 #endif
2205
2206 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2207 consistency with the behaviour of \w we do use it in this case. */
2208
2209 {
2210 /* Get status of previous character */
2211
2212 if (eptr == md->start_subject) prev_is_word = FALSE; else
2213 {
2214 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2215 #ifdef SUPPORT_UCP
2216 if (md->use_ucp)
2217 {
2218 c = eptr[-1];
2219 if (c == '_') prev_is_word = TRUE; else
2220 {
2221 int cat = UCD_CATEGORY(c);
2222 prev_is_word = (cat == ucp_L || cat == ucp_N);
2223 }
2224 }
2225 else
2226 #endif
2227 prev_is_word = MAX_255(eptr[-1])
2228 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2229 }
2230
2231 /* Get status of next character */
2232
2233 if (eptr >= md->end_subject)
2234 {
2235 SCHECK_PARTIAL();
2236 cur_is_word = FALSE;
2237 }
2238 else
2239 #ifdef SUPPORT_UCP
2240 if (md->use_ucp)
2241 {
2242 c = *eptr;
2243 if (c == '_') cur_is_word = TRUE; else
2244 {
2245 int cat = UCD_CATEGORY(c);
2246 cur_is_word = (cat == ucp_L || cat == ucp_N);
2247 }
2248 }
2249 else
2250 #endif
2251 cur_is_word = MAX_255(*eptr)
2252 && ((md->ctypes[*eptr] & ctype_word) != 0);
2253 }
2254
2255 /* Now see if the situation is what we want */
2256
2257 if ((*ecode++ == OP_WORD_BOUNDARY)?
2258 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2259 RRETURN(MATCH_NOMATCH);
2260 }
2261 break;
2262
2263 /* Match any single character type except newline; have to take care with
2264 CRLF newlines and partial matching. */
2265
2266 case OP_ANY:
2267 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2268 if (md->partial != 0 &&
2269 eptr + 1 >= md->end_subject &&
2270 NLBLOCK->nltype == NLTYPE_FIXED &&
2271 NLBLOCK->nllen == 2 &&
2272 *eptr == NLBLOCK->nl[0])
2273 {
2274 md->hitend = TRUE;
2275 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2276 }
2277
2278 /* Fall through */
2279
2280 /* Match any single character whatsoever. */
2281
2282 case OP_ALLANY:
2283 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2284 { /* not be updated before SCHECK_PARTIAL. */
2285 SCHECK_PARTIAL();
2286 RRETURN(MATCH_NOMATCH);
2287 }
2288 eptr++;
2289 #ifdef SUPPORT_UTF
2290 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2291 #endif
2292 ecode++;
2293 break;
2294
2295 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2296 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2297
2298 case OP_ANYBYTE:
2299 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2300 { /* not be updated before SCHECK_PARTIAL. */
2301 SCHECK_PARTIAL();
2302 RRETURN(MATCH_NOMATCH);
2303 }
2304 eptr++;
2305 ecode++;
2306 break;
2307
2308 case OP_NOT_DIGIT:
2309 if (eptr >= md->end_subject)
2310 {
2311 SCHECK_PARTIAL();
2312 RRETURN(MATCH_NOMATCH);
2313 }
2314 GETCHARINCTEST(c, eptr);
2315 if (
2316 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2317 c < 256 &&
2318 #endif
2319 (md->ctypes[c] & ctype_digit) != 0
2320 )
2321 RRETURN(MATCH_NOMATCH);
2322 ecode++;
2323 break;
2324
2325 case OP_DIGIT:
2326 if (eptr >= md->end_subject)
2327 {
2328 SCHECK_PARTIAL();
2329 RRETURN(MATCH_NOMATCH);
2330 }
2331 GETCHARINCTEST(c, eptr);
2332 if (
2333 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2334 c > 255 ||
2335 #endif
2336 (md->ctypes[c] & ctype_digit) == 0
2337 )
2338 RRETURN(MATCH_NOMATCH);
2339 ecode++;
2340 break;
2341
2342 case OP_NOT_WHITESPACE:
2343 if (eptr >= md->end_subject)
2344 {
2345 SCHECK_PARTIAL();
2346 RRETURN(MATCH_NOMATCH);
2347 }
2348 GETCHARINCTEST(c, eptr);
2349 if (
2350 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2351 c < 256 &&
2352 #endif
2353 (md->ctypes[c] & ctype_space) != 0
2354 )
2355 RRETURN(MATCH_NOMATCH);
2356 ecode++;
2357 break;
2358
2359 case OP_WHITESPACE:
2360 if (eptr >= md->end_subject)
2361 {
2362 SCHECK_PARTIAL();
2363 RRETURN(MATCH_NOMATCH);
2364 }
2365 GETCHARINCTEST(c, eptr);
2366 if (
2367 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2368 c > 255 ||
2369 #endif
2370 (md->ctypes[c] & ctype_space) == 0
2371 )
2372 RRETURN(MATCH_NOMATCH);
2373 ecode++;
2374 break;
2375
2376 case OP_NOT_WORDCHAR:
2377 if (eptr >= md->end_subject)
2378 {
2379 SCHECK_PARTIAL();
2380 RRETURN(MATCH_NOMATCH);
2381 }
2382 GETCHARINCTEST(c, eptr);
2383 if (
2384 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2385 c < 256 &&
2386 #endif
2387 (md->ctypes[c] & ctype_word) != 0
2388 )
2389 RRETURN(MATCH_NOMATCH);
2390 ecode++;
2391 break;
2392
2393 case OP_WORDCHAR:
2394 if (eptr >= md->end_subject)
2395 {
2396 SCHECK_PARTIAL();
2397 RRETURN(MATCH_NOMATCH);
2398 }
2399 GETCHARINCTEST(c, eptr);
2400 if (
2401 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2402 c > 255 ||
2403 #endif
2404 (md->ctypes[c] & ctype_word) == 0
2405 )
2406 RRETURN(MATCH_NOMATCH);
2407 ecode++;
2408 break;
2409
2410 case OP_ANYNL:
2411 if (eptr >= md->end_subject)
2412 {
2413 SCHECK_PARTIAL();
2414 RRETURN(MATCH_NOMATCH);
2415 }
2416 GETCHARINCTEST(c, eptr);
2417 switch(c)
2418 {
2419 default: RRETURN(MATCH_NOMATCH);
2420
2421 case 0x000d:
2422 if (eptr >= md->end_subject)
2423 {
2424 SCHECK_PARTIAL();
2425 }
2426 else if (*eptr == 0x0a) eptr++;
2427 break;
2428
2429 case 0x000a:
2430 break;
2431
2432 case 0x000b:
2433 case 0x000c:
2434 case 0x0085:
2435 case 0x2028:
2436 case 0x2029:
2437 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2438 break;
2439 }
2440 ecode++;
2441 break;
2442
2443 case OP_NOT_HSPACE:
2444 if (eptr >= md->end_subject)
2445 {
2446 SCHECK_PARTIAL();
2447 RRETURN(MATCH_NOMATCH);
2448 }
2449 GETCHARINCTEST(c, eptr);
2450 switch(c)
2451 {
2452 default: break;
2453 case 0x09: /* HT */
2454 case 0x20: /* SPACE */
2455 case 0xa0: /* NBSP */
2456 case 0x1680: /* OGHAM SPACE MARK */
2457 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2458 case 0x2000: /* EN QUAD */
2459 case 0x2001: /* EM QUAD */
2460 case 0x2002: /* EN SPACE */
2461 case 0x2003: /* EM SPACE */
2462 case 0x2004: /* THREE-PER-EM SPACE */
2463 case 0x2005: /* FOUR-PER-EM SPACE */
2464 case 0x2006: /* SIX-PER-EM SPACE */
2465 case 0x2007: /* FIGURE SPACE */
2466 case 0x2008: /* PUNCTUATION SPACE */
2467 case 0x2009: /* THIN SPACE */
2468 case 0x200A: /* HAIR SPACE */
2469 case 0x202f: /* NARROW NO-BREAK SPACE */
2470 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2471 case 0x3000: /* IDEOGRAPHIC SPACE */
2472 RRETURN(MATCH_NOMATCH);
2473 }
2474 ecode++;
2475 break;
2476
2477 case OP_HSPACE:
2478 if (eptr >= md->end_subject)
2479 {
2480 SCHECK_PARTIAL();
2481 RRETURN(MATCH_NOMATCH);
2482 }
2483 GETCHARINCTEST(c, eptr);
2484 switch(c)
2485 {
2486 default: RRETURN(MATCH_NOMATCH);
2487 case 0x09: /* HT */
2488 case 0x20: /* SPACE */
2489 case 0xa0: /* NBSP */
2490 case 0x1680: /* OGHAM SPACE MARK */
2491 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2492 case 0x2000: /* EN QUAD */
2493 case 0x2001: /* EM QUAD */
2494 case 0x2002: /* EN SPACE */
2495 case 0x2003: /* EM SPACE */
2496 case 0x2004: /* THREE-PER-EM SPACE */
2497 case 0x2005: /* FOUR-PER-EM SPACE */
2498 case 0x2006: /* SIX-PER-EM SPACE */
2499 case 0x2007: /* FIGURE SPACE */
2500 case 0x2008: /* PUNCTUATION SPACE */
2501 case 0x2009: /* THIN SPACE */
2502 case 0x200A: /* HAIR SPACE */
2503 case 0x202f: /* NARROW NO-BREAK SPACE */
2504 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2505 case 0x3000: /* IDEOGRAPHIC SPACE */
2506 break;
2507 }
2508 ecode++;
2509 break;
2510
2511 case OP_NOT_VSPACE:
2512 if (eptr >= md->end_subject)
2513 {
2514 SCHECK_PARTIAL();
2515 RRETURN(MATCH_NOMATCH);
2516 }
2517 GETCHARINCTEST(c, eptr);
2518 switch(c)
2519 {
2520 default: break;
2521 case 0x0a: /* LF */
2522 case 0x0b: /* VT */
2523 case 0x0c: /* FF */
2524 case 0x0d: /* CR */
2525 case 0x85: /* NEL */
2526 case 0x2028: /* LINE SEPARATOR */
2527 case 0x2029: /* PARAGRAPH SEPARATOR */
2528 RRETURN(MATCH_NOMATCH);
2529 }
2530 ecode++;
2531 break;
2532
2533 case OP_VSPACE:
2534 if (eptr >= md->end_subject)
2535 {
2536 SCHECK_PARTIAL();
2537 RRETURN(MATCH_NOMATCH);
2538 }
2539 GETCHARINCTEST(c, eptr);
2540 switch(c)
2541 {
2542 default: RRETURN(MATCH_NOMATCH);
2543 case 0x0a: /* LF */
2544 case 0x0b: /* VT */
2545 case 0x0c: /* FF */
2546 case 0x0d: /* CR */
2547 case 0x85: /* NEL */
2548 case 0x2028: /* LINE SEPARATOR */
2549 case 0x2029: /* PARAGRAPH SEPARATOR */
2550 break;
2551 }
2552 ecode++;
2553 break;
2554
2555 #ifdef SUPPORT_UCP
2556 /* Check the next character by Unicode property. We will get here only
2557 if the support is in the binary; otherwise a compile-time error occurs. */
2558
2559 case OP_PROP:
2560 case OP_NOTPROP:
2561 if (eptr >= md->end_subject)
2562 {
2563 SCHECK_PARTIAL();
2564 RRETURN(MATCH_NOMATCH);
2565 }
2566 GETCHARINCTEST(c, eptr);
2567 {
2568 const ucd_record *prop = GET_UCD(c);
2569
2570 switch(ecode[1])
2571 {
2572 case PT_ANY:
2573 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2574 break;
2575
2576 case PT_LAMP:
2577 if ((prop->chartype == ucp_Lu ||
2578 prop->chartype == ucp_Ll ||
2579 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2580 RRETURN(MATCH_NOMATCH);
2581 break;
2582
2583 case PT_GC:
2584 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2585 RRETURN(MATCH_NOMATCH);
2586 break;
2587
2588 case PT_PC:
2589 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2590 RRETURN(MATCH_NOMATCH);
2591 break;
2592
2593 case PT_SC:
2594 if ((ecode[2] != prop->script) == (op == OP_PROP))
2595 RRETURN(MATCH_NOMATCH);
2596 break;
2597
2598 /* These are specials */
2599
2600 case PT_ALNUM:
2601 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2602 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2603 RRETURN(MATCH_NOMATCH);
2604 break;
2605
2606 case PT_SPACE: /* Perl space */
2607 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2608 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2609 == (op == OP_NOTPROP))
2610 RRETURN(MATCH_NOMATCH);
2611 break;
2612
2613 case PT_PXSPACE: /* POSIX space */
2614 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2615 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2616 c == CHAR_FF || c == CHAR_CR)
2617 == (op == OP_NOTPROP))
2618 RRETURN(MATCH_NOMATCH);
2619 break;
2620
2621 case PT_WORD:
2622 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2623 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2624 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2625 RRETURN(MATCH_NOMATCH);
2626 break;
2627
2628 /* This should never occur */
2629
2630 default:
2631 RRETURN(PCRE_ERROR_INTERNAL);
2632 }
2633
2634 ecode += 3;
2635 }
2636 break;
2637
2638 /* Match an extended Unicode sequence. We will get here only if the support
2639 is in the binary; otherwise a compile-time error occurs. */
2640
2641 case OP_EXTUNI:
2642 if (eptr >= md->end_subject)
2643 {
2644 SCHECK_PARTIAL();
2645 RRETURN(MATCH_NOMATCH);
2646 }
2647 GETCHARINCTEST(c, eptr);
2648 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2649 while (eptr < md->end_subject)
2650 {
2651 int len = 1;
2652 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2653 if (UCD_CATEGORY(c) != ucp_M) break;
2654 eptr += len;
2655 }
2656 CHECK_PARTIAL();
2657 ecode++;
2658 break;
2659 #endif
2660
2661
2662 /* Match a back reference, possibly repeatedly. Look past the end of the
2663 item to see if there is repeat information following. The code is similar
2664 to that for character classes, but repeated for efficiency. Then obey
2665 similar code to character type repeats - written out again for speed.
2666 However, if the referenced string is the empty string, always treat
2667 it as matched, any number of times (otherwise there could be infinite
2668 loops). */
2669
2670 case OP_REF:
2671 case OP_REFI:
2672 caseless = op == OP_REFI;
2673 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2674 ecode += 1 + IMM2_SIZE;
2675
2676 /* If the reference is unset, there are two possibilities:
2677
2678 (a) In the default, Perl-compatible state, set the length negative;
2679 this ensures that every attempt at a match fails. We can't just fail
2680 here, because of the possibility of quantifiers with zero minima.
2681
2682 (b) If the JavaScript compatibility flag is set, set the length to zero
2683 so that the back reference matches an empty string.
2684
2685 Otherwise, set the length to the length of what was matched by the
2686 referenced subpattern. */
2687
2688 if (offset >= offset_top || md->offset_vector[offset] < 0)
2689 length = (md->jscript_compat)? 0 : -1;
2690 else
2691 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2692
2693 /* Set up for repetition, or handle the non-repeated case */
2694
2695 switch (*ecode)
2696 {
2697 case OP_CRSTAR:
2698 case OP_CRMINSTAR:
2699 case OP_CRPLUS:
2700 case OP_CRMINPLUS:
2701 case OP_CRQUERY:
2702 case OP_CRMINQUERY:
2703 c = *ecode++ - OP_CRSTAR;
2704 minimize = (c & 1) != 0;
2705 min = rep_min[c]; /* Pick up values from tables; */
2706 max = rep_max[c]; /* zero for max => infinity */
2707 if (max == 0) max = INT_MAX;
2708 break;
2709
2710 case OP_CRRANGE:
2711 case OP_CRMINRANGE:
2712 minimize = (*ecode == OP_CRMINRANGE);
2713 min = GET2(ecode, 1);
2714 max = GET2(ecode, 1 + IMM2_SIZE);
2715 if (max == 0) max = INT_MAX;
2716 ecode += 1 + 2 * IMM2_SIZE;
2717 break;
2718
2719 default: /* No repeat follows */
2720 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2721 {
2722 if (length == -2) eptr = md->end_subject; /* Partial match */
2723 CHECK_PARTIAL();
2724 RRETURN(MATCH_NOMATCH);
2725 }
2726 eptr += length;
2727 continue; /* With the main loop */
2728 }
2729
2730 /* Handle repeated back references. If the length of the reference is
2731 zero, just continue with the main loop. If the length is negative, it
2732 means the reference is unset in non-Java-compatible mode. If the minimum is
2733 zero, we can continue at the same level without recursion. For any other
2734 minimum, carrying on will result in NOMATCH. */
2735
2736 if (length == 0) continue;
2737 if (length < 0 && min == 0) continue;
2738
2739 /* First, ensure the minimum number of matches are present. We get back
2740 the length of the reference string explicitly rather than passing the
2741 address of eptr, so that eptr can be a register variable. */
2742
2743 for (i = 1; i <= min; i++)
2744 {
2745 int slength;
2746 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2747 {
2748 if (slength == -2) eptr = md->end_subject; /* Partial match */
2749 CHECK_PARTIAL();
2750 RRETURN(MATCH_NOMATCH);
2751 }
2752 eptr += slength;
2753 }
2754
2755 /* If min = max, continue at the same level without recursion.
2756 They are not both allowed to be zero. */
2757
2758 if (min == max) continue;
2759
2760 /* If minimizing, keep trying and advancing the pointer */
2761
2762 if (minimize)
2763 {
2764 for (fi = min;; fi++)
2765 {
2766 int slength;
2767 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2768 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2769 if (fi >= max) RRETURN(MATCH_NOMATCH);
2770 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2771 {
2772 if (slength == -2) eptr = md->end_subject; /* Partial match */
2773 CHECK_PARTIAL();
2774 RRETURN(MATCH_NOMATCH);
2775 }
2776 eptr += slength;
2777 }
2778 /* Control never gets here */
2779 }
2780
2781 /* If maximizing, find the longest string and work backwards */
2782
2783 else
2784 {
2785 pp = eptr;
2786 for (i = min; i < max; i++)
2787 {
2788 int slength;
2789 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2790 {
2791 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2792 the soft partial matching case. */
2793
2794 if (slength == -2 && md->partial != 0 &&
2795 md->end_subject > md->start_used_ptr)
2796 {
2797 md->hitend = TRUE;
2798 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2799 }
2800 break;
2801 }
2802 eptr += slength;
2803 }
2804
2805 while (eptr >= pp)
2806 {
2807 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2809 eptr -= length;
2810 }
2811 RRETURN(MATCH_NOMATCH);
2812 }
2813 /* Control never gets here */
2814
2815 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2816 used when all the characters in the class have values in the range 0-255,
2817 and either the matching is caseful, or the characters are in the range
2818 0-127 when UTF-8 processing is enabled. The only difference between
2819 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2820 encountered.
2821
2822 First, look past the end of the item to see if there is repeat information
2823 following. Then obey similar code to character type repeats - written out
2824 again for speed. */
2825
2826 case OP_NCLASS:
2827 case OP_CLASS:
2828 {
2829 /* The data variable is saved across frames, so the byte map needs to
2830 be stored there. */
2831 #define BYTE_MAP ((pcre_uint8 *)data)
2832 data = ecode + 1; /* Save for matching */
2833 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2834
2835 switch (*ecode)
2836 {
2837 case OP_CRSTAR:
2838 case OP_CRMINSTAR:
2839 case OP_CRPLUS:
2840 case OP_CRMINPLUS:
2841 case OP_CRQUERY:
2842 case OP_CRMINQUERY:
2843 c = *ecode++ - OP_CRSTAR;
2844 minimize = (c & 1) != 0;
2845 min = rep_min[c]; /* Pick up values from tables; */
2846 max = rep_max[c]; /* zero for max => infinity */
2847 if (max == 0) max = INT_MAX;
2848 break;
2849
2850 case OP_CRRANGE:
2851 case OP_CRMINRANGE:
2852 minimize = (*ecode == OP_CRMINRANGE);
2853 min = GET2(ecode, 1);
2854 max = GET2(ecode, 1 + IMM2_SIZE);
2855 if (max == 0) max = INT_MAX;
2856 ecode += 1 + 2 * IMM2_SIZE;
2857 break;
2858
2859 default: /* No repeat follows */
2860 min = max = 1;
2861 break;
2862 }
2863
2864 /* First, ensure the minimum number of matches are present. */
2865
2866 #ifdef SUPPORT_UTF
2867 if (utf)
2868 {
2869 for (i = 1; i <= min; i++)
2870 {
2871 if (eptr >= md->end_subject)
2872 {
2873 SCHECK_PARTIAL();
2874 RRETURN(MATCH_NOMATCH);
2875 }
2876 GETCHARINC(c, eptr);
2877 if (c > 255)
2878 {
2879 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2880 }
2881 else
2882 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2883 }
2884 }
2885 else
2886 #endif
2887 /* Not UTF mode */
2888 {
2889 for (i = 1; i <= min; i++)
2890 {
2891 if (eptr >= md->end_subject)
2892 {
2893 SCHECK_PARTIAL();
2894 RRETURN(MATCH_NOMATCH);
2895 }
2896 c = *eptr++;
2897 #ifndef COMPILE_PCRE8
2898 if (c > 255)
2899 {
2900 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2901 }
2902 else
2903 #endif
2904 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2905 }
2906 }
2907
2908 /* If max == min we can continue with the main loop without the
2909 need to recurse. */
2910
2911 if (min == max) continue;
2912
2913 /* If minimizing, keep testing the rest of the expression and advancing
2914 the pointer while it matches the class. */
2915
2916 if (minimize)
2917 {
2918 #ifdef SUPPORT_UTF
2919 if (utf)
2920 {
2921 for (fi = min;; fi++)
2922 {
2923 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2925 if (fi >= max) RRETURN(MATCH_NOMATCH);
2926 if (eptr >= md->end_subject)
2927 {
2928 SCHECK_PARTIAL();
2929 RRETURN(MATCH_NOMATCH);
2930 }
2931 GETCHARINC(c, eptr);
2932 if (c > 255)
2933 {
2934 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2935 }
2936 else
2937 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2938 }
2939 }
2940 else
2941 #endif
2942 /* Not UTF mode */
2943 {
2944 for (fi = min;; fi++)
2945 {
2946 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2948 if (fi >= max) RRETURN(MATCH_NOMATCH);
2949 if (eptr >= md->end_subject)
2950 {
2951 SCHECK_PARTIAL();
2952 RRETURN(MATCH_NOMATCH);
2953 }
2954 c = *eptr++;
2955 #ifndef COMPILE_PCRE8
2956 if (c > 255)
2957 {
2958 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2959 }
2960 else
2961 #endif
2962 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2963 }
2964 }
2965 /* Control never gets here */
2966 }
2967
2968 /* If maximizing, find the longest possible run, then work backwards. */
2969
2970 else
2971 {
2972 pp = eptr;
2973
2974 #ifdef SUPPORT_UTF
2975 if (utf)
2976 {
2977 for (i = min; i < max; i++)
2978 {
2979 int len = 1;
2980 if (eptr >= md->end_subject)
2981 {
2982 SCHECK_PARTIAL();
2983 break;
2984 }
2985 GETCHARLEN(c, eptr, len);
2986 if (c > 255)
2987 {
2988 if (op == OP_CLASS) break;
2989 }
2990 else
2991 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2992 eptr += len;
2993 }
2994 for (;;)
2995 {
2996 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2997 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2998 if (eptr-- == pp) break; /* Stop if tried at original pos */
2999 BACKCHAR(eptr);
3000 }
3001 }
3002 else
3003 #endif
3004 /* Not UTF mode */
3005 {
3006 for (i = min; i < max; i++)
3007 {
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 break;
3012 }
3013 c = *eptr;
3014 #ifndef COMPILE_PCRE8
3015 if (c > 255)
3016 {
3017 if (op == OP_CLASS) break;
3018 }
3019 else
3020 #endif
3021 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3022 eptr++;
3023 }
3024 while (eptr >= pp)
3025 {
3026 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3027 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3028 eptr--;
3029 }
3030 }
3031
3032 RRETURN(MATCH_NOMATCH);
3033 }
3034 #undef BYTE_MAP
3035 }
3036 /* Control never gets here */
3037
3038
3039 /* Match an extended character class. This opcode is encountered only
3040 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3041 mode, because Unicode properties are supported in non-UTF-8 mode. */
3042
3043 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3044 case OP_XCLASS:
3045 {
3046 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3047 ecode += GET(ecode, 1); /* Advance past the item */
3048
3049 switch (*ecode)
3050 {
3051 case OP_CRSTAR:
3052 case OP_CRMINSTAR:
3053 case OP_CRPLUS:
3054 case OP_CRMINPLUS:
3055 case OP_CRQUERY:
3056 case OP_CRMINQUERY:
3057 c = *ecode++ - OP_CRSTAR;
3058 minimize = (c & 1) != 0;
3059 min = rep_min[c]; /* Pick up values from tables; */
3060 max = rep_max[c]; /* zero for max => infinity */
3061 if (max == 0) max = INT_MAX;
3062 break;
3063
3064 case OP_CRRANGE:
3065 case OP_CRMINRANGE:
3066 minimize = (*ecode == OP_CRMINRANGE);
3067 min = GET2(ecode, 1);
3068 max = GET2(ecode, 1 + IMM2_SIZE);
3069 if (max == 0) max = INT_MAX;
3070 ecode += 1 + 2 * IMM2_SIZE;
3071 break;
3072
3073 default: /* No repeat follows */
3074 min = max = 1;
3075 break;
3076 }
3077
3078 /* First, ensure the minimum number of matches are present. */
3079
3080 for (i = 1; i <= min; i++)
3081 {
3082 if (eptr >= md->end_subject)
3083 {
3084 SCHECK_PARTIAL();
3085 RRETURN(MATCH_NOMATCH);
3086 }
3087 GETCHARINCTEST(c, eptr);
3088 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3089 }
3090
3091 /* If max == min we can continue with the main loop without the
3092 need to recurse. */
3093
3094 if (min == max) continue;
3095
3096 /* If minimizing, keep testing the rest of the expression and advancing
3097 the pointer while it matches the class. */
3098
3099 if (minimize)
3100 {
3101 for (fi = min;; fi++)
3102 {
3103 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3104 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3105 if (fi >= max) RRETURN(MATCH_NOMATCH);
3106 if (eptr >= md->end_subject)
3107 {
3108 SCHECK_PARTIAL();
3109 RRETURN(MATCH_NOMATCH);
3110 }
3111 GETCHARINCTEST(c, eptr);
3112 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3113 }
3114 /* Control never gets here */
3115 }
3116
3117 /* If maximizing, find the longest possible run, then work backwards. */
3118
3119 else
3120 {
3121 pp = eptr;
3122 for (i = min; i < max; i++)
3123 {
3124 int len = 1;
3125 if (eptr >= md->end_subject)
3126 {
3127 SCHECK_PARTIAL();
3128 break;
3129 }
3130 #ifdef SUPPORT_UTF
3131 GETCHARLENTEST(c, eptr, len);
3132 #else
3133 c = *eptr;
3134 #endif
3135 if (!PRIV(xclass)(c, data, utf)) break;
3136 eptr += len;
3137 }
3138 for(;;)
3139 {
3140 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3141 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3142 if (eptr-- == pp) break; /* Stop if tried at original pos */
3143 #ifdef SUPPORT_UTF
3144 if (utf) BACKCHAR(eptr);
3145 #endif
3146 }
3147 RRETURN(MATCH_NOMATCH);
3148 }
3149
3150 /* Control never gets here */
3151 }
3152 #endif /* End of XCLASS */
3153
3154 /* Match a single character, casefully */
3155
3156 case OP_CHAR:
3157 #ifdef SUPPORT_UTF
3158 if (utf)
3159 {
3160 length = 1;
3161 ecode++;
3162 GETCHARLEN(fc, ecode, length);
3163 if (length > md->end_subject - eptr)
3164 {
3165 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3166 RRETURN(MATCH_NOMATCH);
3167 }
3168 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3169 }
3170 else
3171 #endif
3172 /* Not UTF mode */
3173 {
3174 if (md->end_subject - eptr < 1)
3175 {
3176 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3177 RRETURN(MATCH_NOMATCH);
3178 }
3179 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3180 ecode += 2;
3181 }
3182 break;
3183
3184 /* Match a single character, caselessly. If we are at the end of the
3185 subject, give up immediately. */
3186
3187 case OP_CHARI:
3188 if (eptr >= md->end_subject)
3189 {
3190 SCHECK_PARTIAL();
3191 RRETURN(MATCH_NOMATCH);
3192 }
3193
3194 #ifdef SUPPORT_UTF
3195 if (utf)
3196 {
3197 length = 1;
3198 ecode++;
3199 GETCHARLEN(fc, ecode, length);
3200
3201 /* If the pattern character's value is < 128, we have only one byte, and
3202 we know that its other case must also be one byte long, so we can use the
3203 fast lookup table. We know that there is at least one byte left in the
3204 subject. */
3205
3206 if (fc < 128)
3207 {
3208 if (md->lcc[fc]
3209 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3210 ecode++;
3211 eptr++;
3212 }
3213
3214 /* Otherwise we must pick up the subject character. Note that we cannot
3215 use the value of "length" to check for sufficient bytes left, because the
3216 other case of the character may have more or fewer bytes. */
3217
3218 else
3219 {
3220 unsigned int dc;
3221 GETCHARINC(dc, eptr);
3222 ecode += length;
3223
3224 /* If we have Unicode property support, we can use it to test the other
3225 case of the character, if there is one. */
3226
3227 if (fc != dc)
3228 {
3229 #ifdef SUPPORT_UCP
3230 if (dc != UCD_OTHERCASE(fc))
3231 #endif
3232 RRETURN(MATCH_NOMATCH);
3233 }
3234 }
3235 }
3236 else
3237 #endif /* SUPPORT_UTF */
3238
3239 /* Not UTF mode */
3240 {
3241 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3242 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3243 eptr++;
3244 ecode += 2;
3245 }
3246 break;
3247
3248 /* Match a single character repeatedly. */
3249
3250 case OP_EXACT:
3251 case OP_EXACTI:
3252 min = max = GET2(ecode, 1);
3253 ecode += 1 + IMM2_SIZE;
3254 goto REPEATCHAR;
3255
3256 case OP_POSUPTO:
3257 case OP_POSUPTOI:
3258 possessive = TRUE;
3259 /* Fall through */
3260
3261 case OP_UPTO:
3262 case OP_UPTOI:
3263 case OP_MINUPTO:
3264 case OP_MINUPTOI:
3265 min = 0;
3266 max = GET2(ecode, 1);
3267 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3268 ecode += 1 + IMM2_SIZE;
3269 goto REPEATCHAR;
3270
3271 case OP_POSSTAR:
3272 case OP_POSSTARI:
3273 possessive = TRUE;
3274 min = 0;
3275 max = INT_MAX;
3276 ecode++;
3277 goto REPEATCHAR;
3278
3279 case OP_POSPLUS:
3280 case OP_POSPLUSI:
3281 possessive = TRUE;
3282 min = 1;
3283 max = INT_MAX;
3284 ecode++;
3285 goto REPEATCHAR;
3286
3287 case OP_POSQUERY:
3288 case OP_POSQUERYI:
3289 possessive = TRUE;
3290 min = 0;
3291 max = 1;
3292 ecode++;
3293 goto REPEATCHAR;
3294
3295 case OP_STAR:
3296 case OP_STARI:
3297 case OP_MINSTAR:
3298 case OP_MINSTARI:
3299 case OP_PLUS:
3300 case OP_PLUSI:
3301 case OP_MINPLUS:
3302 case OP_MINPLUSI:
3303 case OP_QUERY:
3304 case OP_QUERYI:
3305 case OP_MINQUERY:
3306 case OP_MINQUERYI:
3307 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3308 minimize = (c & 1) != 0;
3309 min = rep_min[c]; /* Pick up values from tables; */
3310 max = rep_max[c]; /* zero for max => infinity */
3311 if (max == 0) max = INT_MAX;
3312
3313 /* Common code for all repeated single-character matches. */
3314
3315 REPEATCHAR:
3316 #ifdef SUPPORT_UTF
3317 if (utf)
3318 {
3319 length = 1;
3320 charptr = ecode;
3321 GETCHARLEN(fc, ecode, length);
3322 ecode += length;
3323
3324 /* Handle multibyte character matching specially here. There is
3325 support for caseless matching if UCP support is present. */
3326
3327 if (length > 1)
3328 {
3329 #ifdef SUPPORT_UCP
3330 unsigned int othercase;
3331 if (op >= OP_STARI && /* Caseless */
3332 (othercase = UCD_OTHERCASE(fc)) != fc)
3333 oclength = PRIV(ord2utf)(othercase, occhars);
3334 else oclength = 0;
3335 #endif /* SUPPORT_UCP */
3336
3337 for (i = 1; i <= min; i++)
3338 {
3339 if (eptr <= md->end_subject - length &&
3340 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3341 #ifdef SUPPORT_UCP
3342 else if (oclength > 0 &&
3343 eptr <= md->end_subject - oclength &&
3344 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3345 #endif /* SUPPORT_UCP */
3346 else
3347 {
3348 CHECK_PARTIAL();
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 }
3352
3353 if (min == max) continue;
3354
3355 if (minimize)
3356 {
3357 for (fi = min;; fi++)
3358 {
3359 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3360 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3361 if (fi >= max) RRETURN(MATCH_NOMATCH);
3362 if (eptr <= md->end_subject - length &&
3363 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3364 #ifdef SUPPORT_UCP
3365 else if (oclength > 0 &&
3366 eptr <= md->end_subject - oclength &&
3367 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3368 #endif /* SUPPORT_UCP */
3369 else
3370 {
3371 CHECK_PARTIAL();
3372 RRETURN(MATCH_NOMATCH);
3373 }
3374 }
3375 /* Control never gets here */
3376 }
3377
3378 else /* Maximize */
3379 {
3380 pp = eptr;
3381 for (i = min; i < max; i++)
3382 {
3383 if (eptr <= md->end_subject - length &&
3384 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3385 #ifdef SUPPORT_UCP
3386 else if (oclength > 0 &&
3387 eptr <= md->end_subject - oclength &&
3388 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3389 #endif /* SUPPORT_UCP */
3390 else
3391 {
3392 CHECK_PARTIAL();
3393 break;
3394 }
3395 }
3396
3397 if (possessive) continue;
3398
3399 for(;;)
3400 {
3401 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3404 #ifdef SUPPORT_UCP
3405 eptr--;
3406 BACKCHAR(eptr);
3407 #else /* without SUPPORT_UCP */
3408 eptr -= length;
3409 #endif /* SUPPORT_UCP */
3410 }
3411 }
3412 /* Control never gets here */
3413 }
3414
3415 /* If the length of a UTF-8 character is 1, we fall through here, and
3416 obey the code as for non-UTF-8 characters below, though in this case the
3417 value of fc will always be < 128. */
3418 }
3419 else
3420 #endif /* SUPPORT_UTF */
3421 /* When not in UTF-8 mode, load a single-byte character. */
3422 fc = *ecode++;
3423
3424 /* The value of fc at this point is always one character, though we may
3425 or may not be in UTF mode. The code is duplicated for the caseless and
3426 caseful cases, for speed, since matching characters is likely to be quite
3427 common. First, ensure the minimum number of matches are present. If min =
3428 max, continue at the same level without recursing. Otherwise, if
3429 minimizing, keep trying the rest of the expression and advancing one
3430 matching character if failing, up to the maximum. Alternatively, if
3431 maximizing, find the maximum number of characters and work backwards. */
3432
3433 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3434 max, eptr));
3435
3436 if (op >= OP_STARI) /* Caseless */
3437 {
3438 #ifdef COMPILE_PCRE8
3439 /* fc must be < 128 if UTF is enabled. */
3440 foc = md->fcc[fc];
3441 #else
3442 #ifdef SUPPORT_UTF
3443 #ifdef SUPPORT_UCP
3444 if (utf && fc > 127)
3445 foc = UCD_OTHERCASE(fc);
3446 #else
3447 if (utf && fc > 127)
3448 foc = fc;
3449 #endif /* SUPPORT_UCP */
3450 else
3451 #endif /* SUPPORT_UTF */
3452 foc = TABLE_GET(fc, md->fcc, fc);
3453 #endif /* COMPILE_PCRE8 */
3454
3455 for (i = 1; i <= min; i++)
3456 {
3457 if (eptr >= md->end_subject)
3458 {
3459 SCHECK_PARTIAL();
3460 RRETURN(MATCH_NOMATCH);
3461 }
3462 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3463 eptr++;
3464 }
3465 if (min == max) continue;
3466 if (minimize)
3467 {
3468 for (fi = min;; fi++)
3469 {
3470 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3471 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3472 if (fi >= max) RRETURN(MATCH_NOMATCH);
3473 if (eptr >= md->end_subject)
3474 {
3475 SCHECK_PARTIAL();
3476 RRETURN(MATCH_NOMATCH);
3477 }
3478 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3479 eptr++;
3480 }
3481 /* Control never gets here */
3482 }
3483 else /* Maximize */
3484 {
3485 pp = eptr;
3486 for (i = min; i < max; i++)
3487 {
3488 if (eptr >= md->end_subject)
3489 {
3490 SCHECK_PARTIAL();
3491 break;
3492 }
3493 if (fc != *eptr && foc != *eptr) break;
3494 eptr++;
3495 }
3496
3497 if (possessive) continue;
3498
3499 while (eptr >= pp)
3500 {
3501 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3502 eptr--;
3503 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3504 }
3505 RRETURN(MATCH_NOMATCH);
3506 }
3507 /* Control never gets here */
3508 }
3509
3510 /* Caseful comparisons (includes all multi-byte characters) */
3511
3512 else
3513 {
3514 for (i = 1; i <= min; i++)
3515 {
3516 if (eptr >= md->end_subject)
3517 {
3518 SCHECK_PARTIAL();
3519 RRETURN(MATCH_NOMATCH);
3520 }
3521 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3522 }
3523
3524 if (min == max) continue;
3525
3526 if (minimize)
3527 {
3528 for (fi = min;; fi++)
3529 {
3530 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3531 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3532 if (fi >= max) RRETURN(MATCH_NOMATCH);
3533 if (eptr >= md->end_subject)
3534 {
3535 SCHECK_PARTIAL();
3536 RRETURN(MATCH_NOMATCH);
3537 }
3538 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3539 }
3540 /* Control never gets here */
3541 }
3542 else /* Maximize */
3543 {
3544 pp = eptr;
3545 for (i = min; i < max; i++)
3546 {
3547 if (eptr >= md->end_subject)
3548 {
3549 SCHECK_PARTIAL();
3550 break;
3551 }
3552 if (fc != *eptr) break;
3553 eptr++;
3554 }
3555 if (possessive) continue;
3556
3557 while (eptr >= pp)
3558 {
3559 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3560 eptr--;
3561 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3562 }
3563 RRETURN(MATCH_NOMATCH);
3564 }
3565 }
3566 /* Control never gets here */
3567
3568 /* Match a negated single one-byte character. The character we are
3569 checking can be multibyte. */
3570
3571 case OP_NOT:
3572 case OP_NOTI:
3573 if (eptr >= md->end_subject)
3574 {
3575 SCHECK_PARTIAL();
3576 RRETURN(MATCH_NOMATCH);
3577 }
3578 #ifdef SUPPORT_UTF
3579 if (utf)
3580 {
3581 register unsigned int ch, och;
3582
3583 ecode++;
3584 GETCHARINC(ch, ecode);
3585 GETCHARINC(c, eptr);
3586
3587 if (op == OP_NOT)
3588 {
3589 if (ch == c) RRETURN(MATCH_NOMATCH);
3590 }
3591 else
3592 {
3593 #ifdef SUPPORT_UCP
3594 if (ch > 127)
3595 och = UCD_OTHERCASE(ch);
3596 #else
3597 if (ch > 127)
3598 och = ch;
3599 #endif /* SUPPORT_UCP */
3600 else
3601 och = TABLE_GET(ch, md->fcc, ch);
3602 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3603 }
3604 }
3605 else
3606 #endif
3607 {
3608 register unsigned int ch = ecode[1];
3609 c = *eptr++;
3610 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3611 RRETURN(MATCH_NOMATCH);
3612 ecode += 2;
3613 }
3614 break;
3615
3616 /* Match a negated single one-byte character repeatedly. This is almost a
3617 repeat of the code for a repeated single character, but I haven't found a
3618 nice way of commoning these up that doesn't require a test of the
3619 positive/negative option for each character match. Maybe that wouldn't add
3620 very much to the time taken, but character matching *is* what this is all
3621 about... */
3622
3623 case OP_NOTEXACT:
3624 case OP_NOTEXACTI:
3625 min = max = GET2(ecode, 1);
3626 ecode += 1 + IMM2_SIZE;
3627 goto REPEATNOTCHAR;
3628
3629 case OP_NOTUPTO:
3630 case OP_NOTUPTOI:
3631 case OP_NOTMINUPTO:
3632 case OP_NOTMINUPTOI:
3633 min = 0;
3634 max = GET2(ecode, 1);
3635 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3636 ecode += 1 + IMM2_SIZE;
3637 goto REPEATNOTCHAR;
3638
3639 case OP_NOTPOSSTAR:
3640 case OP_NOTPOSSTARI:
3641 possessive = TRUE;
3642 min = 0;
3643 max = INT_MAX;
3644 ecode++;
3645 goto REPEATNOTCHAR;
3646
3647 case OP_NOTPOSPLUS:
3648 case OP_NOTPOSPLUSI:
3649 possessive = TRUE;
3650 min = 1;
3651 max = INT_MAX;
3652 ecode++;
3653 goto REPEATNOTCHAR;
3654
3655 case OP_NOTPOSQUERY:
3656 case OP_NOTPOSQUERYI:
3657 possessive = TRUE;
3658 min = 0;
3659 max = 1;
3660 ecode++;
3661 goto REPEATNOTCHAR;
3662
3663 case OP_NOTPOSUPTO:
3664 case OP_NOTPOSUPTOI:
3665 possessive = TRUE;
3666 min = 0;
3667 max = GET2(ecode, 1);
3668 ecode += 1 + IMM2_SIZE;
3669 goto REPEATNOTCHAR;
3670
3671 case OP_NOTSTAR:
3672 case OP_NOTSTARI:
3673 case OP_NOTMINSTAR:
3674 case OP_NOTMINSTARI:
3675 case OP_NOTPLUS:
3676 case OP_NOTPLUSI:
3677 case OP_NOTMINPLUS:
3678 case OP_NOTMINPLUSI:
3679 case OP_NOTQUERY:
3680 case OP_NOTQUERYI:
3681 case OP_NOTMINQUERY:
3682 case OP_NOTMINQUERYI:
3683 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3684 minimize = (c & 1) != 0;
3685 min = rep_min[c]; /* Pick up values from tables; */
3686 max = rep_max[c]; /* zero for max => infinity */
3687 if (max == 0) max = INT_MAX;
3688
3689 /* Common code for all repeated single-byte matches. */
3690
3691 REPEATNOTCHAR:
3692 GETCHARINCTEST(fc, ecode);
3693
3694 /* The code is duplicated for the caseless and caseful cases, for speed,
3695 since matching characters is likely to be quite common. First, ensure the
3696 minimum number of matches are present. If min = max, continue at the same
3697 level without recursing. Otherwise, if minimizing, keep trying the rest of
3698 the expression and advancing one matching character if failing, up to the
3699 maximum. Alternatively, if maximizing, find the maximum number of
3700 characters and work backwards. */
3701
3702 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3703 max, eptr));
3704
3705 if (op >= OP_NOTSTARI) /* Caseless */
3706 {
3707 #ifdef SUPPORT_UTF
3708 #ifdef SUPPORT_UCP
3709 if (utf && fc > 127)
3710 foc = UCD_OTHERCASE(fc);
3711 #else
3712 if (utf && fc > 127)
3713 foc = fc;
3714 #endif /* SUPPORT_UCP */
3715 else
3716 #endif /* SUPPORT_UTF */
3717 foc = TABLE_GET(fc, md->fcc, fc);
3718
3719 #ifdef SUPPORT_UTF
3720 if (utf)
3721 {
3722 register unsigned int d;
3723 for (i = 1; i <= min; i++)
3724 {
3725 if (eptr >= md->end_subject)
3726 {
3727 SCHECK_PARTIAL();
3728 RRETURN(MATCH_NOMATCH);
3729 }
3730 GETCHARINC(d, eptr);
3731 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3732 }
3733 }
3734 else
3735 #endif
3736 /* Not UTF mode */
3737 {
3738 for (i = 1; i <= min; i++)
3739 {
3740 if (eptr >= md->end_subject)
3741 {
3742 SCHECK_PARTIAL();
3743 RRETURN(MATCH_NOMATCH);
3744 }
3745 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3746 eptr++;
3747 }
3748 }
3749
3750 if (min == max) continue;
3751
3752 if (minimize)
3753 {
3754 #ifdef SUPPORT_UTF
3755 if (utf)
3756 {
3757 register unsigned int d;
3758 for (fi = min;; fi++)
3759 {
3760 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3761 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3762 if (fi >= max) RRETURN(MATCH_NOMATCH);
3763 if (eptr >= md->end_subject)
3764 {
3765 SCHECK_PARTIAL();
3766 RRETURN(MATCH_NOMATCH);
3767 }
3768 GETCHARINC(d, eptr);
3769 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3770 }
3771 }
3772 else
3773 #endif
3774 /* Not UTF mode */
3775 {
3776 for (fi = min;; fi++)
3777 {
3778 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3780 if (fi >= max) RRETURN(MATCH_NOMATCH);
3781 if (eptr >= md->end_subject)
3782 {
3783 SCHECK_PARTIAL();
3784 RRETURN(MATCH_NOMATCH);
3785 }
3786 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3787 eptr++;
3788 }
3789 }
3790 /* Control never gets here */
3791 }
3792
3793 /* Maximize case */
3794
3795 else
3796 {
3797 pp = eptr;
3798
3799 #ifdef SUPPORT_UTF
3800 if (utf)
3801 {
3802 register unsigned int d;
3803 for (i = min; i < max; i++)
3804 {
3805 int len = 1;
3806 if (eptr >= md->end_subject)
3807 {
3808 SCHECK_PARTIAL();
3809 break;
3810 }
3811 GETCHARLEN(d, eptr, len);
3812 if (fc == d || (unsigned int)foc == d) break;
3813 eptr += len;
3814 }
3815 if (possessive) continue;
3816 for(;;)
3817 {
3818 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3820 if (eptr-- == pp) break; /* Stop if tried at original pos */
3821 BACKCHAR(eptr);
3822 }
3823 }
3824 else
3825 #endif
3826 /* Not UTF mode */
3827 {
3828 for (i = min; i < max; i++)
3829 {
3830 if (eptr >= md->end_subject)
3831 {
3832 SCHECK_PARTIAL();
3833 break;
3834 }
3835 if (fc == *eptr || foc == *eptr) break;
3836 eptr++;
3837 }
3838 if (possessive) continue;
3839 while (eptr >= pp)
3840 {
3841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3843 eptr--;
3844 }
3845 }
3846
3847 RRETURN(MATCH_NOMATCH);
3848 }
3849 /* Control never gets here */
3850 }
3851
3852 /* Caseful comparisons */
3853
3854 else
3855 {
3856 #ifdef SUPPORT_UTF
3857 if (utf)
3858 {
3859 register unsigned int d;
3860 for (i = 1; i <= min; i++)
3861 {
3862 if (eptr >= md->end_subject)
3863 {
3864 SCHECK_PARTIAL();
3865 RRETURN(MATCH_NOMATCH);
3866 }
3867 GETCHARINC(d, eptr);
3868 if (fc == d) RRETURN(MATCH_NOMATCH);
3869 }
3870 }
3871 else
3872 #endif
3873 /* Not UTF mode */
3874 {
3875 for (i = 1; i <= min; i++)
3876 {
3877 if (eptr >= md->end_subject)
3878 {
3879 SCHECK_PARTIAL();
3880 RRETURN(MATCH_NOMATCH);
3881 }
3882 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3883 }
3884 }
3885
3886 if (min == max) continue;
3887
3888 if (minimize)
3889 {
3890 #ifdef SUPPORT_UTF
3891 if (utf)
3892 {
3893 register unsigned int d;
3894 for (fi = min;; fi++)
3895 {
3896 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3898 if (fi >= max) RRETURN(MATCH_NOMATCH);
3899 if (eptr >= md->end_subject)
3900 {
3901 SCHECK_PARTIAL();
3902 RRETURN(MATCH_NOMATCH);
3903 }
3904 GETCHARINC(d, eptr);
3905 if (fc == d) RRETURN(MATCH_NOMATCH);
3906 }
3907 }
3908 else
3909 #endif
3910 /* Not UTF mode */
3911 {
3912 for (fi = min;; fi++)
3913 {
3914 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3916 if (fi >= max) RRETURN(MATCH_NOMATCH);
3917 if (eptr >= md->end_subject)
3918 {
3919 SCHECK_PARTIAL();
3920 RRETURN(MATCH_NOMATCH);
3921 }
3922 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3923 }
3924 }
3925 /* Control never gets here */
3926 }
3927
3928 /* Maximize case */
3929
3930 else
3931 {
3932 pp = eptr;
3933
3934 #ifdef SUPPORT_UTF
3935 if (utf)
3936 {
3937 register unsigned int d;
3938 for (i = min; i < max; i++)
3939 {
3940 int len = 1;
3941 if (eptr >= md->end_subject)
3942 {
3943 SCHECK_PARTIAL();
3944 break;
3945 }
3946 GETCHARLEN(d, eptr, len);
3947 if (fc == d) break;
3948 eptr += len;
3949 }
3950 if (possessive) continue;
3951 for(;;)
3952 {
3953 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3955 if (eptr-- == pp) break; /* Stop if tried at original pos */
3956 BACKCHAR(eptr);
3957 }
3958 }
3959 else
3960 #endif
3961 /* Not UTF mode */
3962 {
3963 for (i = min; i < max; i++)
3964 {
3965 if (eptr >= md->end_subject)
3966 {
3967 SCHECK_PARTIAL();
3968 break;
3969 }
3970 if (fc == *eptr) break;
3971 eptr++;
3972 }
3973 if (possessive) continue;
3974 while (eptr >= pp)
3975 {
3976 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3978 eptr--;
3979 }
3980 }
3981
3982 RRETURN(MATCH_NOMATCH);
3983 }
3984 }
3985 /* Control never gets here */
3986
3987 /* Match a single character type repeatedly; several different opcodes
3988 share code. This is very similar to the code for single characters, but we
3989 repeat it in the interests of efficiency. */
3990
3991 case OP_TYPEEXACT:
3992 min = max = GET2(ecode, 1);
3993 minimize = TRUE;
3994 ecode += 1 + IMM2_SIZE;
3995 goto REPEATTYPE;
3996
3997 case OP_TYPEUPTO:
3998 case OP_TYPEMINUPTO:
3999 min = 0;
4000 max = GET2(ecode, 1);
4001 minimize = *ecode == OP_TYPEMINUPTO;
4002 ecode += 1 + IMM2_SIZE;
4003 goto REPEATTYPE;
4004
4005 case OP_TYPEPOSSTAR:
4006 possessive = TRUE;
4007 min = 0;
4008 max = INT_MAX;
4009 ecode++;
4010 goto REPEATTYPE;
4011
4012 case OP_TYPEPOSPLUS:
4013 possessive = TRUE;
4014 min = 1;
4015 max = INT_MAX;
4016 ecode++;
4017 goto REPEATTYPE;
4018
4019 case OP_TYPEPOSQUERY:
4020 possessive = TRUE;
4021 min = 0;
4022 max = 1;
4023 ecode++;
4024 goto REPEATTYPE;
4025
4026 case OP_TYPEPOSUPTO:
4027 possessive = TRUE;
4028 min = 0;
4029 max = GET2(ecode, 1);
4030 ecode += 1 + IMM2_SIZE;
4031 goto REPEATTYPE;
4032
4033 case OP_TYPESTAR:
4034 case OP_TYPEMINSTAR:
4035 case OP_TYPEPLUS:
4036 case OP_TYPEMINPLUS:
4037 case OP_TYPEQUERY:
4038 case OP_TYPEMINQUERY:
4039 c = *ecode++ - OP_TYPESTAR;
4040 minimize = (c & 1) != 0;
4041 min = rep_min[c]; /* Pick up values from tables; */
4042 max = rep_max[c]; /* zero for max => infinity */
4043 if (max == 0) max = INT_MAX;
4044
4045 /* Common code for all repeated single character type matches. Note that
4046 in UTF-8 mode, '.' matches a character of any length, but for the other
4047 character types, the valid characters are all one-byte long. */
4048
4049 REPEATTYPE:
4050 ctype = *ecode++; /* Code for the character type */
4051
4052 #ifdef SUPPORT_UCP
4053 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4054 {
4055 prop_fail_result = ctype == OP_NOTPROP;
4056 prop_type = *ecode++;
4057 prop_value = *ecode++;
4058 }
4059 else prop_type = -1;
4060 #endif
4061
4062 /* First, ensure the minimum number of matches are present. Use inline
4063 code for maximizing the speed, and do the type test once at the start
4064 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4065 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4066 and single-bytes. */
4067
4068 if (min > 0)
4069 {
4070 #ifdef SUPPORT_UCP
4071 if (prop_type >= 0)
4072 {
4073 switch(prop_type)
4074 {
4075 case PT_ANY:
4076 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4077 for (i = 1; i <= min; i++)
4078 {
4079 if (eptr >= md->end_subject)
4080 {
4081 SCHECK_PARTIAL();
4082 RRETURN(MATCH_NOMATCH);
4083 }
4084 GETCHARINCTEST(c, eptr);
4085 }
4086 break;
4087
4088 case PT_LAMP:
4089 for (i = 1; i <= min; i++)
4090 {
4091 int chartype;
4092 if (eptr >= md->end_subject)
4093 {
4094 SCHECK_PARTIAL();
4095 RRETURN(MATCH_NOMATCH);
4096 }
4097 GETCHARINCTEST(c, eptr);
4098 chartype = UCD_CHARTYPE(c);
4099 if ((chartype == ucp_Lu ||
4100 chartype == ucp_Ll ||
4101 chartype == ucp_Lt) == prop_fail_result)
4102 RRETURN(MATCH_NOMATCH);
4103 }
4104 break;
4105
4106 case PT_GC:
4107 for (i = 1; i <= min; i++)
4108 {
4109 if (eptr >= md->end_subject)
4110 {
4111 SCHECK_PARTIAL();
4112 RRETURN(MATCH_NOMATCH);
4113 }
4114 GETCHARINCTEST(c, eptr);
4115 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4116 RRETURN(MATCH_NOMATCH);
4117 }
4118 break;
4119
4120 case PT_PC:
4121 for (i = 1; i <= min; i++)
4122 {
4123 if (eptr >= md->end_subject)
4124 {
4125 SCHECK_PARTIAL();
4126 RRETURN(MATCH_NOMATCH);
4127 }
4128 GETCHARINCTEST(c, eptr);
4129 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4130 RRETURN(MATCH_NOMATCH);
4131 }
4132 break;
4133
4134 case PT_SC:
4135 for (i = 1; i <= min; i++)
4136 {
4137 if (eptr >= md->end_subject)
4138 {
4139 SCHECK_PARTIAL();
4140 RRETURN(MATCH_NOMATCH);
4141 }
4142 GETCHARINCTEST(c, eptr);
4143 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4144 RRETURN(MATCH_NOMATCH);
4145 }
4146 break;
4147
4148 case PT_ALNUM:
4149 for (i = 1; i <= min; i++)
4150 {
4151 int category;
4152 if (eptr >= md->end_subject)
4153 {
4154 SCHECK_PARTIAL();
4155 RRETURN(MATCH_NOMATCH);
4156 }
4157 GETCHARINCTEST(c, eptr);
4158 category = UCD_CATEGORY(c);
4159 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4160 RRETURN(MATCH_NOMATCH);
4161 }
4162 break;
4163
4164 case PT_SPACE: /* Perl space */
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 RRETURN(MATCH_NOMATCH);
4171 }
4172 GETCHARINCTEST(c, eptr);
4173 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4174 c == CHAR_FF || c == CHAR_CR)
4175 == prop_fail_result)
4176 RRETURN(MATCH_NOMATCH);
4177 }
4178 break;
4179
4180 case PT_PXSPACE: /* POSIX space */
4181 for (i = 1; i <= min; i++)
4182 {
4183 if (eptr >= md->end_subject)
4184 {
4185 SCHECK_PARTIAL();
4186 RRETURN(MATCH_NOMATCH);
4187 }
4188 GETCHARINCTEST(c, eptr);
4189 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4190 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4191 == prop_fail_result)
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 break;
4195
4196 case PT_WORD:
4197 for (i = 1; i <= min; i++)
4198 {
4199 int category;
4200 if (eptr >= md->end_subject)
4201 {
4202 SCHECK_PARTIAL();
4203 RRETURN(MATCH_NOMATCH);
4204 }
4205 GETCHARINCTEST(c, eptr);
4206 category = UCD_CATEGORY(c);
4207 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4208 == prop_fail_result)
4209 RRETURN(MATCH_NOMATCH);
4210 }
4211 break;
4212
4213 /* This should not occur */
4214
4215 default:
4216 RRETURN(PCRE_ERROR_INTERNAL);
4217 }
4218 }
4219
4220 /* Match extended Unicode sequences. We will get here only if the
4221 support is in the binary; otherwise a compile-time error occurs. */
4222
4223 else if (ctype == OP_EXTUNI)
4224 {
4225 for (i = 1; i <= min; i++)
4226 {
4227 if (eptr >= md->end_subject)
4228 {
4229 SCHECK_PARTIAL();
4230 RRETURN(MATCH_NOMATCH);
4231 }
4232 GETCHARINCTEST(c, eptr);
4233 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4234 while (eptr < md->end_subject)
4235 {
4236 int len = 1;
4237 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4238 if (UCD_CATEGORY(c) != ucp_M) break;
4239 eptr += len;
4240 }
4241 CHECK_PARTIAL();
4242 }
4243 }
4244
4245 else
4246 #endif /* SUPPORT_UCP */
4247
4248 /* Handle all other cases when the coding is UTF-8 */
4249
4250 #ifdef SUPPORT_UTF
4251 if (utf) switch(ctype)
4252 {
4253 case OP_ANY:
4254 for (i = 1; i <= min; i++)
4255 {
4256 if (eptr >= md->end_subject)
4257 {
4258 SCHECK_PARTIAL();
4259 RRETURN(MATCH_NOMATCH);
4260 }
4261 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4262 if (md->partial != 0 &&
4263 eptr + 1 >= md->end_subject &&
4264 NLBLOCK->nltype == NLTYPE_FIXED &&
4265 NLBLOCK->nllen == 2 &&
4266 *eptr == NLBLOCK->nl[0])
4267 {
4268 md->hitend = TRUE;
4269 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4270 }
4271 eptr++;
4272 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4273 }
4274 break;
4275
4276 case OP_ALLANY:
4277 for (i = 1; i <= min; i++)
4278 {
4279 if (eptr >= md->end_subject)
4280 {
4281 SCHECK_PARTIAL();
4282 RRETURN(MATCH_NOMATCH);
4283 }
4284 eptr++;
4285 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4286 }
4287 break;
4288
4289 case OP_ANYBYTE:
4290 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4291 eptr += min;
4292 break;
4293
4294 case OP_ANYNL:
4295 for (i = 1; i <= min; i++)
4296 {
4297 if (eptr >= md->end_subject)
4298 {
4299 SCHECK_PARTIAL();
4300 RRETURN(MATCH_NOMATCH);
4301 }
4302 GETCHARINC(c, eptr);
4303 switch(c)
4304 {
4305 default: RRETURN(MATCH_NOMATCH);
4306
4307 case 0x000d:
4308 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4309 break;
4310
4311 case 0x000a:
4312 break;
4313
4314 case 0x000b:
4315 case 0x000c:
4316 case 0x0085:
4317 case 0x2028:
4318 case 0x2029:
4319 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4320 break;
4321 }
4322 }
4323 break;
4324
4325 case OP_NOT_HSPACE:
4326 for (i = 1; i <= min; i++)
4327 {
4328 if (eptr >= md->end_subject)
4329 {
4330 SCHECK_PARTIAL();
4331 RRETURN(MATCH_NOMATCH);
4332 }
4333 GETCHARINC(c, eptr);
4334 switch(c)
4335 {
4336 default: break;
4337 case 0x09: /* HT */
4338 case 0x20: /* SPACE */
4339 case 0xa0: /* NBSP */
4340 case 0x1680: /* OGHAM SPACE MARK */
4341 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4342 case 0x2000: /* EN QUAD */
4343 case 0x2001: /* EM QUAD */
4344 case 0x2002: /* EN SPACE */
4345 case 0x2003: /* EM SPACE */
4346 case 0x2004: /* THREE-PER-EM SPACE */
4347 case 0x2005: /* FOUR-PER-EM SPACE */
4348 case 0x2006: /* SIX-PER-EM SPACE */
4349 case 0x2007: /* FIGURE SPACE */
4350 case 0x2008: /* PUNCTUATION SPACE */
4351 case 0x2009: /* THIN SPACE */
4352 case 0x200A: /* HAIR SPACE */
4353 case 0x202f: /* NARROW NO-BREAK SPACE */
4354 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4355 case 0x3000: /* IDEOGRAPHIC SPACE */
4356 RRETURN(MATCH_NOMATCH);
4357 }
4358 }
4359 break;
4360
4361 case OP_HSPACE:
4362 for (i = 1; i <= min; i++)
4363 {
4364 if (eptr >= md->end_subject)
4365 {
4366 SCHECK_PARTIAL();
4367 RRETURN(MATCH_NOMATCH);
4368 }
4369 GETCHARINC(c, eptr);
4370 switch(c)
4371 {
4372 default: RRETURN(MATCH_NOMATCH);
4373 case 0x09: /* HT */
4374 case 0x20: /* SPACE */
4375 case 0xa0: /* NBSP */
4376 case 0x1680: /* OGHAM SPACE MARK */
4377 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4378 case 0x2000: /* EN QUAD */
4379 case 0x2001: /* EM QUAD */
4380 case 0x2002: /* EN SPACE */
4381 case 0x2003: /* EM SPACE */
4382 case 0x2004: /* THREE-PER-EM SPACE */
4383 case 0x2005: /* FOUR-PER-EM SPACE */
4384 case 0x2006: /* SIX-PER-EM SPACE */
4385 case 0x2007: /* FIGURE SPACE */
4386 case 0x2008: /* PUNCTUATION SPACE */
4387 case 0x2009: /* THIN SPACE */
4388 case 0x200A: /* HAIR SPACE */
4389 case 0x202f: /* NARROW NO-BREAK SPACE */
4390 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4391 case 0x3000: /* IDEOGRAPHIC SPACE */
4392 break;
4393 }
4394 }
4395 break;
4396
4397 case OP_NOT_VSPACE:
4398 for (i = 1; i <= min; i++)
4399 {
4400 if (eptr >= md->end_subject)
4401 {
4402 SCHECK_PARTIAL();
4403 RRETURN(MATCH_NOMATCH);
4404 }
4405 GETCHARINC(c, eptr);
4406 switch(c)
4407 {
4408 default: break;
4409 case 0x0a: /* LF */
4410 case 0x0b: /* VT */
4411 case 0x0c: /* FF */
4412 case 0x0d: /* CR */
4413 case 0x85: /* NEL */
4414 case 0x2028: /* LINE SEPARATOR */
4415 case 0x2029: /* PARAGRAPH SEPARATOR */
4416 RRETURN(MATCH_NOMATCH);
4417 }
4418 }
4419 break;
4420
4421 case OP_VSPACE:
4422 for (i = 1; i <= min; i++)
4423 {
4424 if (eptr >= md->end_subject)
4425 {
4426 SCHECK_PARTIAL();
4427 RRETURN(MATCH_NOMATCH);
4428 }
4429 GETCHARINC(c, eptr);
4430 switch(c)
4431 {
4432 default: RRETURN(MATCH_NOMATCH);
4433 case 0x0a: /* LF */
4434 case 0x0b: /* VT */
4435 case 0x0c: /* FF */
4436 case 0x0d: /* CR */
4437 case 0x85: /* NEL */
4438 case 0x2028: /* LINE SEPARATOR */
4439 case 0x2029: /* PARAGRAPH SEPARATOR */
4440 break;
4441 }
4442 }
4443 break;
4444
4445 case OP_NOT_DIGIT:
4446 for (i = 1; i <= min; i++)
4447 {
4448 if (eptr >= md->end_subject)
4449 {
4450 SCHECK_PARTIAL();
4451 RRETURN(MATCH_NOMATCH);
4452 }
4453 GETCHARINC(c, eptr);
4454 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4455 RRETURN(MATCH_NOMATCH);
4456 }
4457 break;
4458
4459 case OP_DIGIT:
4460 for (i = 1; i <= min; i++)
4461 {
4462 if (eptr >= md->end_subject)
4463 {
4464 SCHECK_PARTIAL();
4465 RRETURN(MATCH_NOMATCH);
4466 }
4467 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4468 RRETURN(MATCH_NOMATCH);
4469 eptr++;
4470 /* No need to skip more bytes - we know it's a 1-byte character */
4471 }
4472 break;
4473
4474 case OP_NOT_WHITESPACE:
4475 for (i = 1; i <= min; i++)
4476 {
4477 if (eptr >= md->end_subject)
4478 {
4479 SCHECK_PARTIAL();
4480 RRETURN(MATCH_NOMATCH);
4481 }
4482 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4483 RRETURN(MATCH_NOMATCH);
4484 eptr++;
4485 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4486 }
4487 break;
4488
4489 case OP_WHITESPACE:
4490 for (i = 1; i <= min; i++)
4491 {
4492 if (eptr >= md->end_subject)
4493 {
4494 SCHECK_PARTIAL();
4495 RRETURN(MATCH_NOMATCH);
4496 }
4497 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4498 RRETURN(MATCH_NOMATCH);
4499 eptr++;
4500 /* No need to skip more bytes - we know it's a 1-byte character */
4501 }
4502 break;
4503
4504 case OP_NOT_WORDCHAR:
4505 for (i = 1; i <= min; i++)
4506 {
4507 if (eptr >= md->end_subject)
4508 {
4509 SCHECK_PARTIAL();
4510 RRETURN(MATCH_NOMATCH);
4511 }
4512 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4513 RRETURN(MATCH_NOMATCH);
4514 eptr++;
4515 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4516 }
4517 break;
4518
4519 case OP_WORDCHAR:
4520 for (i = 1; i <= min; i++)
4521 {
4522 if (eptr >= md->end_subject)
4523 {
4524 SCHECK_PARTIAL();
4525 RRETURN(MATCH_NOMATCH);
4526 }
4527 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4528 RRETURN(MATCH_NOMATCH);
4529 eptr++;
4530 /* No need to skip more bytes - we know it's a 1-byte character */
4531 }
4532 break;
4533
4534 default:
4535 RRETURN(PCRE_ERROR_INTERNAL);
4536 } /* End switch(ctype) */
4537
4538 else
4539 #endif /* SUPPORT_UTF */
4540
4541 /* Code for the non-UTF-8 case for minimum matching of operators other
4542 than OP_PROP and OP_NOTPROP. */
4543
4544 switch(ctype)
4545 {
4546 case OP_ANY:
4547 for (i = 1; i <= min; i++)
4548 {
4549 if (eptr >= md->end_subject)
4550 {
4551 SCHECK_PARTIAL();
4552 RRETURN(MATCH_NOMATCH);
4553 }
4554 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4555 if (md->partial != 0 &&
4556 eptr + 1 >= md->end_subject &&
4557 NLBLOCK->nltype == NLTYPE_FIXED &&
4558 NLBLOCK->nllen == 2 &&
4559 *eptr == NLBLOCK->nl[0])
4560 {
4561 md->hitend = TRUE;
4562 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4563 }
4564 eptr++;
4565 }
4566 break;
4567
4568 case OP_ALLANY:
4569 if (eptr > md->end_subject - min)
4570 {
4571 SCHECK_PARTIAL();
4572 RRETURN(MATCH_NOMATCH);
4573 }
4574 eptr += min;
4575 break;
4576
4577 case OP_ANYBYTE:
4578 if (eptr > md->end_subject - min)
4579 {
4580 SCHECK_PARTIAL();
4581 RRETURN(MATCH_NOMATCH);
4582 }
4583 eptr += min;
4584 break;
4585
4586 case OP_ANYNL:
4587 for (i = 1; i <= min; i++)
4588 {
4589 if (eptr >= md->end_subject)
4590 {
4591 SCHECK_PARTIAL();
4592 RRETURN(MATCH_NOMATCH);
4593 }
4594 switch(*eptr++)
4595 {
4596 default: RRETURN(MATCH_NOMATCH);
4597
4598 case 0x000d:
4599 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4600 break;
4601
4602 case 0x000a:
4603 break;
4604
4605 case 0x000b:
4606 case 0x000c:
4607 case 0x0085:
4608 #ifdef COMPILE_PCRE16
4609 case 0x2028:
4610 case 0x2029:
4611 #endif
4612 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4613 break;
4614 }
4615 }
4616 break;
4617
4618 case OP_NOT_HSPACE:
4619 for (i = 1; i <= min; i++)
4620 {
4621 if (eptr >= md->end_subject)
4622 {
4623 SCHECK_PARTIAL();
4624 RRETURN(MATCH_NOMATCH);
4625 }
4626 switch(*eptr++)
4627 {
4628 default: break;
4629 case 0x09: /* HT */
4630 case 0x20: /* SPACE */
4631 case 0xa0: /* NBSP */
4632 #ifdef COMPILE_PCRE16
4633 case 0x1680: /* OGHAM SPACE MARK */
4634 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4635 case 0x2000: /* EN QUAD */
4636 case 0x2001: /* EM QUAD */
4637 case 0x2002: /* EN SPACE */
4638 case 0x2003: /* EM SPACE */
4639 case 0x2004: /* THREE-PER-EM SPACE */
4640 case 0x2005: /* FOUR-PER-EM SPACE */
4641 case 0x2006: /* SIX-PER-EM SPACE */
4642 case 0x2007: /* FIGURE SPACE */
4643 case 0x2008: /* PUNCTUATION SPACE */
4644 case 0x2009: /* THIN SPACE */
4645 case 0x200A: /* HAIR SPACE */
4646 case 0x202f: /* NARROW NO-BREAK SPACE */
4647 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4648 case 0x3000: /* IDEOGRAPHIC SPACE */
4649 #endif
4650 RRETURN(MATCH_NOMATCH);
4651 }
4652 }
4653 break;
4654
4655 case OP_HSPACE:
4656 for (i = 1; i <= min; i++)
4657 {
4658 if (eptr >= md->end_subject)
4659 {
4660 SCHECK_PARTIAL();
4661 RRETURN(MATCH_NOMATCH);
4662 }
4663 switch(*eptr++)
4664 {
4665 default: RRETURN(MATCH_NOMATCH);
4666 case 0x09: /* HT */
4667 case 0x20: /* SPACE */
4668 case 0xa0: /* NBSP */
4669 #ifdef COMPILE_PCRE16
4670 case 0x1680: /* OGHAM SPACE MARK */
4671 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4672 case 0x2000: /* EN QUAD */
4673 case 0x2001: /* EM QUAD */
4674 case 0x2002: /* EN SPACE */
4675 case 0x2003: /* EM SPACE */
4676 case 0x2004: /* THREE-PER-EM SPACE */
4677 case 0x2005: /* FOUR-PER-EM SPACE */
4678 case 0x2006: /* SIX-PER-EM SPACE */
4679 case 0x2007: /* FIGURE SPACE */
4680 case 0x2008: /* PUNCTUATION SPACE */
4681 case 0x2009: /* THIN SPACE */
4682 case 0x200A: /* HAIR SPACE */
4683 case 0x202f: /* NARROW NO-BREAK SPACE */
4684 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4685 case 0x3000: /* IDEOGRAPHIC SPACE */
4686 #endif
4687 break;
4688 }
4689 }
4690 break;
4691
4692 case OP_NOT_VSPACE:
4693 for (i = 1; i <= min; i++)
4694 {
4695 if (eptr >= md->end_subject)
4696 {
4697 SCHECK_PARTIAL();
4698 RRETURN(MATCH_NOMATCH);
4699 }
4700 switch(*eptr++)
4701 {
4702 default: break;
4703 case 0x0a: /* LF */
4704 case 0x0b: /* VT */
4705 case 0x0c: /* FF */
4706 case 0x0d: /* CR */
4707 case 0x85: /* NEL */
4708 #ifdef COMPILE_PCRE16
4709 case 0x2028: /* LINE SEPARATOR */
4710 case 0x2029: /* PARAGRAPH SEPARATOR */
4711 #endif
4712 RRETURN(MATCH_NOMATCH);
4713 }
4714 }
4715 break;
4716
4717 case OP_VSPACE:
4718 for (i = 1; i <= min; i++)
4719 {
4720 if (eptr >= md->end_subject)
4721 {
4722 SCHECK_PARTIAL();
4723 RRETURN(MATCH_NOMATCH);
4724 }
4725 switch(*eptr++)
4726 {
4727 default: RRETURN(MATCH_NOMATCH);
4728 case 0x0a: /* LF */
4729 case 0x0b: /* VT */
4730 case 0x0c: /* FF */
4731 case 0x0d: /* CR */
4732 case 0x85: /* NEL */
4733 #ifdef COMPILE_PCRE16
4734 case 0x2028: /* LINE SEPARATOR */
4735 case 0x2029: /* PARAGRAPH SEPARATOR */
4736 #endif
4737 break;
4738 }
4739 }
4740 break;
4741
4742 case OP_NOT_DIGIT:
4743 for (i = 1; i <= min; i++)
4744 {
4745 if (eptr >= md->end_subject)
4746 {
4747 SCHECK_PARTIAL();
4748 RRETURN(MATCH_NOMATCH);
4749 }
4750 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4751 RRETURN(MATCH_NOMATCH);
4752 eptr++;
4753 }
4754 break;
4755
4756 case OP_DIGIT:
4757 for (i = 1; i <= min; i++)
4758 {
4759 if (eptr >= md->end_subject)
4760 {
4761 SCHECK_PARTIAL();
4762 RRETURN(MATCH_NOMATCH);
4763 }
4764 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4765 RRETURN(MATCH_NOMATCH);
4766 eptr++;
4767 }
4768 break;
4769
4770 case OP_NOT_WHITESPACE:
4771 for (i = 1; i <= min; i++)
4772 {
4773 if (eptr >= md->end_subject)
4774 {
4775 SCHECK_PARTIAL();
4776 RRETURN(MATCH_NOMATCH);
4777 }
4778 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4779 RRETURN(MATCH_NOMATCH);
4780 eptr++;
4781 }
4782 break;
4783
4784 case OP_WHITESPACE:
4785 for (i = 1; i <= min; i++)
4786 {
4787 if (eptr >= md->end_subject)
4788 {
4789 SCHECK_PARTIAL();
4790 RRETURN(MATCH_NOMATCH);
4791 }
4792 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4793 RRETURN(MATCH_NOMATCH);
4794 eptr++;
4795 }
4796 break;
4797
4798 case OP_NOT_WORDCHAR:
4799 for (i = 1; i <= min; i++)
4800 {
4801 if (eptr >= md->end_subject)
4802 {
4803 SCHECK_PARTIAL();
4804 RRETURN(MATCH_NOMATCH);
4805 }
4806 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4807 RRETURN(MATCH_NOMATCH);
4808 eptr++;
4809 }
4810 break;
4811
4812 case OP_WORDCHAR:
4813 for (i = 1; i <= min; i++)
4814 {
4815 if (eptr >= md->end_subject)
4816 {
4817 SCHECK_PARTIAL();
4818 RRETURN(MATCH_NOMATCH);
4819 }
4820 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4821 RRETURN(MATCH_NOMATCH);
4822 eptr++;
4823 }
4824 break;
4825
4826 default:
4827 RRETURN(PCRE_ERROR_INTERNAL);
4828 }
4829 }
4830
4831 /* If min = max, continue at the same level without recursing */
4832
4833 if (min == max) continue;
4834
4835 /* If minimizing, we have to test the rest of the pattern before each
4836 subsequent match. Again, separate the UTF-8 case for speed, and also
4837 separate the UCP cases. */
4838
4839 if (minimize)
4840 {
4841 #ifdef SUPPORT_UCP
4842 if (prop_type >= 0)
4843 {
4844 switch(prop_type)
4845 {
4846 case PT_ANY:
4847 for (fi = min;; fi++)
4848 {
4849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4851 if (fi >= max) RRETURN(MATCH_NOMATCH);
4852 if (eptr >= md->end_subject)
4853 {
4854 SCHECK_PARTIAL();
4855 RRETURN(MATCH_NOMATCH);
4856 }
4857 GETCHARINCTEST(c, eptr);
4858 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4859 }
4860 /* Control never gets here */
4861
4862 case PT_LAMP:
4863 for (fi = min;; fi++)
4864 {
4865 int chartype;
4866 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4868 if (fi >= max) RRETURN(MATCH_NOMATCH);
4869 if (eptr >= md->end_subject)
4870 {
4871 SCHECK_PARTIAL();
4872 RRETURN(MATCH_NOMATCH);
4873 }
4874 GETCHARINCTEST(c, eptr);
4875 chartype = UCD_CHARTYPE(c);
4876 if ((chartype == ucp_Lu ||
4877 chartype == ucp_Ll ||
4878 chartype == ucp_Lt) == prop_fail_result)
4879 RRETURN(MATCH_NOMATCH);
4880 }
4881 /* Control never gets here */
4882
4883 case PT_GC:
4884 for (fi = min;; fi++)
4885 {
4886 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4887 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4888 if (fi >= max) RRETURN(MATCH_NOMATCH);
4889 if (eptr >= md->end_subject)
4890 {
4891 SCHECK_PARTIAL();
4892 RRETURN(MATCH_NOMATCH);
4893 }
4894 GETCHARINCTEST(c, eptr);
4895 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4896 RRETURN(MATCH_NOMATCH);
4897 }
4898 /* Control never gets here */
4899
4900 case PT_PC:
4901 for (fi = min;; fi++)
4902 {
4903 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4905 if (fi >= max) RRETURN(MATCH_NOMATCH);
4906 if (eptr >= md->end_subject)
4907 {
4908 SCHECK_PARTIAL();
4909 RRETURN(MATCH_NOMATCH);
4910 }
4911 GETCHARINCTEST(c, eptr);
4912 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4913 RRETURN(MATCH_NOMATCH);
4914 }
4915 /* Control never gets here */
4916
4917 case PT_SC:
4918 for (fi = min;; fi++)
4919 {
4920 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4922 if (fi >= max) RRETURN(MATCH_NOMATCH);
4923 if (eptr >= md->end_subject)
4924 {
4925 SCHECK_PARTIAL();
4926 RRETURN(MATCH_NOMATCH);
4927 }
4928 GETCHARINCTEST(c, eptr);
4929 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4930 RRETURN(MATCH_NOMATCH);
4931 }
4932 /* Control never gets here */
4933
4934 case PT_ALNUM:
4935 for (fi = min;; fi++)
4936 {
4937 int category;
4938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4940 if (fi >= max) RRETURN(MATCH_NOMATCH);
4941 if (eptr >= md->end_subject)
4942 {
4943 SCHECK_PARTIAL();
4944 RRETURN(MATCH_NOMATCH);
4945 }
4946 GETCHARINCTEST(c, eptr);
4947 category = UCD_CATEGORY(c);
4948 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4949 RRETURN(MATCH_NOMATCH);
4950 }
4951 /* Control never gets here */
4952
4953 case PT_SPACE: /* Perl space */
4954 for (fi = min;; fi++)
4955 {
4956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4958 if (fi >= max) RRETURN(MATCH_NOMATCH);
4959 if (eptr >= md->end_subject)
4960 {
4961 SCHECK_PARTIAL();
4962 RRETURN(MATCH_NOMATCH);
4963 }
4964 GETCHARINCTEST(c, eptr);
4965 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4966 c == CHAR_FF || c == CHAR_CR)
4967 == prop_fail_result)
4968 RRETURN(MATCH_NOMATCH);
4969 }
4970 /* Control never gets here */
4971
4972 case PT_PXSPACE: /* POSIX space */
4973 for (fi = min;; fi++)
4974 {
4975 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4976 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4977 if (fi >= max) RRETURN(MATCH_NOMATCH);
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 RRETURN(MATCH_NOMATCH);
4982 }
4983 GETCHARINCTEST(c, eptr);
4984 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4985 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4986 == prop_fail_result)
4987 RRETURN(MATCH_NOMATCH);
4988 }
4989 /* Control never gets here */
4990
4991 case PT_WORD:
4992 for (fi = min;; fi++)
4993 {
4994 int category;
4995 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4997 if (fi >= max) RRETURN(MATCH_NOMATCH);
4998 if (eptr >= md->end_subject)
4999 {
5000 SCHECK_PARTIAL();
5001 RRETURN(MATCH_NOMATCH);
5002 }
5003 GETCHARINCTEST(c, eptr);
5004 category = UCD_CATEGORY(c);
5005 if ((category == ucp_L ||
5006 category == ucp_N ||
5007 c == CHAR_UNDERSCORE)
5008 == prop_fail_result)
5009 RRETURN(MATCH_NOMATCH);
5010 }
5011 /* Control never gets here */
5012
5013 /* This should never occur */
5014
5015 default:
5016 RRETURN(PCRE_ERROR_INTERNAL);
5017 }
5018 }
5019
5020 /* Match extended Unicode sequences. We will get here only if the
5021 support is in the binary; otherwise a compile-time error occurs. */
5022
5023 else if (ctype == OP_EXTUNI)
5024 {
5025 for (fi = min;; fi++)
5026 {
5027 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5029 if (fi >= max) RRETURN(MATCH_NOMATCH);
5030 if (eptr >= md->end_subject)
5031 {
5032 SCHECK_PARTIAL();
5033 RRETURN(MATCH_NOMATCH);
5034 }
5035 GETCHARINCTEST(c, eptr);
5036 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
5037 while (eptr < md->end_subject)
5038 {
5039 int len = 1;
5040 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5041 if (UCD_CATEGORY(c) != ucp_M) break;
5042 eptr += len;
5043 }
5044 CHECK_PARTIAL();
5045 }
5046 }
5047 else
5048 #endif /* SUPPORT_UCP */
5049
5050 #ifdef SUPPORT_UTF
5051 if (utf)
5052 {
5053 for (fi = min;; fi++)
5054 {
5055 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5057 if (fi >= max) RRETURN(MATCH_NOMATCH);
5058 if (eptr >= md->end_subject)
5059 {
5060 SCHECK_PARTIAL();
5061 RRETURN(MATCH_NOMATCH);
5062 }
5063 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5064 RRETURN(MATCH_NOMATCH);
5065 GETCHARINC(c, eptr);
5066 switch(ctype)
5067 {
5068 case OP_ANY: /* This is the non-NL case */
5069 if (md->partial != 0 && /* Take care with CRLF partial */
5070 eptr >= md->end_subject &&
5071 NLBLOCK->nltype == NLTYPE_FIXED &&
5072 NLBLOCK->nllen == 2 &&
5073 c == NLBLOCK->nl[0])
5074 {
5075 md->hitend = TRUE;
5076 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5077 }
5078 break;
5079
5080 case OP_ALLANY:
5081 case OP_ANYBYTE:
5082 break;
5083
5084 case OP_ANYNL:
5085 switch(c)
5086 {
5087 default: RRETURN(MATCH_NOMATCH);
5088 case 0x000d:
5089 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5090 break;
5091 case 0x000a:
5092 break;
5093
5094 case 0x000b:
5095 case 0x000c:
5096 case 0x0085:
5097 case 0x2028:
5098 case 0x2029:
5099 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5100 break;
5101 }
5102 break;
5103
5104 case OP_NOT_HSPACE:
5105 switch(c)
5106 {
5107 default: break;
5108 case 0x09: /* HT */
5109 case 0x20: /* SPACE */
5110 case 0xa0: /* NBSP */
5111 case 0x1680: /* OGHAM SPACE MARK */
5112 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5113 case 0x2000: /* EN QUAD */
5114 case 0x2001: /* EM QUAD */
5115 case 0x2002: /* EN SPACE */
5116 case 0x2003: /* EM SPACE */
5117 case 0x2004: /* THREE-PER-EM SPACE */
5118 case 0x2005: /* FOUR-PER-EM SPACE */
5119 case 0x2006: /* SIX-PER-EM SPACE */
5120 case 0x2007: /* FIGURE SPACE */
5121 case 0x2008: /* PUNCTUATION SPACE */
5122 case 0x2009: /* THIN SPACE */
5123 case 0x200A: /* HAIR SPACE */
5124 case 0x202f: /* NARROW NO-BREAK SPACE */
5125 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5126 case 0x3000: /* IDEOGRAPHIC SPACE */
5127 RRETURN(MATCH_NOMATCH);
5128 }
5129 break;
5130
5131 case OP_HSPACE:
5132 switch(c)
5133 {
5134 default: RRETURN(MATCH_NOMATCH);
5135 case 0x09: /* HT */
5136 case 0x20: /* SPACE */
5137 case 0xa0: /* NBSP */
5138 case 0x1680: /* OGHAM SPACE MARK */
5139 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5140 case 0x2000: /* EN QUAD */
5141 case 0x2001: /* EM QUAD */
5142 case 0x2002: /* EN SPACE */
5143 case 0x2003: /* EM SPACE */
5144 case 0x2004: /* THREE-PER-EM SPACE */
5145 case 0x2005: /* FOUR-PER-EM SPACE */
5146 case 0x2006: /* SIX-PER-EM SPACE */
5147 case 0x2007: /* FIGURE SPACE */
5148 case 0x2008: /* PUNCTUATION SPACE */
5149 case 0x2009: /* THIN SPACE */
5150 case 0x200A: /* HAIR SPACE */
5151 case 0x202f: /* NARROW NO-BREAK SPACE */
5152 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5153 case 0x3000: /* IDEOGRAPHIC SPACE */
5154 break;
5155 }
5156 break;
5157
5158 case OP_NOT_VSPACE:
5159 switch(c)
5160 {
5161 default: break;
5162 case 0x0a: /* LF */
5163 case 0x0b: /* VT */
5164 case 0x0c: /* FF */
5165 case 0x0d: /* CR */
5166 case 0x85: /* NEL */
5167 case 0x2028: /* LINE SEPARATOR */
5168 case 0x2029: /* PARAGRAPH SEPARATOR */
5169 RRETURN(MATCH_NOMATCH);
5170 }
5171 break;
5172
5173 case OP_VSPACE:
5174 switch(c)
5175 {
5176 default: RRETURN(MATCH_NOMATCH);
5177 case 0x0a: /* LF */
5178 case 0x0b: /* VT */
5179 case 0x0c: /* FF */
5180 case 0x0d: /* CR */
5181 case 0x85: /* NEL */
5182 case 0x2028: /* LINE SEPARATOR */
5183 case 0x2029: /* PARAGRAPH SEPARATOR */
5184 break;
5185 }
5186 break;
5187
5188 case OP_NOT_DIGIT:
5189 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5190 RRETURN(MATCH_NOMATCH);
5191 break;
5192
5193 case OP_DIGIT:
5194 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5195 RRETURN(MATCH_NOMATCH);
5196 break;
5197
5198 case OP_NOT_WHITESPACE:
5199 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5200 RRETURN(MATCH_NOMATCH);
5201 break;
5202
5203 case OP_WHITESPACE:
5204 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5205 RRETURN(MATCH_NOMATCH);
5206 break;
5207
5208 case OP_NOT_WORDCHAR:
5209 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5210 RRETURN(MATCH_NOMATCH);
5211 break;
5212
5213 case OP_WORDCHAR:
5214 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5215 RRETURN(MATCH_NOMATCH);
5216 break;
5217
5218 default:
5219 RRETURN(PCRE_ERROR_INTERNAL);
5220 }
5221 }
5222 }
5223 else
5224 #endif
5225 /* Not UTF mode */
5226 {
5227 for (fi = min;; fi++)
5228 {
5229 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5230 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5231 if (fi >= max) RRETURN(MATCH_NOMATCH);
5232 if (eptr >= md->end_subject)
5233 {
5234 SCHECK_PARTIAL();
5235 RRETURN(MATCH_NOMATCH);
5236 }
5237 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5238 RRETURN(MATCH_NOMATCH);
5239 c = *eptr++;
5240 switch(ctype)
5241 {
5242 case OP_ANY: /* This is the non-NL case */
5243 if (md->partial != 0 && /* Take care with CRLF partial */
5244 eptr >= md->end_subject &&
5245 NLBLOCK->nltype == NLTYPE_FIXED &&
5246 NLBLOCK->nllen == 2 &&
5247 c == NLBLOCK->nl[0])
5248 {
5249 md->hitend = TRUE;
5250 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5251 }
5252 break;
5253
5254 case OP_ALLANY:
5255 case OP_ANYBYTE:
5256 break;
5257
5258 case OP_ANYNL:
5259 switch(c)
5260 {
5261 default: RRETURN(MATCH_NOMATCH);
5262 case 0x000d:
5263 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5264 break;
5265
5266 case 0x000a:
5267 break;
5268
5269 case 0x000b:
5270 case 0x000c:
5271 case 0x0085:
5272 #ifdef COMPILE_PCRE16
5273 case 0x2028:
5274 case 0x2029:
5275 #endif
5276 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5277 break;
5278 }
5279 break;
5280
5281 case OP_NOT_HSPACE:
5282 switch(c)
5283 {
5284 default: break;
5285 case 0x09: /* HT */
5286 case 0x20: /* SPACE */
5287 case 0xa0: /* NBSP */
5288 #ifdef COMPILE_PCRE16
5289 case 0x1680: /* OGHAM SPACE MARK */
5290 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5291 case 0x2000: /* EN QUAD */
5292 case 0x2001: /* EM QUAD */
5293 case 0x2002: /* EN SPACE */
5294 case 0x2003: /* EM SPACE */
5295 case 0x2004: /* THREE-PER-EM SPACE */
5296 case 0x2005: /* FOUR-PER-EM SPACE */
5297 case 0x2006: /* SIX-PER-EM SPACE */
5298 case 0x2007: /* FIGURE SPACE */
5299 case 0x2008: /* PUNCTUATION SPACE */
5300 case 0x2009: /* THIN SPACE */
5301 case 0x200A: /* HAIR SPACE */
5302 case 0x202f: /* NARROW NO-BREAK SPACE */
5303 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5304 case 0x3000: /* IDEOGRAPHIC SPACE */
5305 #endif
5306 RRETURN(MATCH_NOMATCH);
5307 }
5308 break;
5309
5310 case OP_HSPACE:
5311 switch(c)
5312 {
5313 default: RRETURN(MATCH_NOMATCH);
5314 case 0x09: /* HT */
5315 case 0x20: /* SPACE */
5316 case 0xa0: /* NBSP */
5317 #ifdef COMPILE_PCRE16
5318 case 0x1680: /* OGHAM SPACE MARK */
5319 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5320 case 0x2000: /* EN QUAD */
5321 case 0x2001: /* EM QUAD */
5322 case 0x2002: /* EN SPACE */
5323 case 0x2003: /* EM SPACE */
5324 case 0x2004: /* THREE-PER-EM SPACE */
5325 case 0x2005: /* FOUR-PER-EM SPACE */
5326 case 0x2006: /* SIX-PER-EM SPACE */
5327 case 0x2007: /* FIGURE SPACE */
5328 case 0x2008: /* PUNCTUATION SPACE */
5329 case 0x2009: /* THIN SPACE */
5330 case 0x200A: /* HAIR SPACE */
5331 case 0x202f: /* NARROW NO-BREAK SPACE */
5332 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5333 case 0x3000: /* IDEOGRAPHIC SPACE */
5334 #endif
5335 break;
5336 }
5337 break;
5338
5339 case OP_NOT_VSPACE:
5340 switch(c)
5341 {
5342 default: break;
5343 case 0x0a: /* LF */
5344 case 0x0b: /* VT */
5345 case 0x0c: /* FF */
5346 case 0x0d: /* CR */
5347 case 0x85: /* NEL */
5348 #ifdef COMPILE_PCRE16
5349 case 0x2028: /* LINE SEPARATOR */
5350 case 0x2029: /* PARAGRAPH SEPARATOR */
5351 #endif
5352 RRETURN(MATCH_NOMATCH);
5353 }
5354 break;
5355
5356 case OP_VSPACE:
5357 switch(c)
5358 {
5359 default: RRETURN(MATCH_NOMATCH);
5360 case 0x0a: /* LF */
5361 case 0x0b: /* VT */
5362 case 0x0c: /* FF */
5363 case 0x0d: /* CR */
5364 case 0x85: /* NEL */
5365 #ifdef COMPILE_PCRE16
5366 case 0x2028: /* LINE SEPARATOR */
5367 case 0x2029: /* PARAGRAPH SEPARATOR */
5368 #endif
5369 break;
5370 }
5371 break;
5372
5373 case OP_NOT_DIGIT:
5374 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5375 break;
5376
5377 case OP_DIGIT:
5378 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5379 break;
5380
5381 case OP_NOT_WHITESPACE:
5382 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5383 break;
5384
5385 case OP_WHITESPACE:
5386 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5387 break;
5388
5389 case OP_NOT_WORDCHAR:
5390 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5391 break;
5392
5393 case OP_WORDCHAR:
5394 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5395 break;
5396
5397 default:
5398 RRETURN(PCRE_ERROR_INTERNAL);
5399 }
5400 }
5401 }
5402 /* Control never gets here */
5403 }
5404
5405 /* If maximizing, it is worth using inline code for speed, doing the type
5406 test once at the start (i.e. keep it out of the loop). Again, keep the
5407 UTF-8 and UCP stuff separate. */
5408
5409 else
5410 {
5411 pp = eptr; /* Remember where we started */
5412
5413 #ifdef SUPPORT_UCP
5414 if (prop_type >= 0)
5415 {
5416 switch(prop_type)
5417 {
5418 case PT_ANY:
5419 for (i = min; i < max; i++)
5420 {
5421 int len = 1;
5422 if (eptr >= md->end_subject)
5423 {
5424 SCHECK_PARTIAL();
5425 break;
5426 }
5427 GETCHARLENTEST(c, eptr, len);
5428 if (prop_fail_result) break;
5429 eptr+= len;
5430 }
5431 break;
5432
5433 case PT_LAMP:
5434 for (i = min; i < max; i++)
5435 {
5436 int chartype;
5437 int len = 1;
5438 if (eptr >= md->end_subject)
5439 {
5440 SCHECK_PARTIAL();
5441 break;
5442 }
5443 GETCHARLENTEST(c, eptr, len);
5444 chartype = UCD_CHARTYPE(c);
5445 if ((chartype == ucp_Lu ||
5446 chartype == ucp_Ll ||
5447 chartype == ucp_Lt) == prop_fail_result)
5448 break;
5449 eptr+= len;
5450 }
5451 break;
5452
5453 case PT_GC:
5454 for (i = min; i < max; i++)
5455 {
5456 int len = 1;
5457 if (eptr >= md->end_subject)
5458 {
5459 SCHECK_PARTIAL();
5460 break;
5461 }
5462 GETCHARLENTEST(c, eptr, len);
5463 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5464 eptr+= len;
5465 }
5466 break;
5467
5468 case PT_PC:
5469 for (i = min; i < max; i++)
5470 {
5471 int len = 1;
5472 if (eptr >= md->end_subject)
5473 {
5474 SCHECK_PARTIAL();
5475 break;
5476 }
5477 GETCHARLENTEST(c, eptr, len);
5478 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5479 eptr+= len;
5480 }
5481 break;
5482
5483 case PT_SC:
5484 for (i = min; i < max; i++)
5485 {
5486 int len = 1;
5487 if (eptr >= md->end_subject)
5488 {
5489 SCHECK_PARTIAL();
5490 break;
5491 }
5492 GETCHARLENTEST(c, eptr, len);
5493 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5494 eptr+= len;
5495 }
5496 break;
5497
5498 case PT_ALNUM:
5499 for (i = min; i < max; i++)
5500 {
5501 int category;
5502 int len = 1;
5503 if (eptr >= md->end_subject)
5504 {
5505 SCHECK_PARTIAL();
5506 break;
5507 }
5508 GETCHARLENTEST(c, eptr, len);
5509 category = UCD_CATEGORY(c);
5510 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5511 break;
5512 eptr+= len;
5513 }
5514 break;
5515
5516 case PT_SPACE: /* Perl space */
5517 for (i = min; i < max; i++)
5518 {
5519 int len = 1;
5520 if (eptr >= md->end_subject)
5521 {
5522 SCHECK_PARTIAL();
5523 break;
5524 }
5525 GETCHARLENTEST(c, eptr, len);
5526 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5527 c == CHAR_FF || c == CHAR_CR)
5528 == prop_fail_result)
5529 break;
5530 eptr+= len;
5531 }
5532 break;
5533
5534 case PT_PXSPACE: /* POSIX space */
5535 for (i = min; i < max; i++)
5536 {
5537 int len = 1;
5538 if (eptr >= md->end_subject)
5539 {
5540 SCHECK_PARTIAL();
5541 break;
5542 }
5543 GETCHARLENTEST(c, eptr, len);
5544 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5545 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5546 == prop_fail_result)
5547 break;
5548 eptr+= len;
5549 }
5550 break;
5551
5552 case PT_WORD:
5553 for (i = min; i < max; i++)
5554 {
5555 int category;
5556 int len = 1;
5557 if (eptr >= md->end_subject)
5558 {
5559 SCHECK_PARTIAL();
5560 break;
5561 }
5562 GETCHARLENTEST(c, eptr, len);
5563 category = UCD_CATEGORY(c);
5564 if ((category == ucp_L || category == ucp_N ||
5565 c == CHAR_UNDERSCORE) == prop_fail_result)
5566 break;
5567 eptr+= len;
5568 }
5569 break;
5570
5571 default:
5572 RRETURN(PCRE_ERROR_INTERNAL);
5573 }
5574
5575 /* eptr is now past the end of the maximum run */
5576
5577 if (possessive) continue;
5578 for(;;)
5579 {
5580 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5581 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5582 if (eptr-- == pp) break; /* Stop if tried at original pos */
5583 if (utf) BACKCHAR(eptr);
5584 }
5585 }
5586
5587 /* Match extended Unicode sequences. We will get here only if the
5588 support is in the binary; otherwise a compile-time error occurs. */
5589
5590 else if (ctype == OP_EXTUNI)
5591 {
5592 for (i = min; i < max; i++)
5593 {
5594 int len = 1;
5595 if (eptr >= md->end_subject)
5596 {
5597 SCHECK_PARTIAL();
5598 break;
5599 }
5600 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5601 if (UCD_CATEGORY(c) == ucp_M) break;
5602 eptr += len;
5603 while (eptr < md->end_subject)
5604 {
5605 len = 1;
5606 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5607 if (UCD_CATEGORY(c) != ucp_M) break;
5608 eptr += len;
5609 }
5610 CHECK_PARTIAL();
5611 }
5612
5613 /* eptr is now past the end of the maximum run */
5614
5615 if (possessive) continue;
5616
5617 for(;;)
5618 {
5619 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5620 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5621 if (eptr-- == pp) break; /* Stop if tried at original pos */
5622 for (;;) /* Move back over one extended */
5623 {
5624 if (!utf) c = *eptr; else
5625 {
5626 BACKCHAR(eptr);
5627 GETCHAR(c, eptr);
5628 }
5629 if (UCD_CATEGORY(c) != ucp_M) break;
5630 eptr--;
5631 }
5632 }
5633 }
5634
5635 else
5636 #endif /* SUPPORT_UCP */
5637
5638 #ifdef SUPPORT_UTF
5639 if (utf)
5640 {
5641 switch(ctype)
5642 {
5643 case OP_ANY:
5644 if (max < INT_MAX)
5645 {
5646 for (i = min; i < max; i++)
5647 {
5648 if (eptr >= md->end_subject)
5649 {
5650 SCHECK_PARTIAL();
5651 break;
5652 }
5653 if (IS_NEWLINE(eptr)) break;
5654 if (md->partial != 0 && /* Take care with CRLF partial */
5655 eptr + 1 >= md->end_subject &&
5656 NLBLOCK->nltype == NLTYPE_FIXED &&
5657 NLBLOCK->nllen == 2 &&
5658 *eptr == NLBLOCK->nl[0])
5659 {
5660 md->hitend = TRUE;
5661 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5662 }
5663 eptr++;
5664 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5665 }
5666 }
5667
5668 /* Handle unlimited UTF-8 repeat */
5669
5670 else
5671 {
5672 for (i = min; i < max; i++)
5673 {
5674 if (eptr >= md->end_subject)
5675 {
5676 SCHECK_PARTIAL();
5677 break;
5678 }
5679 if (IS_NEWLINE(eptr)) break;
5680 if (md->partial != 0 && /* Take care with CRLF partial */
5681 eptr + 1 >= md->end_subject &&
5682 NLBLOCK->nltype == NLTYPE_FIXED &&
5683 NLBLOCK->nllen == 2 &&
5684 *eptr == NLBLOCK->nl[0])
5685 {
5686 md->hitend = TRUE;
5687 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5688 }
5689 eptr++;
5690 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5691 }
5692 }
5693 break;
5694
5695 case OP_ALLANY:
5696 if (max < INT_MAX)
5697 {
5698 for (i = min; i < max; i++)
5699 {
5700 if (eptr >= md->end_subject)
5701 {
5702 SCHECK_PARTIAL();
5703 break;
5704 }
5705 eptr++;
5706 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5707 }
5708 }
5709 else
5710 {
5711 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5712 SCHECK_PARTIAL();
5713 }
5714 break;
5715
5716 /* The byte case is the same as non-UTF8 */
5717
5718 case OP_ANYBYTE:
5719 c = max - min;
5720 if (c > (unsigned int)(md->end_subject - eptr))
5721 {
5722 eptr = md->end_subject;
5723 SCHECK_PARTIAL();
5724 }
5725 else eptr += c;
5726 break;
5727
5728 case OP_ANYNL:
5729 for (i = min; i < max; i++)
5730 {
5731 int len = 1;
5732 if (eptr >= md->end_subject)
5733 {
5734 SCHECK_PARTIAL();
5735 break;
5736 }
5737 GETCHARLEN(c, eptr, len);
5738 if (c == 0x000d)
5739 {
5740 if (++eptr >= md->end_subject) break;
5741 if (*eptr == 0x000a) eptr++;
5742 }
5743 else
5744 {
5745 if (c != 0x000a &&
5746 (md->bsr_anycrlf ||
5747 (c != 0x000b && c != 0x000c &&
5748 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5749 break;
5750 eptr += len;
5751 }
5752 }
5753 break;
5754
5755 case OP_NOT_HSPACE:
5756 case OP_HSPACE:
5757 for (i = min; i < max; i++)
5758 {
5759 BOOL gotspace;
5760 int len = 1;
5761 if (eptr >= md->end_subject)
5762 {
5763 SCHECK_PARTIAL();
5764 break;
5765 }
5766 GETCHARLEN(c, eptr, len);
5767 switch(c)
5768 {
5769 default: gotspace = FALSE; break;
5770 case 0x09: /* HT */
5771 case 0x20: /* SPACE */
5772 case 0xa0: /* NBSP */
5773 case 0x1680: /* OGHAM SPACE MARK */
5774 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5775 case 0x2000: /* EN QUAD */
5776 case 0x2001: /* EM QUAD */
5777 case 0x2002: /* EN SPACE */
5778 case 0x2003: /* EM SPACE */
5779 case 0x2004: /* THREE-PER-EM SPACE */
5780 case 0x2005: /* FOUR-PER-EM SPACE */
5781 case 0x2006: /* SIX-PER-EM SPACE */
5782 case 0x2007: /* FIGURE SPACE */
5783 case 0x2008: /* PUNCTUATION SPACE */
5784 case 0x2009: /* THIN SPACE */
5785 case 0x200A: /* HAIR SPACE */
5786 case 0x202f: /* NARROW NO-BREAK SPACE */
5787 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5788 case 0x3000: /* IDEOGRAPHIC SPACE */
5789 gotspace = TRUE;
5790 break;
5791 }
5792 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5793 eptr += len;
5794 }
5795 break;
5796
5797 case OP_NOT_VSPACE:
5798 case OP_VSPACE:
5799 for (i = min; i < max; i++)
5800 {
5801 BOOL gotspace;
5802 int len = 1;
5803 if (eptr >= md->end_subject)
5804 {
5805 SCHECK_PARTIAL();
5806 break;
5807 }
5808 GETCHARLEN(c, eptr, len);
5809 switch(c)
5810 {
5811 default: gotspace = FALSE; break;
5812 case 0x0a: /* LF */
5813 case 0x0b: /* VT */
5814 case 0x0c: /* FF */
5815 case 0x0d: /* CR */
5816 case 0x85: /* NEL */
5817 case 0x2028: /* LINE SEPARATOR */
5818 case 0x2029: /* PARAGRAPH SEPARATOR */
5819 gotspace = TRUE;
5820 break;
5821 }
5822 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5823 eptr += len;
5824 }
5825 break;
5826
5827 case OP_NOT_DIGIT:
5828 for (i = min; i < max; i++)
5829 {
5830 int len = 1;
5831 if (eptr >= md->end_subject)
5832 {
5833 SCHECK_PARTIAL();
5834 break;
5835 }
5836 GETCHARLEN(c, eptr, len);
5837 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5838 eptr+= len;
5839 }
5840 break;
5841
5842 case OP_DIGIT:
5843 for (i = min; i < max; i++)
5844 {
5845 int len = 1;
5846 if (eptr >= md->end_subject)
5847 {
5848 SCHECK_PARTIAL();
5849 break;
5850 }
5851 GETCHARLEN(c, eptr, len);
5852 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5853 eptr+= len;
5854 }
5855 break;
5856
5857 case OP_NOT_WHITESPACE:
5858 for (i = min; i < max; i++)
5859 {
5860 int len = 1;
5861 if (eptr >= md->end_subject)
5862 {
5863 SCHECK_PARTIAL();
5864 break;
5865 }
5866 GETCHARLEN(c, eptr, len);
5867 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5868 eptr+= len;
5869 }
5870 break;
5871
5872 case OP_WHITESPACE:
5873 for (i = min; i < max; i++)
5874 {
5875 int len = 1;
5876 if (eptr >= md->end_subject)
5877 {
5878 SCHECK_PARTIAL();
5879 break;
5880 }
5881 GETCHARLEN(c, eptr, len);
5882 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5883 eptr+= len;
5884 }
5885 break;
5886
5887 case OP_NOT_WORDCHAR:
5888 for (i = min; i < max; i++)
5889 {
5890 int len = 1;
5891 if (eptr >= md->end_subject)
5892 {
5893 SCHECK_PARTIAL();
5894 break;
5895 }
5896 GETCHARLEN(c, eptr, len);
5897 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5898 eptr+= len;
5899 }
5900 break;
5901
5902 case OP_WORDCHAR:
5903 for (i = min; i < max; i++)
5904 {
5905 int len = 1;
5906 if (eptr >= md->end_subject)
5907 {
5908 SCHECK_PARTIAL();
5909 break;
5910 }
5911 GETCHARLEN(c, eptr, len);
5912 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5913 eptr+= len;
5914 }
5915 break;
5916
5917 default:
5918 RRETURN(PCRE_ERROR_INTERNAL);
5919 }
5920
5921 /* eptr is now past the end of the maximum run. If possessive, we are
5922 done (no backing up). Otherwise, match at this position; anything other
5923 than no match is immediately returned. For nomatch, back up one
5924 character, unless we are matching \R and the last thing matched was
5925 \r\n, in which case, back up two bytes. */
5926
5927 if (possessive) continue;
5928 for(;;)
5929 {
5930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5932 if (eptr-- == pp) break; /* Stop if tried at original pos */
5933 BACKCHAR(eptr);
5934 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5935 eptr[-1] == '\r') eptr--;
5936 }
5937 }
5938 else
5939 #endif /* SUPPORT_UTF */
5940 /* Not UTF mode */
5941 {
5942 switch(ctype)
5943 {
5944 case OP_ANY:
5945 for (i = min; i < max; i++)
5946 {
5947 if (eptr >= md->end_subject)
5948 {
5949 SCHECK_PARTIAL();
5950 break;
5951 }
5952 if (IS_NEWLINE(eptr)) break;
5953 if (md->partial != 0 && /* Take care with CRLF partial */
5954 eptr + 1 >= md->end_subject &&
5955 NLBLOCK->nltype == NLTYPE_FIXED &&
5956 NLBLOCK->nllen == 2 &&
5957 *eptr == NLBLOCK->nl[0])
5958 {
5959 md->hitend = TRUE;
5960 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5961 }
5962 eptr++;
5963 }
5964 break;
5965
5966 case OP_ALLANY:
5967 case OP_ANYBYTE:
5968 c = max - min;
5969 if (c > (unsigned int)(md->end_subject - eptr))
5970 {
5971 eptr = md->end_subject;
5972 SCHECK_PARTIAL();
5973 }
5974 else eptr += c;
5975 break;
5976
5977 case OP_ANYNL:
5978 for (i = min; i < max; i++)
5979 {
5980 if (eptr >= md->end_subject)
5981 {
5982 SCHECK_PARTIAL();
5983 break;
5984 }
5985 c = *eptr;
5986 if (c == 0x000d)
5987 {
5988 if (++eptr >= md->end_subject) break;
5989 if (*eptr == 0x000a) eptr++;
5990 }
5991 else
5992 {
5993 if (c != 0x000a && (md->bsr_anycrlf ||
5994 (c != 0x000b && c != 0x000c && c != 0x0085
5995 #ifdef COMPILE_PCRE16
5996 && c != 0x2028 && c != 0x2029
5997 #endif
5998 ))) break;
5999 eptr++;
6000 }
6001 }
6002 break;
6003
6004 case OP_NOT_HSPACE:
6005 for (i = min; i < max; i++)
6006 {
6007 if (eptr >= md->end_subject)
6008 {
6009 SCHECK_PARTIAL();
6010 break;
6011 }
6012 c = *eptr;
6013 if (c == 0x09 || c == 0x20 || c == 0xa0
6014 #ifdef COMPILE_PCRE16
6015 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6016 || c == 0x202f || c == 0x205f || c == 0x3000
6017 #endif
6018 ) break;
6019 eptr++;
6020 }
6021 break;
6022
6023 case OP_HSPACE:
6024 for (i = min; i < max; i++)
6025 {
6026 if (eptr >= md->end_subject)
6027 {
6028 SCHECK_PARTIAL();
6029 break;
6030 }
6031 c = *eptr;
6032 if (c != 0x09 && c != 0x20 && c != 0xa0
6033 #ifdef COMPILE_PCRE16
6034 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6035 && c != 0x202f && c != 0x205f && c != 0x3000
6036 #endif
6037 ) break;
6038 eptr++;
6039 }
6040 break;
6041
6042 case OP_NOT_VSPACE:
6043 for (i = min; i < max; i++)
6044 {
6045 if (eptr >= md->end_subject)
6046 {
6047 SCHECK_PARTIAL();
6048 break;
6049 }
6050 c = *eptr;
6051 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6052 #ifdef COMPILE_PCRE16
6053 || c == 0x2028 || c == 0x2029
6054 #endif
6055 ) break;
6056 eptr++;
6057 }
6058 break;
6059
6060 case OP_VSPACE:
6061 for (i = min; i < max; i++)
6062 {
6063 if (eptr >= md->end_subject)
6064 {
6065 SCHECK_PARTIAL();
6066 break;
6067 }
6068 c = *eptr;
6069 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6070 #ifdef COMPILE_PCRE16
6071 && c != 0x2028 && c != 0x2029
6072 #endif
6073 ) break;
6074 eptr++;
6075 }
6076 break;
6077
6078 case OP_NOT_DIGIT:
6079 for (i = min; i < max; i++)
6080 {
6081 if (eptr >= md->end_subject)
6082 {
6083 SCHECK_PARTIAL();
6084 break;
6085 }
6086 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6087 eptr++;
6088 }
6089 break;
6090
6091 case OP_DIGIT:
6092 for (i = min; i < max; i++)
6093 {
6094 if (eptr >= md->end_subject)
6095 {
6096 SCHECK_PARTIAL();
6097 break;
6098 }
6099 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6100 eptr++;
6101 }
6102 break;
6103
6104 case OP_NOT_WHITESPACE:
6105 for (i = min; i < max; i++)
6106 {
6107 if (eptr >= md->end_subject)
6108 {
6109 SCHECK_PARTIAL();
6110 break;
6111 }
6112 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6113 eptr++;
6114 }
6115 break;
6116
6117 case OP_WHITESPACE:
6118 for (i = min; i < max; i++)
6119 {
6120 if (eptr >= md->end_subject)
6121 {
6122 SCHECK_PARTIAL();
6123 break;
6124 }
6125 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6126 eptr++;
6127 }
6128 break;
6129
6130 case OP_NOT_WORDCHAR:
6131 for (i = min; i < max; i++)
6132 {
6133 if (eptr >= md->end_subject)
6134 {
6135 SCHECK_PARTIAL();
6136 break;
6137 }
6138 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6139 eptr++;
6140 }
6141 break;
6142
6143 case OP_WORDCHAR:
6144 for (i = min; i < max; i++)
6145 {
6146 if (eptr >= md->end_subject)
6147 {
6148 SCHECK_PARTIAL();
6149 break;
6150 }
6151 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6152 eptr++;
6153 }
6154 break;
6155
6156 default:
6157 RRETURN(PCRE_ERROR_INTERNAL);
6158 }
6159
6160 /* eptr is now past the end of the maximum run. If possessive, we are
6161 done (no backing up). Otherwise, match at this position; anything other
6162 than no match is immediately returned. For nomatch, back up one
6163 character (byte), unless we are matching \R and the last thing matched
6164 was \r\n, in which case, back up two bytes. */
6165
6166 if (possessive) continue;
6167 while (eptr >= pp)
6168 {
6169 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6171 eptr--;
6172 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6173 eptr[-1] == '\r') eptr--;
6174 }
6175 }
6176
6177 /* Get here if we can't make it match with any permitted repetitions */
6178
6179 RRETURN(MATCH_NOMATCH);
6180 }
6181 /* Control never gets here */
6182
6183 /* There's been some horrible disaster. Arrival here can only mean there is
6184 something seriously wrong in the code above or the OP_xxx definitions. */
6185
6186 default:
6187 DPRINTF(("Unknown opcode %d\n", *ecode));
6188 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6189 }
6190
6191 /* Do not stick any code in here without much thought; it is assumed
6192 that "continue" in the code above comes out to here to repeat the main
6193 loop. */
6194
6195 } /* End of main loop */
6196 /* Control never reaches here */
6197
6198
6199 /* When compiling to use the heap rather than the stack for recursive calls to
6200 match(), the RRETURN() macro jumps here. The number that is saved in
6201 frame->Xwhere indicates which label we actually want to return to. */
6202
6203 #ifdef NO_RECURSE
6204 #define LBL(val) case val: goto L_RM##val;
6205 HEAP_RETURN:
6206 switch (frame->Xwhere)
6207 {
6208 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6209 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6210 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6211 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6212 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6213 LBL(65) LBL(66)
6214 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6215 LBL(21)
6216 #endif
6217 #ifdef SUPPORT_UTF
6218 LBL(16) LBL(18) LBL(20)
6219 LBL(22) LBL(23) LBL(28) LBL(30)
6220 LBL(32) LBL(34) LBL(42) LBL(46)
6221 #ifdef SUPPORT_UCP
6222 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6223 LBL(59) LBL(60) LBL(61) LBL(62)
6224 #endif /* SUPPORT_UCP */
6225 #endif /* SUPPORT_UTF */
6226 default:
6227 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6228
6229 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6230
6231 return PCRE_ERROR_INTERNAL;
6232 }
6233 #undef LBL
6234 #endif /* NO_RECURSE */
6235 }
6236
6237
6238 /***************************************************************************
6239 ****************************************************************************
6240 RECURSION IN THE match() FUNCTION
6241
6242 Undefine all the macros that were defined above to handle this. */
6243
6244 #ifdef NO_RECURSE
6245 #undef eptr
6246 #undef ecode
6247 #undef mstart
6248 #undef offset_top
6249 #undef eptrb
6250 #undef flags
6251
6252 #undef callpat
6253 #undef charptr
6254 #undef data
6255 #undef next
6256 #undef pp
6257 #undef prev
6258 #undef saved_eptr
6259
6260 #undef new_recursive
6261
6262 #undef cur_is_word
6263 #undef condition
6264 #undef prev_is_word
6265
6266 #undef ctype
6267 #undef length
6268 #undef max
6269 #undef min
6270 #undef number
6271 #undef offset
6272 #undef op
6273 #undef save_capture_last
6274 #undef save_offset1
6275 #undef save_offset2
6276 #undef save_offset3
6277 #undef stacksave
6278
6279 #undef newptrb
6280
6281 #endif
6282
6283 /* These two are defined as macros in both cases */
6284
6285 #undef fc
6286 #undef fi
6287
6288 /***************************************************************************
6289 ***************************************************************************/
6290
6291
6292 #ifdef NO_RECURSE
6293 /*************************************************
6294 * Release allocated heap frames *
6295 *************************************************/
6296
6297 /* This function releases all the allocated frames. The base frame is on the
6298 machine stack, and so must not be freed.
6299
6300 Argument: the address of the base frame
6301 Returns: nothing
6302 */
6303
6304 static void
6305 release_match_heapframes (heapframe *frame_base)
6306 {
6307 heapframe *nextframe = frame_base->Xnextframe;
6308 while (nextframe != NULL)
6309 {
6310 heapframe *oldframe = nextframe;
6311 nextframe = nextframe->Xnextframe;
6312 (PUBL(stack_free))(oldframe);
6313 }
6314 }
6315 #endif
6316
6317
6318 /*************************************************
6319 * Execute a Regular Expression *
6320 *************************************************/
6321
6322 /* This function applies a compiled re to a subject string and picks out
6323 portions of the string if it matches. Two elements in the vector are set for
6324 each substring: the offsets to the start and end of the substring.
6325
6326 Arguments:
6327 argument_re points to the compiled expression
6328 extra_data points to extra data or is NULL
6329 subject points to the subject string
6330 length length of subject string (may contain binary zeros)
6331 start_offset where to start in the subject string
6332 options option bits
6333 offsets points to a vector of ints to be filled in with offsets
6334 offsetcount the number of elements in the vector
6335
6336 Returns: > 0 => success; value is the number of elements filled in
6337 = 0 => success, but offsets is not big enough
6338 -1 => failed to match
6339 < -1 => some kind of unexpected problem
6340 */
6341
6342 #ifdef COMPILE_PCRE8
6343 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6344 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6345 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6346 int offsetcount)
6347 #else
6348 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6349 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6350 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6351 int offsetcount)
6352 #endif
6353 {
6354 int rc, ocount, arg_offset_max;
6355 int newline;
6356 BOOL using_temporary_offsets = FALSE;
6357 BOOL anchored;
6358 BOOL startline;
6359 BOOL firstline;
6360 BOOL utf;
6361 BOOL has_first_char = FALSE;
6362 BOOL has_req_char = FALSE;
6363 pcre_uchar first_char = 0;
6364 pcre_uchar first_char2 = 0;
6365 pcre_uchar req_char = 0;
6366 pcre_uchar req_char2 = 0;
6367 match_data match_block;
6368 match_data *md = &match_block;
6369 const pcre_uint8 *tables;
6370 const pcre_uint8 *start_bits = NULL;
6371 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6372 PCRE_PUCHAR end_subject;
6373 PCRE_PUCHAR start_partial = NULL;
6374 PCRE_PUCHAR req_char_ptr = start_match - 1;
6375
6376 const pcre_study_data *study;
6377 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6378
6379 #ifdef NO_RECURSE
6380 heapframe frame_zero;
6381 frame_zero.Xprevframe = NULL; /* Marks the top level */
6382 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6383 md->match_frames_base = &frame_zero;
6384 #endif
6385
6386 /* Check for the special magic call that measures the size of the stack used
6387 per recursive call of match(). Without the funny casting for sizeof, a Windows
6388 compiler gave this error: "unary minus operator applied to unsigned type,
6389 result still unsigned". Hopefully the cast fixes that. */
6390
6391 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6392 start_offset == -999)
6393 #ifdef NO_RECURSE
6394 return -((int)sizeof(heapframe));
6395 #else
6396 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6397 #endif
6398
6399 /* Plausibility checks */
6400
6401 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6402 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6403 return PCRE_ERROR_NULL;
6404 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6405 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6406
6407 /* Check that the first field in the block is the magic number. If it is not,
6408 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6409 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6410 means that the pattern is likely compiled with different endianness. */
6411
6412 if (re->magic_number != MAGIC_NUMBER)
6413 return re->magic_number == REVERSED_MAGIC_NUMBER?
6414 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6415 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6416
6417 /* These two settings are used in the code for checking a UTF-8 string that
6418 follows immediately afterwards. Other values in the md block are used only
6419 during "normal" pcre_exec() processing, not when the JIT support is in use,
6420 so they are set up later. */
6421
6422 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6423 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6424 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6425 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6426
6427 /* Check a UTF-8 string if required. Pass back the character offset and error
6428 code for an invalid string if a results vector is available. */
6429
6430 #ifdef SUPPORT_UTF
6431 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6432 {
6433 int erroroffset;
6434 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6435 if (errorcode != 0)
6436 {
6437 if (offsetcount >= 2)
6438 {
6439 offsets[0] = erroroffset;
6440 offsets[1] = errorcode;
6441 }
6442 #ifdef COMPILE_PCRE16
6443 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6444 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6445 #else
6446 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6447 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6448 #endif
6449 }
6450
6451 /* Check that a start_offset points to the start of a UTF character. */
6452 if (start_offset > 0 && start_offset < length &&
6453 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6454 return PCRE_ERROR_BADUTF8_OFFSET;
6455 }
6456 #endif
6457
6458 /* If the pattern was successfully studied with JIT support, run the JIT
6459 executable instead of the rest of this function. Most options must be set at
6460 compile time for the JIT code to be usable. Fallback to the normal code path if
6461 an unsupported flag is set. */
6462
6463 #ifdef SUPPORT_JIT
6464 if (extra_data != NULL
6465 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6466 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6467 && extra_data->executable_jit != NULL
6468 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6469 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6470 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6471 {
6472 rc = PRIV(jit_exec)(re, extra_data->executable_jit,
6473 (const pcre_uchar *)subject, length, start_offset, options,
6474 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6475 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount,
6476 ((extra_data->flags & PCRE_EXTRA_MARK) != 0) ? extra_data->mark : NULL);
6477
6478 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6479 mode is not compiled. In this case we simply fallback to interpreter. */
6480
6481 if (rc != PCRE_ERROR_NULL) return rc;
6482 }
6483 #endif
6484
6485 /* Carry on with non-JIT matching. This information is for finding all the
6486 numbers associated with a given name, for condition testing. */
6487
6488 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6489 md->name_count = re->name_count;
6490 md->name_entry_size = re->name_entry_size;
6491
6492 /* Fish out the optional data from the extra_data structure, first setting
6493 the default values. */
6494
6495 study = NULL;
6496 md->match_limit = MATCH_LIMIT;
6497 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6498 md->callout_data = NULL;
6499
6500 /* The table pointer is always in native byte order. */
6501
6502 tables = re->tables;
6503
6504 if (extra_data != NULL)
6505 {
6506 register unsigned int flags = extra_data->flags;
6507 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6508 study = (const pcre_study_data *)extra_data->study_data;
6509 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6510 md->match_limit = extra_data->match_limit;
6511 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6512 md->match_limit_recursion = extra_data->match_limit_recursion;
6513 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6514 md->callout_data = extra_data->callout_data;
6515 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6516 }
6517
6518 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6519 is a feature that makes it possible to save compiled regex and re-use them
6520 in other programs later. */
6521
6522 if (tables == NULL) tables = PRIV(default_tables);
6523
6524 /* Set up other data */
6525
6526 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6527 startline = (re->flags & PCRE_STARTLINE) != 0;
6528 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6529
6530 /* The code starts after the real_pcre block and the capture name table. */
6531
6532 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6533 re->name_count * re->name_entry_size;
6534
6535 md->start_subject = (PCRE_PUCHAR)subject;
6536 md->start_offset = start_offset;
6537 md->end_subject = md->start_subject + length;
6538 end_subject = md->end_subject;
6539
6540 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6541 md->use_ucp = (re->options & PCRE_UCP) != 0;
6542 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6543 md->ignore_skip_arg = FALSE;
6544
6545 /* Some options are unpacked into BOOL variables in the hope that testing
6546 them will be faster than individual option bits. */
6547
6548 md->notbol = (options & PCRE_NOTBOL) != 0;
6549 md->noteol = (options & PCRE_NOTEOL) != 0;
6550 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6551 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6552
6553 md->hitend = FALSE;
6554 md->mark = md->nomatch_mark = NULL; /* In case never set */
6555
6556 md->recursive = NULL; /* No recursion at top level */
6557 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6558
6559 md->lcc = tables + lcc_offset;
6560 md->fcc = tables + fcc_offset;
6561 md->ctypes = tables + ctypes_offset;
6562
6563 /* Handle different \R options. */
6564
6565 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6566 {
6567 case 0:
6568 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6569 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6570 else
6571 #ifdef BSR_ANYCRLF
6572 md->bsr_anycrlf = TRUE;
6573 #else
6574 md->bsr_anycrlf = FALSE;
6575 #endif
6576 break;
6577
6578 case PCRE_BSR_ANYCRLF:
6579 md->bsr_anycrlf = TRUE;
6580 break;
6581
6582 case PCRE_BSR_UNICODE:
6583 md->bsr_anycrlf = FALSE;
6584 break;
6585
6586 default: return PCRE_ERROR_BADNEWLINE;
6587 }
6588
6589 /* Handle different types of newline. The three bits give eight cases. If
6590 nothing is set at run time, whatever was used at compile time applies. */
6591
6592 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6593 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6594 {
6595 case 0: newline = NEWLINE; break; /* Compile-time default */
6596 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6597 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6598 case PCRE_NEWLINE_CR+
6599 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6600 case PCRE_NEWLINE_ANY: newline = -1; break;
6601 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6602 default: return PCRE_ERROR_BADNEWLINE;
6603 }
6604
6605 if (newline == -2)
6606 {
6607 md->nltype = NLTYPE_ANYCRLF;
6608 }
6609 else if (newline < 0)
6610 {
6611 md->nltype = NLTYPE_ANY;
6612 }
6613 else
6614 {
6615 md->nltype = NLTYPE_FIXED;
6616 if (newline > 255)
6617 {
6618 md->nllen = 2;
6619 md->nl[0] = (newline >> 8) & 255;
6620 md->nl[1] = newline & 255;
6621 }
6622 else
6623 {
6624 md->nllen = 1;
6625 md->nl[0] = newline;
6626 }
6627 }
6628
6629 /* Partial matching was originally supported only for a restricted set of
6630 regexes; from release 8.00 there are no restrictions, but the bits are still
6631 defined (though never set). So there's no harm in leaving this code. */
6632
6633 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6634 return PCRE_ERROR_BADPARTIAL;
6635
6636 /* If the expression has got more back references than the offsets supplied can
6637 hold, we get a temporary chunk of working store to use during the matching.
6638 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6639 of 3. */
6640
6641 ocount = offsetcount - (offsetcount % 3);
6642 arg_offset_max = (2*ocount)/3;
6643
6644 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6645 {
6646 ocount = re->top_backref * 3 + 3;
6647 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6648 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6649 using_temporary_offsets = TRUE;
6650 DPRINTF(("Got memory to hold back references\n"));
6651 }
6652 else md->offset_vector = offsets;
6653
6654 md->offset_end = ocount;
6655 md->offset_max = (2*ocount)/3;
6656 md->offset_overflow = FALSE;
6657 md->capture_last = -1;
6658
6659 /* Reset the working variable associated with each extraction. These should
6660 never be used unless previously set, but they get saved and restored, and so we
6661 initialize them to avoid reading uninitialized locations. Also, unset the
6662 offsets for the matched string. This is really just for tidiness with callouts,
6663 in case they inspect these fields. */
6664
6665 if (md->offset_vector != NULL)
6666 {
6667 register int *iptr = md->offset_vector + ocount;
6668 register int *iend = iptr - re->top_bracket;
6669 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6670 while (--iptr >= iend) *iptr = -1;
6671 md->offset_vector[0] = md->offset_vector[1] = -1;
6672 }
6673
6674 /* Set up the first character to match, if available. The first_char value is
6675 never set for an anchored regular expression, but the anchoring may be forced
6676 at run time, so we have to test for anchoring. The first char may be unset for
6677 an unanchored pattern, of course. If there's no first char and the pattern was
6678 studied, there may be a bitmap of possible first characters. */
6679
6680 if (!anchored)
6681 {
6682 if ((re->flags & PCRE_FIRSTSET) != 0)
6683 {
6684 has_first_char = TRUE;
6685 first_char = first_char2 = (pcre_uchar)(re->first_char);
6686 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6687 {
6688 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6689 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6690 if (utf && first_char > 127)
6691 first_char2 = UCD_OTHERCASE(first_char);
6692 #endif
6693 }
6694 }
6695 else
6696 if (!startline && study != NULL &&
6697 (study->flags & PCRE_STUDY_MAPPED) != 0)
6698 start_bits = study->start_bits;
6699 }
6700
6701 /* For anchored or unanchored matches, there may be a "last known required
6702 character" set. */
6703
6704 if ((re->flags & PCRE_REQCHSET) != 0)
6705 {
6706 has_req_char = TRUE;
6707 req_char = req_char2 = (pcre_uchar)(re->req_char);
6708 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6709 {
6710 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6711 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6712 if (utf && req_char > 127)
6713 req_char2 = UCD_OTHERCASE(req_char);
6714 #endif
6715 }
6716 }
6717
6718
6719 /* ==========================================================================*/
6720
6721 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6722 the loop runs just once. */
6723
6724 for(;;)
6725 {
6726 PCRE_PUCHAR save_end_subject = end_subject;
6727 PCRE_PUCHAR new_start_match;
6728
6729 /* If firstline is TRUE, the start of the match is constrained to the first
6730 line of a multiline string. That is, the match must be before or at the first
6731 newline. Implement this by temporarily adjusting end_subject so that we stop
6732 scanning at a newline. If the match fails at the newline, later code breaks
6733 this loop. */
6734
6735 if (firstline)
6736 {
6737 PCRE_PUCHAR t = start_match;
6738 #ifdef SUPPORT_UTF
6739 if (utf)
6740 {
6741 while (t < md->end_subject && !IS_NEWLINE(t))
6742 {
6743 t++;