/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 937 - (show annotations)
Sun Feb 26 15:58:56 2012 UTC (7 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 218978 byte(s)
Error occurred while calculating annotation data.
Confine (*COMMIT) inside positive assertions, as documented.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: >= 0 the number of subject bytes matched
144 -1 no match
145 -2 partial match; always given if at end subject
146 */
147
148 static int
149 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
150 BOOL caseless)
151 {
152 PCRE_PUCHAR eptr_start = eptr;
153 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
154
155 #ifdef PCRE_DEBUG
156 if (eptr >= md->end_subject)
157 printf("matching subject <null>");
158 else
159 {
160 printf("matching subject ");
161 pchars(eptr, length, TRUE, md);
162 }
163 printf(" against backref ");
164 pchars(p, length, FALSE, md);
165 printf("\n");
166 #endif
167
168 /* Always fail if reference not set (and not JavaScript compatible - in that
169 case the length is passed as zero). */
170
171 if (length < 0) return -1;
172
173 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
174 properly if Unicode properties are supported. Otherwise, we can check only
175 ASCII characters. */
176
177 if (caseless)
178 {
179 #ifdef SUPPORT_UTF
180 #ifdef SUPPORT_UCP
181 if (md->utf)
182 {
183 /* Match characters up to the end of the reference. NOTE: the number of
184 bytes matched may differ, because there are some characters whose upper and
185 lower case versions code as different numbers of bytes. For example, U+023A
186 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
187 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
188 the latter. It is important, therefore, to check the length along the
189 reference, not along the subject (earlier code did this wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 int c, d;
195 if (eptr >= md->end_subject) return -2; /* Partial match */
196 GETCHARINC(c, eptr);
197 GETCHARINC(d, p);
198 if (c != d && c != UCD_OTHERCASE(d)) return -1;
199 }
200 }
201 else
202 #endif
203 #endif
204
205 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
206 is no UCP support. */
207 {
208 while (length-- > 0)
209 {
210 if (eptr >= md->end_subject) return -2; /* Partial match */
211 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
212 p++;
213 eptr++;
214 }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 while (length-- > 0)
224 {
225 if (eptr >= md->end_subject) return -2; /* Partial match */
226 if (*p++ != *eptr++) return -1;
227 }
228 }
229
230 return (int)(eptr - eptr_start);
231 }
232
233
234
235 /***************************************************************************
236 ****************************************************************************
237 RECURSION IN THE match() FUNCTION
238
239 The match() function is highly recursive, though not every recursive call
240 increases the recursive depth. Nevertheless, some regular expressions can cause
241 it to recurse to a great depth. I was writing for Unix, so I just let it call
242 itself recursively. This uses the stack for saving everything that has to be
243 saved for a recursive call. On Unix, the stack can be large, and this works
244 fine.
245
246 It turns out that on some non-Unix-like systems there are problems with
247 programs that use a lot of stack. (This despite the fact that every last chip
248 has oodles of memory these days, and techniques for extending the stack have
249 been known for decades.) So....
250
251 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
252 calls by keeping local variables that need to be preserved in blocks of memory
253 obtained from malloc() instead instead of on the stack. Macros are used to
254 achieve this so that the actual code doesn't look very different to what it
255 always used to.
256
257 The original heap-recursive code used longjmp(). However, it seems that this
258 can be very slow on some operating systems. Following a suggestion from Stan
259 Switzer, the use of longjmp() has been abolished, at the cost of having to
260 provide a unique number for each call to RMATCH. There is no way of generating
261 a sequence of numbers at compile time in C. I have given them names, to make
262 them stand out more clearly.
263
264 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
265 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
266 tests. Furthermore, not using longjmp() means that local dynamic variables
267 don't have indeterminate values; this has meant that the frame size can be
268 reduced because the result can be "passed back" by straight setting of the
269 variable instead of being passed in the frame.
270 ****************************************************************************
271 ***************************************************************************/
272
273 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
274 below must be updated in sync. */
275
276 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
277 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
278 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
279 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
280 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
281 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
282 RM61, RM62, RM63, RM64, RM65, RM66 };
283
284 /* These versions of the macros use the stack, as normal. There are debugging
285 versions and production versions. Note that the "rw" argument of RMATCH isn't
286 actually used in this definition. */
287
288 #ifndef NO_RECURSE
289 #define REGISTER register
290
291 #ifdef PCRE_DEBUG
292 #define RMATCH(ra,rb,rc,rd,re,rw) \
293 { \
294 printf("match() called in line %d\n", __LINE__); \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
296 printf("to line %d\n", __LINE__); \
297 }
298 #define RRETURN(ra) \
299 { \
300 printf("match() returned %d from line %d ", ra, __LINE__); \
301 return ra; \
302 }
303 #else
304 #define RMATCH(ra,rb,rc,rd,re,rw) \
305 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
306 #define RRETURN(ra) return ra
307 #endif
308
309 #else
310
311
312 /* These versions of the macros manage a private stack on the heap. Note that
313 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
314 argument of match(), which never changes. */
315
316 #define REGISTER
317
318 #define RMATCH(ra,rb,rc,rd,re,rw)\
319 {\
320 heapframe *newframe = frame->Xnextframe;\
321 if (newframe == NULL)\
322 {\
323 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
324 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
325 newframe->Xnextframe = NULL;\
326 frame->Xnextframe = newframe;\
327 }\
328 frame->Xwhere = rw;\
329 newframe->Xeptr = ra;\
330 newframe->Xecode = rb;\
331 newframe->Xmstart = mstart;\
332 newframe->Xoffset_top = rc;\
333 newframe->Xeptrb = re;\
334 newframe->Xrdepth = frame->Xrdepth + 1;\
335 newframe->Xprevframe = frame;\
336 frame = newframe;\
337 DPRINTF(("restarting from line %d\n", __LINE__));\
338 goto HEAP_RECURSE;\
339 L_##rw:\
340 DPRINTF(("jumped back to line %d\n", __LINE__));\
341 }
342
343 #define RRETURN(ra)\
344 {\
345 heapframe *oldframe = frame;\
346 frame = oldframe->Xprevframe;\
347 if (frame != NULL)\
348 {\
349 rrc = ra;\
350 goto HEAP_RETURN;\
351 }\
352 return ra;\
353 }
354
355
356 /* Structure for remembering the local variables in a private frame */
357
358 typedef struct heapframe {
359 struct heapframe *Xprevframe;
360 struct heapframe *Xnextframe;
361
362 /* Function arguments that may change */
363
364 PCRE_PUCHAR Xeptr;
365 const pcre_uchar *Xecode;
366 PCRE_PUCHAR Xmstart;
367 int Xoffset_top;
368 eptrblock *Xeptrb;
369 unsigned int Xrdepth;
370
371 /* Function local variables */
372
373 PCRE_PUCHAR Xcallpat;
374 #ifdef SUPPORT_UTF
375 PCRE_PUCHAR Xcharptr;
376 #endif
377 PCRE_PUCHAR Xdata;
378 PCRE_PUCHAR Xnext;
379 PCRE_PUCHAR Xpp;
380 PCRE_PUCHAR Xprev;
381 PCRE_PUCHAR Xsaved_eptr;
382
383 recursion_info Xnew_recursive;
384
385 BOOL Xcur_is_word;
386 BOOL Xcondition;
387 BOOL Xprev_is_word;
388
389 #ifdef SUPPORT_UCP
390 int Xprop_type;
391 int Xprop_value;
392 int Xprop_fail_result;
393 int Xoclength;
394 pcre_uchar Xocchars[6];
395 #endif
396
397 int Xcodelink;
398 int Xctype;
399 unsigned int Xfc;
400 int Xfi;
401 int Xlength;
402 int Xmax;
403 int Xmin;
404 int Xnumber;
405 int Xoffset;
406 int Xop;
407 int Xsave_capture_last;
408 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
409 int Xstacksave[REC_STACK_SAVE_MAX];
410
411 eptrblock Xnewptrb;
412
413 /* Where to jump back to */
414
415 int Xwhere;
416
417 } heapframe;
418
419 #endif
420
421
422 /***************************************************************************
423 ***************************************************************************/
424
425
426
427 /*************************************************
428 * Match from current position *
429 *************************************************/
430
431 /* This function is called recursively in many circumstances. Whenever it
432 returns a negative (error) response, the outer incarnation must also return the
433 same response. */
434
435 /* These macros pack up tests that are used for partial matching, and which
436 appear several times in the code. We set the "hit end" flag if the pointer is
437 at the end of the subject and also past the start of the subject (i.e.
438 something has been matched). For hard partial matching, we then return
439 immediately. The second one is used when we already know we are past the end of
440 the subject. */
441
442 #define CHECK_PARTIAL()\
443 if (md->partial != 0 && eptr >= md->end_subject && \
444 eptr > md->start_used_ptr) \
445 { \
446 md->hitend = TRUE; \
447 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
448 }
449
450 #define SCHECK_PARTIAL()\
451 if (md->partial != 0 && eptr > md->start_used_ptr) \
452 { \
453 md->hitend = TRUE; \
454 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
455 }
456
457
458 /* Performance note: It might be tempting to extract commonly used fields from
459 the md structure (e.g. utf, end_subject) into individual variables to improve
460 performance. Tests using gcc on a SPARC disproved this; in the first case, it
461 made performance worse.
462
463 Arguments:
464 eptr pointer to current character in subject
465 ecode pointer to current position in compiled code
466 mstart pointer to the current match start position (can be modified
467 by encountering \K)
468 offset_top current top pointer
469 md pointer to "static" info for the match
470 eptrb pointer to chain of blocks containing eptr at start of
471 brackets - for testing for empty matches
472 rdepth the recursion depth
473
474 Returns: MATCH_MATCH if matched ) these values are >= 0
475 MATCH_NOMATCH if failed to match )
476 a negative MATCH_xxx value for PRUNE, SKIP, etc
477 a negative PCRE_ERROR_xxx value if aborted by an error condition
478 (e.g. stopped by repeated call or recursion limit)
479 */
480
481 static int
482 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
483 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
484 unsigned int rdepth)
485 {
486 /* These variables do not need to be preserved over recursion in this function,
487 so they can be ordinary variables in all cases. Mark some of them with
488 "register" because they are used a lot in loops. */
489
490 register int rrc; /* Returns from recursive calls */
491 register int i; /* Used for loops not involving calls to RMATCH() */
492 register unsigned int c; /* Character values not kept over RMATCH() calls */
493 register BOOL utf; /* Local copy of UTF flag for speed */
494
495 BOOL minimize, possessive; /* Quantifier options */
496 BOOL caseless;
497 int condcode;
498
499 /* When recursion is not being used, all "local" variables that have to be
500 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
501 frame on the stack here; subsequent instantiations are obtained from the heap
502 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
503 the top-level on the stack rather than malloc-ing them all gives a performance
504 boost in many cases where there is not much "recursion". */
505
506 #ifdef NO_RECURSE
507 heapframe *frame = (heapframe *)md->match_frames_base;
508
509 /* Copy in the original argument variables */
510
511 frame->Xeptr = eptr;
512 frame->Xecode = ecode;
513 frame->Xmstart = mstart;
514 frame->Xoffset_top = offset_top;
515 frame->Xeptrb = eptrb;
516 frame->Xrdepth = rdepth;
517
518 /* This is where control jumps back to to effect "recursion" */
519
520 HEAP_RECURSE:
521
522 /* Macros make the argument variables come from the current frame */
523
524 #define eptr frame->Xeptr
525 #define ecode frame->Xecode
526 #define mstart frame->Xmstart
527 #define offset_top frame->Xoffset_top
528 #define eptrb frame->Xeptrb
529 #define rdepth frame->Xrdepth
530
531 /* Ditto for the local variables */
532
533 #ifdef SUPPORT_UTF
534 #define charptr frame->Xcharptr
535 #endif
536 #define callpat frame->Xcallpat
537 #define codelink frame->Xcodelink
538 #define data frame->Xdata
539 #define next frame->Xnext
540 #define pp frame->Xpp
541 #define prev frame->Xprev
542 #define saved_eptr frame->Xsaved_eptr
543
544 #define new_recursive frame->Xnew_recursive
545
546 #define cur_is_word frame->Xcur_is_word
547 #define condition frame->Xcondition
548 #define prev_is_word frame->Xprev_is_word
549
550 #ifdef SUPPORT_UCP
551 #define prop_type frame->Xprop_type
552 #define prop_value frame->Xprop_value
553 #define prop_fail_result frame->Xprop_fail_result
554 #define oclength frame->Xoclength
555 #define occhars frame->Xocchars
556 #endif
557
558 #define ctype frame->Xctype
559 #define fc frame->Xfc
560 #define fi frame->Xfi
561 #define length frame->Xlength
562 #define max frame->Xmax
563 #define min frame->Xmin
564 #define number frame->Xnumber
565 #define offset frame->Xoffset
566 #define op frame->Xop
567 #define save_capture_last frame->Xsave_capture_last
568 #define save_offset1 frame->Xsave_offset1
569 #define save_offset2 frame->Xsave_offset2
570 #define save_offset3 frame->Xsave_offset3
571 #define stacksave frame->Xstacksave
572
573 #define newptrb frame->Xnewptrb
574
575 /* When recursion is being used, local variables are allocated on the stack and
576 get preserved during recursion in the normal way. In this environment, fi and
577 i, and fc and c, can be the same variables. */
578
579 #else /* NO_RECURSE not defined */
580 #define fi i
581 #define fc c
582
583 /* Many of the following variables are used only in small blocks of the code.
584 My normal style of coding would have declared them within each of those blocks.
585 However, in order to accommodate the version of this code that uses an external
586 "stack" implemented on the heap, it is easier to declare them all here, so the
587 declarations can be cut out in a block. The only declarations within blocks
588 below are for variables that do not have to be preserved over a recursive call
589 to RMATCH(). */
590
591 #ifdef SUPPORT_UTF
592 const pcre_uchar *charptr;
593 #endif
594 const pcre_uchar *callpat;
595 const pcre_uchar *data;
596 const pcre_uchar *next;
597 PCRE_PUCHAR pp;
598 const pcre_uchar *prev;
599 PCRE_PUCHAR saved_eptr;
600
601 recursion_info new_recursive;
602
603 BOOL cur_is_word;
604 BOOL condition;
605 BOOL prev_is_word;
606
607 #ifdef SUPPORT_UCP
608 int prop_type;
609 int prop_value;
610 int prop_fail_result;
611 int oclength;
612 pcre_uchar occhars[6];
613 #endif
614
615 int codelink;
616 int ctype;
617 int length;
618 int max;
619 int min;
620 int number;
621 int offset;
622 int op;
623 int save_capture_last;
624 int save_offset1, save_offset2, save_offset3;
625 int stacksave[REC_STACK_SAVE_MAX];
626
627 eptrblock newptrb;
628
629 /* There is a special fudge for calling match() in a way that causes it to
630 measure the size of its basic stack frame when the stack is being used for
631 recursion. The second argument (ecode) being NULL triggers this behaviour. It
632 cannot normally ever be NULL. The return is the negated value of the frame
633 size. */
634
635 if (ecode == NULL)
636 {
637 if (rdepth == 0)
638 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
639 else
640 {
641 int len = (char *)&rdepth - (char *)eptr;
642 return (len > 0)? -len : len;
643 }
644 }
645 #endif /* NO_RECURSE */
646
647 /* To save space on the stack and in the heap frame, I have doubled up on some
648 of the local variables that are used only in localised parts of the code, but
649 still need to be preserved over recursive calls of match(). These macros define
650 the alternative names that are used. */
651
652 #define allow_zero cur_is_word
653 #define cbegroup condition
654 #define code_offset codelink
655 #define condassert condition
656 #define matched_once prev_is_word
657 #define foc number
658 #define save_mark data
659
660 /* These statements are here to stop the compiler complaining about unitialized
661 variables. */
662
663 #ifdef SUPPORT_UCP
664 prop_value = 0;
665 prop_fail_result = 0;
666 #endif
667
668
669 /* This label is used for tail recursion, which is used in a few cases even
670 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
671 used. Thanks to Ian Taylor for noticing this possibility and sending the
672 original patch. */
673
674 TAIL_RECURSE:
675
676 /* OK, now we can get on with the real code of the function. Recursive calls
677 are specified by the macro RMATCH and RRETURN is used to return. When
678 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
679 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
680 defined). However, RMATCH isn't like a function call because it's quite a
681 complicated macro. It has to be used in one particular way. This shouldn't,
682 however, impact performance when true recursion is being used. */
683
684 #ifdef SUPPORT_UTF
685 utf = md->utf; /* Local copy of the flag */
686 #else
687 utf = FALSE;
688 #endif
689
690 /* First check that we haven't called match() too many times, or that we
691 haven't exceeded the recursive call limit. */
692
693 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
694 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
695
696 /* At the start of a group with an unlimited repeat that may match an empty
697 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
698 done this way to save having to use another function argument, which would take
699 up space on the stack. See also MATCH_CONDASSERT below.
700
701 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
702 such remembered pointers, to be checked when we hit the closing ket, in order
703 to break infinite loops that match no characters. When match() is called in
704 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
705 NOT be used with tail recursion, because the memory block that is used is on
706 the stack, so a new one may be required for each match(). */
707
708 if (md->match_function_type == MATCH_CBEGROUP)
709 {
710 newptrb.epb_saved_eptr = eptr;
711 newptrb.epb_prev = eptrb;
712 eptrb = &newptrb;
713 md->match_function_type = 0;
714 }
715
716 /* Now start processing the opcodes. */
717
718 for (;;)
719 {
720 minimize = possessive = FALSE;
721 op = *ecode;
722
723 switch(op)
724 {
725 case OP_MARK:
726 md->nomatch_mark = ecode + 2;
727 md->mark = NULL; /* In case previously set by assertion */
728 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
729 eptrb, RM55);
730 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
731 md->mark == NULL) md->mark = ecode + 2;
732
733 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
734 argument, and we must check whether that argument matches this MARK's
735 argument. It is passed back in md->start_match_ptr (an overloading of that
736 variable). If it does match, we reset that variable to the current subject
737 position and return MATCH_SKIP. Otherwise, pass back the return code
738 unaltered. */
739
740 else if (rrc == MATCH_SKIP_ARG &&
741 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
742 {
743 md->start_match_ptr = eptr;
744 RRETURN(MATCH_SKIP);
745 }
746 RRETURN(rrc);
747
748 case OP_FAIL:
749 RRETURN(MATCH_NOMATCH);
750
751 /* COMMIT overrides PRUNE, SKIP, and THEN */
752
753 case OP_COMMIT:
754 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
755 eptrb, RM52);
756 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
757 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
758 rrc != MATCH_THEN)
759 RRETURN(rrc);
760 RRETURN(MATCH_COMMIT);
761
762 /* PRUNE overrides THEN */
763
764 case OP_PRUNE:
765 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
766 eptrb, RM51);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
768 RRETURN(MATCH_PRUNE);
769
770 case OP_PRUNE_ARG:
771 md->nomatch_mark = ecode + 2;
772 md->mark = NULL; /* In case previously set by assertion */
773 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM56);
775 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
776 md->mark == NULL) md->mark = ecode + 2;
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 RRETURN(MATCH_PRUNE);
779
780 /* SKIP overrides PRUNE and THEN */
781
782 case OP_SKIP:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM53);
785 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
786 RRETURN(rrc);
787 md->start_match_ptr = eptr; /* Pass back current position */
788 RRETURN(MATCH_SKIP);
789
790 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
791 nomatch_mark. There is a flag that disables this opcode when re-matching a
792 pattern that ended with a SKIP for which there was not a matching MARK. */
793
794 case OP_SKIP_ARG:
795 if (md->ignore_skip_arg)
796 {
797 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
798 break;
799 }
800 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
801 eptrb, RM57);
802 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
803 RRETURN(rrc);
804
805 /* Pass back the current skip name by overloading md->start_match_ptr and
806 returning the special MATCH_SKIP_ARG return code. This will either be
807 caught by a matching MARK, or get to the top, where it causes a rematch
808 with the md->ignore_skip_arg flag set. */
809
810 md->start_match_ptr = ecode + 2;
811 RRETURN(MATCH_SKIP_ARG);
812
813 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
814 the branch in which it occurs can be determined. Overload the start of
815 match pointer to do this. */
816
817 case OP_THEN:
818 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
819 eptrb, RM54);
820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
821 md->start_match_ptr = ecode;
822 RRETURN(MATCH_THEN);
823
824 case OP_THEN_ARG:
825 md->nomatch_mark = ecode + 2;
826 md->mark = NULL; /* In case previously set by assertion */
827 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
828 md, eptrb, RM58);
829 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
830 md->mark == NULL) md->mark = ecode + 2;
831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832 md->start_match_ptr = ecode;
833 RRETURN(MATCH_THEN);
834
835 /* Handle an atomic group that does not contain any capturing parentheses.
836 This can be handled like an assertion. Prior to 8.13, all atomic groups
837 were handled this way. In 8.13, the code was changed as below for ONCE, so
838 that backups pass through the group and thereby reset captured values.
839 However, this uses a lot more stack, so in 8.20, atomic groups that do not
840 contain any captures generate OP_ONCE_NC, which can be handled in the old,
841 less stack intensive way.
842
843 Check the alternative branches in turn - the matching won't pass the KET
844 for this kind of subpattern. If any one branch matches, we carry on as at
845 the end of a normal bracket, leaving the subject pointer, but resetting
846 the start-of-match value in case it was changed by \K. */
847
848 case OP_ONCE_NC:
849 prev = ecode;
850 saved_eptr = eptr;
851 save_mark = md->mark;
852 do
853 {
854 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
855 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
856 {
857 mstart = md->start_match_ptr;
858 break;
859 }
860 if (rrc == MATCH_THEN)
861 {
862 next = ecode + GET(ecode,1);
863 if (md->start_match_ptr < next &&
864 (*ecode == OP_ALT || *next == OP_ALT))
865 rrc = MATCH_NOMATCH;
866 }
867
868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
869 ecode += GET(ecode,1);
870 md->mark = save_mark;
871 }
872 while (*ecode == OP_ALT);
873
874 /* If hit the end of the group (which could be repeated), fail */
875
876 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
877
878 /* Continue as from after the group, updating the offsets high water
879 mark, since extracts may have been taken. */
880
881 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
882
883 offset_top = md->end_offset_top;
884 eptr = md->end_match_ptr;
885
886 /* For a non-repeating ket, just continue at this level. This also
887 happens for a repeating ket if no characters were matched in the group.
888 This is the forcible breaking of infinite loops as implemented in Perl
889 5.005. */
890
891 if (*ecode == OP_KET || eptr == saved_eptr)
892 {
893 ecode += 1+LINK_SIZE;
894 break;
895 }
896
897 /* The repeating kets try the rest of the pattern or restart from the
898 preceding bracket, in the appropriate order. The second "call" of match()
899 uses tail recursion, to avoid using another stack frame. */
900
901 if (*ecode == OP_KETRMIN)
902 {
903 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
905 ecode = prev;
906 goto TAIL_RECURSE;
907 }
908 else /* OP_KETRMAX */
909 {
910 md->match_function_type = MATCH_CBEGROUP;
911 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
912 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
913 ecode += 1 + LINK_SIZE;
914 goto TAIL_RECURSE;
915 }
916 /* Control never gets here */
917
918 /* Handle a capturing bracket, other than those that are possessive with an
919 unlimited repeat. If there is space in the offset vector, save the current
920 subject position in the working slot at the top of the vector. We mustn't
921 change the current values of the data slot, because they may be set from a
922 previous iteration of this group, and be referred to by a reference inside
923 the group. A failure to match might occur after the group has succeeded,
924 if something later on doesn't match. For this reason, we need to restore
925 the working value and also the values of the final offsets, in case they
926 were set by a previous iteration of the same bracket.
927
928 If there isn't enough space in the offset vector, treat this as if it were
929 a non-capturing bracket. Don't worry about setting the flag for the error
930 case here; that is handled in the code for KET. */
931
932 case OP_CBRA:
933 case OP_SCBRA:
934 number = GET2(ecode, 1+LINK_SIZE);
935 offset = number << 1;
936
937 #ifdef PCRE_DEBUG
938 printf("start bracket %d\n", number);
939 printf("subject=");
940 pchars(eptr, 16, TRUE, md);
941 printf("\n");
942 #endif
943
944 if (offset < md->offset_max)
945 {
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950 save_mark = md->mark;
951
952 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
953 md->offset_vector[md->offset_end - number] =
954 (int)(eptr - md->start_subject);
955
956 for (;;)
957 {
958 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
959 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
960 eptrb, RM1);
961 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
962
963 /* If we backed up to a THEN, check whether it is within the current
964 branch by comparing the address of the THEN that is passed back with
965 the end of the branch. If it is within the current branch, and the
966 branch is one of two or more alternatives (it either starts or ends
967 with OP_ALT), we have reached the limit of THEN's action, so convert
968 the return code to NOMATCH, which will cause normal backtracking to
969 happen from now on. Otherwise, THEN is passed back to an outer
970 alternative. This implements Perl's treatment of parenthesized groups,
971 where a group not containing | does not affect the current alternative,
972 that is, (X) is NOT the same as (X|(*F)). */
973
974 if (rrc == MATCH_THEN)
975 {
976 next = ecode + GET(ecode,1);
977 if (md->start_match_ptr < next &&
978 (*ecode == OP_ALT || *next == OP_ALT))
979 rrc = MATCH_NOMATCH;
980 }
981
982 /* Anything other than NOMATCH is passed back. */
983
984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
985 md->capture_last = save_capture_last;
986 ecode += GET(ecode, 1);
987 md->mark = save_mark;
988 if (*ecode != OP_ALT) break;
989 }
990
991 DPRINTF(("bracket %d failed\n", number));
992 md->offset_vector[offset] = save_offset1;
993 md->offset_vector[offset+1] = save_offset2;
994 md->offset_vector[md->offset_end - number] = save_offset3;
995
996 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
997
998 RRETURN(rrc);
999 }
1000
1001 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1002 as a non-capturing bracket. */
1003
1004 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006
1007 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1008
1009 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1010 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1011
1012 /* Non-capturing or atomic group, except for possessive with unlimited
1013 repeat and ONCE group with no captures. Loop for all the alternatives.
1014
1015 When we get to the final alternative within the brackets, we used to return
1016 the result of a recursive call to match() whatever happened so it was
1017 possible to reduce stack usage by turning this into a tail recursion,
1018 except in the case of a possibly empty group. However, now that there is
1019 the possiblity of (*THEN) occurring in the final alternative, this
1020 optimization is no longer always possible.
1021
1022 We can optimize if we know there are no (*THEN)s in the pattern; at present
1023 this is the best that can be done.
1024
1025 MATCH_ONCE is returned when the end of an atomic group is successfully
1026 reached, but subsequent matching fails. It passes back up the tree (causing
1027 captured values to be reset) until the original atomic group level is
1028 reached. This is tested by comparing md->once_target with the start of the
1029 group. At this point, the return is converted into MATCH_NOMATCH so that
1030 previous backup points can be taken. */
1031
1032 case OP_ONCE:
1033 case OP_BRA:
1034 case OP_SBRA:
1035 DPRINTF(("start non-capturing bracket\n"));
1036
1037 for (;;)
1038 {
1039 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1040
1041 /* If this is not a possibly empty group, and there are no (*THEN)s in
1042 the pattern, and this is the final alternative, optimize as described
1043 above. */
1044
1045 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1046 {
1047 ecode += PRIV(OP_lengths)[*ecode];
1048 goto TAIL_RECURSE;
1049 }
1050
1051 /* In all other cases, we have to make another call to match(). */
1052
1053 save_mark = md->mark;
1054 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1055 RM2);
1056
1057 /* See comment in the code for capturing groups above about handling
1058 THEN. */
1059
1060 if (rrc == MATCH_THEN)
1061 {
1062 next = ecode + GET(ecode,1);
1063 if (md->start_match_ptr < next &&
1064 (*ecode == OP_ALT || *next == OP_ALT))
1065 rrc = MATCH_NOMATCH;
1066 }
1067
1068 if (rrc != MATCH_NOMATCH)
1069 {
1070 if (rrc == MATCH_ONCE)
1071 {
1072 const pcre_uchar *scode = ecode;
1073 if (*scode != OP_ONCE) /* If not at start, find it */
1074 {
1075 while (*scode == OP_ALT) scode += GET(scode, 1);
1076 scode -= GET(scode, 1);
1077 }
1078 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1079 }
1080 RRETURN(rrc);
1081 }
1082 ecode += GET(ecode, 1);
1083 md->mark = save_mark;
1084 if (*ecode != OP_ALT) break;
1085 }
1086
1087 RRETURN(MATCH_NOMATCH);
1088
1089 /* Handle possessive capturing brackets with an unlimited repeat. We come
1090 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1091 handled similarly to the normal case above. However, the matching is
1092 different. The end of these brackets will always be OP_KETRPOS, which
1093 returns MATCH_KETRPOS without going further in the pattern. By this means
1094 we can handle the group by iteration rather than recursion, thereby
1095 reducing the amount of stack needed. */
1096
1097 case OP_CBRAPOS:
1098 case OP_SCBRAPOS:
1099 allow_zero = FALSE;
1100
1101 POSSESSIVE_CAPTURE:
1102 number = GET2(ecode, 1+LINK_SIZE);
1103 offset = number << 1;
1104
1105 #ifdef PCRE_DEBUG
1106 printf("start possessive bracket %d\n", number);
1107 printf("subject=");
1108 pchars(eptr, 16, TRUE, md);
1109 printf("\n");
1110 #endif
1111
1112 if (offset < md->offset_max)
1113 {
1114 matched_once = FALSE;
1115 code_offset = (int)(ecode - md->start_code);
1116
1117 save_offset1 = md->offset_vector[offset];
1118 save_offset2 = md->offset_vector[offset+1];
1119 save_offset3 = md->offset_vector[md->offset_end - number];
1120 save_capture_last = md->capture_last;
1121
1122 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1123
1124 /* Each time round the loop, save the current subject position for use
1125 when the group matches. For MATCH_MATCH, the group has matched, so we
1126 restart it with a new subject starting position, remembering that we had
1127 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1128 usual. If we haven't matched any alternatives in any iteration, check to
1129 see if a previous iteration matched. If so, the group has matched;
1130 continue from afterwards. Otherwise it has failed; restore the previous
1131 capture values before returning NOMATCH. */
1132
1133 for (;;)
1134 {
1135 md->offset_vector[md->offset_end - number] =
1136 (int)(eptr - md->start_subject);
1137 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1138 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1139 eptrb, RM63);
1140 if (rrc == MATCH_KETRPOS)
1141 {
1142 offset_top = md->end_offset_top;
1143 eptr = md->end_match_ptr;
1144 ecode = md->start_code + code_offset;
1145 save_capture_last = md->capture_last;
1146 matched_once = TRUE;
1147 continue;
1148 }
1149
1150 /* See comment in the code for capturing groups above about handling
1151 THEN. */
1152
1153 if (rrc == MATCH_THEN)
1154 {
1155 next = ecode + GET(ecode,1);
1156 if (md->start_match_ptr < next &&
1157 (*ecode == OP_ALT || *next == OP_ALT))
1158 rrc = MATCH_NOMATCH;
1159 }
1160
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 md->capture_last = save_capture_last;
1163 ecode += GET(ecode, 1);
1164 if (*ecode != OP_ALT) break;
1165 }
1166
1167 if (!matched_once)
1168 {
1169 md->offset_vector[offset] = save_offset1;
1170 md->offset_vector[offset+1] = save_offset2;
1171 md->offset_vector[md->offset_end - number] = save_offset3;
1172 }
1173
1174 if (allow_zero || matched_once)
1175 {
1176 ecode += 1 + LINK_SIZE;
1177 break;
1178 }
1179
1180 RRETURN(MATCH_NOMATCH);
1181 }
1182
1183 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1184 as a non-capturing bracket. */
1185
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188
1189 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1190
1191 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1192 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1193
1194 /* Non-capturing possessive bracket with unlimited repeat. We come here
1195 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1196 without the capturing complication. It is written out separately for speed
1197 and cleanliness. */
1198
1199 case OP_BRAPOS:
1200 case OP_SBRAPOS:
1201 allow_zero = FALSE;
1202
1203 POSSESSIVE_NON_CAPTURE:
1204 matched_once = FALSE;
1205 code_offset = (int)(ecode - md->start_code);
1206
1207 for (;;)
1208 {
1209 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1210 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1211 eptrb, RM48);
1212 if (rrc == MATCH_KETRPOS)
1213 {
1214 offset_top = md->end_offset_top;
1215 eptr = md->end_match_ptr;
1216 ecode = md->start_code + code_offset;
1217 matched_once = TRUE;
1218 continue;
1219 }
1220
1221 /* See comment in the code for capturing groups above about handling
1222 THEN. */
1223
1224 if (rrc == MATCH_THEN)
1225 {
1226 next = ecode + GET(ecode,1);
1227 if (md->start_match_ptr < next &&
1228 (*ecode == OP_ALT || *next == OP_ALT))
1229 rrc = MATCH_NOMATCH;
1230 }
1231
1232 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1233 ecode += GET(ecode, 1);
1234 if (*ecode != OP_ALT) break;
1235 }
1236
1237 if (matched_once || allow_zero)
1238 {
1239 ecode += 1 + LINK_SIZE;
1240 break;
1241 }
1242 RRETURN(MATCH_NOMATCH);
1243
1244 /* Control never reaches here. */
1245
1246 /* Conditional group: compilation checked that there are no more than
1247 two branches. If the condition is false, skipping the first branch takes us
1248 past the end if there is only one branch, but that's OK because that is
1249 exactly what going to the ket would do. */
1250
1251 case OP_COND:
1252 case OP_SCOND:
1253 codelink = GET(ecode, 1);
1254
1255 /* Because of the way auto-callout works during compile, a callout item is
1256 inserted between OP_COND and an assertion condition. */
1257
1258 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1259 {
1260 if (PUBL(callout) != NULL)
1261 {
1262 PUBL(callout_block) cb;
1263 cb.version = 2; /* Version 1 of the callout block */
1264 cb.callout_number = ecode[LINK_SIZE+2];
1265 cb.offset_vector = md->offset_vector;
1266 #ifdef COMPILE_PCRE8
1267 cb.subject = (PCRE_SPTR)md->start_subject;
1268 #else
1269 cb.subject = (PCRE_SPTR16)md->start_subject;
1270 #endif
1271 cb.subject_length = (int)(md->end_subject - md->start_subject);
1272 cb.start_match = (int)(mstart - md->start_subject);
1273 cb.current_position = (int)(eptr - md->start_subject);
1274 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1275 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1276 cb.capture_top = offset_top/2;
1277 cb.capture_last = md->capture_last;
1278 cb.callout_data = md->callout_data;
1279 cb.mark = md->nomatch_mark;
1280 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1281 if (rrc < 0) RRETURN(rrc);
1282 }
1283 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1284 }
1285
1286 condcode = ecode[LINK_SIZE+1];
1287
1288 /* Now see what the actual condition is */
1289
1290 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1291 {
1292 if (md->recursive == NULL) /* Not recursing => FALSE */
1293 {
1294 condition = FALSE;
1295 ecode += GET(ecode, 1);
1296 }
1297 else
1298 {
1299 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1300 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1301
1302 /* If the test is for recursion into a specific subpattern, and it is
1303 false, but the test was set up by name, scan the table to see if the
1304 name refers to any other numbers, and test them. The condition is true
1305 if any one is set. */
1306
1307 if (!condition && condcode == OP_NRREF)
1308 {
1309 pcre_uchar *slotA = md->name_table;
1310 for (i = 0; i < md->name_count; i++)
1311 {
1312 if (GET2(slotA, 0) == recno) break;
1313 slotA += md->name_entry_size;
1314 }
1315
1316 /* Found a name for the number - there can be only one; duplicate
1317 names for different numbers are allowed, but not vice versa. First
1318 scan down for duplicates. */
1319
1320 if (i < md->name_count)
1321 {
1322 pcre_uchar *slotB = slotA;
1323 while (slotB > md->name_table)
1324 {
1325 slotB -= md->name_entry_size;
1326 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1327 {
1328 condition = GET2(slotB, 0) == md->recursive->group_num;
1329 if (condition) break;
1330 }
1331 else break;
1332 }
1333
1334 /* Scan up for duplicates */
1335
1336 if (!condition)
1337 {
1338 slotB = slotA;
1339 for (i++; i < md->name_count; i++)
1340 {
1341 slotB += md->name_entry_size;
1342 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1343 {
1344 condition = GET2(slotB, 0) == md->recursive->group_num;
1345 if (condition) break;
1346 }
1347 else break;
1348 }
1349 }
1350 }
1351 }
1352
1353 /* Chose branch according to the condition */
1354
1355 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1356 }
1357 }
1358
1359 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1360 {
1361 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1362 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1363
1364 /* If the numbered capture is unset, but the reference was by name,
1365 scan the table to see if the name refers to any other numbers, and test
1366 them. The condition is true if any one is set. This is tediously similar
1367 to the code above, but not close enough to try to amalgamate. */
1368
1369 if (!condition && condcode == OP_NCREF)
1370 {
1371 int refno = offset >> 1;
1372 pcre_uchar *slotA = md->name_table;
1373
1374 for (i = 0; i < md->name_count; i++)
1375 {
1376 if (GET2(slotA, 0) == refno) break;
1377 slotA += md->name_entry_size;
1378 }
1379
1380 /* Found a name for the number - there can be only one; duplicate names
1381 for different numbers are allowed, but not vice versa. First scan down
1382 for duplicates. */
1383
1384 if (i < md->name_count)
1385 {
1386 pcre_uchar *slotB = slotA;
1387 while (slotB > md->name_table)
1388 {
1389 slotB -= md->name_entry_size;
1390 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1391 {
1392 offset = GET2(slotB, 0) << 1;
1393 condition = offset < offset_top &&
1394 md->offset_vector[offset] >= 0;
1395 if (condition) break;
1396 }
1397 else break;
1398 }
1399
1400 /* Scan up for duplicates */
1401
1402 if (!condition)
1403 {
1404 slotB = slotA;
1405 for (i++; i < md->name_count; i++)
1406 {
1407 slotB += md->name_entry_size;
1408 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1409 {
1410 offset = GET2(slotB, 0) << 1;
1411 condition = offset < offset_top &&
1412 md->offset_vector[offset] >= 0;
1413 if (condition) break;
1414 }
1415 else break;
1416 }
1417 }
1418 }
1419 }
1420
1421 /* Chose branch according to the condition */
1422
1423 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1424 }
1425
1426 else if (condcode == OP_DEF) /* DEFINE - always false */
1427 {
1428 condition = FALSE;
1429 ecode += GET(ecode, 1);
1430 }
1431
1432 /* The condition is an assertion. Call match() to evaluate it - setting
1433 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1434 an assertion. */
1435
1436 else
1437 {
1438 md->match_function_type = MATCH_CONDASSERT;
1439 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1440 if (rrc == MATCH_MATCH)
1441 {
1442 if (md->end_offset_top > offset_top)
1443 offset_top = md->end_offset_top; /* Captures may have happened */
1444 condition = TRUE;
1445 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1446 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1447 }
1448
1449 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1450 assertion; it is therefore treated as NOMATCH. */
1451
1452 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1453 {
1454 RRETURN(rrc); /* Need braces because of following else */
1455 }
1456 else
1457 {
1458 condition = FALSE;
1459 ecode += codelink;
1460 }
1461 }
1462
1463 /* We are now at the branch that is to be obeyed. As there is only one, can
1464 use tail recursion to avoid using another stack frame, except when there is
1465 unlimited repeat of a possibly empty group. In the latter case, a recursive
1466 call to match() is always required, unless the second alternative doesn't
1467 exist, in which case we can just plough on. Note that, for compatibility
1468 with Perl, the | in a conditional group is NOT treated as creating two
1469 alternatives. If a THEN is encountered in the branch, it propagates out to
1470 the enclosing alternative (unless nested in a deeper set of alternatives,
1471 of course). */
1472
1473 if (condition || *ecode == OP_ALT)
1474 {
1475 if (op != OP_SCOND)
1476 {
1477 ecode += 1 + LINK_SIZE;
1478 goto TAIL_RECURSE;
1479 }
1480
1481 md->match_function_type = MATCH_CBEGROUP;
1482 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1483 RRETURN(rrc);
1484 }
1485
1486 /* Condition false & no alternative; continue after the group. */
1487
1488 else
1489 {
1490 ecode += 1 + LINK_SIZE;
1491 }
1492 break;
1493
1494
1495 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1496 to close any currently open capturing brackets. */
1497
1498 case OP_CLOSE:
1499 number = GET2(ecode, 1);
1500 offset = number << 1;
1501
1502 #ifdef PCRE_DEBUG
1503 printf("end bracket %d at *ACCEPT", number);
1504 printf("\n");
1505 #endif
1506
1507 md->capture_last = number;
1508 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1509 {
1510 md->offset_vector[offset] =
1511 md->offset_vector[md->offset_end - number];
1512 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1513 if (offset_top <= offset) offset_top = offset + 2;
1514 }
1515 ecode += 1 + IMM2_SIZE;
1516 break;
1517
1518
1519 /* End of the pattern, either real or forced. */
1520
1521 case OP_END:
1522 case OP_ACCEPT:
1523 case OP_ASSERT_ACCEPT:
1524
1525 /* If we have matched an empty string, fail if not in an assertion and not
1526 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1527 is set and we have matched at the start of the subject. In both cases,
1528 backtracking will then try other alternatives, if any. */
1529
1530 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1531 md->recursive == NULL &&
1532 (md->notempty ||
1533 (md->notempty_atstart &&
1534 mstart == md->start_subject + md->start_offset)))
1535 RRETURN(MATCH_NOMATCH);
1536
1537 /* Otherwise, we have a match. */
1538
1539 md->end_match_ptr = eptr; /* Record where we ended */
1540 md->end_offset_top = offset_top; /* and how many extracts were taken */
1541 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1542
1543 /* For some reason, the macros don't work properly if an expression is
1544 given as the argument to RRETURN when the heap is in use. */
1545
1546 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1547 RRETURN(rrc);
1548
1549 /* Assertion brackets. Check the alternative branches in turn - the
1550 matching won't pass the KET for an assertion. If any one branch matches,
1551 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1552 start of each branch to move the current point backwards, so the code at
1553 this level is identical to the lookahead case. When the assertion is part
1554 of a condition, we want to return immediately afterwards. The caller of
1555 this incarnation of the match() function will have set MATCH_CONDASSERT in
1556 md->match_function type, and one of these opcodes will be the first opcode
1557 that is processed. We use a local variable that is preserved over calls to
1558 match() to remember this case. */
1559
1560 case OP_ASSERT:
1561 case OP_ASSERTBACK:
1562 save_mark = md->mark;
1563 if (md->match_function_type == MATCH_CONDASSERT)
1564 {
1565 condassert = TRUE;
1566 md->match_function_type = 0;
1567 }
1568 else condassert = FALSE;
1569
1570 do
1571 {
1572 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1574 {
1575 mstart = md->start_match_ptr; /* In case \K reset it */
1576 break;
1577 }
1578
1579 /* PCRE does not allow THEN or COMMIT to escape beyond an assertion; it
1580 is treated as NOMATCH. */
1581
1582 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1583 rrc != MATCH_COMMIT) RRETURN(rrc);
1584
1585 ecode += GET(ecode, 1);
1586 md->mark = save_mark;
1587 }
1588 while (*ecode == OP_ALT);
1589
1590 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1591
1592 /* If checking an assertion for a condition, return MATCH_MATCH. */
1593
1594 if (condassert) RRETURN(MATCH_MATCH);
1595
1596 /* Continue from after the assertion, updating the offsets high water
1597 mark, since extracts may have been taken during the assertion. */
1598
1599 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1600 ecode += 1 + LINK_SIZE;
1601 offset_top = md->end_offset_top;
1602 continue;
1603
1604 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1605 PRUNE, or COMMIT means we must assume failure without checking subsequent
1606 branches. */
1607
1608 case OP_ASSERT_NOT:
1609 case OP_ASSERTBACK_NOT:
1610 save_mark = md->mark;
1611 if (md->match_function_type == MATCH_CONDASSERT)
1612 {
1613 condassert = TRUE;
1614 md->match_function_type = 0;
1615 }
1616 else condassert = FALSE;
1617
1618 do
1619 {
1620 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1621 md->mark = save_mark;
1622 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1623 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1624 {
1625 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1626 break;
1627 }
1628
1629 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1630 as NOMATCH. */
1631
1632 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1633 ecode += GET(ecode,1);
1634 }
1635 while (*ecode == OP_ALT);
1636
1637 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1638
1639 ecode += 1 + LINK_SIZE;
1640 continue;
1641
1642 /* Move the subject pointer back. This occurs only at the start of
1643 each branch of a lookbehind assertion. If we are too close to the start to
1644 move back, this match function fails. When working with UTF-8 we move
1645 back a number of characters, not bytes. */
1646
1647 case OP_REVERSE:
1648 #ifdef SUPPORT_UTF
1649 if (utf)
1650 {
1651 i = GET(ecode, 1);
1652 while (i-- > 0)
1653 {
1654 eptr--;
1655 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1656 BACKCHAR(eptr);
1657 }
1658 }
1659 else
1660 #endif
1661
1662 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1663
1664 {
1665 eptr -= GET(ecode, 1);
1666 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1667 }
1668
1669 /* Save the earliest consulted character, then skip to next op code */
1670
1671 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1672 ecode += 1 + LINK_SIZE;
1673 break;
1674
1675 /* The callout item calls an external function, if one is provided, passing
1676 details of the match so far. This is mainly for debugging, though the
1677 function is able to force a failure. */
1678
1679 case OP_CALLOUT:
1680 if (PUBL(callout) != NULL)
1681 {
1682 PUBL(callout_block) cb;
1683 cb.version = 2; /* Version 1 of the callout block */
1684 cb.callout_number = ecode[1];
1685 cb.offset_vector = md->offset_vector;
1686 #ifdef COMPILE_PCRE8
1687 cb.subject = (PCRE_SPTR)md->start_subject;
1688 #else
1689 cb.subject = (PCRE_SPTR16)md->start_subject;
1690 #endif
1691 cb.subject_length = (int)(md->end_subject - md->start_subject);
1692 cb.start_match = (int)(mstart - md->start_subject);
1693 cb.current_position = (int)(eptr - md->start_subject);
1694 cb.pattern_position = GET(ecode, 2);
1695 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1696 cb.capture_top = offset_top/2;
1697 cb.capture_last = md->capture_last;
1698 cb.callout_data = md->callout_data;
1699 cb.mark = md->nomatch_mark;
1700 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1701 if (rrc < 0) RRETURN(rrc);
1702 }
1703 ecode += 2 + 2*LINK_SIZE;
1704 break;
1705
1706 /* Recursion either matches the current regex, or some subexpression. The
1707 offset data is the offset to the starting bracket from the start of the
1708 whole pattern. (This is so that it works from duplicated subpatterns.)
1709
1710 The state of the capturing groups is preserved over recursion, and
1711 re-instated afterwards. We don't know how many are started and not yet
1712 finished (offset_top records the completed total) so we just have to save
1713 all the potential data. There may be up to 65535 such values, which is too
1714 large to put on the stack, but using malloc for small numbers seems
1715 expensive. As a compromise, the stack is used when there are no more than
1716 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1717
1718 There are also other values that have to be saved. We use a chained
1719 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1720 for the original version of this logic. It has, however, been hacked around
1721 a lot, so he is not to blame for the current way it works. */
1722
1723 case OP_RECURSE:
1724 {
1725 recursion_info *ri;
1726 int recno;
1727
1728 callpat = md->start_code + GET(ecode, 1);
1729 recno = (callpat == md->start_code)? 0 :
1730 GET2(callpat, 1 + LINK_SIZE);
1731
1732 /* Check for repeating a recursion without advancing the subject pointer.
1733 This should catch convoluted mutual recursions. (Some simple cases are
1734 caught at compile time.) */
1735
1736 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1737 if (recno == ri->group_num && eptr == ri->subject_position)
1738 RRETURN(PCRE_ERROR_RECURSELOOP);
1739
1740 /* Add to "recursing stack" */
1741
1742 new_recursive.group_num = recno;
1743 new_recursive.subject_position = eptr;
1744 new_recursive.prevrec = md->recursive;
1745 md->recursive = &new_recursive;
1746
1747 /* Where to continue from afterwards */
1748
1749 ecode += 1 + LINK_SIZE;
1750
1751 /* Now save the offset data */
1752
1753 new_recursive.saved_max = md->offset_end;
1754 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1755 new_recursive.offset_save = stacksave;
1756 else
1757 {
1758 new_recursive.offset_save =
1759 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1760 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1761 }
1762 memcpy(new_recursive.offset_save, md->offset_vector,
1763 new_recursive.saved_max * sizeof(int));
1764
1765 /* OK, now we can do the recursion. After processing each alternative,
1766 restore the offset data. If there were nested recursions, md->recursive
1767 might be changed, so reset it before looping. */
1768
1769 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1770 cbegroup = (*callpat >= OP_SBRA);
1771 do
1772 {
1773 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1774 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1775 md, eptrb, RM6);
1776 memcpy(md->offset_vector, new_recursive.offset_save,
1777 new_recursive.saved_max * sizeof(int));
1778 md->recursive = new_recursive.prevrec;
1779 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1780 {
1781 DPRINTF(("Recursion matched\n"));
1782 if (new_recursive.offset_save != stacksave)
1783 (PUBL(free))(new_recursive.offset_save);
1784
1785 /* Set where we got to in the subject, and reset the start in case
1786 it was changed by \K. This *is* propagated back out of a recursion,
1787 for Perl compatibility. */
1788
1789 eptr = md->end_match_ptr;
1790 mstart = md->start_match_ptr;
1791 goto RECURSION_MATCHED; /* Exit loop; end processing */
1792 }
1793
1794 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1795 is treated as NOMATCH. */
1796
1797 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1798 rrc != MATCH_COMMIT)
1799 {
1800 DPRINTF(("Recursion gave error %d\n", rrc));
1801 if (new_recursive.offset_save != stacksave)
1802 (PUBL(free))(new_recursive.offset_save);
1803 RRETURN(rrc);
1804 }
1805
1806 md->recursive = &new_recursive;
1807 callpat += GET(callpat, 1);
1808 }
1809 while (*callpat == OP_ALT);
1810
1811 DPRINTF(("Recursion didn't match\n"));
1812 md->recursive = new_recursive.prevrec;
1813 if (new_recursive.offset_save != stacksave)
1814 (PUBL(free))(new_recursive.offset_save);
1815 RRETURN(MATCH_NOMATCH);
1816 }
1817
1818 RECURSION_MATCHED:
1819 break;
1820
1821 /* An alternation is the end of a branch; scan along to find the end of the
1822 bracketed group and go to there. */
1823
1824 case OP_ALT:
1825 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1826 break;
1827
1828 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1829 indicating that it may occur zero times. It may repeat infinitely, or not
1830 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1831 with fixed upper repeat limits are compiled as a number of copies, with the
1832 optional ones preceded by BRAZERO or BRAMINZERO. */
1833
1834 case OP_BRAZERO:
1835 next = ecode + 1;
1836 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1838 do next += GET(next, 1); while (*next == OP_ALT);
1839 ecode = next + 1 + LINK_SIZE;
1840 break;
1841
1842 case OP_BRAMINZERO:
1843 next = ecode + 1;
1844 do next += GET(next, 1); while (*next == OP_ALT);
1845 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1847 ecode++;
1848 break;
1849
1850 case OP_SKIPZERO:
1851 next = ecode+1;
1852 do next += GET(next,1); while (*next == OP_ALT);
1853 ecode = next + 1 + LINK_SIZE;
1854 break;
1855
1856 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1857 here; just jump to the group, with allow_zero set TRUE. */
1858
1859 case OP_BRAPOSZERO:
1860 op = *(++ecode);
1861 allow_zero = TRUE;
1862 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1863 goto POSSESSIVE_NON_CAPTURE;
1864
1865 /* End of a group, repeated or non-repeating. */
1866
1867 case OP_KET:
1868 case OP_KETRMIN:
1869 case OP_KETRMAX:
1870 case OP_KETRPOS:
1871 prev = ecode - GET(ecode, 1);
1872
1873 /* If this was a group that remembered the subject start, in order to break
1874 infinite repeats of empty string matches, retrieve the subject start from
1875 the chain. Otherwise, set it NULL. */
1876
1877 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1878 {
1879 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1880 eptrb = eptrb->epb_prev; /* Backup to previous group */
1881 }
1882 else saved_eptr = NULL;
1883
1884 /* If we are at the end of an assertion group or a non-capturing atomic
1885 group, stop matching and return MATCH_MATCH, but record the current high
1886 water mark for use by positive assertions. We also need to record the match
1887 start in case it was changed by \K. */
1888
1889 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1890 *prev == OP_ONCE_NC)
1891 {
1892 md->end_match_ptr = eptr; /* For ONCE_NC */
1893 md->end_offset_top = offset_top;
1894 md->start_match_ptr = mstart;
1895 RRETURN(MATCH_MATCH); /* Sets md->mark */
1896 }
1897
1898 /* For capturing groups we have to check the group number back at the start
1899 and if necessary complete handling an extraction by setting the offsets and
1900 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1901 into group 0, so it won't be picked up here. Instead, we catch it when the
1902 OP_END is reached. Other recursion is handled here. We just have to record
1903 the current subject position and start match pointer and give a MATCH
1904 return. */
1905
1906 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1907 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1908 {
1909 number = GET2(prev, 1+LINK_SIZE);
1910 offset = number << 1;
1911
1912 #ifdef PCRE_DEBUG
1913 printf("end bracket %d", number);
1914 printf("\n");
1915 #endif
1916
1917 /* Handle a recursively called group. */
1918
1919 if (md->recursive != NULL && md->recursive->group_num == number)
1920 {
1921 md->end_match_ptr = eptr;
1922 md->start_match_ptr = mstart;
1923 RRETURN(MATCH_MATCH);
1924 }
1925
1926 /* Deal with capturing */
1927
1928 md->capture_last = number;
1929 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1930 {
1931 /* If offset is greater than offset_top, it means that we are
1932 "skipping" a capturing group, and that group's offsets must be marked
1933 unset. In earlier versions of PCRE, all the offsets were unset at the
1934 start of matching, but this doesn't work because atomic groups and
1935 assertions can cause a value to be set that should later be unset.
1936 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1937 part of the atomic group, but this is not on the final matching path,
1938 so must be unset when 2 is set. (If there is no group 2, there is no
1939 problem, because offset_top will then be 2, indicating no capture.) */
1940
1941 if (offset > offset_top)
1942 {
1943 register int *iptr = md->offset_vector + offset_top;
1944 register int *iend = md->offset_vector + offset;
1945 while (iptr < iend) *iptr++ = -1;
1946 }
1947
1948 /* Now make the extraction */
1949
1950 md->offset_vector[offset] =
1951 md->offset_vector[md->offset_end - number];
1952 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1953 if (offset_top <= offset) offset_top = offset + 2;
1954 }
1955 }
1956
1957 /* For an ordinary non-repeating ket, just continue at this level. This
1958 also happens for a repeating ket if no characters were matched in the
1959 group. This is the forcible breaking of infinite loops as implemented in
1960 Perl 5.005. For a non-repeating atomic group that includes captures,
1961 establish a backup point by processing the rest of the pattern at a lower
1962 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1963 original OP_ONCE level, thereby bypassing intermediate backup points, but
1964 resetting any captures that happened along the way. */
1965
1966 if (*ecode == OP_KET || eptr == saved_eptr)
1967 {
1968 if (*prev == OP_ONCE)
1969 {
1970 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1972 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1973 RRETURN(MATCH_ONCE);
1974 }
1975 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1976 break;
1977 }
1978
1979 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1980 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1981 at a time from the outer level, thus saving stack. */
1982
1983 if (*ecode == OP_KETRPOS)
1984 {
1985 md->end_match_ptr = eptr;
1986 md->end_offset_top = offset_top;
1987 RRETURN(MATCH_KETRPOS);
1988 }
1989
1990 /* The normal repeating kets try the rest of the pattern or restart from
1991 the preceding bracket, in the appropriate order. In the second case, we can
1992 use tail recursion to avoid using another stack frame, unless we have an
1993 an atomic group or an unlimited repeat of a group that can match an empty
1994 string. */
1995
1996 if (*ecode == OP_KETRMIN)
1997 {
1998 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1999 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2000 if (*prev == OP_ONCE)
2001 {
2002 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2004 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2005 RRETURN(MATCH_ONCE);
2006 }
2007 if (*prev >= OP_SBRA) /* Could match an empty string */
2008 {
2009 md->match_function_type = MATCH_CBEGROUP;
2010 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2011 RRETURN(rrc);
2012 }
2013 ecode = prev;
2014 goto TAIL_RECURSE;
2015 }
2016 else /* OP_KETRMAX */
2017 {
2018 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2019 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2020 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (*prev == OP_ONCE)
2023 {
2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev;
2027 RRETURN(MATCH_ONCE);
2028 }
2029 ecode += 1 + LINK_SIZE;
2030 goto TAIL_RECURSE;
2031 }
2032 /* Control never gets here */
2033
2034 /* Not multiline mode: start of subject assertion, unless notbol. */
2035
2036 case OP_CIRC:
2037 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2038
2039 /* Start of subject assertion */
2040
2041 case OP_SOD:
2042 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2043 ecode++;
2044 break;
2045
2046 /* Multiline mode: start of subject unless notbol, or after any newline. */
2047
2048 case OP_CIRCM:
2049 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2050 if (eptr != md->start_subject &&
2051 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2052 RRETURN(MATCH_NOMATCH);
2053 ecode++;
2054 break;
2055
2056 /* Start of match assertion */
2057
2058 case OP_SOM:
2059 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2060 ecode++;
2061 break;
2062
2063 /* Reset the start of match point */
2064
2065 case OP_SET_SOM:
2066 mstart = eptr;
2067 ecode++;
2068 break;
2069
2070 /* Multiline mode: assert before any newline, or before end of subject
2071 unless noteol is set. */
2072
2073 case OP_DOLLM:
2074 if (eptr < md->end_subject)
2075 {
2076 if (!IS_NEWLINE(eptr))
2077 {
2078 if (md->partial != 0 &&
2079 eptr + 1 >= md->end_subject &&
2080 NLBLOCK->nltype == NLTYPE_FIXED &&
2081 NLBLOCK->nllen == 2 &&
2082 *eptr == NLBLOCK->nl[0])
2083 {
2084 md->hitend = TRUE;
2085 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2086 }
2087 RRETURN(MATCH_NOMATCH);
2088 }
2089 }
2090 else
2091 {
2092 if (md->noteol) RRETURN(MATCH_NOMATCH);
2093 SCHECK_PARTIAL();
2094 }
2095 ecode++;
2096 break;
2097
2098 /* Not multiline mode: assert before a terminating newline or before end of
2099 subject unless noteol is set. */
2100
2101 case OP_DOLL:
2102 if (md->noteol) RRETURN(MATCH_NOMATCH);
2103 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2104
2105 /* ... else fall through for endonly */
2106
2107 /* End of subject assertion (\z) */
2108
2109 case OP_EOD:
2110 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2111 SCHECK_PARTIAL();
2112 ecode++;
2113 break;
2114
2115 /* End of subject or ending \n assertion (\Z) */
2116
2117 case OP_EODN:
2118 ASSERT_NL_OR_EOS:
2119 if (eptr < md->end_subject &&
2120 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2121 {
2122 if (md->partial != 0 &&
2123 eptr + 1 >= md->end_subject &&
2124 NLBLOCK->nltype == NLTYPE_FIXED &&
2125 NLBLOCK->nllen == 2 &&
2126 *eptr == NLBLOCK->nl[0])
2127 {
2128 md->hitend = TRUE;
2129 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2130 }
2131 RRETURN(MATCH_NOMATCH);
2132 }
2133
2134 /* Either at end of string or \n before end. */
2135
2136 SCHECK_PARTIAL();
2137 ecode++;
2138 break;
2139
2140 /* Word boundary assertions */
2141
2142 case OP_NOT_WORD_BOUNDARY:
2143 case OP_WORD_BOUNDARY:
2144 {
2145
2146 /* Find out if the previous and current characters are "word" characters.
2147 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2148 be "non-word" characters. Remember the earliest consulted character for
2149 partial matching. */
2150
2151 #ifdef SUPPORT_UTF
2152 if (utf)
2153 {
2154 /* Get status of previous character */
2155
2156 if (eptr == md->start_subject) prev_is_word = FALSE; else
2157 {
2158 PCRE_PUCHAR lastptr = eptr - 1;
2159 BACKCHAR(lastptr);
2160 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2161 GETCHAR(c, lastptr);
2162 #ifdef SUPPORT_UCP
2163 if (md->use_ucp)
2164 {
2165 if (c == '_') prev_is_word = TRUE; else
2166 {
2167 int cat = UCD_CATEGORY(c);
2168 prev_is_word = (cat == ucp_L || cat == ucp_N);
2169 }
2170 }
2171 else
2172 #endif
2173 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2174 }
2175
2176 /* Get status of next character */
2177
2178 if (eptr >= md->end_subject)
2179 {
2180 SCHECK_PARTIAL();
2181 cur_is_word = FALSE;
2182 }
2183 else
2184 {
2185 GETCHAR(c, eptr);
2186 #ifdef SUPPORT_UCP
2187 if (md->use_ucp)
2188 {
2189 if (c == '_') cur_is_word = TRUE; else
2190 {
2191 int cat = UCD_CATEGORY(c);
2192 cur_is_word = (cat == ucp_L || cat == ucp_N);
2193 }
2194 }
2195 else
2196 #endif
2197 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2198 }
2199 }
2200 else
2201 #endif
2202
2203 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2204 consistency with the behaviour of \w we do use it in this case. */
2205
2206 {
2207 /* Get status of previous character */
2208
2209 if (eptr == md->start_subject) prev_is_word = FALSE; else
2210 {
2211 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2212 #ifdef SUPPORT_UCP
2213 if (md->use_ucp)
2214 {
2215 c = eptr[-1];
2216 if (c == '_') prev_is_word = TRUE; else
2217 {
2218 int cat = UCD_CATEGORY(c);
2219 prev_is_word = (cat == ucp_L || cat == ucp_N);
2220 }
2221 }
2222 else
2223 #endif
2224 prev_is_word = MAX_255(eptr[-1])
2225 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2226 }
2227
2228 /* Get status of next character */
2229
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 cur_is_word = FALSE;
2234 }
2235 else
2236 #ifdef SUPPORT_UCP
2237 if (md->use_ucp)
2238 {
2239 c = *eptr;
2240 if (c == '_') cur_is_word = TRUE; else
2241 {
2242 int cat = UCD_CATEGORY(c);
2243 cur_is_word = (cat == ucp_L || cat == ucp_N);
2244 }
2245 }
2246 else
2247 #endif
2248 cur_is_word = MAX_255(*eptr)
2249 && ((md->ctypes[*eptr] & ctype_word) != 0);
2250 }
2251
2252 /* Now see if the situation is what we want */
2253
2254 if ((*ecode++ == OP_WORD_BOUNDARY)?
2255 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2256 RRETURN(MATCH_NOMATCH);
2257 }
2258 break;
2259
2260 /* Match any single character type except newline; have to take care with
2261 CRLF newlines and partial matching. */
2262
2263 case OP_ANY:
2264 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2265 if (md->partial != 0 &&
2266 eptr + 1 >= md->end_subject &&
2267 NLBLOCK->nltype == NLTYPE_FIXED &&
2268 NLBLOCK->nllen == 2 &&
2269 *eptr == NLBLOCK->nl[0])
2270 {
2271 md->hitend = TRUE;
2272 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2273 }
2274
2275 /* Fall through */
2276
2277 /* Match any single character whatsoever. */
2278
2279 case OP_ALLANY:
2280 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2281 { /* not be updated before SCHECK_PARTIAL. */
2282 SCHECK_PARTIAL();
2283 RRETURN(MATCH_NOMATCH);
2284 }
2285 eptr++;
2286 #ifdef SUPPORT_UTF
2287 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2288 #endif
2289 ecode++;
2290 break;
2291
2292 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2293 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2294
2295 case OP_ANYBYTE:
2296 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2297 { /* not be updated before SCHECK_PARTIAL. */
2298 SCHECK_PARTIAL();
2299 RRETURN(MATCH_NOMATCH);
2300 }
2301 eptr++;
2302 ecode++;
2303 break;
2304
2305 case OP_NOT_DIGIT:
2306 if (eptr >= md->end_subject)
2307 {
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 GETCHARINCTEST(c, eptr);
2312 if (
2313 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2314 c < 256 &&
2315 #endif
2316 (md->ctypes[c] & ctype_digit) != 0
2317 )
2318 RRETURN(MATCH_NOMATCH);
2319 ecode++;
2320 break;
2321
2322 case OP_DIGIT:
2323 if (eptr >= md->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 GETCHARINCTEST(c, eptr);
2329 if (
2330 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2331 c > 255 ||
2332 #endif
2333 (md->ctypes[c] & ctype_digit) == 0
2334 )
2335 RRETURN(MATCH_NOMATCH);
2336 ecode++;
2337 break;
2338
2339 case OP_NOT_WHITESPACE:
2340 if (eptr >= md->end_subject)
2341 {
2342 SCHECK_PARTIAL();
2343 RRETURN(MATCH_NOMATCH);
2344 }
2345 GETCHARINCTEST(c, eptr);
2346 if (
2347 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2348 c < 256 &&
2349 #endif
2350 (md->ctypes[c] & ctype_space) != 0
2351 )
2352 RRETURN(MATCH_NOMATCH);
2353 ecode++;
2354 break;
2355
2356 case OP_WHITESPACE:
2357 if (eptr >= md->end_subject)
2358 {
2359 SCHECK_PARTIAL();
2360 RRETURN(MATCH_NOMATCH);
2361 }
2362 GETCHARINCTEST(c, eptr);
2363 if (
2364 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2365 c > 255 ||
2366 #endif
2367 (md->ctypes[c] & ctype_space) == 0
2368 )
2369 RRETURN(MATCH_NOMATCH);
2370 ecode++;
2371 break;
2372
2373 case OP_NOT_WORDCHAR:
2374 if (eptr >= md->end_subject)
2375 {
2376 SCHECK_PARTIAL();
2377 RRETURN(MATCH_NOMATCH);
2378 }
2379 GETCHARINCTEST(c, eptr);
2380 if (
2381 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2382 c < 256 &&
2383 #endif
2384 (md->ctypes[c] & ctype_word) != 0
2385 )
2386 RRETURN(MATCH_NOMATCH);
2387 ecode++;
2388 break;
2389
2390 case OP_WORDCHAR:
2391 if (eptr >= md->end_subject)
2392 {
2393 SCHECK_PARTIAL();
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 GETCHARINCTEST(c, eptr);
2397 if (
2398 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2399 c > 255 ||
2400 #endif
2401 (md->ctypes[c] & ctype_word) == 0
2402 )
2403 RRETURN(MATCH_NOMATCH);
2404 ecode++;
2405 break;
2406
2407 case OP_ANYNL:
2408 if (eptr >= md->end_subject)
2409 {
2410 SCHECK_PARTIAL();
2411 RRETURN(MATCH_NOMATCH);
2412 }
2413 GETCHARINCTEST(c, eptr);
2414 switch(c)
2415 {
2416 default: RRETURN(MATCH_NOMATCH);
2417
2418 case 0x000d:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 }
2423 else if (*eptr == 0x0a) eptr++;
2424 break;
2425
2426 case 0x000a:
2427 break;
2428
2429 case 0x000b:
2430 case 0x000c:
2431 case 0x0085:
2432 case 0x2028:
2433 case 0x2029:
2434 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2435 break;
2436 }
2437 ecode++;
2438 break;
2439
2440 case OP_NOT_HSPACE:
2441 if (eptr >= md->end_subject)
2442 {
2443 SCHECK_PARTIAL();
2444 RRETURN(MATCH_NOMATCH);
2445 }
2446 GETCHARINCTEST(c, eptr);
2447 switch(c)
2448 {
2449 default: break;
2450 case 0x09: /* HT */
2451 case 0x20: /* SPACE */
2452 case 0xa0: /* NBSP */
2453 case 0x1680: /* OGHAM SPACE MARK */
2454 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2455 case 0x2000: /* EN QUAD */
2456 case 0x2001: /* EM QUAD */
2457 case 0x2002: /* EN SPACE */
2458 case 0x2003: /* EM SPACE */
2459 case 0x2004: /* THREE-PER-EM SPACE */
2460 case 0x2005: /* FOUR-PER-EM SPACE */
2461 case 0x2006: /* SIX-PER-EM SPACE */
2462 case 0x2007: /* FIGURE SPACE */
2463 case 0x2008: /* PUNCTUATION SPACE */
2464 case 0x2009: /* THIN SPACE */
2465 case 0x200A: /* HAIR SPACE */
2466 case 0x202f: /* NARROW NO-BREAK SPACE */
2467 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2468 case 0x3000: /* IDEOGRAPHIC SPACE */
2469 RRETURN(MATCH_NOMATCH);
2470 }
2471 ecode++;
2472 break;
2473
2474 case OP_HSPACE:
2475 if (eptr >= md->end_subject)
2476 {
2477 SCHECK_PARTIAL();
2478 RRETURN(MATCH_NOMATCH);
2479 }
2480 GETCHARINCTEST(c, eptr);
2481 switch(c)
2482 {
2483 default: RRETURN(MATCH_NOMATCH);
2484 case 0x09: /* HT */
2485 case 0x20: /* SPACE */
2486 case 0xa0: /* NBSP */
2487 case 0x1680: /* OGHAM SPACE MARK */
2488 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2489 case 0x2000: /* EN QUAD */
2490 case 0x2001: /* EM QUAD */
2491 case 0x2002: /* EN SPACE */
2492 case 0x2003: /* EM SPACE */
2493 case 0x2004: /* THREE-PER-EM SPACE */
2494 case 0x2005: /* FOUR-PER-EM SPACE */
2495 case 0x2006: /* SIX-PER-EM SPACE */
2496 case 0x2007: /* FIGURE SPACE */
2497 case 0x2008: /* PUNCTUATION SPACE */
2498 case 0x2009: /* THIN SPACE */
2499 case 0x200A: /* HAIR SPACE */
2500 case 0x202f: /* NARROW NO-BREAK SPACE */
2501 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2502 case 0x3000: /* IDEOGRAPHIC SPACE */
2503 break;
2504 }
2505 ecode++;
2506 break;
2507
2508 case OP_NOT_VSPACE:
2509 if (eptr >= md->end_subject)
2510 {
2511 SCHECK_PARTIAL();
2512 RRETURN(MATCH_NOMATCH);
2513 }
2514 GETCHARINCTEST(c, eptr);
2515 switch(c)
2516 {
2517 default: break;
2518 case 0x0a: /* LF */
2519 case 0x0b: /* VT */
2520 case 0x0c: /* FF */
2521 case 0x0d: /* CR */
2522 case 0x85: /* NEL */
2523 case 0x2028: /* LINE SEPARATOR */
2524 case 0x2029: /* PARAGRAPH SEPARATOR */
2525 RRETURN(MATCH_NOMATCH);
2526 }
2527 ecode++;
2528 break;
2529
2530 case OP_VSPACE:
2531 if (eptr >= md->end_subject)
2532 {
2533 SCHECK_PARTIAL();
2534 RRETURN(MATCH_NOMATCH);
2535 }
2536 GETCHARINCTEST(c, eptr);
2537 switch(c)
2538 {
2539 default: RRETURN(MATCH_NOMATCH);
2540 case 0x0a: /* LF */
2541 case 0x0b: /* VT */
2542 case 0x0c: /* FF */
2543 case 0x0d: /* CR */
2544 case 0x85: /* NEL */
2545 case 0x2028: /* LINE SEPARATOR */
2546 case 0x2029: /* PARAGRAPH SEPARATOR */
2547 break;
2548 }
2549 ecode++;
2550 break;
2551
2552 #ifdef SUPPORT_UCP
2553 /* Check the next character by Unicode property. We will get here only
2554 if the support is in the binary; otherwise a compile-time error occurs. */
2555
2556 case OP_PROP:
2557 case OP_NOTPROP:
2558 if (eptr >= md->end_subject)
2559 {
2560 SCHECK_PARTIAL();
2561 RRETURN(MATCH_NOMATCH);
2562 }
2563 GETCHARINCTEST(c, eptr);
2564 {
2565 const ucd_record *prop = GET_UCD(c);
2566
2567 switch(ecode[1])
2568 {
2569 case PT_ANY:
2570 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2571 break;
2572
2573 case PT_LAMP:
2574 if ((prop->chartype == ucp_Lu ||
2575 prop->chartype == ucp_Ll ||
2576 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2577 RRETURN(MATCH_NOMATCH);
2578 break;
2579
2580 case PT_GC:
2581 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2582 RRETURN(MATCH_NOMATCH);
2583 break;
2584
2585 case PT_PC:
2586 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2587 RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 case PT_SC:
2591 if ((ecode[2] != prop->script) == (op == OP_PROP))
2592 RRETURN(MATCH_NOMATCH);
2593 break;
2594
2595 /* These are specials */
2596
2597 case PT_ALNUM:
2598 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2599 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2600 RRETURN(MATCH_NOMATCH);
2601 break;
2602
2603 case PT_SPACE: /* Perl space */
2604 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2605 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2606 == (op == OP_NOTPROP))
2607 RRETURN(MATCH_NOMATCH);
2608 break;
2609
2610 case PT_PXSPACE: /* POSIX space */
2611 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2612 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2613 c == CHAR_FF || c == CHAR_CR)
2614 == (op == OP_NOTPROP))
2615 RRETURN(MATCH_NOMATCH);
2616 break;
2617
2618 case PT_WORD:
2619 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2620 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2621 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2622 RRETURN(MATCH_NOMATCH);
2623 break;
2624
2625 /* This should never occur */
2626
2627 default:
2628 RRETURN(PCRE_ERROR_INTERNAL);
2629 }
2630
2631 ecode += 3;
2632 }
2633 break;
2634
2635 /* Match an extended Unicode sequence. We will get here only if the support
2636 is in the binary; otherwise a compile-time error occurs. */
2637
2638 case OP_EXTUNI:
2639 if (eptr >= md->end_subject)
2640 {
2641 SCHECK_PARTIAL();
2642 RRETURN(MATCH_NOMATCH);
2643 }
2644 GETCHARINCTEST(c, eptr);
2645 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2646 while (eptr < md->end_subject)
2647 {
2648 int len = 1;
2649 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2650 if (UCD_CATEGORY(c) != ucp_M) break;
2651 eptr += len;
2652 }
2653 CHECK_PARTIAL();
2654 ecode++;
2655 break;
2656 #endif
2657
2658
2659 /* Match a back reference, possibly repeatedly. Look past the end of the
2660 item to see if there is repeat information following. The code is similar
2661 to that for character classes, but repeated for efficiency. Then obey
2662 similar code to character type repeats - written out again for speed.
2663 However, if the referenced string is the empty string, always treat
2664 it as matched, any number of times (otherwise there could be infinite
2665 loops). */
2666
2667 case OP_REF:
2668 case OP_REFI:
2669 caseless = op == OP_REFI;
2670 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2671 ecode += 1 + IMM2_SIZE;
2672
2673 /* If the reference is unset, there are two possibilities:
2674
2675 (a) In the default, Perl-compatible state, set the length negative;
2676 this ensures that every attempt at a match fails. We can't just fail
2677 here, because of the possibility of quantifiers with zero minima.
2678
2679 (b) If the JavaScript compatibility flag is set, set the length to zero
2680 so that the back reference matches an empty string.
2681
2682 Otherwise, set the length to the length of what was matched by the
2683 referenced subpattern. */
2684
2685 if (offset >= offset_top || md->offset_vector[offset] < 0)
2686 length = (md->jscript_compat)? 0 : -1;
2687 else
2688 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2689
2690 /* Set up for repetition, or handle the non-repeated case */
2691
2692 switch (*ecode)
2693 {
2694 case OP_CRSTAR:
2695 case OP_CRMINSTAR:
2696 case OP_CRPLUS:
2697 case OP_CRMINPLUS:
2698 case OP_CRQUERY:
2699 case OP_CRMINQUERY:
2700 c = *ecode++ - OP_CRSTAR;
2701 minimize = (c & 1) != 0;
2702 min = rep_min[c]; /* Pick up values from tables; */
2703 max = rep_max[c]; /* zero for max => infinity */
2704 if (max == 0) max = INT_MAX;
2705 break;
2706
2707 case OP_CRRANGE:
2708 case OP_CRMINRANGE:
2709 minimize = (*ecode == OP_CRMINRANGE);
2710 min = GET2(ecode, 1);
2711 max = GET2(ecode, 1 + IMM2_SIZE);
2712 if (max == 0) max = INT_MAX;
2713 ecode += 1 + 2 * IMM2_SIZE;
2714 break;
2715
2716 default: /* No repeat follows */
2717 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2718 {
2719 if (length == -2) eptr = md->end_subject; /* Partial match */
2720 CHECK_PARTIAL();
2721 RRETURN(MATCH_NOMATCH);
2722 }
2723 eptr += length;
2724 continue; /* With the main loop */
2725 }
2726
2727 /* Handle repeated back references. If the length of the reference is
2728 zero, just continue with the main loop. If the length is negative, it
2729 means the reference is unset in non-Java-compatible mode. If the minimum is
2730 zero, we can continue at the same level without recursion. For any other
2731 minimum, carrying on will result in NOMATCH. */
2732
2733 if (length == 0) continue;
2734 if (length < 0 && min == 0) continue;
2735
2736 /* First, ensure the minimum number of matches are present. We get back
2737 the length of the reference string explicitly rather than passing the
2738 address of eptr, so that eptr can be a register variable. */
2739
2740 for (i = 1; i <= min; i++)
2741 {
2742 int slength;
2743 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2744 {
2745 if (slength == -2) eptr = md->end_subject; /* Partial match */
2746 CHECK_PARTIAL();
2747 RRETURN(MATCH_NOMATCH);
2748 }
2749 eptr += slength;
2750 }
2751
2752 /* If min = max, continue at the same level without recursion.
2753 They are not both allowed to be zero. */
2754
2755 if (min == max) continue;
2756
2757 /* If minimizing, keep trying and advancing the pointer */
2758
2759 if (minimize)
2760 {
2761 for (fi = min;; fi++)
2762 {
2763 int slength;
2764 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2765 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2766 if (fi >= max) RRETURN(MATCH_NOMATCH);
2767 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2768 {
2769 if (slength == -2) eptr = md->end_subject; /* Partial match */
2770 CHECK_PARTIAL();
2771 RRETURN(MATCH_NOMATCH);
2772 }
2773 eptr += slength;
2774 }
2775 /* Control never gets here */
2776 }
2777
2778 /* If maximizing, find the longest string and work backwards */
2779
2780 else
2781 {
2782 pp = eptr;
2783 for (i = min; i < max; i++)
2784 {
2785 int slength;
2786 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2787 {
2788 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2789 the soft partial matching case. */
2790
2791 if (slength == -2 && md->partial != 0 &&
2792 md->end_subject > md->start_used_ptr)
2793 {
2794 md->hitend = TRUE;
2795 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2796 }
2797 break;
2798 }
2799 eptr += slength;
2800 }
2801
2802 while (eptr >= pp)
2803 {
2804 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2805 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2806 eptr -= length;
2807 }
2808 RRETURN(MATCH_NOMATCH);
2809 }
2810 /* Control never gets here */
2811
2812 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2813 used when all the characters in the class have values in the range 0-255,
2814 and either the matching is caseful, or the characters are in the range
2815 0-127 when UTF-8 processing is enabled. The only difference between
2816 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2817 encountered.
2818
2819 First, look past the end of the item to see if there is repeat information
2820 following. Then obey similar code to character type repeats - written out
2821 again for speed. */
2822
2823 case OP_NCLASS:
2824 case OP_CLASS:
2825 {
2826 /* The data variable is saved across frames, so the byte map needs to
2827 be stored there. */
2828 #define BYTE_MAP ((pcre_uint8 *)data)
2829 data = ecode + 1; /* Save for matching */
2830 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2831
2832 switch (*ecode)
2833 {
2834 case OP_CRSTAR:
2835 case OP_CRMINSTAR:
2836 case OP_CRPLUS:
2837 case OP_CRMINPLUS:
2838 case OP_CRQUERY:
2839 case OP_CRMINQUERY:
2840 c = *ecode++ - OP_CRSTAR;
2841 minimize = (c & 1) != 0;
2842 min = rep_min[c]; /* Pick up values from tables; */
2843 max = rep_max[c]; /* zero for max => infinity */
2844 if (max == 0) max = INT_MAX;
2845 break;
2846
2847 case OP_CRRANGE:
2848 case OP_CRMINRANGE:
2849 minimize = (*ecode == OP_CRMINRANGE);
2850 min = GET2(ecode, 1);
2851 max = GET2(ecode, 1 + IMM2_SIZE);
2852 if (max == 0) max = INT_MAX;
2853 ecode += 1 + 2 * IMM2_SIZE;
2854 break;
2855
2856 default: /* No repeat follows */
2857 min = max = 1;
2858 break;
2859 }
2860
2861 /* First, ensure the minimum number of matches are present. */
2862
2863 #ifdef SUPPORT_UTF
2864 if (utf)
2865 {
2866 for (i = 1; i <= min; i++)
2867 {
2868 if (eptr >= md->end_subject)
2869 {
2870 SCHECK_PARTIAL();
2871 RRETURN(MATCH_NOMATCH);
2872 }
2873 GETCHARINC(c, eptr);
2874 if (c > 255)
2875 {
2876 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2877 }
2878 else
2879 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2880 }
2881 }
2882 else
2883 #endif
2884 /* Not UTF mode */
2885 {
2886 for (i = 1; i <= min; i++)
2887 {
2888 if (eptr >= md->end_subject)
2889 {
2890 SCHECK_PARTIAL();
2891 RRETURN(MATCH_NOMATCH);
2892 }
2893 c = *eptr++;
2894 #ifndef COMPILE_PCRE8
2895 if (c > 255)
2896 {
2897 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2898 }
2899 else
2900 #endif
2901 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2902 }
2903 }
2904
2905 /* If max == min we can continue with the main loop without the
2906 need to recurse. */
2907
2908 if (min == max) continue;
2909
2910 /* If minimizing, keep testing the rest of the expression and advancing
2911 the pointer while it matches the class. */
2912
2913 if (minimize)
2914 {
2915 #ifdef SUPPORT_UTF
2916 if (utf)
2917 {
2918 for (fi = min;; fi++)
2919 {
2920 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2922 if (fi >= max) RRETURN(MATCH_NOMATCH);
2923 if (eptr >= md->end_subject)
2924 {
2925 SCHECK_PARTIAL();
2926 RRETURN(MATCH_NOMATCH);
2927 }
2928 GETCHARINC(c, eptr);
2929 if (c > 255)
2930 {
2931 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2932 }
2933 else
2934 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2935 }
2936 }
2937 else
2938 #endif
2939 /* Not UTF mode */
2940 {
2941 for (fi = min;; fi++)
2942 {
2943 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2945 if (fi >= max) RRETURN(MATCH_NOMATCH);
2946 if (eptr >= md->end_subject)
2947 {
2948 SCHECK_PARTIAL();
2949 RRETURN(MATCH_NOMATCH);
2950 }
2951 c = *eptr++;
2952 #ifndef COMPILE_PCRE8
2953 if (c > 255)
2954 {
2955 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2956 }
2957 else
2958 #endif
2959 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2960 }
2961 }
2962 /* Control never gets here */
2963 }
2964
2965 /* If maximizing, find the longest possible run, then work backwards. */
2966
2967 else
2968 {
2969 pp = eptr;
2970
2971 #ifdef SUPPORT_UTF
2972 if (utf)
2973 {
2974 for (i = min; i < max; i++)
2975 {
2976 int len = 1;
2977 if (eptr >= md->end_subject)
2978 {
2979 SCHECK_PARTIAL();
2980 break;
2981 }
2982 GETCHARLEN(c, eptr, len);
2983 if (c > 255)
2984 {
2985 if (op == OP_CLASS) break;
2986 }
2987 else
2988 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2989 eptr += len;
2990 }
2991 for (;;)
2992 {
2993 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2994 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2995 if (eptr-- == pp) break; /* Stop if tried at original pos */
2996 BACKCHAR(eptr);
2997 }
2998 }
2999 else
3000 #endif
3001 /* Not UTF mode */
3002 {
3003 for (i = min; i < max; i++)
3004 {
3005 if (eptr >= md->end_subject)
3006 {
3007 SCHECK_PARTIAL();
3008 break;
3009 }
3010 c = *eptr;
3011 #ifndef COMPILE_PCRE8
3012 if (c > 255)
3013 {
3014 if (op == OP_CLASS) break;
3015 }
3016 else
3017 #endif
3018 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3019 eptr++;
3020 }
3021 while (eptr >= pp)
3022 {
3023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3025 eptr--;
3026 }
3027 }
3028
3029 RRETURN(MATCH_NOMATCH);
3030 }
3031 #undef BYTE_MAP
3032 }
3033 /* Control never gets here */
3034
3035
3036 /* Match an extended character class. This opcode is encountered only
3037 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3038 mode, because Unicode properties are supported in non-UTF-8 mode. */
3039
3040 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3041 case OP_XCLASS:
3042 {
3043 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3044 ecode += GET(ecode, 1); /* Advance past the item */
3045
3046 switch (*ecode)
3047 {
3048 case OP_CRSTAR:
3049 case OP_CRMINSTAR:
3050 case OP_CRPLUS:
3051 case OP_CRMINPLUS:
3052 case OP_CRQUERY:
3053 case OP_CRMINQUERY:
3054 c = *ecode++ - OP_CRSTAR;
3055 minimize = (c & 1) != 0;
3056 min = rep_min[c]; /* Pick up values from tables; */
3057 max = rep_max[c]; /* zero for max => infinity */
3058 if (max == 0) max = INT_MAX;
3059 break;
3060
3061 case OP_CRRANGE:
3062 case OP_CRMINRANGE:
3063 minimize = (*ecode == OP_CRMINRANGE);
3064 min = GET2(ecode, 1);
3065 max = GET2(ecode, 1 + IMM2_SIZE);
3066 if (max == 0) max = INT_MAX;
3067 ecode += 1 + 2 * IMM2_SIZE;
3068 break;
3069
3070 default: /* No repeat follows */
3071 min = max = 1;
3072 break;
3073 }
3074
3075 /* First, ensure the minimum number of matches are present. */
3076
3077 for (i = 1; i <= min; i++)
3078 {
3079 if (eptr >= md->end_subject)
3080 {
3081 SCHECK_PARTIAL();
3082 RRETURN(MATCH_NOMATCH);
3083 }
3084 GETCHARINCTEST(c, eptr);
3085 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3086 }
3087
3088 /* If max == min we can continue with the main loop without the
3089 need to recurse. */
3090
3091 if (min == max) continue;
3092
3093 /* If minimizing, keep testing the rest of the expression and advancing
3094 the pointer while it matches the class. */
3095
3096 if (minimize)
3097 {
3098 for (fi = min;; fi++)
3099 {
3100 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3101 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3102 if (fi >= max) RRETURN(MATCH_NOMATCH);
3103 if (eptr >= md->end_subject)
3104 {
3105 SCHECK_PARTIAL();
3106 RRETURN(MATCH_NOMATCH);
3107 }
3108 GETCHARINCTEST(c, eptr);
3109 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3110 }
3111 /* Control never gets here */
3112 }
3113
3114 /* If maximizing, find the longest possible run, then work backwards. */
3115
3116 else
3117 {
3118 pp = eptr;
3119 for (i = min; i < max; i++)
3120 {
3121 int len = 1;
3122 if (eptr >= md->end_subject)
3123 {
3124 SCHECK_PARTIAL();
3125 break;
3126 }
3127 #ifdef SUPPORT_UTF
3128 GETCHARLENTEST(c, eptr, len);
3129 #else
3130 c = *eptr;
3131 #endif
3132 if (!PRIV(xclass)(c, data, utf)) break;
3133 eptr += len;
3134 }
3135 for(;;)
3136 {
3137 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3138 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3139 if (eptr-- == pp) break; /* Stop if tried at original pos */
3140 #ifdef SUPPORT_UTF
3141 if (utf) BACKCHAR(eptr);
3142 #endif
3143 }
3144 RRETURN(MATCH_NOMATCH);
3145 }
3146
3147 /* Control never gets here */
3148 }
3149 #endif /* End of XCLASS */
3150
3151 /* Match a single character, casefully */
3152
3153 case OP_CHAR:
3154 #ifdef SUPPORT_UTF
3155 if (utf)
3156 {
3157 length = 1;
3158 ecode++;
3159 GETCHARLEN(fc, ecode, length);
3160 if (length > md->end_subject - eptr)
3161 {
3162 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3163 RRETURN(MATCH_NOMATCH);
3164 }
3165 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3166 }
3167 else
3168 #endif
3169 /* Not UTF mode */
3170 {
3171 if (md->end_subject - eptr < 1)
3172 {
3173 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3174 RRETURN(MATCH_NOMATCH);
3175 }
3176 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3177 ecode += 2;
3178 }
3179 break;
3180
3181 /* Match a single character, caselessly. If we are at the end of the
3182 subject, give up immediately. */
3183
3184 case OP_CHARI:
3185 if (eptr >= md->end_subject)
3186 {
3187 SCHECK_PARTIAL();
3188 RRETURN(MATCH_NOMATCH);
3189 }
3190
3191 #ifdef SUPPORT_UTF
3192 if (utf)
3193 {
3194 length = 1;
3195 ecode++;
3196 GETCHARLEN(fc, ecode, length);
3197
3198 /* If the pattern character's value is < 128, we have only one byte, and
3199 we know that its other case must also be one byte long, so we can use the
3200 fast lookup table. We know that there is at least one byte left in the
3201 subject. */
3202
3203 if (fc < 128)
3204 {
3205 if (md->lcc[fc]
3206 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3207 ecode++;
3208 eptr++;
3209 }
3210
3211 /* Otherwise we must pick up the subject character. Note that we cannot
3212 use the value of "length" to check for sufficient bytes left, because the
3213 other case of the character may have more or fewer bytes. */
3214
3215 else
3216 {
3217 unsigned int dc;
3218 GETCHARINC(dc, eptr);
3219 ecode += length;
3220
3221 /* If we have Unicode property support, we can use it to test the other
3222 case of the character, if there is one. */
3223
3224 if (fc != dc)
3225 {
3226 #ifdef SUPPORT_UCP
3227 if (dc != UCD_OTHERCASE(fc))
3228 #endif
3229 RRETURN(MATCH_NOMATCH);
3230 }
3231 }
3232 }
3233 else
3234 #endif /* SUPPORT_UTF */
3235
3236 /* Not UTF mode */
3237 {
3238 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3239 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3240 eptr++;
3241 ecode += 2;
3242 }
3243 break;
3244
3245 /* Match a single character repeatedly. */
3246
3247 case OP_EXACT:
3248 case OP_EXACTI:
3249 min = max = GET2(ecode, 1);
3250 ecode += 1 + IMM2_SIZE;
3251 goto REPEATCHAR;
3252
3253 case OP_POSUPTO:
3254 case OP_POSUPTOI:
3255 possessive = TRUE;
3256 /* Fall through */
3257
3258 case OP_UPTO:
3259 case OP_UPTOI:
3260 case OP_MINUPTO:
3261 case OP_MINUPTOI:
3262 min = 0;
3263 max = GET2(ecode, 1);
3264 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3265 ecode += 1 + IMM2_SIZE;
3266 goto REPEATCHAR;
3267
3268 case OP_POSSTAR:
3269 case OP_POSSTARI:
3270 possessive = TRUE;
3271 min = 0;
3272 max = INT_MAX;
3273 ecode++;
3274 goto REPEATCHAR;
3275
3276 case OP_POSPLUS:
3277 case OP_POSPLUSI:
3278 possessive = TRUE;
3279 min = 1;
3280 max = INT_MAX;
3281 ecode++;
3282 goto REPEATCHAR;
3283
3284 case OP_POSQUERY:
3285 case OP_POSQUERYI:
3286 possessive = TRUE;
3287 min = 0;
3288 max = 1;
3289 ecode++;
3290 goto REPEATCHAR;
3291
3292 case OP_STAR:
3293 case OP_STARI:
3294 case OP_MINSTAR:
3295 case OP_MINSTARI:
3296 case OP_PLUS:
3297 case OP_PLUSI:
3298 case OP_MINPLUS:
3299 case OP_MINPLUSI:
3300 case OP_QUERY:
3301 case OP_QUERYI:
3302 case OP_MINQUERY:
3303 case OP_MINQUERYI:
3304 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3305 minimize = (c & 1) != 0;
3306 min = rep_min[c]; /* Pick up values from tables; */
3307 max = rep_max[c]; /* zero for max => infinity */
3308 if (max == 0) max = INT_MAX;
3309
3310 /* Common code for all repeated single-character matches. */
3311
3312 REPEATCHAR:
3313 #ifdef SUPPORT_UTF
3314 if (utf)
3315 {
3316 length = 1;
3317 charptr = ecode;
3318 GETCHARLEN(fc, ecode, length);
3319 ecode += length;
3320
3321 /* Handle multibyte character matching specially here. There is
3322 support for caseless matching if UCP support is present. */
3323
3324 if (length > 1)
3325 {
3326 #ifdef SUPPORT_UCP
3327 unsigned int othercase;
3328 if (op >= OP_STARI && /* Caseless */
3329 (othercase = UCD_OTHERCASE(fc)) != fc)
3330 oclength = PRIV(ord2utf)(othercase, occhars);
3331 else oclength = 0;
3332 #endif /* SUPPORT_UCP */
3333
3334 for (i = 1; i <= min; i++)
3335 {
3336 if (eptr <= md->end_subject - length &&
3337 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3338 #ifdef SUPPORT_UCP
3339 else if (oclength > 0 &&
3340 eptr <= md->end_subject - oclength &&
3341 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3342 #endif /* SUPPORT_UCP */
3343 else
3344 {
3345 CHECK_PARTIAL();
3346 RRETURN(MATCH_NOMATCH);
3347 }
3348 }
3349
3350 if (min == max) continue;
3351
3352 if (minimize)
3353 {
3354 for (fi = min;; fi++)
3355 {
3356 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3358 if (fi >= max) RRETURN(MATCH_NOMATCH);
3359 if (eptr <= md->end_subject - length &&
3360 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3361 #ifdef SUPPORT_UCP
3362 else if (oclength > 0 &&
3363 eptr <= md->end_subject - oclength &&
3364 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3365 #endif /* SUPPORT_UCP */
3366 else
3367 {
3368 CHECK_PARTIAL();
3369 RRETURN(MATCH_NOMATCH);
3370 }
3371 }
3372 /* Control never gets here */
3373 }
3374
3375 else /* Maximize */
3376 {
3377 pp = eptr;
3378 for (i = min; i < max; i++)
3379 {
3380 if (eptr <= md->end_subject - length &&
3381 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3382 #ifdef SUPPORT_UCP
3383 else if (oclength > 0 &&
3384 eptr <= md->end_subject - oclength &&
3385 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3386 #endif /* SUPPORT_UCP */
3387 else
3388 {
3389 CHECK_PARTIAL();
3390 break;
3391 }
3392 }
3393
3394 if (possessive) continue;
3395
3396 for(;;)
3397 {
3398 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3399 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3401 #ifdef SUPPORT_UCP
3402 eptr--;
3403 BACKCHAR(eptr);
3404 #else /* without SUPPORT_UCP */
3405 eptr -= length;
3406 #endif /* SUPPORT_UCP */
3407 }
3408 }
3409 /* Control never gets here */
3410 }
3411
3412 /* If the length of a UTF-8 character is 1, we fall through here, and
3413 obey the code as for non-UTF-8 characters below, though in this case the
3414 value of fc will always be < 128. */
3415 }
3416 else
3417 #endif /* SUPPORT_UTF */
3418 /* When not in UTF-8 mode, load a single-byte character. */
3419 fc = *ecode++;
3420
3421 /* The value of fc at this point is always one character, though we may
3422 or may not be in UTF mode. The code is duplicated for the caseless and
3423 caseful cases, for speed, since matching characters is likely to be quite
3424 common. First, ensure the minimum number of matches are present. If min =
3425 max, continue at the same level without recursing. Otherwise, if
3426 minimizing, keep trying the rest of the expression and advancing one
3427 matching character if failing, up to the maximum. Alternatively, if
3428 maximizing, find the maximum number of characters and work backwards. */
3429
3430 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3431 max, eptr));
3432
3433 if (op >= OP_STARI) /* Caseless */
3434 {
3435 #ifdef COMPILE_PCRE8
3436 /* fc must be < 128 if UTF is enabled. */
3437 foc = md->fcc[fc];
3438 #else
3439 #ifdef SUPPORT_UTF
3440 #ifdef SUPPORT_UCP
3441 if (utf && fc > 127)
3442 foc = UCD_OTHERCASE(fc);
3443 #else
3444 if (utf && fc > 127)
3445 foc = fc;
3446 #endif /* SUPPORT_UCP */
3447 else
3448 #endif /* SUPPORT_UTF */
3449 foc = TABLE_GET(fc, md->fcc, fc);
3450 #endif /* COMPILE_PCRE8 */
3451
3452 for (i = 1; i <= min; i++)
3453 {
3454 if (eptr >= md->end_subject)
3455 {
3456 SCHECK_PARTIAL();
3457 RRETURN(MATCH_NOMATCH);
3458 }
3459 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3460 eptr++;
3461 }
3462 if (min == max) continue;
3463 if (minimize)
3464 {
3465 for (fi = min;; fi++)
3466 {
3467 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3468 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3469 if (fi >= max) RRETURN(MATCH_NOMATCH);
3470 if (eptr >= md->end_subject)
3471 {
3472 SCHECK_PARTIAL();
3473 RRETURN(MATCH_NOMATCH);
3474 }
3475 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3476 eptr++;
3477 }
3478 /* Control never gets here */
3479 }
3480 else /* Maximize */
3481 {
3482 pp = eptr;
3483 for (i = min; i < max; i++)
3484 {
3485 if (eptr >= md->end_subject)
3486 {
3487 SCHECK_PARTIAL();
3488 break;
3489 }
3490 if (fc != *eptr && foc != *eptr) break;
3491 eptr++;
3492 }
3493
3494 if (possessive) continue;
3495
3496 while (eptr >= pp)
3497 {
3498 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3499 eptr--;
3500 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3501 }
3502 RRETURN(MATCH_NOMATCH);
3503 }
3504 /* Control never gets here */
3505 }
3506
3507 /* Caseful comparisons (includes all multi-byte characters) */
3508
3509 else
3510 {
3511 for (i = 1; i <= min; i++)
3512 {
3513 if (eptr >= md->end_subject)
3514 {
3515 SCHECK_PARTIAL();
3516 RRETURN(MATCH_NOMATCH);
3517 }
3518 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3519 }
3520
3521 if (min == max) continue;
3522
3523 if (minimize)
3524 {
3525 for (fi = min;; fi++)
3526 {
3527 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3528 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3529 if (fi >= max) RRETURN(MATCH_NOMATCH);
3530 if (eptr >= md->end_subject)
3531 {
3532 SCHECK_PARTIAL();
3533 RRETURN(MATCH_NOMATCH);
3534 }
3535 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3536 }
3537 /* Control never gets here */
3538 }
3539 else /* Maximize */
3540 {
3541 pp = eptr;
3542 for (i = min; i < max; i++)
3543 {
3544 if (eptr >= md->end_subject)
3545 {
3546 SCHECK_PARTIAL();
3547 break;
3548 }
3549 if (fc != *eptr) break;
3550 eptr++;
3551 }
3552 if (possessive) continue;
3553
3554 while (eptr >= pp)
3555 {
3556 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3557 eptr--;
3558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3559 }
3560 RRETURN(MATCH_NOMATCH);
3561 }
3562 }
3563 /* Control never gets here */
3564
3565 /* Match a negated single one-byte character. The character we are
3566 checking can be multibyte. */
3567
3568 case OP_NOT:
3569 case OP_NOTI:
3570 if (eptr >= md->end_subject)
3571 {
3572 SCHECK_PARTIAL();
3573 RRETURN(MATCH_NOMATCH);
3574 }
3575 #ifdef SUPPORT_UTF
3576 if (utf)
3577 {
3578 register unsigned int ch, och;
3579
3580 ecode++;
3581 GETCHARINC(ch, ecode);
3582 GETCHARINC(c, eptr);
3583
3584 if (op == OP_NOT)
3585 {
3586 if (ch == c) RRETURN(MATCH_NOMATCH);
3587 }
3588 else
3589 {
3590 #ifdef SUPPORT_UCP
3591 if (ch > 127)
3592 och = UCD_OTHERCASE(ch);
3593 #else
3594 if (ch > 127)
3595 och = ch;
3596 #endif /* SUPPORT_UCP */
3597 else
3598 och = TABLE_GET(ch, md->fcc, ch);
3599 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3600 }
3601 }
3602 else
3603 #endif
3604 {
3605 register unsigned int ch = ecode[1];
3606 c = *eptr++;
3607 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3608 RRETURN(MATCH_NOMATCH);
3609 ecode += 2;
3610 }
3611 break;
3612
3613 /* Match a negated single one-byte character repeatedly. This is almost a
3614 repeat of the code for a repeated single character, but I haven't found a
3615 nice way of commoning these up that doesn't require a test of the
3616 positive/negative option for each character match. Maybe that wouldn't add
3617 very much to the time taken, but character matching *is* what this is all
3618 about... */
3619
3620 case OP_NOTEXACT:
3621 case OP_NOTEXACTI:
3622 min = max = GET2(ecode, 1);
3623 ecode += 1 + IMM2_SIZE;
3624 goto REPEATNOTCHAR;
3625
3626 case OP_NOTUPTO:
3627 case OP_NOTUPTOI:
3628 case OP_NOTMINUPTO:
3629 case OP_NOTMINUPTOI:
3630 min = 0;
3631 max = GET2(ecode, 1);
3632 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3633 ecode += 1 + IMM2_SIZE;
3634 goto REPEATNOTCHAR;
3635
3636 case OP_NOTPOSSTAR:
3637 case OP_NOTPOSSTARI:
3638 possessive = TRUE;
3639 min = 0;
3640 max = INT_MAX;
3641 ecode++;
3642 goto REPEATNOTCHAR;
3643
3644 case OP_NOTPOSPLUS:
3645 case OP_NOTPOSPLUSI:
3646 possessive = TRUE;
3647 min = 1;
3648 max = INT_MAX;
3649 ecode++;
3650 goto REPEATNOTCHAR;
3651
3652 case OP_NOTPOSQUERY:
3653 case OP_NOTPOSQUERYI:
3654 possessive = TRUE;
3655 min = 0;
3656 max = 1;
3657 ecode++;
3658 goto REPEATNOTCHAR;
3659
3660 case OP_NOTPOSUPTO:
3661 case OP_NOTPOSUPTOI:
3662 possessive = TRUE;
3663 min = 0;
3664 max = GET2(ecode, 1);
3665 ecode += 1 + IMM2_SIZE;
3666 goto REPEATNOTCHAR;
3667
3668 case OP_NOTSTAR:
3669 case OP_NOTSTARI:
3670 case OP_NOTMINSTAR:
3671 case OP_NOTMINSTARI:
3672 case OP_NOTPLUS:
3673 case OP_NOTPLUSI:
3674 case OP_NOTMINPLUS:
3675 case OP_NOTMINPLUSI:
3676 case OP_NOTQUERY:
3677 case OP_NOTQUERYI:
3678 case OP_NOTMINQUERY:
3679 case OP_NOTMINQUERYI:
3680 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3681 minimize = (c & 1) != 0;
3682 min = rep_min[c]; /* Pick up values from tables; */
3683 max = rep_max[c]; /* zero for max => infinity */
3684 if (max == 0) max = INT_MAX;
3685
3686 /* Common code for all repeated single-byte matches. */
3687
3688 REPEATNOTCHAR:
3689 GETCHARINCTEST(fc, ecode);
3690
3691 /* The code is duplicated for the caseless and caseful cases, for speed,
3692 since matching characters is likely to be quite common. First, ensure the
3693 minimum number of matches are present. If min = max, continue at the same
3694 level without recursing. Otherwise, if minimizing, keep trying the rest of
3695 the expression and advancing one matching character if failing, up to the
3696 maximum. Alternatively, if maximizing, find the maximum number of
3697 characters and work backwards. */
3698
3699 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3700 max, eptr));
3701
3702 if (op >= OP_NOTSTARI) /* Caseless */
3703 {
3704 #ifdef SUPPORT_UTF
3705 #ifdef SUPPORT_UCP
3706 if (utf && fc > 127)
3707 foc = UCD_OTHERCASE(fc);
3708 #else
3709 if (utf && fc > 127)
3710 foc = fc;
3711 #endif /* SUPPORT_UCP */
3712 else
3713 #endif /* SUPPORT_UTF */
3714 foc = TABLE_GET(fc, md->fcc, fc);
3715
3716 #ifdef SUPPORT_UTF
3717 if (utf)
3718 {
3719 register unsigned int d;
3720 for (i = 1; i <= min; i++)
3721 {
3722 if (eptr >= md->end_subject)
3723 {
3724 SCHECK_PARTIAL();
3725 RRETURN(MATCH_NOMATCH);
3726 }
3727 GETCHARINC(d, eptr);
3728 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3729 }
3730 }
3731 else
3732 #endif
3733 /* Not UTF mode */
3734 {
3735 for (i = 1; i <= min; i++)
3736 {
3737 if (eptr >= md->end_subject)
3738 {
3739 SCHECK_PARTIAL();
3740 RRETURN(MATCH_NOMATCH);
3741 }
3742 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3743 eptr++;
3744 }
3745 }
3746
3747 if (min == max) continue;
3748
3749 if (minimize)
3750 {
3751 #ifdef SUPPORT_UTF
3752 if (utf)
3753 {
3754 register unsigned int d;
3755 for (fi = min;; fi++)
3756 {
3757 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3759 if (fi >= max) RRETURN(MATCH_NOMATCH);
3760 if (eptr >= md->end_subject)
3761 {
3762 SCHECK_PARTIAL();
3763 RRETURN(MATCH_NOMATCH);
3764 }
3765 GETCHARINC(d, eptr);
3766 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3767 }
3768 }
3769 else
3770 #endif
3771 /* Not UTF mode */
3772 {
3773 for (fi = min;; fi++)
3774 {
3775 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3776 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3777 if (fi >= max) RRETURN(MATCH_NOMATCH);
3778 if (eptr >= md->end_subject)
3779 {
3780 SCHECK_PARTIAL();
3781 RRETURN(MATCH_NOMATCH);
3782 }
3783 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3784 eptr++;
3785 }
3786 }
3787 /* Control never gets here */
3788 }
3789
3790 /* Maximize case */
3791
3792 else
3793 {
3794 pp = eptr;
3795
3796 #ifdef SUPPORT_UTF
3797 if (utf)
3798 {
3799 register unsigned int d;
3800 for (i = min; i < max; i++)
3801 {
3802 int len = 1;
3803 if (eptr >= md->end_subject)
3804 {
3805 SCHECK_PARTIAL();
3806 break;
3807 }
3808 GETCHARLEN(d, eptr, len);
3809 if (fc == d || (unsigned int)foc == d) break;
3810 eptr += len;
3811 }
3812 if (possessive) continue;
3813 for(;;)
3814 {
3815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3817 if (eptr-- == pp) break; /* Stop if tried at original pos */
3818 BACKCHAR(eptr);
3819 }
3820 }
3821 else
3822 #endif
3823 /* Not UTF mode */
3824 {
3825 for (i = min; i < max; i++)
3826 {
3827 if (eptr >= md->end_subject)
3828 {
3829 SCHECK_PARTIAL();
3830 break;
3831 }
3832 if (fc == *eptr || foc == *eptr) break;
3833 eptr++;
3834 }
3835 if (possessive) continue;
3836 while (eptr >= pp)
3837 {
3838 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3840 eptr--;
3841 }
3842 }
3843
3844 RRETURN(MATCH_NOMATCH);
3845 }
3846 /* Control never gets here */
3847 }
3848
3849 /* Caseful comparisons */
3850
3851 else
3852 {
3853 #ifdef SUPPORT_UTF
3854 if (utf)
3855 {
3856 register unsigned int d;
3857 for (i = 1; i <= min; i++)
3858 {
3859 if (eptr >= md->end_subject)
3860 {
3861 SCHECK_PARTIAL();
3862 RRETURN(MATCH_NOMATCH);
3863 }
3864 GETCHARINC(d, eptr);
3865 if (fc == d) RRETURN(MATCH_NOMATCH);
3866 }
3867 }
3868 else
3869 #endif
3870 /* Not UTF mode */
3871 {
3872 for (i = 1; i <= min; i++)
3873 {
3874 if (eptr >= md->end_subject)
3875 {
3876 SCHECK_PARTIAL();
3877 RRETURN(MATCH_NOMATCH);
3878 }
3879 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3880 }
3881 }
3882
3883 if (min == max) continue;
3884
3885 if (minimize)
3886 {
3887 #ifdef SUPPORT_UTF
3888 if (utf)
3889 {
3890 register unsigned int d;
3891 for (fi = min;; fi++)
3892 {
3893 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3895 if (fi >= max) RRETURN(MATCH_NOMATCH);
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 RRETURN(MATCH_NOMATCH);
3900 }
3901 GETCHARINC(d, eptr);
3902 if (fc == d) RRETURN(MATCH_NOMATCH);
3903 }
3904 }
3905 else
3906 #endif
3907 /* Not UTF mode */
3908 {
3909 for (fi = min;; fi++)
3910 {
3911 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3912 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3913 if (fi >= max) RRETURN(MATCH_NOMATCH);
3914 if (eptr >= md->end_subject)
3915 {
3916 SCHECK_PARTIAL();
3917 RRETURN(MATCH_NOMATCH);
3918 }
3919 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3920 }
3921 }
3922 /* Control never gets here */
3923 }
3924
3925 /* Maximize case */
3926
3927 else
3928 {
3929 pp = eptr;
3930
3931 #ifdef SUPPORT_UTF
3932 if (utf)
3933 {
3934 register unsigned int d;
3935 for (i = min; i < max; i++)
3936 {
3937 int len = 1;
3938 if (eptr >= md->end_subject)
3939 {
3940 SCHECK_PARTIAL();
3941 break;
3942 }
3943 GETCHARLEN(d, eptr, len);
3944 if (fc == d) break;
3945 eptr += len;
3946 }
3947 if (possessive) continue;
3948 for(;;)
3949 {
3950 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3952 if (eptr-- == pp) break; /* Stop if tried at original pos */
3953 BACKCHAR(eptr);
3954 }
3955 }
3956 else
3957 #endif
3958 /* Not UTF mode */
3959 {
3960 for (i = min; i < max; i++)
3961 {
3962 if (eptr >= md->end_subject)
3963 {
3964 SCHECK_PARTIAL();
3965 break;
3966 }
3967 if (fc == *eptr) break;
3968 eptr++;
3969 }
3970 if (possessive) continue;
3971 while (eptr >= pp)
3972 {
3973 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3975 eptr--;
3976 }
3977 }
3978
3979 RRETURN(MATCH_NOMATCH);
3980 }
3981 }
3982 /* Control never gets here */
3983
3984 /* Match a single character type repeatedly; several different opcodes
3985 share code. This is very similar to the code for single characters, but we
3986 repeat it in the interests of efficiency. */
3987
3988 case OP_TYPEEXACT:
3989 min = max = GET2(ecode, 1);
3990 minimize = TRUE;
3991 ecode += 1 + IMM2_SIZE;
3992 goto REPEATTYPE;
3993
3994 case OP_TYPEUPTO:
3995 case OP_TYPEMINUPTO:
3996 min = 0;
3997 max = GET2(ecode, 1);
3998 minimize = *ecode == OP_TYPEMINUPTO;
3999 ecode += 1 + IMM2_SIZE;
4000 goto REPEATTYPE;
4001
4002 case OP_TYPEPOSSTAR:
4003 possessive = TRUE;
4004 min = 0;
4005 max = INT_MAX;
4006 ecode++;
4007 goto REPEATTYPE;
4008
4009 case OP_TYPEPOSPLUS:
4010 possessive = TRUE;
4011 min = 1;
4012 max = INT_MAX;
4013 ecode++;
4014 goto REPEATTYPE;
4015
4016 case OP_TYPEPOSQUERY:
4017 possessive = TRUE;
4018 min = 0;
4019 max = 1;
4020 ecode++;
4021 goto REPEATTYPE;
4022
4023 case OP_TYPEPOSUPTO:
4024 possessive = TRUE;
4025 min = 0;
4026 max = GET2(ecode, 1);
4027 ecode += 1 + IMM2_SIZE;
4028 goto REPEATTYPE;
4029
4030 case OP_TYPESTAR:
4031 case OP_TYPEMINSTAR:
4032 case OP_TYPEPLUS:
4033 case OP_TYPEMINPLUS:
4034 case OP_TYPEQUERY:
4035 case OP_TYPEMINQUERY:
4036 c = *ecode++ - OP_TYPESTAR;
4037 minimize = (c & 1) != 0;
4038 min = rep_min[c]; /* Pick up values from tables; */
4039 max = rep_max[c]; /* zero for max => infinity */
4040 if (max == 0) max = INT_MAX;
4041
4042 /* Common code for all repeated single character type matches. Note that
4043 in UTF-8 mode, '.' matches a character of any length, but for the other
4044 character types, the valid characters are all one-byte long. */
4045
4046 REPEATTYPE:
4047 ctype = *ecode++; /* Code for the character type */
4048
4049 #ifdef SUPPORT_UCP
4050 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4051 {
4052 prop_fail_result = ctype == OP_NOTPROP;
4053 prop_type = *ecode++;
4054 prop_value = *ecode++;
4055 }
4056 else prop_type = -1;
4057 #endif
4058
4059 /* First, ensure the minimum number of matches are present. Use inline
4060 code for maximizing the speed, and do the type test once at the start
4061 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4062 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4063 and single-bytes. */
4064
4065 if (min > 0)
4066 {
4067 #ifdef SUPPORT_UCP
4068 if (prop_type >= 0)
4069 {
4070 switch(prop_type)
4071 {
4072 case PT_ANY:
4073 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4074 for (i = 1; i <= min; i++)
4075 {
4076 if (eptr >= md->end_subject)
4077 {
4078 SCHECK_PARTIAL();
4079 RRETURN(MATCH_NOMATCH);
4080 }
4081 GETCHARINCTEST(c, eptr);
4082 }
4083 break;
4084
4085 case PT_LAMP:
4086 for (i = 1; i <= min; i++)
4087 {
4088 int chartype;
4089 if (eptr >= md->end_subject)
4090 {
4091 SCHECK_PARTIAL();
4092 RRETURN(MATCH_NOMATCH);
4093 }
4094 GETCHARINCTEST(c, eptr);
4095 chartype = UCD_CHARTYPE(c);
4096 if ((chartype == ucp_Lu ||
4097 chartype == ucp_Ll ||
4098 chartype == ucp_Lt) == prop_fail_result)
4099 RRETURN(MATCH_NOMATCH);
4100 }
4101 break;
4102
4103 case PT_GC:
4104 for (i = 1; i <= min; i++)
4105 {
4106 if (eptr >= md->end_subject)
4107 {
4108 SCHECK_PARTIAL();
4109 RRETURN(MATCH_NOMATCH);
4110 }
4111 GETCHARINCTEST(c, eptr);
4112 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4113 RRETURN(MATCH_NOMATCH);
4114 }
4115 break;
4116
4117 case PT_PC:
4118 for (i = 1; i <= min; i++)
4119 {
4120 if (eptr >= md->end_subject)
4121 {
4122 SCHECK_PARTIAL();
4123 RRETURN(MATCH_NOMATCH);
4124 }
4125 GETCHARINCTEST(c, eptr);
4126 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4127 RRETURN(MATCH_NOMATCH);
4128 }
4129 break;
4130
4131 case PT_SC:
4132 for (i = 1; i <= min; i++)
4133 {
4134 if (eptr >= md->end_subject)
4135 {
4136 SCHECK_PARTIAL();
4137 RRETURN(MATCH_NOMATCH);
4138 }
4139 GETCHARINCTEST(c, eptr);
4140 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4141 RRETURN(MATCH_NOMATCH);
4142 }
4143 break;
4144
4145 case PT_ALNUM:
4146 for (i = 1; i <= min; i++)
4147 {
4148 int category;
4149 if (eptr >= md->end_subject)
4150 {
4151 SCHECK_PARTIAL();
4152 RRETURN(MATCH_NOMATCH);
4153 }
4154 GETCHARINCTEST(c, eptr);
4155 category = UCD_CATEGORY(c);
4156 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4157 RRETURN(MATCH_NOMATCH);
4158 }
4159 break;
4160
4161 case PT_SPACE: /* Perl space */
4162 for (i = 1; i <= min; i++)
4163 {
4164 if (eptr >= md->end_subject)
4165 {
4166 SCHECK_PARTIAL();
4167 RRETURN(MATCH_NOMATCH);
4168 }
4169 GETCHARINCTEST(c, eptr);
4170 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4171 c == CHAR_FF || c == CHAR_CR)
4172 == prop_fail_result)
4173 RRETURN(MATCH_NOMATCH);
4174 }
4175 break;
4176
4177 case PT_PXSPACE: /* POSIX space */
4178 for (i = 1; i <= min; i++)
4179 {
4180 if (eptr >= md->end_subject)
4181 {
4182 SCHECK_PARTIAL();
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 GETCHARINCTEST(c, eptr);
4186 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4187 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4188 == prop_fail_result)
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 break;
4192
4193 case PT_WORD:
4194 for (i = 1; i <= min; i++)
4195 {
4196 int category;
4197 if (eptr >= md->end_subject)
4198 {
4199 SCHECK_PARTIAL();
4200 RRETURN(MATCH_NOMATCH);
4201 }
4202 GETCHARINCTEST(c, eptr);
4203 category = UCD_CATEGORY(c);
4204 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4205 == prop_fail_result)
4206 RRETURN(MATCH_NOMATCH);
4207 }
4208 break;
4209
4210 /* This should not occur */
4211
4212 default:
4213 RRETURN(PCRE_ERROR_INTERNAL);
4214 }
4215 }
4216
4217 /* Match extended Unicode sequences. We will get here only if the
4218 support is in the binary; otherwise a compile-time error occurs. */
4219
4220 else if (ctype == OP_EXTUNI)
4221 {
4222 for (i = 1; i <= min; i++)
4223 {
4224 if (eptr >= md->end_subject)
4225 {
4226 SCHECK_PARTIAL();
4227 RRETURN(MATCH_NOMATCH);
4228 }
4229 GETCHARINCTEST(c, eptr);
4230 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4231 while (eptr < md->end_subject)
4232 {
4233 int len = 1;
4234 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4235 if (UCD_CATEGORY(c) != ucp_M) break;
4236 eptr += len;
4237 }
4238 CHECK_PARTIAL();
4239 }
4240 }
4241
4242 else
4243 #endif /* SUPPORT_UCP */
4244
4245 /* Handle all other cases when the coding is UTF-8 */
4246
4247 #ifdef SUPPORT_UTF
4248 if (utf) switch(ctype)
4249 {
4250 case OP_ANY:
4251 for (i = 1; i <= min; i++)
4252 {
4253 if (eptr >= md->end_subject)
4254 {
4255 SCHECK_PARTIAL();
4256 RRETURN(MATCH_NOMATCH);
4257 }
4258 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4259 if (md->partial != 0 &&
4260 eptr + 1 >= md->end_subject &&
4261 NLBLOCK->nltype == NLTYPE_FIXED &&
4262 NLBLOCK->nllen == 2 &&
4263 *eptr == NLBLOCK->nl[0])
4264 {
4265 md->hitend = TRUE;
4266 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4267 }
4268 eptr++;
4269 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4270 }
4271 break;
4272
4273 case OP_ALLANY:
4274 for (i = 1; i <= min; i++)
4275 {
4276 if (eptr >= md->end_subject)
4277 {
4278 SCHECK_PARTIAL();
4279 RRETURN(MATCH_NOMATCH);
4280 }
4281 eptr++;
4282 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4283 }
4284 break;
4285
4286 case OP_ANYBYTE:
4287 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4288 eptr += min;
4289 break;
4290
4291 case OP_ANYNL:
4292 for (i = 1; i <= min; i++)
4293 {
4294 if (eptr >= md->end_subject)
4295 {
4296 SCHECK_PARTIAL();
4297 RRETURN(MATCH_NOMATCH);
4298 }
4299 GETCHARINC(c, eptr);
4300 switch(c)
4301 {
4302 default: RRETURN(MATCH_NOMATCH);
4303
4304 case 0x000d:
4305 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4306 break;
4307
4308 case 0x000a:
4309 break;
4310
4311 case 0x000b:
4312 case 0x000c:
4313 case 0x0085:
4314 case 0x2028:
4315 case 0x2029:
4316 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4317 break;
4318 }
4319 }
4320 break;
4321
4322 case OP_NOT_HSPACE:
4323 for (i = 1; i <= min; i++)
4324 {
4325 if (eptr >= md->end_subject)
4326 {
4327 SCHECK_PARTIAL();
4328 RRETURN(MATCH_NOMATCH);
4329 }
4330 GETCHARINC(c, eptr);
4331 switch(c)
4332 {
4333 default: break;
4334 case 0x09: /* HT */
4335 case 0x20: /* SPACE */
4336 case 0xa0: /* NBSP */
4337 case 0x1680: /* OGHAM SPACE MARK */
4338 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4339 case 0x2000: /* EN QUAD */
4340 case 0x2001: /* EM QUAD */
4341 case 0x2002: /* EN SPACE */
4342 case 0x2003: /* EM SPACE */
4343 case 0x2004: /* THREE-PER-EM SPACE */
4344 case 0x2005: /* FOUR-PER-EM SPACE */
4345 case 0x2006: /* SIX-PER-EM SPACE */
4346 case 0x2007: /* FIGURE SPACE */
4347 case 0x2008: /* PUNCTUATION SPACE */
4348 case 0x2009: /* THIN SPACE */
4349 case 0x200A: /* HAIR SPACE */
4350 case 0x202f: /* NARROW NO-BREAK SPACE */
4351 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4352 case 0x3000: /* IDEOGRAPHIC SPACE */
4353 RRETURN(MATCH_NOMATCH);
4354 }
4355 }
4356 break;
4357
4358 case OP_HSPACE:
4359 for (i = 1; i <= min; i++)
4360 {
4361 if (eptr >= md->end_subject)
4362 {
4363 SCHECK_PARTIAL();
4364 RRETURN(MATCH_NOMATCH);
4365 }
4366 GETCHARINC(c, eptr);
4367 switch(c)
4368 {
4369 default: RRETURN(MATCH_NOMATCH);
4370 case 0x09: /* HT */
4371 case 0x20: /* SPACE */
4372 case 0xa0: /* NBSP */
4373 case 0x1680: /* OGHAM SPACE MARK */
4374 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4375 case 0x2000: /* EN QUAD */
4376 case 0x2001: /* EM QUAD */
4377 case 0x2002: /* EN SPACE */
4378 case 0x2003: /* EM SPACE */
4379 case 0x2004: /* THREE-PER-EM SPACE */
4380 case 0x2005: /* FOUR-PER-EM SPACE */
4381 case 0x2006: /* SIX-PER-EM SPACE */
4382 case 0x2007: /* FIGURE SPACE */
4383 case 0x2008: /* PUNCTUATION SPACE */
4384 case 0x2009: /* THIN SPACE */
4385 case 0x200A: /* HAIR SPACE */
4386 case 0x202f: /* NARROW NO-BREAK SPACE */
4387 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4388 case 0x3000: /* IDEOGRAPHIC SPACE */
4389 break;
4390 }
4391 }
4392 break;
4393
4394 case OP_NOT_VSPACE:
4395 for (i = 1; i <= min; i++)
4396 {
4397 if (eptr >= md->end_subject)
4398 {
4399 SCHECK_PARTIAL();
4400 RRETURN(MATCH_NOMATCH);
4401 }
4402 GETCHARINC(c, eptr);
4403 switch(c)
4404 {
4405 default: break;
4406 case 0x0a: /* LF */
4407 case 0x0b: /* VT */
4408 case 0x0c: /* FF */
4409 case 0x0d: /* CR */
4410 case 0x85: /* NEL */
4411 case 0x2028: /* LINE SEPARATOR */
4412 case 0x2029: /* PARAGRAPH SEPARATOR */
4413 RRETURN(MATCH_NOMATCH);
4414 }
4415 }
4416 break;
4417
4418 case OP_VSPACE:
4419 for (i = 1; i <= min; i++)
4420 {
4421 if (eptr >= md->end_subject)
4422 {
4423 SCHECK_PARTIAL();
4424 RRETURN(MATCH_NOMATCH);
4425 }
4426 GETCHARINC(c, eptr);
4427 switch(c)
4428 {
4429 default: RRETURN(MATCH_NOMATCH);
4430 case 0x0a: /* LF */
4431 case 0x0b: /* VT */
4432 case 0x0c: /* FF */
4433 case 0x0d: /* CR */
4434 case 0x85: /* NEL */
4435 case 0x2028: /* LINE SEPARATOR */
4436 case 0x2029: /* PARAGRAPH SEPARATOR */
4437 break;
4438 }
4439 }
4440 break;
4441
4442 case OP_NOT_DIGIT:
4443 for (i = 1; i <= min; i++)
4444 {
4445 if (eptr >= md->end_subject)
4446 {
4447 SCHECK_PARTIAL();
4448 RRETURN(MATCH_NOMATCH);
4449 }
4450 GETCHARINC(c, eptr);
4451 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4452 RRETURN(MATCH_NOMATCH);
4453 }
4454 break;
4455
4456 case OP_DIGIT:
4457 for (i = 1; i <= min; i++)
4458 {
4459 if (eptr >= md->end_subject)
4460 {
4461 SCHECK_PARTIAL();
4462 RRETURN(MATCH_NOMATCH);
4463 }
4464 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4465 RRETURN(MATCH_NOMATCH);
4466 eptr++;
4467 /* No need to skip more bytes - we know it's a 1-byte character */
4468 }
4469 break;
4470
4471 case OP_NOT_WHITESPACE:
4472 for (i = 1; i <= min; i++)
4473 {
4474 if (eptr >= md->end_subject)
4475 {
4476 SCHECK_PARTIAL();
4477 RRETURN(MATCH_NOMATCH);
4478 }
4479 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4480 RRETURN(MATCH_NOMATCH);
4481 eptr++;
4482 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4483 }
4484 break;
4485
4486 case OP_WHITESPACE:
4487 for (i = 1; i <= min; i++)
4488 {
4489 if (eptr >= md->end_subject)
4490 {
4491 SCHECK_PARTIAL();
4492 RRETURN(MATCH_NOMATCH);
4493 }
4494 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4495 RRETURN(MATCH_NOMATCH);
4496 eptr++;
4497 /* No need to skip more bytes - we know it's a 1-byte character */
4498 }
4499 break;
4500
4501 case OP_NOT_WORDCHAR:
4502 for (i = 1; i <= min; i++)
4503 {
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 RRETURN(MATCH_NOMATCH);
4508 }
4509 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4510 RRETURN(MATCH_NOMATCH);
4511 eptr++;
4512 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4513 }
4514 break;
4515
4516 case OP_WORDCHAR:
4517 for (i = 1; i <= min; i++)
4518 {
4519 if (eptr >= md->end_subject)
4520 {
4521 SCHECK_PARTIAL();
4522 RRETURN(MATCH_NOMATCH);
4523 }
4524 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4525 RRETURN(MATCH_NOMATCH);
4526 eptr++;
4527 /* No need to skip more bytes - we know it's a 1-byte character */
4528 }
4529 break;
4530
4531 default:
4532 RRETURN(PCRE_ERROR_INTERNAL);
4533 } /* End switch(ctype) */
4534
4535 else
4536 #endif /* SUPPORT_UTF */
4537
4538 /* Code for the non-UTF-8 case for minimum matching of operators other
4539 than OP_PROP and OP_NOTPROP. */
4540
4541 switch(ctype)
4542 {
4543 case OP_ANY:
4544 for (i = 1; i <= min; i++)
4545 {
4546 if (eptr >= md->end_subject)
4547 {
4548 SCHECK_PARTIAL();
4549 RRETURN(MATCH_NOMATCH);
4550 }
4551 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4552 if (md->partial != 0 &&
4553 eptr + 1 >= md->end_subject &&
4554 NLBLOCK->nltype == NLTYPE_FIXED &&
4555 NLBLOCK->nllen == 2 &&
4556 *eptr == NLBLOCK->nl[0])
4557 {
4558 md->hitend = TRUE;
4559 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4560 }
4561 eptr++;
4562 }
4563 break;
4564
4565 case OP_ALLANY:
4566 if (eptr > md->end_subject - min)
4567 {
4568 SCHECK_PARTIAL();
4569 RRETURN(MATCH_NOMATCH);
4570 }
4571 eptr += min;
4572 break;
4573
4574 case OP_ANYBYTE:
4575 if (eptr > md->end_subject - min)
4576 {
4577 SCHECK_PARTIAL();
4578 RRETURN(MATCH_NOMATCH);
4579 }
4580 eptr += min;
4581 break;
4582
4583 case OP_ANYNL:
4584 for (i = 1; i <= min; i++)
4585 {
4586 if (eptr >= md->end_subject)
4587 {
4588 SCHECK_PARTIAL();
4589 RRETURN(MATCH_NOMATCH);
4590 }
4591 switch(*eptr++)
4592 {
4593 default: RRETURN(MATCH_NOMATCH);
4594
4595 case 0x000d:
4596 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4597 break;
4598
4599 case 0x000a:
4600 break;
4601
4602 case 0x000b:
4603 case 0x000c:
4604 case 0x0085:
4605 #ifdef COMPILE_PCRE16
4606 case 0x2028:
4607 case 0x2029:
4608 #endif
4609 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4610 break;
4611 }
4612 }
4613 break;
4614
4615 case OP_NOT_HSPACE:
4616 for (i = 1; i <= min; i++)
4617 {
4618 if (eptr >= md->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 switch(*eptr++)
4624 {
4625 default: break;
4626 case 0x09: /* HT */
4627 case 0x20: /* SPACE */
4628 case 0xa0: /* NBSP */
4629 #ifdef COMPILE_PCRE16
4630 case 0x1680: /* OGHAM SPACE MARK */
4631 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4632 case 0x2000: /* EN QUAD */
4633 case 0x2001: /* EM QUAD */
4634 case 0x2002: /* EN SPACE */
4635 case 0x2003: /* EM SPACE */
4636 case 0x2004: /* THREE-PER-EM SPACE */
4637 case 0x2005: /* FOUR-PER-EM SPACE */
4638 case 0x2006: /* SIX-PER-EM SPACE */
4639 case 0x2007: /* FIGURE SPACE */
4640 case 0x2008: /* PUNCTUATION SPACE */
4641 case 0x2009: /* THIN SPACE */
4642 case 0x200A: /* HAIR SPACE */
4643 case 0x202f: /* NARROW NO-BREAK SPACE */
4644 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4645 case 0x3000: /* IDEOGRAPHIC SPACE */
4646 #endif
4647 RRETURN(MATCH_NOMATCH);
4648 }
4649 }
4650 break;
4651
4652 case OP_HSPACE:
4653 for (i = 1; i <= min; i++)
4654 {
4655 if (eptr >= md->end_subject)
4656 {
4657 SCHECK_PARTIAL();
4658 RRETURN(MATCH_NOMATCH);
4659 }
4660 switch(*eptr++)
4661 {
4662 default: RRETURN(MATCH_NOMATCH);
4663 case 0x09: /* HT */
4664 case 0x20: /* SPACE */
4665 case 0xa0: /* NBSP */
4666 #ifdef COMPILE_PCRE16
4667 case 0x1680: /* OGHAM SPACE MARK */
4668 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4669 case 0x2000: /* EN QUAD */
4670 case 0x2001: /* EM QUAD */
4671 case 0x2002: /* EN SPACE */
4672 case 0x2003: /* EM SPACE */
4673 case 0x2004: /* THREE-PER-EM SPACE */
4674 case 0x2005: /* FOUR-PER-EM SPACE */
4675 case 0x2006: /* SIX-PER-EM SPACE */
4676 case 0x2007: /* FIGURE SPACE */
4677 case 0x2008: /* PUNCTUATION SPACE */
4678 case 0x2009: /* THIN SPACE */
4679 case 0x200A: /* HAIR SPACE */
4680 case 0x202f: /* NARROW NO-BREAK SPACE */
4681 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4682 case 0x3000: /* IDEOGRAPHIC SPACE */
4683 #endif
4684 break;
4685 }
4686 }
4687 break;
4688
4689 case OP_NOT_VSPACE:
4690 for (i = 1; i <= min; i++)
4691 {
4692 if (eptr >= md->end_subject)
4693 {
4694 SCHECK_PARTIAL();
4695 RRETURN(MATCH_NOMATCH);
4696 }
4697 switch(*eptr++)
4698 {
4699 default: break;
4700 case 0x0a: /* LF */
4701 case 0x0b: /* VT */
4702 case 0x0c: /* FF */
4703 case 0x0d: /* CR */
4704 case 0x85: /* NEL */
4705 #ifdef COMPILE_PCRE16
4706 case 0x2028: /* LINE SEPARATOR */
4707 case 0x2029: /* PARAGRAPH SEPARATOR */
4708 #endif
4709 RRETURN(MATCH_NOMATCH);
4710 }
4711 }
4712 break;
4713
4714 case OP_VSPACE:
4715 for (i = 1; i <= min; i++)
4716 {
4717 if (eptr >= md->end_subject)
4718 {
4719 SCHECK_PARTIAL();
4720 RRETURN(MATCH_NOMATCH);
4721 }
4722 switch(*eptr++)
4723 {
4724 default: RRETURN(MATCH_NOMATCH);
4725 case 0x0a: /* LF */
4726 case 0x0b: /* VT */
4727 case 0x0c: /* FF */
4728 case 0x0d: /* CR */
4729 case 0x85: /* NEL */
4730 #ifdef COMPILE_PCRE16
4731 case 0x2028: /* LINE SEPARATOR */
4732 case 0x2029: /* PARAGRAPH SEPARATOR */
4733 #endif
4734 break;
4735 }
4736 }
4737 break;
4738
4739 case OP_NOT_DIGIT:
4740 for (i = 1; i <= min; i++)
4741 {
4742 if (eptr >= md->end_subject)
4743 {
4744 SCHECK_PARTIAL();
4745 RRETURN(MATCH_NOMATCH);
4746 }
4747 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4748 RRETURN(MATCH_NOMATCH);
4749 eptr++;
4750 }
4751 break;
4752
4753 case OP_DIGIT:
4754 for (i = 1; i <= min; i++)
4755 {
4756 if (eptr >= md->end_subject)
4757 {
4758 SCHECK_PARTIAL();
4759 RRETURN(MATCH_NOMATCH);
4760 }
4761 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4762 RRETURN(MATCH_NOMATCH);
4763 eptr++;
4764 }
4765 break;
4766
4767 case OP_NOT_WHITESPACE:
4768 for (i = 1; i <= min; i++)
4769 {
4770 if (eptr >= md->end_subject)
4771 {
4772 SCHECK_PARTIAL();
4773 RRETURN(MATCH_NOMATCH);
4774 }
4775 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4776 RRETURN(MATCH_NOMATCH);
4777 eptr++;
4778 }
4779 break;
4780
4781 case OP_WHITESPACE:
4782 for (i = 1; i <= min; i++)
4783 {
4784 if (eptr >= md->end_subject)
4785 {
4786 SCHECK_PARTIAL();
4787 RRETURN(MATCH_NOMATCH);
4788 }
4789 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4790 RRETURN(MATCH_NOMATCH);
4791 eptr++;
4792 }
4793 break;
4794
4795 case OP_NOT_WORDCHAR:
4796 for (i = 1; i <= min; i++)
4797 {
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 RRETURN(MATCH_NOMATCH);
4802 }
4803 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4804 RRETURN(MATCH_NOMATCH);
4805 eptr++;
4806 }
4807 break;
4808
4809 case OP_WORDCHAR:
4810 for (i = 1; i <= min; i++)
4811 {
4812 if (eptr >= md->end_subject)
4813 {
4814 SCHECK_PARTIAL();
4815 RRETURN(MATCH_NOMATCH);
4816 }
4817 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4818 RRETURN(MATCH_NOMATCH);
4819 eptr++;
4820 }
4821 break;
4822
4823 default:
4824 RRETURN(PCRE_ERROR_INTERNAL);
4825 }
4826 }
4827
4828 /* If min = max, continue at the same level without recursing */
4829
4830 if (min == max) continue;
4831
4832 /* If minimizing, we have to test the rest of the pattern before each
4833 subsequent match. Again, separate the UTF-8 case for speed, and also
4834 separate the UCP cases. */
4835
4836 if (minimize)
4837 {
4838 #ifdef SUPPORT_UCP
4839 if (prop_type >= 0)
4840 {
4841 switch(prop_type)
4842 {
4843 case PT_ANY:
4844 for (fi = min;; fi++)
4845 {
4846 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4848 if (fi >= max) RRETURN(MATCH_NOMATCH);
4849 if (eptr >= md->end_subject)
4850 {
4851 SCHECK_PARTIAL();
4852 RRETURN(MATCH_NOMATCH);
4853 }
4854 GETCHARINCTEST(c, eptr);
4855 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4856 }
4857 /* Control never gets here */
4858
4859 case PT_LAMP:
4860 for (fi = min;; fi++)
4861 {
4862 int chartype;
4863 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4865 if (fi >= max) RRETURN(MATCH_NOMATCH);
4866 if (eptr >= md->end_subject)
4867 {
4868 SCHECK_PARTIAL();
4869 RRETURN(MATCH_NOMATCH);
4870 }
4871 GETCHARINCTEST(c, eptr);
4872 chartype = UCD_CHARTYPE(c);
4873 if ((chartype == ucp_Lu ||
4874 chartype == ucp_Ll ||
4875 chartype == ucp_Lt) == prop_fail_result)
4876 RRETURN(MATCH_NOMATCH);
4877 }
4878 /* Control never gets here */
4879
4880 case PT_GC:
4881 for (fi = min;; fi++)
4882 {
4883 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4885 if (fi >= max) RRETURN(MATCH_NOMATCH);
4886 if (eptr >= md->end_subject)
4887 {
4888 SCHECK_PARTIAL();
4889 RRETURN(MATCH_NOMATCH);
4890 }
4891 GETCHARINCTEST(c, eptr);
4892 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4893 RRETURN(MATCH_NOMATCH);
4894 }
4895 /* Control never gets here */
4896
4897 case PT_PC:
4898 for (fi = min;; fi++)
4899 {
4900 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4901 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4902 if (fi >= max) RRETURN(MATCH_NOMATCH);
4903 if (eptr >= md->end_subject)
4904 {
4905 SCHECK_PARTIAL();
4906 RRETURN(MATCH_NOMATCH);
4907 }
4908 GETCHARINCTEST(c, eptr);
4909 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4910 RRETURN(MATCH_NOMATCH);
4911 }
4912 /* Control never gets here */
4913
4914 case PT_SC:
4915 for (fi = min;; fi++)
4916 {
4917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919 if (fi >= max) RRETURN(MATCH_NOMATCH);
4920 if (eptr >= md->end_subject)
4921 {
4922 SCHECK_PARTIAL();
4923 RRETURN(MATCH_NOMATCH);
4924 }
4925 GETCHARINCTEST(c, eptr);
4926 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4927 RRETURN(MATCH_NOMATCH);
4928 }
4929 /* Control never gets here */
4930
4931 case PT_ALNUM:
4932 for (fi = min;; fi++)
4933 {
4934 int category;
4935 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4937 if (fi >= max) RRETURN(MATCH_NOMATCH);
4938 if (eptr >= md->end_subject)
4939 {
4940 SCHECK_PARTIAL();
4941 RRETURN(MATCH_NOMATCH);
4942 }
4943 GETCHARINCTEST(c, eptr);
4944 category = UCD_CATEGORY(c);
4945 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4946 RRETURN(MATCH_NOMATCH);
4947 }
4948 /* Control never gets here */
4949
4950 case PT_SPACE: /* Perl space */
4951 for (fi = min;; fi++)
4952 {
4953 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4955 if (fi >= max) RRETURN(MATCH_NOMATCH);
4956 if (eptr >= md->end_subject)
4957 {
4958 SCHECK_PARTIAL();
4959 RRETURN(MATCH_NOMATCH);
4960 }
4961 GETCHARINCTEST(c, eptr);
4962 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4963 c == CHAR_FF || c == CHAR_CR)
4964 == prop_fail_result)
4965 RRETURN(MATCH_NOMATCH);
4966 }
4967 /* Control never gets here */
4968
4969 case PT_PXSPACE: /* POSIX space */
4970 for (fi = min;; fi++)
4971 {
4972 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4974 if (fi >= max) RRETURN(MATCH_NOMATCH);
4975 if (eptr >= md->end_subject)
4976 {
4977 SCHECK_PARTIAL();
4978 RRETURN(MATCH_NOMATCH);
4979 }
4980 GETCHARINCTEST(c, eptr);
4981 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4982 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4983 == prop_fail_result)
4984 RRETURN(MATCH_NOMATCH);
4985 }
4986 /* Control never gets here */
4987
4988 case PT_WORD:
4989 for (fi = min;; fi++)
4990 {
4991 int category;
4992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4994 if (fi >= max) RRETURN(MATCH_NOMATCH);
4995 if (eptr >= md->end_subject)
4996 {
4997 SCHECK_PARTIAL();
4998 RRETURN(MATCH_NOMATCH);
4999 }
5000 GETCHARINCTEST(c, eptr);
5001 category = UCD_CATEGORY(c);
5002 if ((category == ucp_L ||
5003 category == ucp_N ||
5004 c == CHAR_UNDERSCORE)
5005 == prop_fail_result)
5006 RRETURN(MATCH_NOMATCH);
5007 }
5008 /* Control never gets here */
5009
5010 /* This should never occur */
5011
5012 default:
5013 RRETURN(PCRE_ERROR_INTERNAL);
5014 }
5015 }
5016
5017 /* Match extended Unicode sequences. We will get here only if the
5018 support is in the binary; otherwise a compile-time error occurs. */
5019
5020 else if (ctype == OP_EXTUNI)
5021 {
5022 for (fi = min;; fi++)
5023 {
5024 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5026 if (fi >= max) RRETURN(MATCH_NOMATCH);
5027 if (eptr >= md->end_subject)
5028 {
5029 SCHECK_PARTIAL();
5030 RRETURN(MATCH_NOMATCH);
5031 }
5032 GETCHARINCTEST(c, eptr);
5033 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
5034 while (eptr < md->end_subject)
5035 {
5036 int len = 1;
5037 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5038 if (UCD_CATEGORY(c) != ucp_M) break;
5039 eptr += len;
5040 }
5041 CHECK_PARTIAL();
5042 }
5043 }
5044 else
5045 #endif /* SUPPORT_UCP */
5046
5047 #ifdef SUPPORT_UTF
5048 if (utf)
5049 {
5050 for (fi = min;; fi++)
5051 {
5052 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5054 if (fi >= max) RRETURN(MATCH_NOMATCH);
5055 if (eptr >= md->end_subject)
5056 {
5057 SCHECK_PARTIAL();
5058 RRETURN(MATCH_NOMATCH);
5059 }
5060 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5061 RRETURN(MATCH_NOMATCH);
5062 GETCHARINC(c, eptr);
5063 switch(ctype)
5064 {
5065 case OP_ANY: /* This is the non-NL case */
5066 if (md->partial != 0 && /* Take care with CRLF partial */
5067 eptr >= md->end_subject &&
5068 NLBLOCK->nltype == NLTYPE_FIXED &&
5069 NLBLOCK->nllen == 2 &&
5070 c == NLBLOCK->nl[0])
5071 {
5072 md->hitend = TRUE;
5073 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5074 }
5075 break;
5076
5077 case OP_ALLANY:
5078 case OP_ANYBYTE:
5079 break;
5080
5081 case OP_ANYNL:
5082 switch(c)
5083 {
5084 default: RRETURN(MATCH_NOMATCH);
5085 case 0x000d:
5086 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5087 break;
5088 case 0x000a:
5089 break;
5090
5091 case 0x000b:
5092 case 0x000c:
5093 case 0x0085:
5094 case 0x2028:
5095 case 0x2029:
5096 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5097 break;
5098 }
5099 break;
5100
5101 case OP_NOT_HSPACE:
5102 switch(c)
5103 {
5104 default: break;
5105 case 0x09: /* HT */
5106 case 0x20: /* SPACE */
5107 case 0xa0: /* NBSP */
5108 case 0x1680: /* OGHAM SPACE MARK */
5109 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5110 case 0x2000: /* EN QUAD */
5111 case 0x2001: /* EM QUAD */
5112 case 0x2002: /* EN SPACE */
5113 case 0x2003: /* EM SPACE */
5114 case 0x2004: /* THREE-PER-EM SPACE */
5115 case 0x2005: /* FOUR-PER-EM SPACE */
5116 case 0x2006: /* SIX-PER-EM SPACE */
5117 case 0x2007: /* FIGURE SPACE */
5118 case 0x2008: /* PUNCTUATION SPACE */
5119 case 0x2009: /* THIN SPACE */
5120 case 0x200A: /* HAIR SPACE */
5121 case 0x202f: /* NARROW NO-BREAK SPACE */
5122 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5123 case 0x3000: /* IDEOGRAPHIC SPACE */
5124 RRETURN(MATCH_NOMATCH);
5125 }
5126 break;
5127
5128 case OP_HSPACE:
5129 switch(c)
5130 {
5131 default: RRETURN(MATCH_NOMATCH);
5132 case 0x09: /* HT */
5133 case 0x20: /* SPACE */
5134 case 0xa0: /* NBSP */
5135 case 0x1680: /* OGHAM SPACE MARK */
5136 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5137 case 0x2000: /* EN QUAD */
5138 case 0x2001: /* EM QUAD */
5139 case 0x2002: /* EN SPACE */
5140 case 0x2003: /* EM SPACE */
5141 case 0x2004: /* THREE-PER-EM SPACE */
5142 case 0x2005: /* FOUR-PER-EM SPACE */
5143 case 0x2006: /* SIX-PER-EM SPACE */
5144 case 0x2007: /* FIGURE SPACE */
5145 case 0x2008: /* PUNCTUATION SPACE */
5146 case 0x2009: /* THIN SPACE */
5147 case 0x200A: /* HAIR SPACE */
5148 case 0x202f: /* NARROW NO-BREAK SPACE */
5149 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5150 case 0x3000: /* IDEOGRAPHIC SPACE */
5151 break;
5152 }
5153 break;
5154
5155 case OP_NOT_VSPACE:
5156 switch(c)
5157 {
5158 default: break;
5159 case 0x0a: /* LF */
5160 case 0x0b: /* VT */
5161 case 0x0c: /* FF */
5162 case 0x0d: /* CR */
5163 case 0x85: /* NEL */
5164 case 0x2028: /* LINE SEPARATOR */
5165 case 0x2029: /* PARAGRAPH SEPARATOR */
5166 RRETURN(MATCH_NOMATCH);
5167 }
5168 break;
5169
5170 case OP_VSPACE:
5171 switch(c)
5172 {
5173 default: RRETURN(MATCH_NOMATCH);
5174 case 0x0a: /* LF */
5175 case 0x0b: /* VT */
5176 case 0x0c: /* FF */
5177 case 0x0d: /* CR */
5178 case 0x85: /* NEL */
5179 case 0x2028: /* LINE SEPARATOR */
5180 case 0x2029: /* PARAGRAPH SEPARATOR */
5181 break;
5182 }
5183 break;
5184
5185 case OP_NOT_DIGIT:
5186 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5187 RRETURN(MATCH_NOMATCH);
5188 break;
5189
5190 case OP_DIGIT:
5191 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5192 RRETURN(MATCH_NOMATCH);
5193 break;
5194
5195 case OP_NOT_WHITESPACE:
5196 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5197 RRETURN(MATCH_NOMATCH);
5198 break;
5199
5200 case OP_WHITESPACE:
5201 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5202 RRETURN(MATCH_NOMATCH);
5203 break;
5204
5205 case OP_NOT_WORDCHAR:
5206 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5207 RRETURN(MATCH_NOMATCH);
5208 break;
5209
5210 case OP_WORDCHAR:
5211 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5212 RRETURN(MATCH_NOMATCH);
5213 break;
5214
5215 default:
5216 RRETURN(PCRE_ERROR_INTERNAL);
5217 }
5218 }
5219 }
5220 else
5221 #endif
5222 /* Not UTF mode */
5223 {
5224 for (fi = min;; fi++)
5225 {
5226 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5228 if (fi >= max) RRETURN(MATCH_NOMATCH);
5229 if (eptr >= md->end_subject)
5230 {
5231 SCHECK_PARTIAL();
5232 RRETURN(MATCH_NOMATCH);
5233 }
5234 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5235 RRETURN(MATCH_NOMATCH);
5236 c = *eptr++;
5237 switch(ctype)
5238 {
5239 case OP_ANY: /* This is the non-NL case */
5240 if (md->partial != 0 && /* Take care with CRLF partial */
5241 eptr >= md->end_subject &&
5242 NLBLOCK->nltype == NLTYPE_FIXED &&
5243 NLBLOCK->nllen == 2 &&
5244 c == NLBLOCK->nl[0])
5245 {
5246 md->hitend = TRUE;
5247 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5248 }
5249 break;
5250
5251 case OP_ALLANY:
5252 case OP_ANYBYTE:
5253 break;
5254
5255 case OP_ANYNL:
5256 switch(c)
5257 {
5258 default: RRETURN(MATCH_NOMATCH);
5259 case 0x000d:
5260 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5261 break;
5262
5263 case 0x000a:
5264 break;
5265
5266 case 0x000b:
5267 case 0x000c:
5268 case 0x0085:
5269 #ifdef COMPILE_PCRE16
5270 case 0x2028:
5271 case 0x2029:
5272 #endif
5273 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5274 break;
5275 }
5276 break;
5277
5278 case OP_NOT_HSPACE:
5279 switch(c)
5280 {
5281 default: break;
5282 case 0x09: /* HT */
5283 case 0x20: /* SPACE */
5284 case 0xa0: /* NBSP */
5285 #ifdef COMPILE_PCRE16
5286 case 0x1680: /* OGHAM SPACE MARK */
5287 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5288 case 0x2000: /* EN QUAD */
5289 case 0x2001: /* EM QUAD */
5290 case 0x2002: /* EN SPACE */
5291 case 0x2003: /* EM SPACE */
5292 case 0x2004: /* THREE-PER-EM SPACE */
5293 case 0x2005: /* FOUR-PER-EM SPACE */
5294 case 0x2006: /* SIX-PER-EM SPACE */
5295 case 0x2007: /* FIGURE SPACE */
5296 case 0x2008: /* PUNCTUATION SPACE */
5297 case 0x2009: /* THIN SPACE */
5298 case 0x200A: /* HAIR SPACE */
5299 case 0x202f: /* NARROW NO-BREAK SPACE */
5300 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5301 case 0x3000: /* IDEOGRAPHIC SPACE */
5302 #endif
5303 RRETURN(MATCH_NOMATCH);
5304 }
5305 break;
5306
5307 case OP_HSPACE:
5308 switch(c)
5309 {
5310 default: RRETURN(MATCH_NOMATCH);
5311 case 0x09: /* HT */
5312 case 0x20: /* SPACE */
5313 case 0xa0: /* NBSP */
5314 #ifdef COMPILE_PCRE16
5315 case 0x1680: /* OGHAM SPACE MARK */
5316 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5317 case 0x2000: /* EN QUAD */
5318 case 0x2001: /* EM QUAD */
5319 case 0x2002: /* EN SPACE */
5320 case 0x2003: /* EM SPACE */
5321 case 0x2004: /* THREE-PER-EM SPACE */
5322 case 0x2005: /* FOUR-PER-EM SPACE */
5323 case 0x2006: /* SIX-PER-EM SPACE */
5324 case 0x2007: /* FIGURE SPACE */
5325 case 0x2008: /* PUNCTUATION SPACE */
5326 case 0x2009: /* THIN SPACE */
5327 case 0x200A: /* HAIR SPACE */
5328 case 0x202f: /* NARROW NO-BREAK SPACE */
5329 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5330 case 0x3000: /* IDEOGRAPHIC SPACE */
5331 #endif
5332 break;
5333 }
5334 break;
5335
5336 case OP_NOT_VSPACE:
5337 switch(c)
5338 {
5339 default: break;
5340 case 0x0a: /* LF */
5341 case 0x0b: /* VT */
5342 case 0x0c: /* FF */
5343 case 0x0d: /* CR */
5344 case 0x85: /* NEL */
5345 #ifdef COMPILE_PCRE16
5346 case 0x2028: /* LINE SEPARATOR */
5347 case 0x2029: /* PARAGRAPH SEPARATOR */
5348 #endif
5349 RRETURN(MATCH_NOMATCH);
5350 }
5351 break;
5352
5353 case OP_VSPACE:
5354 switch(c)
5355 {
5356 default: RRETURN(MATCH_NOMATCH);
5357 case 0x0a: /* LF */
5358 case 0x0b: /* VT */
5359 case 0x0c: /* FF */
5360 case 0x0d: /* CR */
5361 case 0x85: /* NEL */
5362 #ifdef COMPILE_PCRE16
5363 case 0x2028: /* LINE SEPARATOR */
5364 case 0x2029: /* PARAGRAPH SEPARATOR */
5365 #endif
5366 break;
5367 }
5368 break;
5369
5370 case OP_NOT_DIGIT:
5371 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5372 break;
5373
5374 case OP_DIGIT:
5375 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5376 break;
5377
5378 case OP_NOT_WHITESPACE:
5379 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5380 break;
5381
5382 case OP_WHITESPACE:
5383 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5384 break;
5385
5386 case OP_NOT_WORDCHAR:
5387 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5388 break;
5389
5390 case OP_WORDCHAR:
5391 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5392 break;
5393
5394 default:
5395 RRETURN(PCRE_ERROR_INTERNAL);
5396 }
5397 }
5398 }
5399 /* Control never gets here */
5400 }
5401
5402 /* If maximizing, it is worth using inline code for speed, doing the type
5403 test once at the start (i.e. keep it out of the loop). Again, keep the
5404 UTF-8 and UCP stuff separate. */
5405
5406 else
5407 {
5408 pp = eptr; /* Remember where we started */
5409
5410 #ifdef SUPPORT_UCP
5411 if (prop_type >= 0)
5412 {
5413 switch(prop_type)
5414 {
5415 case PT_ANY:
5416 for (i = min; i < max; i++)
5417 {
5418 int len = 1;
5419 if (eptr >= md->end_subject)
5420 {
5421 SCHECK_PARTIAL();
5422 break;
5423 }
5424 GETCHARLENTEST(c, eptr, len);
5425 if (prop_fail_result) break;
5426 eptr+= len;
5427 }
5428 break;
5429
5430 case PT_LAMP:
5431 for (i = min; i < max; i++)
5432 {
5433 int chartype;
5434 int len = 1;
5435 if (eptr >= md->end_subject)
5436 {
5437 SCHECK_PARTIAL();
5438 break;
5439 }
5440 GETCHARLENTEST(c, eptr, len);
5441 chartype = UCD_CHARTYPE(c);
5442 if ((chartype == ucp_Lu ||
5443 chartype == ucp_Ll ||
5444 chartype == ucp_Lt) == prop_fail_result)
5445 break;
5446 eptr+= len;
5447 }
5448 break;
5449
5450 case PT_GC:
5451 for (i = min; i < max; i++)
5452 {
5453 int len = 1;
5454 if (eptr >= md->end_subject)
5455 {
5456 SCHECK_PARTIAL();
5457 break;
5458 }
5459 GETCHARLENTEST(c, eptr, len);
5460 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5461 eptr+= len;
5462 }
5463 break;
5464
5465 case PT_PC:
5466 for (i = min; i < max; i++)
5467 {
5468 int len = 1;
5469 if (eptr >= md->end_subject)
5470 {
5471 SCHECK_PARTIAL();
5472 break;
5473 }
5474 GETCHARLENTEST(c, eptr, len);
5475 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5476 eptr+= len;
5477 }
5478 break;
5479
5480 case PT_SC:
5481 for (i = min; i < max; i++)
5482 {
5483 int len = 1;
5484 if (eptr >= md->end_subject)
5485 {
5486 SCHECK_PARTIAL();
5487 break;
5488 }
5489 GETCHARLENTEST(c, eptr, len);
5490 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5491 eptr+= len;
5492 }
5493 break;
5494
5495 case PT_ALNUM:
5496 for (i = min; i < max; i++)
5497 {
5498 int category;
5499 int len = 1;
5500 if (eptr >= md->end_subject)
5501 {
5502 SCHECK_PARTIAL();
5503 break;
5504 }
5505 GETCHARLENTEST(c, eptr, len);
5506 category = UCD_CATEGORY(c);
5507 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5508 break;
5509 eptr+= len;
5510 }
5511 break;
5512
5513 case PT_SPACE: /* Perl space */
5514 for (i = min; i < max; i++)
5515 {
5516 int len = 1;
5517 if (eptr >= md->end_subject)
5518 {
5519 SCHECK_PARTIAL();
5520 break;
5521 }
5522 GETCHARLENTEST(c, eptr, len);
5523 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5524 c == CHAR_FF || c == CHAR_CR)
5525 == prop_fail_result)
5526 break;
5527 eptr+= len;
5528 }
5529 break;
5530
5531 case PT_PXSPACE: /* POSIX space */
5532 for (i = min; i < max; i++)
5533 {
5534 int len = 1;
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 GETCHARLENTEST(c, eptr, len);
5541 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5542 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5543 == prop_fail_result)
5544 break;
5545 eptr+= len;
5546 }
5547 break;
5548
5549 case PT_WORD:
5550 for (i = min; i < max; i++)
5551 {
5552 int category;
5553 int len = 1;
5554 if (eptr >= md->end_subject)
5555 {
5556 SCHECK_PARTIAL();
5557 break;
5558 }
5559 GETCHARLENTEST(c, eptr, len);
5560 category = UCD_CATEGORY(c);
5561 if ((category == ucp_L || category == ucp_N ||
5562 c == CHAR_UNDERSCORE) == prop_fail_result)
5563 break;
5564 eptr+= len;
5565 }
5566 break;
5567
5568 default:
5569 RRETURN(PCRE_ERROR_INTERNAL);
5570 }
5571
5572 /* eptr is now past the end of the maximum run */
5573
5574 if (possessive) continue;
5575 for(;;)
5576 {
5577 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5579 if (eptr-- == pp) break; /* Stop if tried at original pos */
5580 if (utf) BACKCHAR(eptr);
5581 }
5582 }
5583
5584 /* Match extended Unicode sequences. We will get here only if the
5585 support is in the binary; otherwise a compile-time error occurs. */
5586
5587 else if (ctype == OP_EXTUNI)
5588 {
5589 for (i = min; i < max; i++)
5590 {
5591 int len = 1;
5592 if (eptr >= md->end_subject)
5593 {
5594 SCHECK_PARTIAL();
5595 break;
5596 }
5597 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5598 if (UCD_CATEGORY(c) == ucp_M) break;
5599 eptr += len;
5600 while (eptr < md->end_subject)
5601 {
5602 len = 1;
5603 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5604 if (UCD_CATEGORY(c) != ucp_M) break;
5605 eptr += len;
5606 }
5607 CHECK_PARTIAL();
5608 }
5609
5610 /* eptr is now past the end of the maximum run */
5611
5612 if (possessive) continue;
5613
5614 for(;;)
5615 {
5616 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5617 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5618 if (eptr-- == pp) break; /* Stop if tried at original pos */
5619 for (;;) /* Move back over one extended */
5620 {
5621 if (!utf) c = *eptr; else
5622 {
5623 BACKCHAR(eptr);
5624 GETCHAR(c, eptr);
5625 }
5626 if (UCD_CATEGORY(c) != ucp_M) break;
5627 eptr--;
5628 }
5629 }
5630 }
5631
5632 else
5633 #endif /* SUPPORT_UCP */
5634
5635 #ifdef SUPPORT_UTF
5636 if (utf)
5637 {
5638 switch(ctype)
5639 {
5640 case OP_ANY:
5641 if (max < INT_MAX)
5642 {
5643 for (i = min; i < max; i++)
5644 {
5645 if (eptr >= md->end_subject)
5646 {
5647 SCHECK_PARTIAL();
5648 break;
5649 }
5650 if (IS_NEWLINE(eptr)) break;
5651 if (md->partial != 0 && /* Take care with CRLF partial */
5652 eptr + 1 >= md->end_subject &&
5653 NLBLOCK->nltype == NLTYPE_FIXED &&
5654 NLBLOCK->nllen == 2 &&
5655 *eptr == NLBLOCK->nl[0])
5656 {
5657 md->hitend = TRUE;
5658 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5659 }
5660 eptr++;
5661 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5662 }
5663 }
5664
5665 /* Handle unlimited UTF-8 repeat */
5666
5667 else
5668 {
5669 for (i = min; i < max; i++)
5670 {
5671 if (eptr >= md->end_subject)
5672 {
5673 SCHECK_PARTIAL();
5674 break;
5675 }
5676 if (IS_NEWLINE(eptr)) break;
5677 if (md->partial != 0 && /* Take care with CRLF partial */
5678 eptr + 1 >= md->end_subject &&
5679 NLBLOCK->nltype == NLTYPE_FIXED &&
5680 NLBLOCK->nllen == 2 &&
5681 *eptr == NLBLOCK->nl[0])
5682 {
5683 md->hitend = TRUE;
5684 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5685 }
5686 eptr++;
5687 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5688 }
5689 }
5690 break;
5691
5692 case OP_ALLANY:
5693 if (max < INT_MAX)
5694 {
5695 for (i = min; i < max; i++)
5696 {
5697 if (eptr >= md->end_subject)
5698 {
5699 SCHECK_PARTIAL();
5700 break;
5701 }
5702 eptr++;
5703 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5704 }
5705 }
5706 else
5707 {
5708 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5709 SCHECK_PARTIAL();
5710 }
5711 break;
5712
5713 /* The byte case is the same as non-UTF8 */
5714
5715 case OP_ANYBYTE:
5716 c = max - min;
5717 if (c > (unsigned int)(md->end_subject - eptr))
5718 {
5719 eptr = md->end_subject;
5720 SCHECK_PARTIAL();
5721 }
5722 else eptr += c;
5723 break;
5724
5725 case OP_ANYNL:
5726 for (i = min; i < max; i++)
5727 {
5728 int len = 1;
5729 if (eptr >= md->end_subject)
5730 {
5731 SCHECK_PARTIAL();
5732 break;
5733 }
5734 GETCHARLEN(c, eptr, len);
5735 if (c == 0x000d)
5736 {
5737 if (++eptr >= md->end_subject) break;
5738 if (*eptr == 0x000a) eptr++;
5739 }
5740 else
5741 {
5742 if (c != 0x000a &&
5743 (md->bsr_anycrlf ||
5744 (c != 0x000b && c != 0x000c &&
5745 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5746 break;
5747 eptr += len;
5748 }
5749 }
5750 break;
5751
5752 case OP_NOT_HSPACE:
5753 case OP_HSPACE:
5754 for (i = min; i < max; i++)
5755 {
5756 BOOL gotspace;
5757 int len = 1;
5758 if (eptr >= md->end_subject)
5759 {
5760 SCHECK_PARTIAL();
5761 break;
5762 }
5763 GETCHARLEN(c, eptr, len);
5764 switch(c)
5765 {
5766 default: gotspace = FALSE; break;
5767 case 0x09: /* HT */
5768 case 0x20: /* SPACE */
5769 case 0xa0: /* NBSP */
5770 case 0x1680: /* OGHAM SPACE MARK */
5771 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5772 case 0x2000: /* EN QUAD */
5773 case 0x2001: /* EM QUAD */
5774 case 0x2002: /* EN SPACE */
5775 case 0x2003: /* EM SPACE */
5776 case 0x2004: /* THREE-PER-EM SPACE */
5777 case 0x2005: /* FOUR-PER-EM SPACE */
5778 case 0x2006: /* SIX-PER-EM SPACE */
5779 case 0x2007: /* FIGURE SPACE */
5780 case 0x2008: /* PUNCTUATION SPACE */
5781 case 0x2009: /* THIN SPACE */
5782 case 0x200A: /* HAIR SPACE */
5783 case 0x202f: /* NARROW NO-BREAK SPACE */
5784 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5785 case 0x3000: /* IDEOGRAPHIC SPACE */
5786 gotspace = TRUE;
5787 break;
5788 }
5789 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5790 eptr += len;
5791 }
5792 break;
5793
5794 case OP_NOT_VSPACE:
5795 case OP_VSPACE:
5796 for (i = min; i < max; i++)
5797 {
5798 BOOL gotspace;
5799 int len = 1;
5800 if (eptr >= md->end_subject)
5801 {
5802 SCHECK_PARTIAL();
5803 break;
5804 }
5805 GETCHARLEN(c, eptr, len);
5806 switch(c)
5807 {
5808 default: gotspace = FALSE; break;
5809 case 0x0a: /* LF */
5810 case 0x0b: /* VT */
5811 case 0x0c: /* FF */
5812 case 0x0d: /* CR */
5813 case 0x85: /* NEL */
5814 case 0x2028: /* LINE SEPARATOR */
5815 case 0x2029: /* PARAGRAPH SEPARATOR */
5816 gotspace = TRUE;
5817 break;
5818 }
5819 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5820 eptr += len;
5821 }
5822 break;
5823
5824 case OP_NOT_DIGIT:
5825 for (i = min; i < max; i++)
5826 {
5827 int len = 1;
5828 if (eptr >= md->end_subject)
5829 {
5830 SCHECK_PARTIAL();
5831 break;
5832 }
5833 GETCHARLEN(c, eptr, len);
5834 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5835 eptr+= len;
5836 }
5837 break;
5838
5839 case OP_DIGIT:
5840 for (i = min; i < max; i++)
5841 {
5842 int len = 1;
5843 if (eptr >= md->end_subject)
5844 {
5845 SCHECK_PARTIAL();
5846 break;
5847 }
5848 GETCHARLEN(c, eptr, len);
5849 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5850 eptr+= len;
5851 }
5852 break;
5853
5854 case OP_NOT_WHITESPACE:
5855 for (i = min; i < max; i++)
5856 {
5857 int len = 1;
5858 if (eptr >= md->end_subject)
5859 {
5860 SCHECK_PARTIAL();
5861 break;
5862 }
5863 GETCHARLEN(c, eptr, len);
5864 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5865 eptr+= len;
5866 }
5867 break;
5868
5869 case OP_WHITESPACE:
5870 for (i = min; i < max; i++)
5871 {
5872 int len = 1;
5873 if (eptr >= md->end_subject)
5874 {
5875 SCHECK_PARTIAL();
5876 break;
5877 }
5878 GETCHARLEN(c, eptr, len);
5879 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5880 eptr+= len;
5881 }
5882 break;
5883
5884 case OP_NOT_WORDCHAR:
5885 for (i = min; i < max; i++)
5886 {
5887 int len = 1;
5888 if (eptr >= md->end_subject)
5889 {
5890 SCHECK_PARTIAL();
5891 break;
5892 }
5893 GETCHARLEN(c, eptr, len);
5894 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5895 eptr+= len;
5896 }
5897 break;
5898
5899 case OP_WORDCHAR:
5900 for (i = min; i < max; i++)
5901 {
5902 int len = 1;
5903 if (eptr >= md->end_subject)
5904 {
5905 SCHECK_PARTIAL();
5906 break;
5907 }
5908 GETCHARLEN(c, eptr, len);
5909 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5910 eptr+= len;
5911 }
5912 break;
5913
5914 default:
5915 RRETURN(PCRE_ERROR_INTERNAL);
5916 }
5917
5918 /* eptr is now past the end of the maximum run. If possessive, we are
5919 done (no backing up). Otherwise, match at this position; anything other
5920 than no match is immediately returned. For nomatch, back up one
5921 character, unless we are matching \R and the last thing matched was
5922 \r\n, in which case, back up two bytes. */
5923
5924 if (possessive) continue;
5925 for(;;)
5926 {
5927 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5928 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5929 if (eptr-- == pp) break; /* Stop if tried at original pos */
5930 BACKCHAR(eptr);
5931 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5932 eptr[-1] == '\r') eptr--;
5933 }
5934 }
5935 else
5936 #endif /* SUPPORT_UTF */
5937 /* Not UTF mode */
5938 {
5939 switch(ctype)
5940 {
5941 case OP_ANY:
5942 for (i = min; i < max; i++)
5943 {
5944 if (eptr >= md->end_subject)
5945 {
5946 SCHECK_PARTIAL();
5947 break;
5948 }
5949 if (IS_NEWLINE(eptr)) break;
5950 if (md->partial != 0 && /* Take care with CRLF partial */
5951 eptr + 1 >= md->end_subject &&
5952 NLBLOCK->nltype == NLTYPE_FIXED &&
5953 NLBLOCK->nllen == 2 &&
5954 *eptr == NLBLOCK->nl[0])
5955 {
5956 md->hitend = TRUE;
5957 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5958 }
5959 eptr++;
5960 }
5961 break;
5962
5963 case OP_ALLANY:
5964 case OP_ANYBYTE:
5965 c = max - min;
5966 if (c > (unsigned int)(md->end_subject - eptr))
5967 {
5968 eptr = md->end_subject;
5969 SCHECK_PARTIAL();
5970 }
5971 else eptr += c;
5972 break;
5973
5974 case OP_ANYNL:
5975 for (i = min; i < max; i++)
5976 {
5977 if (eptr >= md->end_subject)
5978 {
5979 SCHECK_PARTIAL();
5980 break;
5981 }
5982 c = *eptr;
5983 if (c == 0x000d)
5984 {
5985 if (++eptr >= md->end_subject) break;
5986 if (*eptr == 0x000a) eptr++;
5987 }
5988 else
5989 {
5990 if (c != 0x000a && (md->bsr_anycrlf ||
5991 (c != 0x000b && c != 0x000c && c != 0x0085
5992 #ifdef COMPILE_PCRE16
5993 && c != 0x2028 && c != 0x2029
5994 #endif
5995 ))) break;
5996 eptr++;
5997 }
5998 }
5999 break;
6000
6001 case OP_NOT_HSPACE:
6002 for (i = min; i < max; i++)
6003 {
6004 if (eptr >= md->end_subject)
6005 {
6006 SCHECK_PARTIAL();
6007 break;
6008 }
6009 c = *eptr;
6010 if (c == 0x09 || c == 0x20 || c == 0xa0
6011 #ifdef COMPILE_PCRE16
6012 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6013 || c == 0x202f || c == 0x205f || c == 0x3000
6014 #endif
6015 ) break;
6016 eptr++;
6017 }
6018 break;
6019
6020 case OP_HSPACE:
6021 for (i = min; i < max; i++)
6022 {
6023 if (eptr >= md->end_subject)
6024 {
6025 SCHECK_PARTIAL();
6026 break;
6027 }
6028 c = *eptr;
6029 if (c != 0x09 && c != 0x20 && c != 0xa0
6030 #ifdef COMPILE_PCRE16
6031 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6032 && c != 0x202f && c != 0x205f && c != 0x3000
6033 #endif
6034 ) break;
6035 eptr++;
6036 }
6037 break;
6038
6039 case OP_NOT_VSPACE:
6040 for (i = min; i < max; i++)
6041 {
6042 if (eptr >= md->end_subject)
6043 {
6044 SCHECK_PARTIAL();
6045 break;
6046 }
6047 c = *eptr;
6048 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6049 #ifdef COMPILE_PCRE16
6050 || c == 0x2028 || c == 0x2029
6051 #endif
6052 ) break;
6053 eptr++;
6054 }
6055 break;
6056
6057 case OP_VSPACE:
6058 for (i = min; i < max; i++)
6059 {
6060 if (eptr >= md->end_subject)
6061 {
6062 SCHECK_PARTIAL();
6063 break;
6064 }
6065 c = *eptr;
6066 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6067 #ifdef COMPILE_PCRE16
6068 && c != 0x2028 && c != 0x2029
6069 #endif
6070 ) break;
6071 eptr++;
6072 }
6073 break;
6074
6075 case OP_NOT_DIGIT:
6076 for (i = min; i < max; i++)
6077 {
6078 if (eptr >= md->end_subject)
6079 {
6080 SCHECK_PARTIAL();
6081 break;
6082 }
6083 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6084 eptr++;
6085 }
6086 break;
6087
6088 case OP_DIGIT:
6089 for (i = min; i < max; i++)
6090 {
6091 if (eptr >= md->end_subject)
6092 {
6093 SCHECK_PARTIAL();
6094 break;
6095 }
6096 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6097 eptr++;
6098 }
6099 break;
6100
6101 case OP_NOT_WHITESPACE:
6102 for (i = min; i < max; i++)
6103 {
6104 if (eptr >= md->end_subject)
6105 {
6106 SCHECK_PARTIAL();
6107 break;
6108 }
6109 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6110 eptr++;
6111 }
6112 break;
6113
6114 case OP_WHITESPACE:
6115 for (i = min; i < max; i++)
6116 {
6117 if (eptr >= md->end_subject)
6118 {
6119 SCHECK_PARTIAL();
6120 break;
6121 }
6122 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6123 eptr++;
6124 }
6125 break;
6126
6127 case OP_NOT_WORDCHAR:
6128 for (i = min; i < max; i++)
6129 {
6130 if (eptr >= md->end_subject)
6131 {
6132 SCHECK_PARTIAL();
6133 break;
6134 }
6135 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6136 eptr++;
6137 }
6138 break;
6139
6140 case OP_WORDCHAR:
6141 for (i = min; i < max; i++)
6142 {
6143 if (eptr >= md->end_subject)
6144 {
6145 SCHECK_PARTIAL();
6146 break;
6147 }
6148 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6149 eptr++;
6150 }
6151 break;
6152
6153 default:
6154 RRETURN(PCRE_ERROR_INTERNAL);
6155 }
6156
6157 /* eptr is now past the end of the maximum run. If possessive, we are
6158 done (no backing up). Otherwise, match at this position; anything other
6159 than no match is immediately returned. For nomatch, back up one
6160 character (byte), unless we are matching \R and the last thing matched
6161 was \r\n, in which case, back up two bytes. */
6162
6163 if (possessive) continue;
6164 while (eptr >= pp)
6165 {
6166 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6167 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6168 eptr--;
6169 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6170 eptr[-1] == '\r') eptr--;
6171 }
6172 }
6173
6174 /* Get here if we can't make it match with any permitted repetitions */
6175
6176 RRETURN(MATCH_NOMATCH);
6177 }
6178 /* Control never gets here */
6179
6180 /* There's been some horrible disaster. Arrival here can only mean there is
6181 something seriously wrong in the code above or the OP_xxx definitions. */
6182
6183 default:
6184 DPRINTF(("Unknown opcode %d\n", *ecode));
6185 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6186 }
6187
6188 /* Do not stick any code in here without much thought; it is assumed
6189 that "continue" in the code above comes out to here to repeat the main
6190 loop. */
6191
6192 } /* End of main loop */
6193 /* Control never reaches here */
6194
6195
6196 /* When compiling to use the heap rather than the stack for recursive calls to
6197 match(), the RRETURN() macro jumps here. The number that is saved in
6198 frame->Xwhere indicates which label we actually want to return to. */
6199
6200 #ifdef NO_RECURSE
6201 #define LBL(val) case val: goto L_RM##val;
6202 HEAP_RETURN:
6203 switch (frame->Xwhere)
6204 {
6205 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6206 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6207 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6208 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6209 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6210 LBL(65) LBL(66)
6211 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6212 LBL(21)
6213 #endif
6214 #ifdef SUPPORT_UTF
6215 LBL(16) LBL(18) LBL(20)
6216 LBL(22) LBL(23) LBL(28) LBL(30)
6217 LBL(32) LBL(34) LBL(42) LBL(46)
6218 #ifdef SUPPORT_UCP
6219 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6220 LBL(59) LBL(60) LBL(61) LBL(62)
6221 #endif /* SUPPORT_UCP */
6222 #endif /* SUPPORT_UTF */
6223 default:
6224 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6225
6226 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6227
6228 return PCRE_ERROR_INTERNAL;
6229 }
6230 #undef LBL
6231 #endif /* NO_RECURSE */
6232 }
6233
6234
6235 /***************************************************************************
6236 ****************************************************************************
6237 RECURSION IN THE match() FUNCTION
6238
6239 Undefine all the macros that were defined above to handle this. */
6240
6241 #ifdef NO_RECURSE
6242 #undef eptr
6243 #undef ecode
6244 #undef mstart
6245 #undef offset_top
6246 #undef eptrb
6247 #undef flags
6248
6249 #undef callpat
6250 #undef charptr
6251 #undef data
6252 #undef next
6253 #undef pp
6254 #undef prev
6255 #undef saved_eptr
6256
6257 #undef new_recursive
6258
6259 #undef cur_is_word
6260 #undef condition
6261 #undef prev_is_word
6262
6263 #undef ctype
6264 #undef length
6265 #undef max
6266 #undef min
6267 #undef number
6268 #undef offset
6269 #undef op
6270 #undef save_capture_last
6271 #undef save_offset1
6272 #undef save_offset2
6273 #undef save_offset3
6274 #undef stacksave
6275
6276 #undef newptrb
6277
6278 #endif
6279
6280 /* These two are defined as macros in both cases */
6281
6282 #undef fc
6283 #undef fi
6284
6285 /***************************************************************************
6286 ***************************************************************************/
6287
6288
6289 #ifdef NO_RECURSE
6290 /*************************************************
6291 * Release allocated heap frames *
6292 *************************************************/
6293
6294 /* This function releases all the allocated frames. The base frame is on the
6295 machine stack, and so must not be freed.
6296
6297 Argument: the address of the base frame
6298 Returns: nothing
6299 */
6300
6301 static void
6302 release_match_heapframes (heapframe *frame_base)
6303 {
6304 heapframe *nextframe = frame_base->Xnextframe;
6305 while (nextframe != NULL)
6306 {
6307 heapframe *oldframe = nextframe;
6308 nextframe = nextframe->Xnextframe;
6309 (PUBL(stack_free))(oldframe);
6310 }
6311 }
6312 #endif
6313
6314
6315 /*************************************************
6316 * Execute a Regular Expression *
6317 *************************************************/
6318
6319 /* This function applies a compiled re to a subject string and picks out
6320 portions of the string if it matches. Two elements in the vector are set for
6321 each substring: the offsets to the start and end of the substring.
6322
6323 Arguments:
6324 argument_re points to the compiled expression
6325 extra_data points to extra data or is NULL
6326 subject points to the subject string
6327 length length of subject string (may contain binary zeros)
6328 start_offset where to start in the subject string
6329 options option bits
6330 offsets points to a vector of ints to be filled in with offsets
6331 offsetcount the number of elements in the vector
6332
6333 Returns: > 0 => success; value is the number of elements filled in
6334 = 0 => success, but offsets is not big enough
6335 -1 => failed to match
6336 < -1 => some kind of unexpected problem
6337 */
6338
6339 #ifdef COMPILE_PCRE8
6340 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6341 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6342 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6343 int offsetcount)
6344 #else
6345 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6346 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6347 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6348 int offsetcount)
6349 #endif
6350 {
6351 int rc, ocount, arg_offset_max;
6352 int newline;
6353 BOOL using_temporary_offsets = FALSE;
6354 BOOL anchored;
6355 BOOL startline;
6356 BOOL firstline;
6357 BOOL utf;
6358 BOOL has_first_char = FALSE;
6359 BOOL has_req_char = FALSE;
6360 pcre_uchar first_char = 0;
6361 pcre_uchar first_char2 = 0;
6362 pcre_uchar req_char = 0;
6363 pcre_uchar req_char2 = 0;
6364 match_data match_block;
6365 match_data *md = &match_block;
6366 const pcre_uint8 *tables;
6367 const pcre_uint8 *start_bits = NULL;
6368 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6369 PCRE_PUCHAR end_subject;
6370 PCRE_PUCHAR start_partial = NULL;
6371 PCRE_PUCHAR req_char_ptr = start_match - 1;
6372
6373 const pcre_study_data *study;
6374 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6375
6376 #ifdef NO_RECURSE
6377 heapframe frame_zero;
6378 frame_zero.Xprevframe = NULL; /* Marks the top level */
6379 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6380 md->match_frames_base = &frame_zero;
6381 #endif
6382
6383 /* Check for the special magic call that measures the size of the stack used
6384 per recursive call of match(). Without the funny casting for sizeof, a Windows
6385 compiler gave this error: "unary minus operator applied to unsigned type,
6386 result still unsigned". Hopefully the cast fixes that. */
6387
6388 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6389 start_offset == -999)
6390 #ifdef NO_RECURSE
6391 return -((int)sizeof(heapframe));
6392 #else
6393 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6394 #endif
6395
6396 /* Plausibility checks */
6397
6398 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6399 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6400 return PCRE_ERROR_NULL;
6401 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6402 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6403
6404 /* Check that the first field in the block is the magic number. If it is not,
6405 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6406 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6407 means that the pattern is likely compiled with different endianness. */
6408
6409 if (re->magic_number != MAGIC_NUMBER)
6410 return re->magic_number == REVERSED_MAGIC_NUMBER?
6411 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6412 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6413
6414 /* These two settings are used in the code for checking a UTF-8 string that
6415 follows immediately afterwards. Other values in the md block are used only
6416 during "normal" pcre_exec() processing, not when the JIT support is in use,
6417 so they are set up later. */
6418
6419 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6420 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6421 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6422 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6423
6424 /* Check a UTF-8 string if required. Pass back the character offset and error
6425 code for an invalid string if a results vector is available. */
6426
6427 #ifdef SUPPORT_UTF
6428 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6429 {
6430 int erroroffset;
6431 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6432 if (errorcode != 0)
6433 {
6434 if (offsetcount >= 2)
6435 {
6436 offsets[0] = erroroffset;
6437 offsets[1] = errorcode;
6438 }
6439 #ifdef COMPILE_PCRE16
6440 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6441 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6442 #else
6443 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6444 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6445 #endif
6446 }
6447
6448 /* Check that a start_offset points to the start of a UTF character. */
6449 if (start_offset > 0 && start_offset < length &&
6450 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6451 return PCRE_ERROR_BADUTF8_OFFSET;
6452 }
6453 #endif
6454
6455 /* If the pattern was successfully studied with JIT support, run the JIT
6456 executable instead of the rest of this function. Most options must be set at
6457 compile time for the JIT code to be usable. Fallback to the normal code path if
6458 an unsupported flag is set. */
6459
6460 #ifdef SUPPORT_JIT
6461 if (extra_data != NULL
6462 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6463 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6464 && extra_data->executable_jit != NULL
6465 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6466 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6467 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6468 {
6469 rc = PRIV(jit_exec)(re, extra_data->executable_jit,
6470 (const pcre_uchar *)subject, length, start_offset, options,
6471 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6472 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount,
6473 ((extra_data->flags & PCRE_EXTRA_MARK) != 0) ? extra_data->mark : NULL);
6474
6475 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6476 mode is not compiled. In this case we simply fallback to interpreter. */
6477
6478 if (rc != PCRE_ERROR_NULL) return rc;
6479 }
6480 #endif
6481
6482 /* Carry on with non-JIT matching. This information is for finding all the
6483 numbers associated with a given name, for condition testing. */
6484
6485 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6486 md->name_count = re->name_count;
6487 md->name_entry_size = re->name_entry_size;
6488
6489 /* Fish out the optional data from the extra_data structure, first setting
6490 the default values. */
6491
6492 study = NULL;
6493 md->match_limit = MATCH_LIMIT;
6494 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6495 md->callout_data = NULL;
6496
6497 /* The table pointer is always in native byte order. */
6498
6499 tables = re->tables;
6500
6501 if (extra_data != NULL)
6502 {
6503 register unsigned int flags = extra_data->flags;
6504 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6505 study = (const pcre_study_data *)extra_data->study_data;
6506 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6507 md->match_limit = extra_data->match_limit;
6508 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6509 md->match_limit_recursion = extra_data->match_limit_recursion;
6510 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6511 md->callout_data = extra_data->callout_data;
6512 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6513 }
6514
6515 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6516 is a feature that makes it possible to save compiled regex and re-use them
6517 in other programs later. */
6518
6519 if (tables == NULL) tables = PRIV(default_tables);
6520
6521 /* Set up other data */
6522
6523 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6524 startline = (re->flags & PCRE_STARTLINE) != 0;
6525 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6526
6527 /* The code starts after the real_pcre block and the capture name table. */
6528
6529 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6530 re->name_count * re->name_entry_size;
6531
6532 md->start_subject = (PCRE_PUCHAR)subject;
6533 md->start_offset = start_offset;
6534 md->end_subject = md->start_subject + length;
6535 end_subject = md->end_subject;
6536
6537 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6538 md->use_ucp = (re->options & PCRE_UCP) != 0;
6539 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6540 md->ignore_skip_arg = FALSE;
6541
6542 /* Some options are unpacked into BOOL variables in the hope that testing
6543 them will be faster than individual option bits. */
6544
6545 md->notbol = (options & PCRE_NOTBOL) != 0;
6546 md->noteol = (options & PCRE_NOTEOL) != 0;
6547 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6548 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6549
6550 md->hitend = FALSE;
6551 md->mark = md->nomatch_mark = NULL; /* In case never set */
6552
6553 md->recursive = NULL; /* No recursion at top level */
6554 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6555
6556 md->lcc = tables + lcc_offset;
6557 md->fcc = tables + fcc_offset;
6558 md->ctypes = tables + ctypes_offset;
6559
6560 /* Handle different \R options. */
6561
6562 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6563 {
6564 case 0:
6565 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6566 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6567 else
6568 #ifdef BSR_ANYCRLF
6569 md->bsr_anycrlf = TRUE;
6570 #else
6571 md->bsr_anycrlf = FALSE;
6572 #endif
6573 break;
6574
6575 case PCRE_BSR_ANYCRLF:
6576 md->bsr_anycrlf = TRUE;
6577 break;
6578
6579 case PCRE_BSR_UNICODE:
6580 md->bsr_anycrlf = FALSE;
6581 break;
6582
6583 default: return PCRE_ERROR_BADNEWLINE;
6584 }
6585
6586 /* Handle different types of newline. The three bits give eight cases. If
6587 nothing is set at run time, whatever was used at compile time applies. */
6588
6589 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6590 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6591 {
6592 case 0: newline = NEWLINE; break; /* Compile-time default */
6593 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6594 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6595 case PCRE_NEWLINE_CR+
6596 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6597 case PCRE_NEWLINE_ANY: newline = -1; break;
6598 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6599 default: return PCRE_ERROR_BADNEWLINE;
6600 }
6601
6602 if (newline == -2)
6603 {
6604 md->nltype = NLTYPE_ANYCRLF;
6605 }
6606 else if (newline < 0)
6607 {
6608 md->nltype = NLTYPE_ANY;
6609 }
6610 else
6611 {
6612 md->nltype = NLTYPE_FIXED;
6613 if (newline > 255)
6614 {
6615 md->nllen = 2;
6616 md->nl[0] = (newline >> 8) & 255;
6617 md->nl[1] = newline & 255;
6618 }
6619 else
6620 {
6621 md->nllen = 1;
6622 md->nl[0] = newline;
6623 }
6624 }
6625
6626 /* Partial matching was originally supported only for a restricted set of
6627 regexes; from release 8.00 there are no restrictions, but the bits are still
6628 defined (though never set). So there's no harm in leaving this code. */
6629
6630 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6631 return PCRE_ERROR_BADPARTIAL;
6632
6633 /* If the expression has got more back references than the offsets supplied can
6634 hold, we get a temporary chunk of working store to use during the matching.
6635 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6636 of 3. */
6637
6638 ocount = offsetcount - (offsetcount % 3);
6639 arg_offset_max = (2*ocount)/3;
6640
6641 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6642 {
6643 ocount = re->top_backref * 3 + 3;
6644 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6645 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6646 using_temporary_offsets = TRUE;
6647 DPRINTF(("Got memory to hold back references\n"));
6648 }
6649 else md->offset_vector = offsets;
6650
6651 md->offset_end = ocount;
6652 md->offset_max = (2*ocount)/3;
6653 md->offset_overflow = FALSE;
6654 md->capture_last = -1;
6655
6656 /* Reset the working variable associated with each extraction. These should
6657 never be used unless previously set, but they get saved and restored, and so we
6658 initialize them to avoid reading uninitialized locations. Also, unset the
6659 offsets for the matched string. This is really just for tidiness with callouts,
6660 in case they inspect these fields. */
6661
6662 if (md->offset_vector != NULL)
6663 {
6664 register int *iptr = md->offset_vector + ocount;
6665 register int *iend = iptr - re->top_bracket;
6666 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6667 while (--iptr >= iend) *iptr = -1;
6668 md->offset_vector[0] = md->offset_vector[1] = -1;
6669 }
6670
6671 /* Set up the first character to match, if available. The first_char value is
6672 never set for an anchored regular expression, but the anchoring may be forced
6673 at run time, so we have to test for anchoring. The first char may be unset for
6674 an unanchored pattern, of course. If there's no first char and the pattern was
6675 studied, there may be a bitmap of possible first characters. */
6676
6677 if (!anchored)
6678 {
6679 if ((re->flags & PCRE_FIRSTSET) != 0)
6680 {
6681 has_first_char = TRUE;
6682 first_char = first_char2 = (pcre_uchar)(re->first_char);
6683 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6684 {
6685 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6686 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6687 if (utf && first_char > 127)
6688 first_char2 = UCD_OTHERCASE(first_char);
6689 #endif
6690 }
6691 }
6692 else
6693 if (!startline && study != NULL &&
6694 (study->flags & PCRE_STUDY_MAPPED) != 0)
6695 start_bits = study->start_bits;
6696 }
6697
6698 /* For anchored or unanchored matches, there may be a "last known required
6699 character" set. */
6700
6701 if ((re->flags & PCRE_REQCHSET) != 0)
6702 {
6703 has_req_char = TRUE;
6704 req_char = req_char2 = (pcre_uchar)(re->req_char);
6705 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6706 {
6707 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6708 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6709 if (utf && req_char > 127)
6710 req_char2 = UCD_OTHERCASE(req_char);
6711 #endif
6712 }
6713 }
6714
6715
6716 /* ==========================================================================*/
6717
6718 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6719 the loop runs just once. */
6720
6721 for(;;)
6722 {
6723 PCRE_PUCHAR save_end_subject = end_subject;
6724 PCRE_PUCHAR new_start_match;
6725
6726 /* If firstline is TRUE, the start of the match is constrained to the first
6727 line of a multiline string. That is, the match must be before or at the first
6728 newline. Implement this by temporarily adjusting end_subject so that we stop
6729 scanning at a newline. If the match fails at the newline, later code breaks
6730 this loop. */
6731
6732 if (firstline)
6733 {
6734 PCRE_PUCHAR t = start_match;
6735 #ifdef SUPPORT_UTF
6736 if (utf)
6737 {
6738 while (t < md->end_subject && !IS_NEWLINE(t))
6739 {
6740 t++;
6741 ACROSSCHAR(t < end_subject, *t, t++);
6742 }
6743 }
6744 else