/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 934 - (show annotations)
Sat Feb 25 12:30:36 2012 UTC (7 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 218923 byte(s)
Error occurred while calculating annotation data.
Stop (*COMMIT) escaping from a recursive subroutine call.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: >= 0 the number of subject bytes matched
144 -1 no match
145 -2 partial match; always given if at end subject
146 */
147
148 static int
149 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
150 BOOL caseless)
151 {
152 PCRE_PUCHAR eptr_start = eptr;
153 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
154
155 #ifdef PCRE_DEBUG
156 if (eptr >= md->end_subject)
157 printf("matching subject <null>");
158 else
159 {
160 printf("matching subject ");
161 pchars(eptr, length, TRUE, md);
162 }
163 printf(" against backref ");
164 pchars(p, length, FALSE, md);
165 printf("\n");
166 #endif
167
168 /* Always fail if reference not set (and not JavaScript compatible - in that
169 case the length is passed as zero). */
170
171 if (length < 0) return -1;
172
173 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
174 properly if Unicode properties are supported. Otherwise, we can check only
175 ASCII characters. */
176
177 if (caseless)
178 {
179 #ifdef SUPPORT_UTF
180 #ifdef SUPPORT_UCP
181 if (md->utf)
182 {
183 /* Match characters up to the end of the reference. NOTE: the number of
184 bytes matched may differ, because there are some characters whose upper and
185 lower case versions code as different numbers of bytes. For example, U+023A
186 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
187 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
188 the latter. It is important, therefore, to check the length along the
189 reference, not along the subject (earlier code did this wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 int c, d;
195 if (eptr >= md->end_subject) return -2; /* Partial match */
196 GETCHARINC(c, eptr);
197 GETCHARINC(d, p);
198 if (c != d && c != UCD_OTHERCASE(d)) return -1;
199 }
200 }
201 else
202 #endif
203 #endif
204
205 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
206 is no UCP support. */
207 {
208 while (length-- > 0)
209 {
210 if (eptr >= md->end_subject) return -2; /* Partial match */
211 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
212 p++;
213 eptr++;
214 }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 while (length-- > 0)
224 {
225 if (eptr >= md->end_subject) return -2; /* Partial match */
226 if (*p++ != *eptr++) return -1;
227 }
228 }
229
230 return (int)(eptr - eptr_start);
231 }
232
233
234
235 /***************************************************************************
236 ****************************************************************************
237 RECURSION IN THE match() FUNCTION
238
239 The match() function is highly recursive, though not every recursive call
240 increases the recursive depth. Nevertheless, some regular expressions can cause
241 it to recurse to a great depth. I was writing for Unix, so I just let it call
242 itself recursively. This uses the stack for saving everything that has to be
243 saved for a recursive call. On Unix, the stack can be large, and this works
244 fine.
245
246 It turns out that on some non-Unix-like systems there are problems with
247 programs that use a lot of stack. (This despite the fact that every last chip
248 has oodles of memory these days, and techniques for extending the stack have
249 been known for decades.) So....
250
251 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
252 calls by keeping local variables that need to be preserved in blocks of memory
253 obtained from malloc() instead instead of on the stack. Macros are used to
254 achieve this so that the actual code doesn't look very different to what it
255 always used to.
256
257 The original heap-recursive code used longjmp(). However, it seems that this
258 can be very slow on some operating systems. Following a suggestion from Stan
259 Switzer, the use of longjmp() has been abolished, at the cost of having to
260 provide a unique number for each call to RMATCH. There is no way of generating
261 a sequence of numbers at compile time in C. I have given them names, to make
262 them stand out more clearly.
263
264 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
265 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
266 tests. Furthermore, not using longjmp() means that local dynamic variables
267 don't have indeterminate values; this has meant that the frame size can be
268 reduced because the result can be "passed back" by straight setting of the
269 variable instead of being passed in the frame.
270 ****************************************************************************
271 ***************************************************************************/
272
273 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
274 below must be updated in sync. */
275
276 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
277 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
278 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
279 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
280 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
281 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
282 RM61, RM62, RM63, RM64, RM65, RM66 };
283
284 /* These versions of the macros use the stack, as normal. There are debugging
285 versions and production versions. Note that the "rw" argument of RMATCH isn't
286 actually used in this definition. */
287
288 #ifndef NO_RECURSE
289 #define REGISTER register
290
291 #ifdef PCRE_DEBUG
292 #define RMATCH(ra,rb,rc,rd,re,rw) \
293 { \
294 printf("match() called in line %d\n", __LINE__); \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
296 printf("to line %d\n", __LINE__); \
297 }
298 #define RRETURN(ra) \
299 { \
300 printf("match() returned %d from line %d ", ra, __LINE__); \
301 return ra; \
302 }
303 #else
304 #define RMATCH(ra,rb,rc,rd,re,rw) \
305 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
306 #define RRETURN(ra) return ra
307 #endif
308
309 #else
310
311
312 /* These versions of the macros manage a private stack on the heap. Note that
313 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
314 argument of match(), which never changes. */
315
316 #define REGISTER
317
318 #define RMATCH(ra,rb,rc,rd,re,rw)\
319 {\
320 heapframe *newframe = frame->Xnextframe;\
321 if (newframe == NULL)\
322 {\
323 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
324 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
325 newframe->Xnextframe = NULL;\
326 frame->Xnextframe = newframe;\
327 }\
328 frame->Xwhere = rw;\
329 newframe->Xeptr = ra;\
330 newframe->Xecode = rb;\
331 newframe->Xmstart = mstart;\
332 newframe->Xoffset_top = rc;\
333 newframe->Xeptrb = re;\
334 newframe->Xrdepth = frame->Xrdepth + 1;\
335 newframe->Xprevframe = frame;\
336 frame = newframe;\
337 DPRINTF(("restarting from line %d\n", __LINE__));\
338 goto HEAP_RECURSE;\
339 L_##rw:\
340 DPRINTF(("jumped back to line %d\n", __LINE__));\
341 }
342
343 #define RRETURN(ra)\
344 {\
345 heapframe *oldframe = frame;\
346 frame = oldframe->Xprevframe;\
347 if (frame != NULL)\
348 {\
349 rrc = ra;\
350 goto HEAP_RETURN;\
351 }\
352 return ra;\
353 }
354
355
356 /* Structure for remembering the local variables in a private frame */
357
358 typedef struct heapframe {
359 struct heapframe *Xprevframe;
360 struct heapframe *Xnextframe;
361
362 /* Function arguments that may change */
363
364 PCRE_PUCHAR Xeptr;
365 const pcre_uchar *Xecode;
366 PCRE_PUCHAR Xmstart;
367 int Xoffset_top;
368 eptrblock *Xeptrb;
369 unsigned int Xrdepth;
370
371 /* Function local variables */
372
373 PCRE_PUCHAR Xcallpat;
374 #ifdef SUPPORT_UTF
375 PCRE_PUCHAR Xcharptr;
376 #endif
377 PCRE_PUCHAR Xdata;
378 PCRE_PUCHAR Xnext;
379 PCRE_PUCHAR Xpp;
380 PCRE_PUCHAR Xprev;
381 PCRE_PUCHAR Xsaved_eptr;
382
383 recursion_info Xnew_recursive;
384
385 BOOL Xcur_is_word;
386 BOOL Xcondition;
387 BOOL Xprev_is_word;
388
389 #ifdef SUPPORT_UCP
390 int Xprop_type;
391 int Xprop_value;
392 int Xprop_fail_result;
393 int Xoclength;
394 pcre_uchar Xocchars[6];
395 #endif
396
397 int Xcodelink;
398 int Xctype;
399 unsigned int Xfc;
400 int Xfi;
401 int Xlength;
402 int Xmax;
403 int Xmin;
404 int Xnumber;
405 int Xoffset;
406 int Xop;
407 int Xsave_capture_last;
408 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
409 int Xstacksave[REC_STACK_SAVE_MAX];
410
411 eptrblock Xnewptrb;
412
413 /* Where to jump back to */
414
415 int Xwhere;
416
417 } heapframe;
418
419 #endif
420
421
422 /***************************************************************************
423 ***************************************************************************/
424
425
426
427 /*************************************************
428 * Match from current position *
429 *************************************************/
430
431 /* This function is called recursively in many circumstances. Whenever it
432 returns a negative (error) response, the outer incarnation must also return the
433 same response. */
434
435 /* These macros pack up tests that are used for partial matching, and which
436 appear several times in the code. We set the "hit end" flag if the pointer is
437 at the end of the subject and also past the start of the subject (i.e.
438 something has been matched). For hard partial matching, we then return
439 immediately. The second one is used when we already know we are past the end of
440 the subject. */
441
442 #define CHECK_PARTIAL()\
443 if (md->partial != 0 && eptr >= md->end_subject && \
444 eptr > md->start_used_ptr) \
445 { \
446 md->hitend = TRUE; \
447 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
448 }
449
450 #define SCHECK_PARTIAL()\
451 if (md->partial != 0 && eptr > md->start_used_ptr) \
452 { \
453 md->hitend = TRUE; \
454 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
455 }
456
457
458 /* Performance note: It might be tempting to extract commonly used fields from
459 the md structure (e.g. utf, end_subject) into individual variables to improve
460 performance. Tests using gcc on a SPARC disproved this; in the first case, it
461 made performance worse.
462
463 Arguments:
464 eptr pointer to current character in subject
465 ecode pointer to current position in compiled code
466 mstart pointer to the current match start position (can be modified
467 by encountering \K)
468 offset_top current top pointer
469 md pointer to "static" info for the match
470 eptrb pointer to chain of blocks containing eptr at start of
471 brackets - for testing for empty matches
472 rdepth the recursion depth
473
474 Returns: MATCH_MATCH if matched ) these values are >= 0
475 MATCH_NOMATCH if failed to match )
476 a negative MATCH_xxx value for PRUNE, SKIP, etc
477 a negative PCRE_ERROR_xxx value if aborted by an error condition
478 (e.g. stopped by repeated call or recursion limit)
479 */
480
481 static int
482 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
483 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
484 unsigned int rdepth)
485 {
486 /* These variables do not need to be preserved over recursion in this function,
487 so they can be ordinary variables in all cases. Mark some of them with
488 "register" because they are used a lot in loops. */
489
490 register int rrc; /* Returns from recursive calls */
491 register int i; /* Used for loops not involving calls to RMATCH() */
492 register unsigned int c; /* Character values not kept over RMATCH() calls */
493 register BOOL utf; /* Local copy of UTF flag for speed */
494
495 BOOL minimize, possessive; /* Quantifier options */
496 BOOL caseless;
497 int condcode;
498
499 /* When recursion is not being used, all "local" variables that have to be
500 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
501 frame on the stack here; subsequent instantiations are obtained from the heap
502 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
503 the top-level on the stack rather than malloc-ing them all gives a performance
504 boost in many cases where there is not much "recursion". */
505
506 #ifdef NO_RECURSE
507 heapframe *frame = (heapframe *)md->match_frames_base;
508
509 /* Copy in the original argument variables */
510
511 frame->Xeptr = eptr;
512 frame->Xecode = ecode;
513 frame->Xmstart = mstart;
514 frame->Xoffset_top = offset_top;
515 frame->Xeptrb = eptrb;
516 frame->Xrdepth = rdepth;
517
518 /* This is where control jumps back to to effect "recursion" */
519
520 HEAP_RECURSE:
521
522 /* Macros make the argument variables come from the current frame */
523
524 #define eptr frame->Xeptr
525 #define ecode frame->Xecode
526 #define mstart frame->Xmstart
527 #define offset_top frame->Xoffset_top
528 #define eptrb frame->Xeptrb
529 #define rdepth frame->Xrdepth
530
531 /* Ditto for the local variables */
532
533 #ifdef SUPPORT_UTF
534 #define charptr frame->Xcharptr
535 #endif
536 #define callpat frame->Xcallpat
537 #define codelink frame->Xcodelink
538 #define data frame->Xdata
539 #define next frame->Xnext
540 #define pp frame->Xpp
541 #define prev frame->Xprev
542 #define saved_eptr frame->Xsaved_eptr
543
544 #define new_recursive frame->Xnew_recursive
545
546 #define cur_is_word frame->Xcur_is_word
547 #define condition frame->Xcondition
548 #define prev_is_word frame->Xprev_is_word
549
550 #ifdef SUPPORT_UCP
551 #define prop_type frame->Xprop_type
552 #define prop_value frame->Xprop_value
553 #define prop_fail_result frame->Xprop_fail_result
554 #define oclength frame->Xoclength
555 #define occhars frame->Xocchars
556 #endif
557
558 #define ctype frame->Xctype
559 #define fc frame->Xfc
560 #define fi frame->Xfi
561 #define length frame->Xlength
562 #define max frame->Xmax
563 #define min frame->Xmin
564 #define number frame->Xnumber
565 #define offset frame->Xoffset
566 #define op frame->Xop
567 #define save_capture_last frame->Xsave_capture_last
568 #define save_offset1 frame->Xsave_offset1
569 #define save_offset2 frame->Xsave_offset2
570 #define save_offset3 frame->Xsave_offset3
571 #define stacksave frame->Xstacksave
572
573 #define newptrb frame->Xnewptrb
574
575 /* When recursion is being used, local variables are allocated on the stack and
576 get preserved during recursion in the normal way. In this environment, fi and
577 i, and fc and c, can be the same variables. */
578
579 #else /* NO_RECURSE not defined */
580 #define fi i
581 #define fc c
582
583 /* Many of the following variables are used only in small blocks of the code.
584 My normal style of coding would have declared them within each of those blocks.
585 However, in order to accommodate the version of this code that uses an external
586 "stack" implemented on the heap, it is easier to declare them all here, so the
587 declarations can be cut out in a block. The only declarations within blocks
588 below are for variables that do not have to be preserved over a recursive call
589 to RMATCH(). */
590
591 #ifdef SUPPORT_UTF
592 const pcre_uchar *charptr;
593 #endif
594 const pcre_uchar *callpat;
595 const pcre_uchar *data;
596 const pcre_uchar *next;
597 PCRE_PUCHAR pp;
598 const pcre_uchar *prev;
599 PCRE_PUCHAR saved_eptr;
600
601 recursion_info new_recursive;
602
603 BOOL cur_is_word;
604 BOOL condition;
605 BOOL prev_is_word;
606
607 #ifdef SUPPORT_UCP
608 int prop_type;
609 int prop_value;
610 int prop_fail_result;
611 int oclength;
612 pcre_uchar occhars[6];
613 #endif
614
615 int codelink;
616 int ctype;
617 int length;
618 int max;
619 int min;
620 int number;
621 int offset;
622 int op;
623 int save_capture_last;
624 int save_offset1, save_offset2, save_offset3;
625 int stacksave[REC_STACK_SAVE_MAX];
626
627 eptrblock newptrb;
628
629 /* There is a special fudge for calling match() in a way that causes it to
630 measure the size of its basic stack frame when the stack is being used for
631 recursion. The second argument (ecode) being NULL triggers this behaviour. It
632 cannot normally ever be NULL. The return is the negated value of the frame
633 size. */
634
635 if (ecode == NULL)
636 {
637 if (rdepth == 0)
638 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
639 else
640 {
641 int len = (char *)&rdepth - (char *)eptr;
642 return (len > 0)? -len : len;
643 }
644 }
645 #endif /* NO_RECURSE */
646
647 /* To save space on the stack and in the heap frame, I have doubled up on some
648 of the local variables that are used only in localised parts of the code, but
649 still need to be preserved over recursive calls of match(). These macros define
650 the alternative names that are used. */
651
652 #define allow_zero cur_is_word
653 #define cbegroup condition
654 #define code_offset codelink
655 #define condassert condition
656 #define matched_once prev_is_word
657 #define foc number
658 #define save_mark data
659
660 /* These statements are here to stop the compiler complaining about unitialized
661 variables. */
662
663 #ifdef SUPPORT_UCP
664 prop_value = 0;
665 prop_fail_result = 0;
666 #endif
667
668
669 /* This label is used for tail recursion, which is used in a few cases even
670 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
671 used. Thanks to Ian Taylor for noticing this possibility and sending the
672 original patch. */
673
674 TAIL_RECURSE:
675
676 /* OK, now we can get on with the real code of the function. Recursive calls
677 are specified by the macro RMATCH and RRETURN is used to return. When
678 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
679 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
680 defined). However, RMATCH isn't like a function call because it's quite a
681 complicated macro. It has to be used in one particular way. This shouldn't,
682 however, impact performance when true recursion is being used. */
683
684 #ifdef SUPPORT_UTF
685 utf = md->utf; /* Local copy of the flag */
686 #else
687 utf = FALSE;
688 #endif
689
690 /* First check that we haven't called match() too many times, or that we
691 haven't exceeded the recursive call limit. */
692
693 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
694 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
695
696 /* At the start of a group with an unlimited repeat that may match an empty
697 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
698 done this way to save having to use another function argument, which would take
699 up space on the stack. See also MATCH_CONDASSERT below.
700
701 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
702 such remembered pointers, to be checked when we hit the closing ket, in order
703 to break infinite loops that match no characters. When match() is called in
704 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
705 NOT be used with tail recursion, because the memory block that is used is on
706 the stack, so a new one may be required for each match(). */
707
708 if (md->match_function_type == MATCH_CBEGROUP)
709 {
710 newptrb.epb_saved_eptr = eptr;
711 newptrb.epb_prev = eptrb;
712 eptrb = &newptrb;
713 md->match_function_type = 0;
714 }
715
716 /* Now start processing the opcodes. */
717
718 for (;;)
719 {
720 minimize = possessive = FALSE;
721 op = *ecode;
722
723 switch(op)
724 {
725 case OP_MARK:
726 md->nomatch_mark = ecode + 2;
727 md->mark = NULL; /* In case previously set by assertion */
728 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
729 eptrb, RM55);
730 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
731 md->mark == NULL) md->mark = ecode + 2;
732
733 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
734 argument, and we must check whether that argument matches this MARK's
735 argument. It is passed back in md->start_match_ptr (an overloading of that
736 variable). If it does match, we reset that variable to the current subject
737 position and return MATCH_SKIP. Otherwise, pass back the return code
738 unaltered. */
739
740 else if (rrc == MATCH_SKIP_ARG &&
741 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
742 {
743 md->start_match_ptr = eptr;
744 RRETURN(MATCH_SKIP);
745 }
746 RRETURN(rrc);
747
748 case OP_FAIL:
749 RRETURN(MATCH_NOMATCH);
750
751 /* COMMIT overrides PRUNE, SKIP, and THEN */
752
753 case OP_COMMIT:
754 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
755 eptrb, RM52);
756 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
757 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
758 rrc != MATCH_THEN)
759 RRETURN(rrc);
760 RRETURN(MATCH_COMMIT);
761
762 /* PRUNE overrides THEN */
763
764 case OP_PRUNE:
765 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
766 eptrb, RM51);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
768 RRETURN(MATCH_PRUNE);
769
770 case OP_PRUNE_ARG:
771 md->nomatch_mark = ecode + 2;
772 md->mark = NULL; /* In case previously set by assertion */
773 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM56);
775 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
776 md->mark == NULL) md->mark = ecode + 2;
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 RRETURN(MATCH_PRUNE);
779
780 /* SKIP overrides PRUNE and THEN */
781
782 case OP_SKIP:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM53);
785 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
786 RRETURN(rrc);
787 md->start_match_ptr = eptr; /* Pass back current position */
788 RRETURN(MATCH_SKIP);
789
790 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
791 nomatch_mark. There is a flag that disables this opcode when re-matching a
792 pattern that ended with a SKIP for which there was not a matching MARK. */
793
794 case OP_SKIP_ARG:
795 if (md->ignore_skip_arg)
796 {
797 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
798 break;
799 }
800 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
801 eptrb, RM57);
802 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
803 RRETURN(rrc);
804
805 /* Pass back the current skip name by overloading md->start_match_ptr and
806 returning the special MATCH_SKIP_ARG return code. This will either be
807 caught by a matching MARK, or get to the top, where it causes a rematch
808 with the md->ignore_skip_arg flag set. */
809
810 md->start_match_ptr = ecode + 2;
811 RRETURN(MATCH_SKIP_ARG);
812
813 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
814 the branch in which it occurs can be determined. Overload the start of
815 match pointer to do this. */
816
817 case OP_THEN:
818 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
819 eptrb, RM54);
820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
821 md->start_match_ptr = ecode;
822 RRETURN(MATCH_THEN);
823
824 case OP_THEN_ARG:
825 md->nomatch_mark = ecode + 2;
826 md->mark = NULL; /* In case previously set by assertion */
827 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
828 md, eptrb, RM58);
829 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
830 md->mark == NULL) md->mark = ecode + 2;
831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832 md->start_match_ptr = ecode;
833 RRETURN(MATCH_THEN);
834
835 /* Handle an atomic group that does not contain any capturing parentheses.
836 This can be handled like an assertion. Prior to 8.13, all atomic groups
837 were handled this way. In 8.13, the code was changed as below for ONCE, so
838 that backups pass through the group and thereby reset captured values.
839 However, this uses a lot more stack, so in 8.20, atomic groups that do not
840 contain any captures generate OP_ONCE_NC, which can be handled in the old,
841 less stack intensive way.
842
843 Check the alternative branches in turn - the matching won't pass the KET
844 for this kind of subpattern. If any one branch matches, we carry on as at
845 the end of a normal bracket, leaving the subject pointer, but resetting
846 the start-of-match value in case it was changed by \K. */
847
848 case OP_ONCE_NC:
849 prev = ecode;
850 saved_eptr = eptr;
851 save_mark = md->mark;
852 do
853 {
854 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
855 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
856 {
857 mstart = md->start_match_ptr;
858 break;
859 }
860 if (rrc == MATCH_THEN)
861 {
862 next = ecode + GET(ecode,1);
863 if (md->start_match_ptr < next &&
864 (*ecode == OP_ALT || *next == OP_ALT))
865 rrc = MATCH_NOMATCH;
866 }
867
868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
869 ecode += GET(ecode,1);
870 md->mark = save_mark;
871 }
872 while (*ecode == OP_ALT);
873
874 /* If hit the end of the group (which could be repeated), fail */
875
876 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
877
878 /* Continue as from after the group, updating the offsets high water
879 mark, since extracts may have been taken. */
880
881 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
882
883 offset_top = md->end_offset_top;
884 eptr = md->end_match_ptr;
885
886 /* For a non-repeating ket, just continue at this level. This also
887 happens for a repeating ket if no characters were matched in the group.
888 This is the forcible breaking of infinite loops as implemented in Perl
889 5.005. */
890
891 if (*ecode == OP_KET || eptr == saved_eptr)
892 {
893 ecode += 1+LINK_SIZE;
894 break;
895 }
896
897 /* The repeating kets try the rest of the pattern or restart from the
898 preceding bracket, in the appropriate order. The second "call" of match()
899 uses tail recursion, to avoid using another stack frame. */
900
901 if (*ecode == OP_KETRMIN)
902 {
903 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
905 ecode = prev;
906 goto TAIL_RECURSE;
907 }
908 else /* OP_KETRMAX */
909 {
910 md->match_function_type = MATCH_CBEGROUP;
911 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
912 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
913 ecode += 1 + LINK_SIZE;
914 goto TAIL_RECURSE;
915 }
916 /* Control never gets here */
917
918 /* Handle a capturing bracket, other than those that are possessive with an
919 unlimited repeat. If there is space in the offset vector, save the current
920 subject position in the working slot at the top of the vector. We mustn't
921 change the current values of the data slot, because they may be set from a
922 previous iteration of this group, and be referred to by a reference inside
923 the group. A failure to match might occur after the group has succeeded,
924 if something later on doesn't match. For this reason, we need to restore
925 the working value and also the values of the final offsets, in case they
926 were set by a previous iteration of the same bracket.
927
928 If there isn't enough space in the offset vector, treat this as if it were
929 a non-capturing bracket. Don't worry about setting the flag for the error
930 case here; that is handled in the code for KET. */
931
932 case OP_CBRA:
933 case OP_SCBRA:
934 number = GET2(ecode, 1+LINK_SIZE);
935 offset = number << 1;
936
937 #ifdef PCRE_DEBUG
938 printf("start bracket %d\n", number);
939 printf("subject=");
940 pchars(eptr, 16, TRUE, md);
941 printf("\n");
942 #endif
943
944 if (offset < md->offset_max)
945 {
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950 save_mark = md->mark;
951
952 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
953 md->offset_vector[md->offset_end - number] =
954 (int)(eptr - md->start_subject);
955
956 for (;;)
957 {
958 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
959 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
960 eptrb, RM1);
961 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
962
963 /* If we backed up to a THEN, check whether it is within the current
964 branch by comparing the address of the THEN that is passed back with
965 the end of the branch. If it is within the current branch, and the
966 branch is one of two or more alternatives (it either starts or ends
967 with OP_ALT), we have reached the limit of THEN's action, so convert
968 the return code to NOMATCH, which will cause normal backtracking to
969 happen from now on. Otherwise, THEN is passed back to an outer
970 alternative. This implements Perl's treatment of parenthesized groups,
971 where a group not containing | does not affect the current alternative,
972 that is, (X) is NOT the same as (X|(*F)). */
973
974 if (rrc == MATCH_THEN)
975 {
976 next = ecode + GET(ecode,1);
977 if (md->start_match_ptr < next &&
978 (*ecode == OP_ALT || *next == OP_ALT))
979 rrc = MATCH_NOMATCH;
980 }
981
982 /* Anything other than NOMATCH is passed back. */
983
984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
985 md->capture_last = save_capture_last;
986 ecode += GET(ecode, 1);
987 md->mark = save_mark;
988 if (*ecode != OP_ALT) break;
989 }
990
991 DPRINTF(("bracket %d failed\n", number));
992 md->offset_vector[offset] = save_offset1;
993 md->offset_vector[offset+1] = save_offset2;
994 md->offset_vector[md->offset_end - number] = save_offset3;
995
996 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
997
998 RRETURN(rrc);
999 }
1000
1001 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1002 as a non-capturing bracket. */
1003
1004 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006
1007 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1008
1009 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1010 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1011
1012 /* Non-capturing or atomic group, except for possessive with unlimited
1013 repeat and ONCE group with no captures. Loop for all the alternatives.
1014
1015 When we get to the final alternative within the brackets, we used to return
1016 the result of a recursive call to match() whatever happened so it was
1017 possible to reduce stack usage by turning this into a tail recursion,
1018 except in the case of a possibly empty group. However, now that there is
1019 the possiblity of (*THEN) occurring in the final alternative, this
1020 optimization is no longer always possible.
1021
1022 We can optimize if we know there are no (*THEN)s in the pattern; at present
1023 this is the best that can be done.
1024
1025 MATCH_ONCE is returned when the end of an atomic group is successfully
1026 reached, but subsequent matching fails. It passes back up the tree (causing
1027 captured values to be reset) until the original atomic group level is
1028 reached. This is tested by comparing md->once_target with the start of the
1029 group. At this point, the return is converted into MATCH_NOMATCH so that
1030 previous backup points can be taken. */
1031
1032 case OP_ONCE:
1033 case OP_BRA:
1034 case OP_SBRA:
1035 DPRINTF(("start non-capturing bracket\n"));
1036
1037 for (;;)
1038 {
1039 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1040
1041 /* If this is not a possibly empty group, and there are no (*THEN)s in
1042 the pattern, and this is the final alternative, optimize as described
1043 above. */
1044
1045 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1046 {
1047 ecode += PRIV(OP_lengths)[*ecode];
1048 goto TAIL_RECURSE;
1049 }
1050
1051 /* In all other cases, we have to make another call to match(). */
1052
1053 save_mark = md->mark;
1054 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1055 RM2);
1056
1057 /* See comment in the code for capturing groups above about handling
1058 THEN. */
1059
1060 if (rrc == MATCH_THEN)
1061 {
1062 next = ecode + GET(ecode,1);
1063 if (md->start_match_ptr < next &&
1064 (*ecode == OP_ALT || *next == OP_ALT))
1065 rrc = MATCH_NOMATCH;
1066 }
1067
1068 if (rrc != MATCH_NOMATCH)
1069 {
1070 if (rrc == MATCH_ONCE)
1071 {
1072 const pcre_uchar *scode = ecode;
1073 if (*scode != OP_ONCE) /* If not at start, find it */
1074 {
1075 while (*scode == OP_ALT) scode += GET(scode, 1);
1076 scode -= GET(scode, 1);
1077 }
1078 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1079 }
1080 RRETURN(rrc);
1081 }
1082 ecode += GET(ecode, 1);
1083 md->mark = save_mark;
1084 if (*ecode != OP_ALT) break;
1085 }
1086
1087 RRETURN(MATCH_NOMATCH);
1088
1089 /* Handle possessive capturing brackets with an unlimited repeat. We come
1090 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1091 handled similarly to the normal case above. However, the matching is
1092 different. The end of these brackets will always be OP_KETRPOS, which
1093 returns MATCH_KETRPOS without going further in the pattern. By this means
1094 we can handle the group by iteration rather than recursion, thereby
1095 reducing the amount of stack needed. */
1096
1097 case OP_CBRAPOS:
1098 case OP_SCBRAPOS:
1099 allow_zero = FALSE;
1100
1101 POSSESSIVE_CAPTURE:
1102 number = GET2(ecode, 1+LINK_SIZE);
1103 offset = number << 1;
1104
1105 #ifdef PCRE_DEBUG
1106 printf("start possessive bracket %d\n", number);
1107 printf("subject=");
1108 pchars(eptr, 16, TRUE, md);
1109 printf("\n");
1110 #endif
1111
1112 if (offset < md->offset_max)
1113 {
1114 matched_once = FALSE;
1115 code_offset = (int)(ecode - md->start_code);
1116
1117 save_offset1 = md->offset_vector[offset];
1118 save_offset2 = md->offset_vector[offset+1];
1119 save_offset3 = md->offset_vector[md->offset_end - number];
1120 save_capture_last = md->capture_last;
1121
1122 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1123
1124 /* Each time round the loop, save the current subject position for use
1125 when the group matches. For MATCH_MATCH, the group has matched, so we
1126 restart it with a new subject starting position, remembering that we had
1127 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1128 usual. If we haven't matched any alternatives in any iteration, check to
1129 see if a previous iteration matched. If so, the group has matched;
1130 continue from afterwards. Otherwise it has failed; restore the previous
1131 capture values before returning NOMATCH. */
1132
1133 for (;;)
1134 {
1135 md->offset_vector[md->offset_end - number] =
1136 (int)(eptr - md->start_subject);
1137 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1138 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1139 eptrb, RM63);
1140 if (rrc == MATCH_KETRPOS)
1141 {
1142 offset_top = md->end_offset_top;
1143 eptr = md->end_match_ptr;
1144 ecode = md->start_code + code_offset;
1145 save_capture_last = md->capture_last;
1146 matched_once = TRUE;
1147 continue;
1148 }
1149
1150 /* See comment in the code for capturing groups above about handling
1151 THEN. */
1152
1153 if (rrc == MATCH_THEN)
1154 {
1155 next = ecode + GET(ecode,1);
1156 if (md->start_match_ptr < next &&
1157 (*ecode == OP_ALT || *next == OP_ALT))
1158 rrc = MATCH_NOMATCH;
1159 }
1160
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 md->capture_last = save_capture_last;
1163 ecode += GET(ecode, 1);
1164 if (*ecode != OP_ALT) break;
1165 }
1166
1167 if (!matched_once)
1168 {
1169 md->offset_vector[offset] = save_offset1;
1170 md->offset_vector[offset+1] = save_offset2;
1171 md->offset_vector[md->offset_end - number] = save_offset3;
1172 }
1173
1174 if (allow_zero || matched_once)
1175 {
1176 ecode += 1 + LINK_SIZE;
1177 break;
1178 }
1179
1180 RRETURN(MATCH_NOMATCH);
1181 }
1182
1183 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1184 as a non-capturing bracket. */
1185
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188
1189 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1190
1191 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1192 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1193
1194 /* Non-capturing possessive bracket with unlimited repeat. We come here
1195 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1196 without the capturing complication. It is written out separately for speed
1197 and cleanliness. */
1198
1199 case OP_BRAPOS:
1200 case OP_SBRAPOS:
1201 allow_zero = FALSE;
1202
1203 POSSESSIVE_NON_CAPTURE:
1204 matched_once = FALSE;
1205 code_offset = (int)(ecode - md->start_code);
1206
1207 for (;;)
1208 {
1209 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1210 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1211 eptrb, RM48);
1212 if (rrc == MATCH_KETRPOS)
1213 {
1214 offset_top = md->end_offset_top;
1215 eptr = md->end_match_ptr;
1216 ecode = md->start_code + code_offset;
1217 matched_once = TRUE;
1218 continue;
1219 }
1220
1221 /* See comment in the code for capturing groups above about handling
1222 THEN. */
1223
1224 if (rrc == MATCH_THEN)
1225 {
1226 next = ecode + GET(ecode,1);
1227 if (md->start_match_ptr < next &&
1228 (*ecode == OP_ALT || *next == OP_ALT))
1229 rrc = MATCH_NOMATCH;
1230 }
1231
1232 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1233 ecode += GET(ecode, 1);
1234 if (*ecode != OP_ALT) break;
1235 }
1236
1237 if (matched_once || allow_zero)
1238 {
1239 ecode += 1 + LINK_SIZE;
1240 break;
1241 }
1242 RRETURN(MATCH_NOMATCH);
1243
1244 /* Control never reaches here. */
1245
1246 /* Conditional group: compilation checked that there are no more than
1247 two branches. If the condition is false, skipping the first branch takes us
1248 past the end if there is only one branch, but that's OK because that is
1249 exactly what going to the ket would do. */
1250
1251 case OP_COND:
1252 case OP_SCOND:
1253 codelink = GET(ecode, 1);
1254
1255 /* Because of the way auto-callout works during compile, a callout item is
1256 inserted between OP_COND and an assertion condition. */
1257
1258 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1259 {
1260 if (PUBL(callout) != NULL)
1261 {
1262 PUBL(callout_block) cb;
1263 cb.version = 2; /* Version 1 of the callout block */
1264 cb.callout_number = ecode[LINK_SIZE+2];
1265 cb.offset_vector = md->offset_vector;
1266 #ifdef COMPILE_PCRE8
1267 cb.subject = (PCRE_SPTR)md->start_subject;
1268 #else
1269 cb.subject = (PCRE_SPTR16)md->start_subject;
1270 #endif
1271 cb.subject_length = (int)(md->end_subject - md->start_subject);
1272 cb.start_match = (int)(mstart - md->start_subject);
1273 cb.current_position = (int)(eptr - md->start_subject);
1274 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1275 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1276 cb.capture_top = offset_top/2;
1277 cb.capture_last = md->capture_last;
1278 cb.callout_data = md->callout_data;
1279 cb.mark = md->nomatch_mark;
1280 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1281 if (rrc < 0) RRETURN(rrc);
1282 }
1283 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1284 }
1285
1286 condcode = ecode[LINK_SIZE+1];
1287
1288 /* Now see what the actual condition is */
1289
1290 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1291 {
1292 if (md->recursive == NULL) /* Not recursing => FALSE */
1293 {
1294 condition = FALSE;
1295 ecode += GET(ecode, 1);
1296 }
1297 else
1298 {
1299 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1300 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1301
1302 /* If the test is for recursion into a specific subpattern, and it is
1303 false, but the test was set up by name, scan the table to see if the
1304 name refers to any other numbers, and test them. The condition is true
1305 if any one is set. */
1306
1307 if (!condition && condcode == OP_NRREF)
1308 {
1309 pcre_uchar *slotA = md->name_table;
1310 for (i = 0; i < md->name_count; i++)
1311 {
1312 if (GET2(slotA, 0) == recno) break;
1313 slotA += md->name_entry_size;
1314 }
1315
1316 /* Found a name for the number - there can be only one; duplicate
1317 names for different numbers are allowed, but not vice versa. First
1318 scan down for duplicates. */
1319
1320 if (i < md->name_count)
1321 {
1322 pcre_uchar *slotB = slotA;
1323 while (slotB > md->name_table)
1324 {
1325 slotB -= md->name_entry_size;
1326 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1327 {
1328 condition = GET2(slotB, 0) == md->recursive->group_num;
1329 if (condition) break;
1330 }
1331 else break;
1332 }
1333
1334 /* Scan up for duplicates */
1335
1336 if (!condition)
1337 {
1338 slotB = slotA;
1339 for (i++; i < md->name_count; i++)
1340 {
1341 slotB += md->name_entry_size;
1342 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1343 {
1344 condition = GET2(slotB, 0) == md->recursive->group_num;
1345 if (condition) break;
1346 }
1347 else break;
1348 }
1349 }
1350 }
1351 }
1352
1353 /* Chose branch according to the condition */
1354
1355 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1356 }
1357 }
1358
1359 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1360 {
1361 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1362 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1363
1364 /* If the numbered capture is unset, but the reference was by name,
1365 scan the table to see if the name refers to any other numbers, and test
1366 them. The condition is true if any one is set. This is tediously similar
1367 to the code above, but not close enough to try to amalgamate. */
1368
1369 if (!condition && condcode == OP_NCREF)
1370 {
1371 int refno = offset >> 1;
1372 pcre_uchar *slotA = md->name_table;
1373
1374 for (i = 0; i < md->name_count; i++)
1375 {
1376 if (GET2(slotA, 0) == refno) break;
1377 slotA += md->name_entry_size;
1378 }
1379
1380 /* Found a name for the number - there can be only one; duplicate names
1381 for different numbers are allowed, but not vice versa. First scan down
1382 for duplicates. */
1383
1384 if (i < md->name_count)
1385 {
1386 pcre_uchar *slotB = slotA;
1387 while (slotB > md->name_table)
1388 {
1389 slotB -= md->name_entry_size;
1390 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1391 {
1392 offset = GET2(slotB, 0) << 1;
1393 condition = offset < offset_top &&
1394 md->offset_vector[offset] >= 0;
1395 if (condition) break;
1396 }
1397 else break;
1398 }
1399
1400 /* Scan up for duplicates */
1401
1402 if (!condition)
1403 {
1404 slotB = slotA;
1405 for (i++; i < md->name_count; i++)
1406 {
1407 slotB += md->name_entry_size;
1408 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1409 {
1410 offset = GET2(slotB, 0) << 1;
1411 condition = offset < offset_top &&
1412 md->offset_vector[offset] >= 0;
1413 if (condition) break;
1414 }
1415 else break;
1416 }
1417 }
1418 }
1419 }
1420
1421 /* Chose branch according to the condition */
1422
1423 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1424 }
1425
1426 else if (condcode == OP_DEF) /* DEFINE - always false */
1427 {
1428 condition = FALSE;
1429 ecode += GET(ecode, 1);
1430 }
1431
1432 /* The condition is an assertion. Call match() to evaluate it - setting
1433 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1434 an assertion. */
1435
1436 else
1437 {
1438 md->match_function_type = MATCH_CONDASSERT;
1439 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1440 if (rrc == MATCH_MATCH)
1441 {
1442 if (md->end_offset_top > offset_top)
1443 offset_top = md->end_offset_top; /* Captures may have happened */
1444 condition = TRUE;
1445 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1446 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1447 }
1448
1449 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1450 assertion; it is therefore treated as NOMATCH. */
1451
1452 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1453 {
1454 RRETURN(rrc); /* Need braces because of following else */
1455 }
1456 else
1457 {
1458 condition = FALSE;
1459 ecode += codelink;
1460 }
1461 }
1462
1463 /* We are now at the branch that is to be obeyed. As there is only one, can
1464 use tail recursion to avoid using another stack frame, except when there is
1465 unlimited repeat of a possibly empty group. In the latter case, a recursive
1466 call to match() is always required, unless the second alternative doesn't
1467 exist, in which case we can just plough on. Note that, for compatibility
1468 with Perl, the | in a conditional group is NOT treated as creating two
1469 alternatives. If a THEN is encountered in the branch, it propagates out to
1470 the enclosing alternative (unless nested in a deeper set of alternatives,
1471 of course). */
1472
1473 if (condition || *ecode == OP_ALT)
1474 {
1475 if (op != OP_SCOND)
1476 {
1477 ecode += 1 + LINK_SIZE;
1478 goto TAIL_RECURSE;
1479 }
1480
1481 md->match_function_type = MATCH_CBEGROUP;
1482 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1483 RRETURN(rrc);
1484 }
1485
1486 /* Condition false & no alternative; continue after the group. */
1487
1488 else
1489 {
1490 ecode += 1 + LINK_SIZE;
1491 }
1492 break;
1493
1494
1495 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1496 to close any currently open capturing brackets. */
1497
1498 case OP_CLOSE:
1499 number = GET2(ecode, 1);
1500 offset = number << 1;
1501
1502 #ifdef PCRE_DEBUG
1503 printf("end bracket %d at *ACCEPT", number);
1504 printf("\n");
1505 #endif
1506
1507 md->capture_last = number;
1508 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1509 {
1510 md->offset_vector[offset] =
1511 md->offset_vector[md->offset_end - number];
1512 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1513 if (offset_top <= offset) offset_top = offset + 2;
1514 }
1515 ecode += 1 + IMM2_SIZE;
1516 break;
1517
1518
1519 /* End of the pattern, either real or forced. */
1520
1521 case OP_END:
1522 case OP_ACCEPT:
1523 case OP_ASSERT_ACCEPT:
1524
1525 /* If we have matched an empty string, fail if not in an assertion and not
1526 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1527 is set and we have matched at the start of the subject. In both cases,
1528 backtracking will then try other alternatives, if any. */
1529
1530 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1531 md->recursive == NULL &&
1532 (md->notempty ||
1533 (md->notempty_atstart &&
1534 mstart == md->start_subject + md->start_offset)))
1535 RRETURN(MATCH_NOMATCH);
1536
1537 /* Otherwise, we have a match. */
1538
1539 md->end_match_ptr = eptr; /* Record where we ended */
1540 md->end_offset_top = offset_top; /* and how many extracts were taken */
1541 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1542
1543 /* For some reason, the macros don't work properly if an expression is
1544 given as the argument to RRETURN when the heap is in use. */
1545
1546 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1547 RRETURN(rrc);
1548
1549 /* Assertion brackets. Check the alternative branches in turn - the
1550 matching won't pass the KET for an assertion. If any one branch matches,
1551 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1552 start of each branch to move the current point backwards, so the code at
1553 this level is identical to the lookahead case. When the assertion is part
1554 of a condition, we want to return immediately afterwards. The caller of
1555 this incarnation of the match() function will have set MATCH_CONDASSERT in
1556 md->match_function type, and one of these opcodes will be the first opcode
1557 that is processed. We use a local variable that is preserved over calls to
1558 match() to remember this case. */
1559
1560 case OP_ASSERT:
1561 case OP_ASSERTBACK:
1562 save_mark = md->mark;
1563 if (md->match_function_type == MATCH_CONDASSERT)
1564 {
1565 condassert = TRUE;
1566 md->match_function_type = 0;
1567 }
1568 else condassert = FALSE;
1569
1570 do
1571 {
1572 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1574 {
1575 mstart = md->start_match_ptr; /* In case \K reset it */
1576 break;
1577 }
1578
1579 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1580 as NOMATCH. */
1581
1582 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1583 ecode += GET(ecode, 1);
1584 md->mark = save_mark;
1585 }
1586 while (*ecode == OP_ALT);
1587
1588 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1589
1590 /* If checking an assertion for a condition, return MATCH_MATCH. */
1591
1592 if (condassert) RRETURN(MATCH_MATCH);
1593
1594 /* Continue from after the assertion, updating the offsets high water
1595 mark, since extracts may have been taken during the assertion. */
1596
1597 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1598 ecode += 1 + LINK_SIZE;
1599 offset_top = md->end_offset_top;
1600 continue;
1601
1602 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1603 PRUNE, or COMMIT means we must assume failure without checking subsequent
1604 branches. */
1605
1606 case OP_ASSERT_NOT:
1607 case OP_ASSERTBACK_NOT:
1608 save_mark = md->mark;
1609 if (md->match_function_type == MATCH_CONDASSERT)
1610 {
1611 condassert = TRUE;
1612 md->match_function_type = 0;
1613 }
1614 else condassert = FALSE;
1615
1616 do
1617 {
1618 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1619 md->mark = save_mark;
1620 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1621 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1622 {
1623 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1624 break;
1625 }
1626
1627 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1628 as NOMATCH. */
1629
1630 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1631 ecode += GET(ecode,1);
1632 }
1633 while (*ecode == OP_ALT);
1634
1635 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1636
1637 ecode += 1 + LINK_SIZE;
1638 continue;
1639
1640 /* Move the subject pointer back. This occurs only at the start of
1641 each branch of a lookbehind assertion. If we are too close to the start to
1642 move back, this match function fails. When working with UTF-8 we move
1643 back a number of characters, not bytes. */
1644
1645 case OP_REVERSE:
1646 #ifdef SUPPORT_UTF
1647 if (utf)
1648 {
1649 i = GET(ecode, 1);
1650 while (i-- > 0)
1651 {
1652 eptr--;
1653 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1654 BACKCHAR(eptr);
1655 }
1656 }
1657 else
1658 #endif
1659
1660 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1661
1662 {
1663 eptr -= GET(ecode, 1);
1664 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1665 }
1666
1667 /* Save the earliest consulted character, then skip to next op code */
1668
1669 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1670 ecode += 1 + LINK_SIZE;
1671 break;
1672
1673 /* The callout item calls an external function, if one is provided, passing
1674 details of the match so far. This is mainly for debugging, though the
1675 function is able to force a failure. */
1676
1677 case OP_CALLOUT:
1678 if (PUBL(callout) != NULL)
1679 {
1680 PUBL(callout_block) cb;
1681 cb.version = 2; /* Version 1 of the callout block */
1682 cb.callout_number = ecode[1];
1683 cb.offset_vector = md->offset_vector;
1684 #ifdef COMPILE_PCRE8
1685 cb.subject = (PCRE_SPTR)md->start_subject;
1686 #else
1687 cb.subject = (PCRE_SPTR16)md->start_subject;
1688 #endif
1689 cb.subject_length = (int)(md->end_subject - md->start_subject);
1690 cb.start_match = (int)(mstart - md->start_subject);
1691 cb.current_position = (int)(eptr - md->start_subject);
1692 cb.pattern_position = GET(ecode, 2);
1693 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1694 cb.capture_top = offset_top/2;
1695 cb.capture_last = md->capture_last;
1696 cb.callout_data = md->callout_data;
1697 cb.mark = md->nomatch_mark;
1698 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1699 if (rrc < 0) RRETURN(rrc);
1700 }
1701 ecode += 2 + 2*LINK_SIZE;
1702 break;
1703
1704 /* Recursion either matches the current regex, or some subexpression. The
1705 offset data is the offset to the starting bracket from the start of the
1706 whole pattern. (This is so that it works from duplicated subpatterns.)
1707
1708 The state of the capturing groups is preserved over recursion, and
1709 re-instated afterwards. We don't know how many are started and not yet
1710 finished (offset_top records the completed total) so we just have to save
1711 all the potential data. There may be up to 65535 such values, which is too
1712 large to put on the stack, but using malloc for small numbers seems
1713 expensive. As a compromise, the stack is used when there are no more than
1714 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1715
1716 There are also other values that have to be saved. We use a chained
1717 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1718 for the original version of this logic. It has, however, been hacked around
1719 a lot, so he is not to blame for the current way it works. */
1720
1721 case OP_RECURSE:
1722 {
1723 recursion_info *ri;
1724 int recno;
1725
1726 callpat = md->start_code + GET(ecode, 1);
1727 recno = (callpat == md->start_code)? 0 :
1728 GET2(callpat, 1 + LINK_SIZE);
1729
1730 /* Check for repeating a recursion without advancing the subject pointer.
1731 This should catch convoluted mutual recursions. (Some simple cases are
1732 caught at compile time.) */
1733
1734 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1735 if (recno == ri->group_num && eptr == ri->subject_position)
1736 RRETURN(PCRE_ERROR_RECURSELOOP);
1737
1738 /* Add to "recursing stack" */
1739
1740 new_recursive.group_num = recno;
1741 new_recursive.subject_position = eptr;
1742 new_recursive.prevrec = md->recursive;
1743 md->recursive = &new_recursive;
1744
1745 /* Where to continue from afterwards */
1746
1747 ecode += 1 + LINK_SIZE;
1748
1749 /* Now save the offset data */
1750
1751 new_recursive.saved_max = md->offset_end;
1752 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1753 new_recursive.offset_save = stacksave;
1754 else
1755 {
1756 new_recursive.offset_save =
1757 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1758 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1759 }
1760 memcpy(new_recursive.offset_save, md->offset_vector,
1761 new_recursive.saved_max * sizeof(int));
1762
1763 /* OK, now we can do the recursion. After processing each alternative,
1764 restore the offset data. If there were nested recursions, md->recursive
1765 might be changed, so reset it before looping. */
1766
1767 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1768 cbegroup = (*callpat >= OP_SBRA);
1769 do
1770 {
1771 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1772 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1773 md, eptrb, RM6);
1774 memcpy(md->offset_vector, new_recursive.offset_save,
1775 new_recursive.saved_max * sizeof(int));
1776 md->recursive = new_recursive.prevrec;
1777 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1778 {
1779 DPRINTF(("Recursion matched\n"));
1780 if (new_recursive.offset_save != stacksave)
1781 (PUBL(free))(new_recursive.offset_save);
1782
1783 /* Set where we got to in the subject, and reset the start in case
1784 it was changed by \K. This *is* propagated back out of a recursion,
1785 for Perl compatibility. */
1786
1787 eptr = md->end_match_ptr;
1788 mstart = md->start_match_ptr;
1789 goto RECURSION_MATCHED; /* Exit loop; end processing */
1790 }
1791
1792 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1793 is treated as NOMATCH. */
1794
1795 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1796 rrc != MATCH_COMMIT)
1797 {
1798 DPRINTF(("Recursion gave error %d\n", rrc));
1799 if (new_recursive.offset_save != stacksave)
1800 (PUBL(free))(new_recursive.offset_save);
1801 RRETURN(rrc);
1802 }
1803
1804 md->recursive = &new_recursive;
1805 callpat += GET(callpat, 1);
1806 }
1807 while (*callpat == OP_ALT);
1808
1809 DPRINTF(("Recursion didn't match\n"));
1810 md->recursive = new_recursive.prevrec;
1811 if (new_recursive.offset_save != stacksave)
1812 (PUBL(free))(new_recursive.offset_save);
1813 RRETURN(MATCH_NOMATCH);
1814 }
1815
1816 RECURSION_MATCHED:
1817 break;
1818
1819 /* An alternation is the end of a branch; scan along to find the end of the
1820 bracketed group and go to there. */
1821
1822 case OP_ALT:
1823 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1824 break;
1825
1826 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1827 indicating that it may occur zero times. It may repeat infinitely, or not
1828 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1829 with fixed upper repeat limits are compiled as a number of copies, with the
1830 optional ones preceded by BRAZERO or BRAMINZERO. */
1831
1832 case OP_BRAZERO:
1833 next = ecode + 1;
1834 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1836 do next += GET(next, 1); while (*next == OP_ALT);
1837 ecode = next + 1 + LINK_SIZE;
1838 break;
1839
1840 case OP_BRAMINZERO:
1841 next = ecode + 1;
1842 do next += GET(next, 1); while (*next == OP_ALT);
1843 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1844 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1845 ecode++;
1846 break;
1847
1848 case OP_SKIPZERO:
1849 next = ecode+1;
1850 do next += GET(next,1); while (*next == OP_ALT);
1851 ecode = next + 1 + LINK_SIZE;
1852 break;
1853
1854 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1855 here; just jump to the group, with allow_zero set TRUE. */
1856
1857 case OP_BRAPOSZERO:
1858 op = *(++ecode);
1859 allow_zero = TRUE;
1860 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1861 goto POSSESSIVE_NON_CAPTURE;
1862
1863 /* End of a group, repeated or non-repeating. */
1864
1865 case OP_KET:
1866 case OP_KETRMIN:
1867 case OP_KETRMAX:
1868 case OP_KETRPOS:
1869 prev = ecode - GET(ecode, 1);
1870
1871 /* If this was a group that remembered the subject start, in order to break
1872 infinite repeats of empty string matches, retrieve the subject start from
1873 the chain. Otherwise, set it NULL. */
1874
1875 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1876 {
1877 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1878 eptrb = eptrb->epb_prev; /* Backup to previous group */
1879 }
1880 else saved_eptr = NULL;
1881
1882 /* If we are at the end of an assertion group or a non-capturing atomic
1883 group, stop matching and return MATCH_MATCH, but record the current high
1884 water mark for use by positive assertions. We also need to record the match
1885 start in case it was changed by \K. */
1886
1887 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1888 *prev == OP_ONCE_NC)
1889 {
1890 md->end_match_ptr = eptr; /* For ONCE_NC */
1891 md->end_offset_top = offset_top;
1892 md->start_match_ptr = mstart;
1893 RRETURN(MATCH_MATCH); /* Sets md->mark */
1894 }
1895
1896 /* For capturing groups we have to check the group number back at the start
1897 and if necessary complete handling an extraction by setting the offsets and
1898 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1899 into group 0, so it won't be picked up here. Instead, we catch it when the
1900 OP_END is reached. Other recursion is handled here. We just have to record
1901 the current subject position and start match pointer and give a MATCH
1902 return. */
1903
1904 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1905 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1906 {
1907 number = GET2(prev, 1+LINK_SIZE);
1908 offset = number << 1;
1909
1910 #ifdef PCRE_DEBUG
1911 printf("end bracket %d", number);
1912 printf("\n");
1913 #endif
1914
1915 /* Handle a recursively called group. */
1916
1917 if (md->recursive != NULL && md->recursive->group_num == number)
1918 {
1919 md->end_match_ptr = eptr;
1920 md->start_match_ptr = mstart;
1921 RRETURN(MATCH_MATCH);
1922 }
1923
1924 /* Deal with capturing */
1925
1926 md->capture_last = number;
1927 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1928 {
1929 /* If offset is greater than offset_top, it means that we are
1930 "skipping" a capturing group, and that group's offsets must be marked
1931 unset. In earlier versions of PCRE, all the offsets were unset at the
1932 start of matching, but this doesn't work because atomic groups and
1933 assertions can cause a value to be set that should later be unset.
1934 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1935 part of the atomic group, but this is not on the final matching path,
1936 so must be unset when 2 is set. (If there is no group 2, there is no
1937 problem, because offset_top will then be 2, indicating no capture.) */
1938
1939 if (offset > offset_top)
1940 {
1941 register int *iptr = md->offset_vector + offset_top;
1942 register int *iend = md->offset_vector + offset;
1943 while (iptr < iend) *iptr++ = -1;
1944 }
1945
1946 /* Now make the extraction */
1947
1948 md->offset_vector[offset] =
1949 md->offset_vector[md->offset_end - number];
1950 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1951 if (offset_top <= offset) offset_top = offset + 2;
1952 }
1953 }
1954
1955 /* For an ordinary non-repeating ket, just continue at this level. This
1956 also happens for a repeating ket if no characters were matched in the
1957 group. This is the forcible breaking of infinite loops as implemented in
1958 Perl 5.005. For a non-repeating atomic group that includes captures,
1959 establish a backup point by processing the rest of the pattern at a lower
1960 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1961 original OP_ONCE level, thereby bypassing intermediate backup points, but
1962 resetting any captures that happened along the way. */
1963
1964 if (*ecode == OP_KET || eptr == saved_eptr)
1965 {
1966 if (*prev == OP_ONCE)
1967 {
1968 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1970 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1971 RRETURN(MATCH_ONCE);
1972 }
1973 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1974 break;
1975 }
1976
1977 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1978 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1979 at a time from the outer level, thus saving stack. */
1980
1981 if (*ecode == OP_KETRPOS)
1982 {
1983 md->end_match_ptr = eptr;
1984 md->end_offset_top = offset_top;
1985 RRETURN(MATCH_KETRPOS);
1986 }
1987
1988 /* The normal repeating kets try the rest of the pattern or restart from
1989 the preceding bracket, in the appropriate order. In the second case, we can
1990 use tail recursion to avoid using another stack frame, unless we have an
1991 an atomic group or an unlimited repeat of a group that can match an empty
1992 string. */
1993
1994 if (*ecode == OP_KETRMIN)
1995 {
1996 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1997 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1998 if (*prev == OP_ONCE)
1999 {
2000 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2002 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2003 RRETURN(MATCH_ONCE);
2004 }
2005 if (*prev >= OP_SBRA) /* Could match an empty string */
2006 {
2007 md->match_function_type = MATCH_CBEGROUP;
2008 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2009 RRETURN(rrc);
2010 }
2011 ecode = prev;
2012 goto TAIL_RECURSE;
2013 }
2014 else /* OP_KETRMAX */
2015 {
2016 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2017 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2018 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2019 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020 if (*prev == OP_ONCE)
2021 {
2022 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2024 md->once_target = prev;
2025 RRETURN(MATCH_ONCE);
2026 }
2027 ecode += 1 + LINK_SIZE;
2028 goto TAIL_RECURSE;
2029 }
2030 /* Control never gets here */
2031
2032 /* Not multiline mode: start of subject assertion, unless notbol. */
2033
2034 case OP_CIRC:
2035 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2036
2037 /* Start of subject assertion */
2038
2039 case OP_SOD:
2040 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2041 ecode++;
2042 break;
2043
2044 /* Multiline mode: start of subject unless notbol, or after any newline. */
2045
2046 case OP_CIRCM:
2047 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2048 if (eptr != md->start_subject &&
2049 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2050 RRETURN(MATCH_NOMATCH);
2051 ecode++;
2052 break;
2053
2054 /* Start of match assertion */
2055
2056 case OP_SOM:
2057 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2058 ecode++;
2059 break;
2060
2061 /* Reset the start of match point */
2062
2063 case OP_SET_SOM:
2064 mstart = eptr;
2065 ecode++;
2066 break;
2067
2068 /* Multiline mode: assert before any newline, or before end of subject
2069 unless noteol is set. */
2070
2071 case OP_DOLLM:
2072 if (eptr < md->end_subject)
2073 {
2074 if (!IS_NEWLINE(eptr))
2075 {
2076 if (md->partial != 0 &&
2077 eptr + 1 >= md->end_subject &&
2078 NLBLOCK->nltype == NLTYPE_FIXED &&
2079 NLBLOCK->nllen == 2 &&
2080 *eptr == NLBLOCK->nl[0])
2081 {
2082 md->hitend = TRUE;
2083 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2084 }
2085 RRETURN(MATCH_NOMATCH);
2086 }
2087 }
2088 else
2089 {
2090 if (md->noteol) RRETURN(MATCH_NOMATCH);
2091 SCHECK_PARTIAL();
2092 }
2093 ecode++;
2094 break;
2095
2096 /* Not multiline mode: assert before a terminating newline or before end of
2097 subject unless noteol is set. */
2098
2099 case OP_DOLL:
2100 if (md->noteol) RRETURN(MATCH_NOMATCH);
2101 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2102
2103 /* ... else fall through for endonly */
2104
2105 /* End of subject assertion (\z) */
2106
2107 case OP_EOD:
2108 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2109 SCHECK_PARTIAL();
2110 ecode++;
2111 break;
2112
2113 /* End of subject or ending \n assertion (\Z) */
2114
2115 case OP_EODN:
2116 ASSERT_NL_OR_EOS:
2117 if (eptr < md->end_subject &&
2118 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2119 {
2120 if (md->partial != 0 &&
2121 eptr + 1 >= md->end_subject &&
2122 NLBLOCK->nltype == NLTYPE_FIXED &&
2123 NLBLOCK->nllen == 2 &&
2124 *eptr == NLBLOCK->nl[0])
2125 {
2126 md->hitend = TRUE;
2127 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2128 }
2129 RRETURN(MATCH_NOMATCH);
2130 }
2131
2132 /* Either at end of string or \n before end. */
2133
2134 SCHECK_PARTIAL();
2135 ecode++;
2136 break;
2137
2138 /* Word boundary assertions */
2139
2140 case OP_NOT_WORD_BOUNDARY:
2141 case OP_WORD_BOUNDARY:
2142 {
2143
2144 /* Find out if the previous and current characters are "word" characters.
2145 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2146 be "non-word" characters. Remember the earliest consulted character for
2147 partial matching. */
2148
2149 #ifdef SUPPORT_UTF
2150 if (utf)
2151 {
2152 /* Get status of previous character */
2153
2154 if (eptr == md->start_subject) prev_is_word = FALSE; else
2155 {
2156 PCRE_PUCHAR lastptr = eptr - 1;
2157 BACKCHAR(lastptr);
2158 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2159 GETCHAR(c, lastptr);
2160 #ifdef SUPPORT_UCP
2161 if (md->use_ucp)
2162 {
2163 if (c == '_') prev_is_word = TRUE; else
2164 {
2165 int cat = UCD_CATEGORY(c);
2166 prev_is_word = (cat == ucp_L || cat == ucp_N);
2167 }
2168 }
2169 else
2170 #endif
2171 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2172 }
2173
2174 /* Get status of next character */
2175
2176 if (eptr >= md->end_subject)
2177 {
2178 SCHECK_PARTIAL();
2179 cur_is_word = FALSE;
2180 }
2181 else
2182 {
2183 GETCHAR(c, eptr);
2184 #ifdef SUPPORT_UCP
2185 if (md->use_ucp)
2186 {
2187 if (c == '_') cur_is_word = TRUE; else
2188 {
2189 int cat = UCD_CATEGORY(c);
2190 cur_is_word = (cat == ucp_L || cat == ucp_N);
2191 }
2192 }
2193 else
2194 #endif
2195 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2196 }
2197 }
2198 else
2199 #endif
2200
2201 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2202 consistency with the behaviour of \w we do use it in this case. */
2203
2204 {
2205 /* Get status of previous character */
2206
2207 if (eptr == md->start_subject) prev_is_word = FALSE; else
2208 {
2209 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2210 #ifdef SUPPORT_UCP
2211 if (md->use_ucp)
2212 {
2213 c = eptr[-1];
2214 if (c == '_') prev_is_word = TRUE; else
2215 {
2216 int cat = UCD_CATEGORY(c);
2217 prev_is_word = (cat == ucp_L || cat == ucp_N);
2218 }
2219 }
2220 else
2221 #endif
2222 prev_is_word = MAX_255(eptr[-1])
2223 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2224 }
2225
2226 /* Get status of next character */
2227
2228 if (eptr >= md->end_subject)
2229 {
2230 SCHECK_PARTIAL();
2231 cur_is_word = FALSE;
2232 }
2233 else
2234 #ifdef SUPPORT_UCP
2235 if (md->use_ucp)
2236 {
2237 c = *eptr;
2238 if (c == '_') cur_is_word = TRUE; else
2239 {
2240 int cat = UCD_CATEGORY(c);
2241 cur_is_word = (cat == ucp_L || cat == ucp_N);
2242 }
2243 }
2244 else
2245 #endif
2246 cur_is_word = MAX_255(*eptr)
2247 && ((md->ctypes[*eptr] & ctype_word) != 0);
2248 }
2249
2250 /* Now see if the situation is what we want */
2251
2252 if ((*ecode++ == OP_WORD_BOUNDARY)?
2253 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2254 RRETURN(MATCH_NOMATCH);
2255 }
2256 break;
2257
2258 /* Match any single character type except newline; have to take care with
2259 CRLF newlines and partial matching. */
2260
2261 case OP_ANY:
2262 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2263 if (md->partial != 0 &&
2264 eptr + 1 >= md->end_subject &&
2265 NLBLOCK->nltype == NLTYPE_FIXED &&
2266 NLBLOCK->nllen == 2 &&
2267 *eptr == NLBLOCK->nl[0])
2268 {
2269 md->hitend = TRUE;
2270 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2271 }
2272
2273 /* Fall through */
2274
2275 /* Match any single character whatsoever. */
2276
2277 case OP_ALLANY:
2278 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2279 { /* not be updated before SCHECK_PARTIAL. */
2280 SCHECK_PARTIAL();
2281 RRETURN(MATCH_NOMATCH);
2282 }
2283 eptr++;
2284 #ifdef SUPPORT_UTF
2285 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2286 #endif
2287 ecode++;
2288 break;
2289
2290 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2291 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2292
2293 case OP_ANYBYTE:
2294 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2295 { /* not be updated before SCHECK_PARTIAL. */
2296 SCHECK_PARTIAL();
2297 RRETURN(MATCH_NOMATCH);
2298 }
2299 eptr++;
2300 ecode++;
2301 break;
2302
2303 case OP_NOT_DIGIT:
2304 if (eptr >= md->end_subject)
2305 {
2306 SCHECK_PARTIAL();
2307 RRETURN(MATCH_NOMATCH);
2308 }
2309 GETCHARINCTEST(c, eptr);
2310 if (
2311 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2312 c < 256 &&
2313 #endif
2314 (md->ctypes[c] & ctype_digit) != 0
2315 )
2316 RRETURN(MATCH_NOMATCH);
2317 ecode++;
2318 break;
2319
2320 case OP_DIGIT:
2321 if (eptr >= md->end_subject)
2322 {
2323 SCHECK_PARTIAL();
2324 RRETURN(MATCH_NOMATCH);
2325 }
2326 GETCHARINCTEST(c, eptr);
2327 if (
2328 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2329 c > 255 ||
2330 #endif
2331 (md->ctypes[c] & ctype_digit) == 0
2332 )
2333 RRETURN(MATCH_NOMATCH);
2334 ecode++;
2335 break;
2336
2337 case OP_NOT_WHITESPACE:
2338 if (eptr >= md->end_subject)
2339 {
2340 SCHECK_PARTIAL();
2341 RRETURN(MATCH_NOMATCH);
2342 }
2343 GETCHARINCTEST(c, eptr);
2344 if (
2345 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2346 c < 256 &&
2347 #endif
2348 (md->ctypes[c] & ctype_space) != 0
2349 )
2350 RRETURN(MATCH_NOMATCH);
2351 ecode++;
2352 break;
2353
2354 case OP_WHITESPACE:
2355 if (eptr >= md->end_subject)
2356 {
2357 SCHECK_PARTIAL();
2358 RRETURN(MATCH_NOMATCH);
2359 }
2360 GETCHARINCTEST(c, eptr);
2361 if (
2362 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2363 c > 255 ||
2364 #endif
2365 (md->ctypes[c] & ctype_space) == 0
2366 )
2367 RRETURN(MATCH_NOMATCH);
2368 ecode++;
2369 break;
2370
2371 case OP_NOT_WORDCHAR:
2372 if (eptr >= md->end_subject)
2373 {
2374 SCHECK_PARTIAL();
2375 RRETURN(MATCH_NOMATCH);
2376 }
2377 GETCHARINCTEST(c, eptr);
2378 if (
2379 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2380 c < 256 &&
2381 #endif
2382 (md->ctypes[c] & ctype_word) != 0
2383 )
2384 RRETURN(MATCH_NOMATCH);
2385 ecode++;
2386 break;
2387
2388 case OP_WORDCHAR:
2389 if (eptr >= md->end_subject)
2390 {
2391 SCHECK_PARTIAL();
2392 RRETURN(MATCH_NOMATCH);
2393 }
2394 GETCHARINCTEST(c, eptr);
2395 if (
2396 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2397 c > 255 ||
2398 #endif
2399 (md->ctypes[c] & ctype_word) == 0
2400 )
2401 RRETURN(MATCH_NOMATCH);
2402 ecode++;
2403 break;
2404
2405 case OP_ANYNL:
2406 if (eptr >= md->end_subject)
2407 {
2408 SCHECK_PARTIAL();
2409 RRETURN(MATCH_NOMATCH);
2410 }
2411 GETCHARINCTEST(c, eptr);
2412 switch(c)
2413 {
2414 default: RRETURN(MATCH_NOMATCH);
2415
2416 case 0x000d:
2417 if (eptr >= md->end_subject)
2418 {
2419 SCHECK_PARTIAL();
2420 }
2421 else if (*eptr == 0x0a) eptr++;
2422 break;
2423
2424 case 0x000a:
2425 break;
2426
2427 case 0x000b:
2428 case 0x000c:
2429 case 0x0085:
2430 case 0x2028:
2431 case 0x2029:
2432 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2433 break;
2434 }
2435 ecode++;
2436 break;
2437
2438 case OP_NOT_HSPACE:
2439 if (eptr >= md->end_subject)
2440 {
2441 SCHECK_PARTIAL();
2442 RRETURN(MATCH_NOMATCH);
2443 }
2444 GETCHARINCTEST(c, eptr);
2445 switch(c)
2446 {
2447 default: break;
2448 case 0x09: /* HT */
2449 case 0x20: /* SPACE */
2450 case 0xa0: /* NBSP */
2451 case 0x1680: /* OGHAM SPACE MARK */
2452 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2453 case 0x2000: /* EN QUAD */
2454 case 0x2001: /* EM QUAD */
2455 case 0x2002: /* EN SPACE */
2456 case 0x2003: /* EM SPACE */
2457 case 0x2004: /* THREE-PER-EM SPACE */
2458 case 0x2005: /* FOUR-PER-EM SPACE */
2459 case 0x2006: /* SIX-PER-EM SPACE */
2460 case 0x2007: /* FIGURE SPACE */
2461 case 0x2008: /* PUNCTUATION SPACE */
2462 case 0x2009: /* THIN SPACE */
2463 case 0x200A: /* HAIR SPACE */
2464 case 0x202f: /* NARROW NO-BREAK SPACE */
2465 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2466 case 0x3000: /* IDEOGRAPHIC SPACE */
2467 RRETURN(MATCH_NOMATCH);
2468 }
2469 ecode++;
2470 break;
2471
2472 case OP_HSPACE:
2473 if (eptr >= md->end_subject)
2474 {
2475 SCHECK_PARTIAL();
2476 RRETURN(MATCH_NOMATCH);
2477 }
2478 GETCHARINCTEST(c, eptr);
2479 switch(c)
2480 {
2481 default: RRETURN(MATCH_NOMATCH);
2482 case 0x09: /* HT */
2483 case 0x20: /* SPACE */
2484 case 0xa0: /* NBSP */
2485 case 0x1680: /* OGHAM SPACE MARK */
2486 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2487 case 0x2000: /* EN QUAD */
2488 case 0x2001: /* EM QUAD */
2489 case 0x2002: /* EN SPACE */
2490 case 0x2003: /* EM SPACE */
2491 case 0x2004: /* THREE-PER-EM SPACE */
2492 case 0x2005: /* FOUR-PER-EM SPACE */
2493 case 0x2006: /* SIX-PER-EM SPACE */
2494 case 0x2007: /* FIGURE SPACE */
2495 case 0x2008: /* PUNCTUATION SPACE */
2496 case 0x2009: /* THIN SPACE */
2497 case 0x200A: /* HAIR SPACE */
2498 case 0x202f: /* NARROW NO-BREAK SPACE */
2499 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2500 case 0x3000: /* IDEOGRAPHIC SPACE */
2501 break;
2502 }
2503 ecode++;
2504 break;
2505
2506 case OP_NOT_VSPACE:
2507 if (eptr >= md->end_subject)
2508 {
2509 SCHECK_PARTIAL();
2510 RRETURN(MATCH_NOMATCH);
2511 }
2512 GETCHARINCTEST(c, eptr);
2513 switch(c)
2514 {
2515 default: break;
2516 case 0x0a: /* LF */
2517 case 0x0b: /* VT */
2518 case 0x0c: /* FF */
2519 case 0x0d: /* CR */
2520 case 0x85: /* NEL */
2521 case 0x2028: /* LINE SEPARATOR */
2522 case 0x2029: /* PARAGRAPH SEPARATOR */
2523 RRETURN(MATCH_NOMATCH);
2524 }
2525 ecode++;
2526 break;
2527
2528 case OP_VSPACE:
2529 if (eptr >= md->end_subject)
2530 {
2531 SCHECK_PARTIAL();
2532 RRETURN(MATCH_NOMATCH);
2533 }
2534 GETCHARINCTEST(c, eptr);
2535 switch(c)
2536 {
2537 default: RRETURN(MATCH_NOMATCH);
2538 case 0x0a: /* LF */
2539 case 0x0b: /* VT */
2540 case 0x0c: /* FF */
2541 case 0x0d: /* CR */
2542 case 0x85: /* NEL */
2543 case 0x2028: /* LINE SEPARATOR */
2544 case 0x2029: /* PARAGRAPH SEPARATOR */
2545 break;
2546 }
2547 ecode++;
2548 break;
2549
2550 #ifdef SUPPORT_UCP
2551 /* Check the next character by Unicode property. We will get here only
2552 if the support is in the binary; otherwise a compile-time error occurs. */
2553
2554 case OP_PROP:
2555 case OP_NOTPROP:
2556 if (eptr >= md->end_subject)
2557 {
2558 SCHECK_PARTIAL();
2559 RRETURN(MATCH_NOMATCH);
2560 }
2561 GETCHARINCTEST(c, eptr);
2562 {
2563 const ucd_record *prop = GET_UCD(c);
2564
2565 switch(ecode[1])
2566 {
2567 case PT_ANY:
2568 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2569 break;
2570
2571 case PT_LAMP:
2572 if ((prop->chartype == ucp_Lu ||
2573 prop->chartype == ucp_Ll ||
2574 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2575 RRETURN(MATCH_NOMATCH);
2576 break;
2577
2578 case PT_GC:
2579 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2580 RRETURN(MATCH_NOMATCH);
2581 break;
2582
2583 case PT_PC:
2584 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2585 RRETURN(MATCH_NOMATCH);
2586 break;
2587
2588 case PT_SC:
2589 if ((ecode[2] != prop->script) == (op == OP_PROP))
2590 RRETURN(MATCH_NOMATCH);
2591 break;
2592
2593 /* These are specials */
2594
2595 case PT_ALNUM:
2596 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2597 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2598 RRETURN(MATCH_NOMATCH);
2599 break;
2600
2601 case PT_SPACE: /* Perl space */
2602 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2603 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2604 == (op == OP_NOTPROP))
2605 RRETURN(MATCH_NOMATCH);
2606 break;
2607
2608 case PT_PXSPACE: /* POSIX space */
2609 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2610 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2611 c == CHAR_FF || c == CHAR_CR)
2612 == (op == OP_NOTPROP))
2613 RRETURN(MATCH_NOMATCH);
2614 break;
2615
2616 case PT_WORD:
2617 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2618 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2619 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2620 RRETURN(MATCH_NOMATCH);
2621 break;
2622
2623 /* This should never occur */
2624
2625 default:
2626 RRETURN(PCRE_ERROR_INTERNAL);
2627 }
2628
2629 ecode += 3;
2630 }
2631 break;
2632
2633 /* Match an extended Unicode sequence. We will get here only if the support
2634 is in the binary; otherwise a compile-time error occurs. */
2635
2636 case OP_EXTUNI:
2637 if (eptr >= md->end_subject)
2638 {
2639 SCHECK_PARTIAL();
2640 RRETURN(MATCH_NOMATCH);
2641 }
2642 GETCHARINCTEST(c, eptr);
2643 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2644 while (eptr < md->end_subject)
2645 {
2646 int len = 1;
2647 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2648 if (UCD_CATEGORY(c) != ucp_M) break;
2649 eptr += len;
2650 }
2651 CHECK_PARTIAL();
2652 ecode++;
2653 break;
2654 #endif
2655
2656
2657 /* Match a back reference, possibly repeatedly. Look past the end of the
2658 item to see if there is repeat information following. The code is similar
2659 to that for character classes, but repeated for efficiency. Then obey
2660 similar code to character type repeats - written out again for speed.
2661 However, if the referenced string is the empty string, always treat
2662 it as matched, any number of times (otherwise there could be infinite
2663 loops). */
2664
2665 case OP_REF:
2666 case OP_REFI:
2667 caseless = op == OP_REFI;
2668 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2669 ecode += 1 + IMM2_SIZE;
2670
2671 /* If the reference is unset, there are two possibilities:
2672
2673 (a) In the default, Perl-compatible state, set the length negative;
2674 this ensures that every attempt at a match fails. We can't just fail
2675 here, because of the possibility of quantifiers with zero minima.
2676
2677 (b) If the JavaScript compatibility flag is set, set the length to zero
2678 so that the back reference matches an empty string.
2679
2680 Otherwise, set the length to the length of what was matched by the
2681 referenced subpattern. */
2682
2683 if (offset >= offset_top || md->offset_vector[offset] < 0)
2684 length = (md->jscript_compat)? 0 : -1;
2685 else
2686 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2687
2688 /* Set up for repetition, or handle the non-repeated case */
2689
2690 switch (*ecode)
2691 {
2692 case OP_CRSTAR:
2693 case OP_CRMINSTAR:
2694 case OP_CRPLUS:
2695 case OP_CRMINPLUS:
2696 case OP_CRQUERY:
2697 case OP_CRMINQUERY:
2698 c = *ecode++ - OP_CRSTAR;
2699 minimize = (c & 1) != 0;
2700 min = rep_min[c]; /* Pick up values from tables; */
2701 max = rep_max[c]; /* zero for max => infinity */
2702 if (max == 0) max = INT_MAX;
2703 break;
2704
2705 case OP_CRRANGE:
2706 case OP_CRMINRANGE:
2707 minimize = (*ecode == OP_CRMINRANGE);
2708 min = GET2(ecode, 1);
2709 max = GET2(ecode, 1 + IMM2_SIZE);
2710 if (max == 0) max = INT_MAX;
2711 ecode += 1 + 2 * IMM2_SIZE;
2712 break;
2713
2714 default: /* No repeat follows */
2715 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2716 {
2717 if (length == -2) eptr = md->end_subject; /* Partial match */
2718 CHECK_PARTIAL();
2719 RRETURN(MATCH_NOMATCH);
2720 }
2721 eptr += length;
2722 continue; /* With the main loop */
2723 }
2724
2725 /* Handle repeated back references. If the length of the reference is
2726 zero, just continue with the main loop. If the length is negative, it
2727 means the reference is unset in non-Java-compatible mode. If the minimum is
2728 zero, we can continue at the same level without recursion. For any other
2729 minimum, carrying on will result in NOMATCH. */
2730
2731 if (length == 0) continue;
2732 if (length < 0 && min == 0) continue;
2733
2734 /* First, ensure the minimum number of matches are present. We get back
2735 the length of the reference string explicitly rather than passing the
2736 address of eptr, so that eptr can be a register variable. */
2737
2738 for (i = 1; i <= min; i++)
2739 {
2740 int slength;
2741 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2742 {
2743 if (slength == -2) eptr = md->end_subject; /* Partial match */
2744 CHECK_PARTIAL();
2745 RRETURN(MATCH_NOMATCH);
2746 }
2747 eptr += slength;
2748 }
2749
2750 /* If min = max, continue at the same level without recursion.
2751 They are not both allowed to be zero. */
2752
2753 if (min == max) continue;
2754
2755 /* If minimizing, keep trying and advancing the pointer */
2756
2757 if (minimize)
2758 {
2759 for (fi = min;; fi++)
2760 {
2761 int slength;
2762 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2763 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2764 if (fi >= max) RRETURN(MATCH_NOMATCH);
2765 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2766 {
2767 if (slength == -2) eptr = md->end_subject; /* Partial match */
2768 CHECK_PARTIAL();
2769 RRETURN(MATCH_NOMATCH);
2770 }
2771 eptr += slength;
2772 }
2773 /* Control never gets here */
2774 }
2775
2776 /* If maximizing, find the longest string and work backwards */
2777
2778 else
2779 {
2780 pp = eptr;
2781 for (i = min; i < max; i++)
2782 {
2783 int slength;
2784 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2785 {
2786 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2787 the soft partial matching case. */
2788
2789 if (slength == -2 && md->partial != 0 &&
2790 md->end_subject > md->start_used_ptr)
2791 {
2792 md->hitend = TRUE;
2793 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2794 }
2795 break;
2796 }
2797 eptr += slength;
2798 }
2799
2800 while (eptr >= pp)
2801 {
2802 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2804 eptr -= length;
2805 }
2806 RRETURN(MATCH_NOMATCH);
2807 }
2808 /* Control never gets here */
2809
2810 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2811 used when all the characters in the class have values in the range 0-255,
2812 and either the matching is caseful, or the characters are in the range
2813 0-127 when UTF-8 processing is enabled. The only difference between
2814 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2815 encountered.
2816
2817 First, look past the end of the item to see if there is repeat information
2818 following. Then obey similar code to character type repeats - written out
2819 again for speed. */
2820
2821 case OP_NCLASS:
2822 case OP_CLASS:
2823 {
2824 /* The data variable is saved across frames, so the byte map needs to
2825 be stored there. */
2826 #define BYTE_MAP ((pcre_uint8 *)data)
2827 data = ecode + 1; /* Save for matching */
2828 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2829
2830 switch (*ecode)
2831 {
2832 case OP_CRSTAR:
2833 case OP_CRMINSTAR:
2834 case OP_CRPLUS:
2835 case OP_CRMINPLUS:
2836 case OP_CRQUERY:
2837 case OP_CRMINQUERY:
2838 c = *ecode++ - OP_CRSTAR;
2839 minimize = (c & 1) != 0;
2840 min = rep_min[c]; /* Pick up values from tables; */
2841 max = rep_max[c]; /* zero for max => infinity */
2842 if (max == 0) max = INT_MAX;
2843 break;
2844
2845 case OP_CRRANGE:
2846 case OP_CRMINRANGE:
2847 minimize = (*ecode == OP_CRMINRANGE);
2848 min = GET2(ecode, 1);
2849 max = GET2(ecode, 1 + IMM2_SIZE);
2850 if (max == 0) max = INT_MAX;
2851 ecode += 1 + 2 * IMM2_SIZE;
2852 break;
2853
2854 default: /* No repeat follows */
2855 min = max = 1;
2856 break;
2857 }
2858
2859 /* First, ensure the minimum number of matches are present. */
2860
2861 #ifdef SUPPORT_UTF
2862 if (utf)
2863 {
2864 for (i = 1; i <= min; i++)
2865 {
2866 if (eptr >= md->end_subject)
2867 {
2868 SCHECK_PARTIAL();
2869 RRETURN(MATCH_NOMATCH);
2870 }
2871 GETCHARINC(c, eptr);
2872 if (c > 255)
2873 {
2874 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2875 }
2876 else
2877 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2878 }
2879 }
2880 else
2881 #endif
2882 /* Not UTF mode */
2883 {
2884 for (i = 1; i <= min; i++)
2885 {
2886 if (eptr >= md->end_subject)
2887 {
2888 SCHECK_PARTIAL();
2889 RRETURN(MATCH_NOMATCH);
2890 }
2891 c = *eptr++;
2892 #ifndef COMPILE_PCRE8
2893 if (c > 255)
2894 {
2895 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2896 }
2897 else
2898 #endif
2899 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2900 }
2901 }
2902
2903 /* If max == min we can continue with the main loop without the
2904 need to recurse. */
2905
2906 if (min == max) continue;
2907
2908 /* If minimizing, keep testing the rest of the expression and advancing
2909 the pointer while it matches the class. */
2910
2911 if (minimize)
2912 {
2913 #ifdef SUPPORT_UTF
2914 if (utf)
2915 {
2916 for (fi = min;; fi++)
2917 {
2918 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2920 if (fi >= max) RRETURN(MATCH_NOMATCH);
2921 if (eptr >= md->end_subject)
2922 {
2923 SCHECK_PARTIAL();
2924 RRETURN(MATCH_NOMATCH);
2925 }
2926 GETCHARINC(c, eptr);
2927 if (c > 255)
2928 {
2929 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2930 }
2931 else
2932 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2933 }
2934 }
2935 else
2936 #endif
2937 /* Not UTF mode */
2938 {
2939 for (fi = min;; fi++)
2940 {
2941 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2943 if (fi >= max) RRETURN(MATCH_NOMATCH);
2944 if (eptr >= md->end_subject)
2945 {
2946 SCHECK_PARTIAL();
2947 RRETURN(MATCH_NOMATCH);
2948 }
2949 c = *eptr++;
2950 #ifndef COMPILE_PCRE8
2951 if (c > 255)
2952 {
2953 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2954 }
2955 else
2956 #endif
2957 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2958 }
2959 }
2960 /* Control never gets here */
2961 }
2962
2963 /* If maximizing, find the longest possible run, then work backwards. */
2964
2965 else
2966 {
2967 pp = eptr;
2968
2969 #ifdef SUPPORT_UTF
2970 if (utf)
2971 {
2972 for (i = min; i < max; i++)
2973 {
2974 int len = 1;
2975 if (eptr >= md->end_subject)
2976 {
2977 SCHECK_PARTIAL();
2978 break;
2979 }
2980 GETCHARLEN(c, eptr, len);
2981 if (c > 255)
2982 {
2983 if (op == OP_CLASS) break;
2984 }
2985 else
2986 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2987 eptr += len;
2988 }
2989 for (;;)
2990 {
2991 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2992 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2993 if (eptr-- == pp) break; /* Stop if tried at original pos */
2994 BACKCHAR(eptr);
2995 }
2996 }
2997 else
2998 #endif
2999 /* Not UTF mode */
3000 {
3001 for (i = min; i < max; i++)
3002 {
3003 if (eptr >= md->end_subject)
3004 {
3005 SCHECK_PARTIAL();
3006 break;
3007 }
3008 c = *eptr;
3009 #ifndef COMPILE_PCRE8
3010 if (c > 255)
3011 {
3012 if (op == OP_CLASS) break;
3013 }
3014 else
3015 #endif
3016 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3017 eptr++;
3018 }
3019 while (eptr >= pp)
3020 {
3021 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3022 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3023 eptr--;
3024 }
3025 }
3026
3027 RRETURN(MATCH_NOMATCH);
3028 }
3029 #undef BYTE_MAP
3030 }
3031 /* Control never gets here */
3032
3033
3034 /* Match an extended character class. This opcode is encountered only
3035 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3036 mode, because Unicode properties are supported in non-UTF-8 mode. */
3037
3038 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3039 case OP_XCLASS:
3040 {
3041 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3042 ecode += GET(ecode, 1); /* Advance past the item */
3043
3044 switch (*ecode)
3045 {
3046 case OP_CRSTAR:
3047 case OP_CRMINSTAR:
3048 case OP_CRPLUS:
3049 case OP_CRMINPLUS:
3050 case OP_CRQUERY:
3051 case OP_CRMINQUERY:
3052 c = *ecode++ - OP_CRSTAR;
3053 minimize = (c & 1) != 0;
3054 min = rep_min[c]; /* Pick up values from tables; */
3055 max = rep_max[c]; /* zero for max => infinity */
3056 if (max == 0) max = INT_MAX;
3057 break;
3058
3059 case OP_CRRANGE:
3060 case OP_CRMINRANGE:
3061 minimize = (*ecode == OP_CRMINRANGE);
3062 min = GET2(ecode, 1);
3063 max = GET2(ecode, 1 + IMM2_SIZE);
3064 if (max == 0) max = INT_MAX;
3065 ecode += 1 + 2 * IMM2_SIZE;
3066 break;
3067
3068 default: /* No repeat follows */
3069 min = max = 1;
3070 break;
3071 }
3072
3073 /* First, ensure the minimum number of matches are present. */
3074
3075 for (i = 1; i <= min; i++)
3076 {
3077 if (eptr >= md->end_subject)
3078 {
3079 SCHECK_PARTIAL();
3080 RRETURN(MATCH_NOMATCH);
3081 }
3082 GETCHARINCTEST(c, eptr);
3083 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3084 }
3085
3086 /* If max == min we can continue with the main loop without the
3087 need to recurse. */
3088
3089 if (min == max) continue;
3090
3091 /* If minimizing, keep testing the rest of the expression and advancing
3092 the pointer while it matches the class. */
3093
3094 if (minimize)
3095 {
3096 for (fi = min;; fi++)
3097 {
3098 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3099 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3100 if (fi >= max) RRETURN(MATCH_NOMATCH);
3101 if (eptr >= md->end_subject)
3102 {
3103 SCHECK_PARTIAL();
3104 RRETURN(MATCH_NOMATCH);
3105 }
3106 GETCHARINCTEST(c, eptr);
3107 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3108 }
3109 /* Control never gets here */
3110 }
3111
3112 /* If maximizing, find the longest possible run, then work backwards. */
3113
3114 else
3115 {
3116 pp = eptr;
3117 for (i = min; i < max; i++)
3118 {
3119 int len = 1;
3120 if (eptr >= md->end_subject)
3121 {
3122 SCHECK_PARTIAL();
3123 break;
3124 }
3125 #ifdef SUPPORT_UTF
3126 GETCHARLENTEST(c, eptr, len);
3127 #else
3128 c = *eptr;
3129 #endif
3130 if (!PRIV(xclass)(c, data, utf)) break;
3131 eptr += len;
3132 }
3133 for(;;)
3134 {
3135 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3136 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3137 if (eptr-- == pp) break; /* Stop if tried at original pos */
3138 #ifdef SUPPORT_UTF
3139 if (utf) BACKCHAR(eptr);
3140 #endif
3141 }
3142 RRETURN(MATCH_NOMATCH);
3143 }
3144
3145 /* Control never gets here */
3146 }
3147 #endif /* End of XCLASS */
3148
3149 /* Match a single character, casefully */
3150
3151 case OP_CHAR:
3152 #ifdef SUPPORT_UTF
3153 if (utf)
3154 {
3155 length = 1;
3156 ecode++;
3157 GETCHARLEN(fc, ecode, length);
3158 if (length > md->end_subject - eptr)
3159 {
3160 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3161 RRETURN(MATCH_NOMATCH);
3162 }
3163 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3164 }
3165 else
3166 #endif
3167 /* Not UTF mode */
3168 {
3169 if (md->end_subject - eptr < 1)
3170 {
3171 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3172 RRETURN(MATCH_NOMATCH);
3173 }
3174 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3175 ecode += 2;
3176 }
3177 break;
3178
3179 /* Match a single character, caselessly. If we are at the end of the
3180 subject, give up immediately. */
3181
3182 case OP_CHARI:
3183 if (eptr >= md->end_subject)
3184 {
3185 SCHECK_PARTIAL();
3186 RRETURN(MATCH_NOMATCH);
3187 }
3188
3189 #ifdef SUPPORT_UTF
3190 if (utf)
3191 {
3192 length = 1;
3193 ecode++;
3194 GETCHARLEN(fc, ecode, length);
3195
3196 /* If the pattern character's value is < 128, we have only one byte, and
3197 we know that its other case must also be one byte long, so we can use the
3198 fast lookup table. We know that there is at least one byte left in the
3199 subject. */
3200
3201 if (fc < 128)
3202 {
3203 if (md->lcc[fc]
3204 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3205 ecode++;
3206 eptr++;
3207 }
3208
3209 /* Otherwise we must pick up the subject character. Note that we cannot
3210 use the value of "length" to check for sufficient bytes left, because the
3211 other case of the character may have more or fewer bytes. */
3212
3213 else
3214 {
3215 unsigned int dc;
3216 GETCHARINC(dc, eptr);
3217 ecode += length;
3218
3219 /* If we have Unicode property support, we can use it to test the other
3220 case of the character, if there is one. */
3221
3222 if (fc != dc)
3223 {
3224 #ifdef SUPPORT_UCP
3225 if (dc != UCD_OTHERCASE(fc))
3226 #endif
3227 RRETURN(MATCH_NOMATCH);
3228 }
3229 }
3230 }
3231 else
3232 #endif /* SUPPORT_UTF */
3233
3234 /* Not UTF mode */
3235 {
3236 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3237 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3238 eptr++;
3239 ecode += 2;
3240 }
3241 break;
3242
3243 /* Match a single character repeatedly. */
3244
3245 case OP_EXACT:
3246 case OP_EXACTI:
3247 min = max = GET2(ecode, 1);
3248 ecode += 1 + IMM2_SIZE;
3249 goto REPEATCHAR;
3250
3251 case OP_POSUPTO:
3252 case OP_POSUPTOI:
3253 possessive = TRUE;
3254 /* Fall through */
3255
3256 case OP_UPTO:
3257 case OP_UPTOI:
3258 case OP_MINUPTO:
3259 case OP_MINUPTOI:
3260 min = 0;
3261 max = GET2(ecode, 1);
3262 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3263 ecode += 1 + IMM2_SIZE;
3264 goto REPEATCHAR;
3265
3266 case OP_POSSTAR:
3267 case OP_POSSTARI:
3268 possessive = TRUE;
3269 min = 0;
3270 max = INT_MAX;
3271 ecode++;
3272 goto REPEATCHAR;
3273
3274 case OP_POSPLUS:
3275 case OP_POSPLUSI:
3276 possessive = TRUE;
3277 min = 1;
3278 max = INT_MAX;
3279 ecode++;
3280 goto REPEATCHAR;
3281
3282 case OP_POSQUERY:
3283 case OP_POSQUERYI:
3284 possessive = TRUE;
3285 min = 0;
3286 max = 1;
3287 ecode++;
3288 goto REPEATCHAR;
3289
3290 case OP_STAR:
3291 case OP_STARI:
3292 case OP_MINSTAR:
3293 case OP_MINSTARI:
3294 case OP_PLUS:
3295 case OP_PLUSI:
3296 case OP_MINPLUS:
3297 case OP_MINPLUSI:
3298 case OP_QUERY:
3299 case OP_QUERYI:
3300 case OP_MINQUERY:
3301 case OP_MINQUERYI:
3302 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3303 minimize = (c & 1) != 0;
3304 min = rep_min[c]; /* Pick up values from tables; */
3305 max = rep_max[c]; /* zero for max => infinity */
3306 if (max == 0) max = INT_MAX;
3307
3308 /* Common code for all repeated single-character matches. */
3309
3310 REPEATCHAR:
3311 #ifdef SUPPORT_UTF
3312 if (utf)
3313 {
3314 length = 1;
3315 charptr = ecode;
3316 GETCHARLEN(fc, ecode, length);
3317 ecode += length;
3318
3319 /* Handle multibyte character matching specially here. There is
3320 support for caseless matching if UCP support is present. */
3321
3322 if (length > 1)
3323 {
3324 #ifdef SUPPORT_UCP
3325 unsigned int othercase;
3326 if (op >= OP_STARI && /* Caseless */
3327 (othercase = UCD_OTHERCASE(fc)) != fc)
3328 oclength = PRIV(ord2utf)(othercase, occhars);
3329 else oclength = 0;
3330 #endif /* SUPPORT_UCP */
3331
3332 for (i = 1; i <= min; i++)
3333 {
3334 if (eptr <= md->end_subject - length &&
3335 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3336 #ifdef SUPPORT_UCP
3337 else if (oclength > 0 &&
3338 eptr <= md->end_subject - oclength &&
3339 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3340 #endif /* SUPPORT_UCP */
3341 else
3342 {
3343 CHECK_PARTIAL();
3344 RRETURN(MATCH_NOMATCH);
3345 }
3346 }
3347
3348 if (min == max) continue;
3349
3350 if (minimize)
3351 {
3352 for (fi = min;; fi++)
3353 {
3354 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 if (fi >= max) RRETURN(MATCH_NOMATCH);
3357 if (eptr <= md->end_subject - length &&
3358 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3359 #ifdef SUPPORT_UCP
3360 else if (oclength > 0 &&
3361 eptr <= md->end_subject - oclength &&
3362 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3363 #endif /* SUPPORT_UCP */
3364 else
3365 {
3366 CHECK_PARTIAL();
3367 RRETURN(MATCH_NOMATCH);
3368 }
3369 }
3370 /* Control never gets here */
3371 }
3372
3373 else /* Maximize */
3374 {
3375 pp = eptr;
3376 for (i = min; i < max; i++)
3377 {
3378 if (eptr <= md->end_subject - length &&
3379 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3380 #ifdef SUPPORT_UCP
3381 else if (oclength > 0 &&
3382 eptr <= md->end_subject - oclength &&
3383 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3384 #endif /* SUPPORT_UCP */
3385 else
3386 {
3387 CHECK_PARTIAL();
3388 break;
3389 }
3390 }
3391
3392 if (possessive) continue;
3393
3394 for(;;)
3395 {
3396 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3397 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3398 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3399 #ifdef SUPPORT_UCP
3400 eptr--;
3401 BACKCHAR(eptr);
3402 #else /* without SUPPORT_UCP */
3403 eptr -= length;
3404 #endif /* SUPPORT_UCP */
3405 }
3406 }
3407 /* Control never gets here */
3408 }
3409
3410 /* If the length of a UTF-8 character is 1, we fall through here, and
3411 obey the code as for non-UTF-8 characters below, though in this case the
3412 value of fc will always be < 128. */
3413 }
3414 else
3415 #endif /* SUPPORT_UTF */
3416 /* When not in UTF-8 mode, load a single-byte character. */
3417 fc = *ecode++;
3418
3419 /* The value of fc at this point is always one character, though we may
3420 or may not be in UTF mode. The code is duplicated for the caseless and
3421 caseful cases, for speed, since matching characters is likely to be quite
3422 common. First, ensure the minimum number of matches are present. If min =
3423 max, continue at the same level without recursing. Otherwise, if
3424 minimizing, keep trying the rest of the expression and advancing one
3425 matching character if failing, up to the maximum. Alternatively, if
3426 maximizing, find the maximum number of characters and work backwards. */
3427
3428 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3429 max, eptr));
3430
3431 if (op >= OP_STARI) /* Caseless */
3432 {
3433 #ifdef COMPILE_PCRE8
3434 /* fc must be < 128 if UTF is enabled. */
3435 foc = md->fcc[fc];
3436 #else
3437 #ifdef SUPPORT_UTF
3438 #ifdef SUPPORT_UCP
3439 if (utf && fc > 127)
3440 foc = UCD_OTHERCASE(fc);
3441 #else
3442 if (utf && fc > 127)
3443 foc = fc;
3444 #endif /* SUPPORT_UCP */
3445 else
3446 #endif /* SUPPORT_UTF */
3447 foc = TABLE_GET(fc, md->fcc, fc);
3448 #endif /* COMPILE_PCRE8 */
3449
3450 for (i = 1; i <= min; i++)
3451 {
3452 if (eptr >= md->end_subject)
3453 {
3454 SCHECK_PARTIAL();
3455 RRETURN(MATCH_NOMATCH);
3456 }
3457 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3458 eptr++;
3459 }
3460 if (min == max) continue;
3461 if (minimize)
3462 {
3463 for (fi = min;; fi++)
3464 {
3465 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467 if (fi >= max) RRETURN(MATCH_NOMATCH);
3468 if (eptr >= md->end_subject)
3469 {
3470 SCHECK_PARTIAL();
3471 RRETURN(MATCH_NOMATCH);
3472 }
3473 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3474 eptr++;
3475 }
3476 /* Control never gets here */
3477 }
3478 else /* Maximize */
3479 {
3480 pp = eptr;
3481 for (i = min; i < max; i++)
3482 {
3483 if (eptr >= md->end_subject)
3484 {
3485 SCHECK_PARTIAL();
3486 break;
3487 }
3488 if (fc != *eptr && foc != *eptr) break;
3489 eptr++;
3490 }
3491
3492 if (possessive) continue;
3493
3494 while (eptr >= pp)
3495 {
3496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3497 eptr--;
3498 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3499 }
3500 RRETURN(MATCH_NOMATCH);
3501 }
3502 /* Control never gets here */
3503 }
3504
3505 /* Caseful comparisons (includes all multi-byte characters) */
3506
3507 else
3508 {
3509 for (i = 1; i <= min; i++)
3510 {
3511 if (eptr >= md->end_subject)
3512 {
3513 SCHECK_PARTIAL();
3514 RRETURN(MATCH_NOMATCH);
3515 }
3516 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3517 }
3518
3519 if (min == max) continue;
3520
3521 if (minimize)
3522 {
3523 for (fi = min;; fi++)
3524 {
3525 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3527 if (fi >= max) RRETURN(MATCH_NOMATCH);
3528 if (eptr >= md->end_subject)
3529 {
3530 SCHECK_PARTIAL();
3531 RRETURN(MATCH_NOMATCH);
3532 }
3533 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3534 }
3535 /* Control never gets here */
3536 }
3537 else /* Maximize */
3538 {
3539 pp = eptr;
3540 for (i = min; i < max; i++)
3541 {
3542 if (eptr >= md->end_subject)
3543 {
3544 SCHECK_PARTIAL();
3545 break;
3546 }
3547 if (fc != *eptr) break;
3548 eptr++;
3549 }
3550 if (possessive) continue;
3551
3552 while (eptr >= pp)
3553 {
3554 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3555 eptr--;
3556 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3557 }
3558 RRETURN(MATCH_NOMATCH);
3559 }
3560 }
3561 /* Control never gets here */
3562
3563 /* Match a negated single one-byte character. The character we are
3564 checking can be multibyte. */
3565
3566 case OP_NOT:
3567 case OP_NOTI:
3568 if (eptr >= md->end_subject)
3569 {
3570 SCHECK_PARTIAL();
3571 RRETURN(MATCH_NOMATCH);
3572 }
3573 #ifdef SUPPORT_UTF
3574 if (utf)
3575 {
3576 register unsigned int ch, och;
3577
3578 ecode++;
3579 GETCHARINC(ch, ecode);
3580 GETCHARINC(c, eptr);
3581
3582 if (op == OP_NOT)
3583 {
3584 if (ch == c) RRETURN(MATCH_NOMATCH);
3585 }
3586 else
3587 {
3588 #ifdef SUPPORT_UCP
3589 if (ch > 127)
3590 och = UCD_OTHERCASE(ch);
3591 #else
3592 if (ch > 127)
3593 och = ch;
3594 #endif /* SUPPORT_UCP */
3595 else
3596 och = TABLE_GET(ch, md->fcc, ch);
3597 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3598 }
3599 }
3600 else
3601 #endif
3602 {
3603 register unsigned int ch = ecode[1];
3604 c = *eptr++;
3605 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3606 RRETURN(MATCH_NOMATCH);
3607 ecode += 2;
3608 }
3609 break;
3610
3611 /* Match a negated single one-byte character repeatedly. This is almost a
3612 repeat of the code for a repeated single character, but I haven't found a
3613 nice way of commoning these up that doesn't require a test of the
3614 positive/negative option for each character match. Maybe that wouldn't add
3615 very much to the time taken, but character matching *is* what this is all
3616 about... */
3617
3618 case OP_NOTEXACT:
3619 case OP_NOTEXACTI:
3620 min = max = GET2(ecode, 1);
3621 ecode += 1 + IMM2_SIZE;
3622 goto REPEATNOTCHAR;
3623
3624 case OP_NOTUPTO:
3625 case OP_NOTUPTOI:
3626 case OP_NOTMINUPTO:
3627 case OP_NOTMINUPTOI:
3628 min = 0;
3629 max = GET2(ecode, 1);
3630 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3631 ecode += 1 + IMM2_SIZE;
3632 goto REPEATNOTCHAR;
3633
3634 case OP_NOTPOSSTAR:
3635 case OP_NOTPOSSTARI:
3636 possessive = TRUE;
3637 min = 0;
3638 max = INT_MAX;
3639 ecode++;
3640 goto REPEATNOTCHAR;
3641
3642 case OP_NOTPOSPLUS:
3643 case OP_NOTPOSPLUSI:
3644 possessive = TRUE;
3645 min = 1;
3646 max = INT_MAX;
3647 ecode++;
3648 goto REPEATNOTCHAR;
3649
3650 case OP_NOTPOSQUERY:
3651 case OP_NOTPOSQUERYI:
3652 possessive = TRUE;
3653 min = 0;
3654 max = 1;
3655 ecode++;
3656 goto REPEATNOTCHAR;
3657
3658 case OP_NOTPOSUPTO:
3659 case OP_NOTPOSUPTOI:
3660 possessive = TRUE;
3661 min = 0;
3662 max = GET2(ecode, 1);
3663 ecode += 1 + IMM2_SIZE;
3664 goto REPEATNOTCHAR;
3665
3666 case OP_NOTSTAR:
3667 case OP_NOTSTARI:
3668 case OP_NOTMINSTAR:
3669 case OP_NOTMINSTARI:
3670 case OP_NOTPLUS:
3671 case OP_NOTPLUSI:
3672 case OP_NOTMINPLUS:
3673 case OP_NOTMINPLUSI:
3674 case OP_NOTQUERY:
3675 case OP_NOTQUERYI:
3676 case OP_NOTMINQUERY:
3677 case OP_NOTMINQUERYI:
3678 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3679 minimize = (c & 1) != 0;
3680 min = rep_min[c]; /* Pick up values from tables; */
3681 max = rep_max[c]; /* zero for max => infinity */
3682 if (max == 0) max = INT_MAX;
3683
3684 /* Common code for all repeated single-byte matches. */
3685
3686 REPEATNOTCHAR:
3687 GETCHARINCTEST(fc, ecode);
3688
3689 /* The code is duplicated for the caseless and caseful cases, for speed,
3690 since matching characters is likely to be quite common. First, ensure the
3691 minimum number of matches are present. If min = max, continue at the same
3692 level without recursing. Otherwise, if minimizing, keep trying the rest of
3693 the expression and advancing one matching character if failing, up to the
3694 maximum. Alternatively, if maximizing, find the maximum number of
3695 characters and work backwards. */
3696
3697 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3698 max, eptr));
3699
3700 if (op >= OP_NOTSTARI) /* Caseless */
3701 {
3702 #ifdef SUPPORT_UTF
3703 #ifdef SUPPORT_UCP
3704 if (utf && fc > 127)
3705 foc = UCD_OTHERCASE(fc);
3706 #else
3707 if (utf && fc > 127)
3708 foc = fc;
3709 #endif /* SUPPORT_UCP */
3710 else
3711 #endif /* SUPPORT_UTF */
3712 foc = TABLE_GET(fc, md->fcc, fc);
3713
3714 #ifdef SUPPORT_UTF
3715 if (utf)
3716 {
3717 register unsigned int d;
3718 for (i = 1; i <= min; i++)
3719 {
3720 if (eptr >= md->end_subject)
3721 {
3722 SCHECK_PARTIAL();
3723 RRETURN(MATCH_NOMATCH);
3724 }
3725 GETCHARINC(d, eptr);
3726 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3727 }
3728 }
3729 else
3730 #endif
3731 /* Not UTF mode */
3732 {
3733 for (i = 1; i <= min; i++)
3734 {
3735 if (eptr >= md->end_subject)
3736 {
3737 SCHECK_PARTIAL();
3738 RRETURN(MATCH_NOMATCH);
3739 }
3740 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3741 eptr++;
3742 }
3743 }
3744
3745 if (min == max) continue;
3746
3747 if (minimize)
3748 {
3749 #ifdef SUPPORT_UTF
3750 if (utf)
3751 {
3752 register unsigned int d;
3753 for (fi = min;; fi++)
3754 {
3755 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3756 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3757 if (fi >= max) RRETURN(MATCH_NOMATCH);
3758 if (eptr >= md->end_subject)
3759 {
3760 SCHECK_PARTIAL();
3761 RRETURN(MATCH_NOMATCH);
3762 }
3763 GETCHARINC(d, eptr);
3764 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3765 }
3766 }
3767 else
3768 #endif
3769 /* Not UTF mode */
3770 {
3771 for (fi = min;; fi++)
3772 {
3773 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3775 if (fi >= max) RRETURN(MATCH_NOMATCH);
3776 if (eptr >= md->end_subject)
3777 {
3778 SCHECK_PARTIAL();
3779 RRETURN(MATCH_NOMATCH);
3780 }
3781 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3782 eptr++;
3783 }
3784 }
3785 /* Control never gets here */
3786 }
3787
3788 /* Maximize case */
3789
3790 else
3791 {
3792 pp = eptr;
3793
3794 #ifdef SUPPORT_UTF
3795 if (utf)
3796 {
3797 register unsigned int d;
3798 for (i = min; i < max; i++)
3799 {
3800 int len = 1;
3801 if (eptr >= md->end_subject)
3802 {
3803 SCHECK_PARTIAL();
3804 break;
3805 }
3806 GETCHARLEN(d, eptr, len);
3807 if (fc == d || (unsigned int)foc == d) break;
3808 eptr += len;
3809 }
3810 if (possessive) continue;
3811 for(;;)
3812 {
3813 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3815 if (eptr-- == pp) break; /* Stop if tried at original pos */
3816 BACKCHAR(eptr);
3817 }
3818 }
3819 else
3820 #endif
3821 /* Not UTF mode */
3822 {
3823 for (i = min; i < max; i++)
3824 {
3825 if (eptr >= md->end_subject)
3826 {
3827 SCHECK_PARTIAL();
3828 break;
3829 }
3830 if (fc == *eptr || foc == *eptr) break;
3831 eptr++;
3832 }
3833 if (possessive) continue;
3834 while (eptr >= pp)
3835 {
3836 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3838 eptr--;
3839 }
3840 }
3841
3842 RRETURN(MATCH_NOMATCH);
3843 }
3844 /* Control never gets here */
3845 }
3846
3847 /* Caseful comparisons */
3848
3849 else
3850 {
3851 #ifdef SUPPORT_UTF
3852 if (utf)
3853 {
3854 register unsigned int d;
3855 for (i = 1; i <= min; i++)
3856 {
3857 if (eptr >= md->end_subject)
3858 {
3859 SCHECK_PARTIAL();
3860 RRETURN(MATCH_NOMATCH);
3861 }
3862 GETCHARINC(d, eptr);
3863 if (fc == d) RRETURN(MATCH_NOMATCH);
3864 }
3865 }
3866 else
3867 #endif
3868 /* Not UTF mode */
3869 {
3870 for (i = 1; i <= min; i++)
3871 {
3872 if (eptr >= md->end_subject)
3873 {
3874 SCHECK_PARTIAL();
3875 RRETURN(MATCH_NOMATCH);
3876 }
3877 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3878 }
3879 }
3880
3881 if (min == max) continue;
3882
3883 if (minimize)
3884 {
3885 #ifdef SUPPORT_UTF
3886 if (utf)
3887 {
3888 register unsigned int d;
3889 for (fi = min;; fi++)
3890 {
3891 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3893 if (fi >= max) RRETURN(MATCH_NOMATCH);
3894 if (eptr >= md->end_subject)
3895 {
3896 SCHECK_PARTIAL();
3897 RRETURN(MATCH_NOMATCH);
3898 }
3899 GETCHARINC(d, eptr);
3900 if (fc == d) RRETURN(MATCH_NOMATCH);
3901 }
3902 }
3903 else
3904 #endif
3905 /* Not UTF mode */
3906 {
3907 for (fi = min;; fi++)
3908 {
3909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3911 if (fi >= max) RRETURN(MATCH_NOMATCH);
3912 if (eptr >= md->end_subject)
3913 {
3914 SCHECK_PARTIAL();
3915 RRETURN(MATCH_NOMATCH);
3916 }
3917 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3918 }
3919 }
3920 /* Control never gets here */
3921 }
3922
3923 /* Maximize case */
3924
3925 else
3926 {
3927 pp = eptr;
3928
3929 #ifdef SUPPORT_UTF
3930 if (utf)
3931 {
3932 register unsigned int d;
3933 for (i = min; i < max; i++)
3934 {
3935 int len = 1;
3936 if (eptr >= md->end_subject)
3937 {
3938 SCHECK_PARTIAL();
3939 break;
3940 }
3941 GETCHARLEN(d, eptr, len);
3942 if (fc == d) break;
3943 eptr += len;
3944 }
3945 if (possessive) continue;
3946 for(;;)
3947 {
3948 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3950 if (eptr-- == pp) break; /* Stop if tried at original pos */
3951 BACKCHAR(eptr);
3952 }
3953 }
3954 else
3955 #endif
3956 /* Not UTF mode */
3957 {
3958 for (i = min; i < max; i++)
3959 {
3960 if (eptr >= md->end_subject)
3961 {
3962 SCHECK_PARTIAL();
3963 break;
3964 }
3965 if (fc == *eptr) break;
3966 eptr++;
3967 }
3968 if (possessive) continue;
3969 while (eptr >= pp)
3970 {
3971 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3973 eptr--;
3974 }
3975 }
3976
3977 RRETURN(MATCH_NOMATCH);
3978 }
3979 }
3980 /* Control never gets here */
3981
3982 /* Match a single character type repeatedly; several different opcodes
3983 share code. This is very similar to the code for single characters, but we
3984 repeat it in the interests of efficiency. */
3985
3986 case OP_TYPEEXACT:
3987 min = max = GET2(ecode, 1);
3988 minimize = TRUE;
3989 ecode += 1 + IMM2_SIZE;
3990 goto REPEATTYPE;
3991
3992 case OP_TYPEUPTO:
3993 case OP_TYPEMINUPTO:
3994 min = 0;
3995 max = GET2(ecode, 1);
3996 minimize = *ecode == OP_TYPEMINUPTO;
3997 ecode += 1 + IMM2_SIZE;
3998 goto REPEATTYPE;
3999
4000 case OP_TYPEPOSSTAR:
4001 possessive = TRUE;
4002 min = 0;
4003 max = INT_MAX;
4004 ecode++;
4005 goto REPEATTYPE;
4006
4007 case OP_TYPEPOSPLUS:
4008 possessive = TRUE;
4009 min = 1;
4010 max = INT_MAX;
4011 ecode++;
4012 goto REPEATTYPE;
4013
4014 case OP_TYPEPOSQUERY:
4015 possessive = TRUE;
4016 min = 0;
4017 max = 1;
4018 ecode++;
4019 goto REPEATTYPE;
4020
4021 case OP_TYPEPOSUPTO:
4022 possessive = TRUE;
4023 min = 0;
4024 max = GET2(ecode, 1);
4025 ecode += 1 + IMM2_SIZE;
4026 goto REPEATTYPE;
4027
4028 case OP_TYPESTAR:
4029 case OP_TYPEMINSTAR:
4030 case OP_TYPEPLUS:
4031 case OP_TYPEMINPLUS:
4032 case OP_TYPEQUERY:
4033 case OP_TYPEMINQUERY:
4034 c = *ecode++ - OP_TYPESTAR;
4035 minimize = (c & 1) != 0;
4036 min = rep_min[c]; /* Pick up values from tables; */
4037 max = rep_max[c]; /* zero for max => infinity */
4038 if (max == 0) max = INT_MAX;
4039
4040 /* Common code for all repeated single character type matches. Note that
4041 in UTF-8 mode, '.' matches a character of any length, but for the other
4042 character types, the valid characters are all one-byte long. */
4043
4044 REPEATTYPE:
4045 ctype = *ecode++; /* Code for the character type */
4046
4047 #ifdef SUPPORT_UCP
4048 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4049 {
4050 prop_fail_result = ctype == OP_NOTPROP;
4051 prop_type = *ecode++;
4052 prop_value = *ecode++;
4053 }
4054 else prop_type = -1;
4055 #endif
4056
4057 /* First, ensure the minimum number of matches are present. Use inline
4058 code for maximizing the speed, and do the type test once at the start
4059 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4060 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4061 and single-bytes. */
4062
4063 if (min > 0)
4064 {
4065 #ifdef SUPPORT_UCP
4066 if (prop_type >= 0)
4067 {
4068 switch(prop_type)
4069 {
4070 case PT_ANY:
4071 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4072 for (i = 1; i <= min; i++)
4073 {
4074 if (eptr >= md->end_subject)
4075 {
4076 SCHECK_PARTIAL();
4077 RRETURN(MATCH_NOMATCH);
4078 }
4079 GETCHARINCTEST(c, eptr);
4080 }
4081 break;
4082
4083 case PT_LAMP:
4084 for (i = 1; i <= min; i++)
4085 {
4086 int chartype;
4087 if (eptr >= md->end_subject)
4088 {
4089 SCHECK_PARTIAL();
4090 RRETURN(MATCH_NOMATCH);
4091 }
4092 GETCHARINCTEST(c, eptr);
4093 chartype = UCD_CHARTYPE(c);
4094 if ((chartype == ucp_Lu ||
4095 chartype == ucp_Ll ||
4096 chartype == ucp_Lt) == prop_fail_result)
4097 RRETURN(MATCH_NOMATCH);
4098 }
4099 break;
4100
4101 case PT_GC:
4102 for (i = 1; i <= min; i++)
4103 {
4104 if (eptr >= md->end_subject)
4105 {
4106 SCHECK_PARTIAL();
4107 RRETURN(MATCH_NOMATCH);
4108 }
4109 GETCHARINCTEST(c, eptr);
4110 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4111 RRETURN(MATCH_NOMATCH);
4112 }
4113 break;
4114
4115 case PT_PC:
4116 for (i = 1; i <= min; i++)
4117 {
4118 if (eptr >= md->end_subject)
4119 {
4120 SCHECK_PARTIAL();
4121 RRETURN(MATCH_NOMATCH);
4122 }
4123 GETCHARINCTEST(c, eptr);
4124 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4125 RRETURN(MATCH_NOMATCH);
4126 }
4127 break;
4128
4129 case PT_SC:
4130 for (i = 1; i <= min; i++)
4131 {
4132 if (eptr >= md->end_subject)
4133 {
4134 SCHECK_PARTIAL();
4135 RRETURN(MATCH_NOMATCH);
4136 }
4137 GETCHARINCTEST(c, eptr);
4138 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4139 RRETURN(MATCH_NOMATCH);
4140 }
4141 break;
4142
4143 case PT_ALNUM:
4144 for (i = 1; i <= min; i++)
4145 {
4146 int category;
4147 if (eptr >= md->end_subject)
4148 {
4149 SCHECK_PARTIAL();
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 GETCHARINCTEST(c, eptr);
4153 category = UCD_CATEGORY(c);
4154 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4155 RRETURN(MATCH_NOMATCH);
4156 }
4157 break;
4158
4159 case PT_SPACE: /* Perl space */
4160 for (i = 1; i <= min; i++)
4161 {
4162 if (eptr >= md->end_subject)
4163 {
4164 SCHECK_PARTIAL();
4165 RRETURN(MATCH_NOMATCH);
4166 }
4167 GETCHARINCTEST(c, eptr);
4168 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4169 c == CHAR_FF || c == CHAR_CR)
4170 == prop_fail_result)
4171 RRETURN(MATCH_NOMATCH);
4172 }
4173 break;
4174
4175 case PT_PXSPACE: /* POSIX space */
4176 for (i = 1; i <= min; i++)
4177 {
4178 if (eptr >= md->end_subject)
4179 {
4180 SCHECK_PARTIAL();
4181 RRETURN(MATCH_NOMATCH);
4182 }
4183 GETCHARINCTEST(c, eptr);
4184 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4185 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4186 == prop_fail_result)
4187 RRETURN(MATCH_NOMATCH);
4188 }
4189 break;
4190
4191 case PT_WORD:
4192 for (i = 1; i <= min; i++)
4193 {
4194 int category;
4195 if (eptr >= md->end_subject)
4196 {
4197 SCHECK_PARTIAL();
4198 RRETURN(MATCH_NOMATCH);
4199 }
4200 GETCHARINCTEST(c, eptr);
4201 category = UCD_CATEGORY(c);
4202 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4203 == prop_fail_result)
4204 RRETURN(MATCH_NOMATCH);
4205 }
4206 break;
4207
4208 /* This should not occur */
4209
4210 default:
4211 RRETURN(PCRE_ERROR_INTERNAL);
4212 }
4213 }
4214
4215 /* Match extended Unicode sequences. We will get here only if the
4216 support is in the binary; otherwise a compile-time error occurs. */
4217
4218 else if (ctype == OP_EXTUNI)
4219 {
4220 for (i = 1; i <= min; i++)
4221 {
4222 if (eptr >= md->end_subject)
4223 {
4224 SCHECK_PARTIAL();
4225 RRETURN(MATCH_NOMATCH);
4226 }
4227 GETCHARINCTEST(c, eptr);
4228 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4229 while (eptr < md->end_subject)
4230 {
4231 int len = 1;
4232 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4233 if (UCD_CATEGORY(c) != ucp_M) break;
4234 eptr += len;
4235 }
4236 CHECK_PARTIAL();
4237 }
4238 }
4239
4240 else
4241 #endif /* SUPPORT_UCP */
4242
4243 /* Handle all other cases when the coding is UTF-8 */
4244
4245 #ifdef SUPPORT_UTF
4246 if (utf) switch(ctype)
4247 {
4248 case OP_ANY:
4249 for (i = 1; i <= min; i++)
4250 {
4251 if (eptr >= md->end_subject)
4252 {
4253 SCHECK_PARTIAL();
4254 RRETURN(MATCH_NOMATCH);
4255 }
4256 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4257 if (md->partial != 0 &&
4258 eptr + 1 >= md->end_subject &&
4259 NLBLOCK->nltype == NLTYPE_FIXED &&
4260 NLBLOCK->nllen == 2 &&
4261 *eptr == NLBLOCK->nl[0])
4262 {
4263 md->hitend = TRUE;
4264 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4265 }
4266 eptr++;
4267 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4268 }
4269 break;
4270
4271 case OP_ALLANY:
4272 for (i = 1; i <= min; i++)
4273 {
4274 if (eptr >= md->end_subject)
4275 {
4276 SCHECK_PARTIAL();
4277 RRETURN(MATCH_NOMATCH);
4278 }
4279 eptr++;
4280 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4281 }
4282 break;
4283
4284 case OP_ANYBYTE:
4285 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4286 eptr += min;
4287 break;
4288
4289 case OP_ANYNL:
4290 for (i = 1; i <= min; i++)
4291 {
4292 if (eptr >= md->end_subject)
4293 {
4294 SCHECK_PARTIAL();
4295 RRETURN(MATCH_NOMATCH);
4296 }
4297 GETCHARINC(c, eptr);
4298 switch(c)
4299 {
4300 default: RRETURN(MATCH_NOMATCH);
4301
4302 case 0x000d:
4303 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4304 break;
4305
4306 case 0x000a:
4307 break;
4308
4309 case 0x000b:
4310 case 0x000c:
4311 case 0x0085:
4312 case 0x2028:
4313 case 0x2029:
4314 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4315 break;
4316 }
4317 }
4318 break;
4319
4320 case OP_NOT_HSPACE:
4321 for (i = 1; i <= min; i++)
4322 {
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 RRETURN(MATCH_NOMATCH);
4327 }
4328 GETCHARINC(c, eptr);
4329 switch(c)
4330 {
4331 default: break;
4332 case 0x09: /* HT */
4333 case 0x20: /* SPACE */
4334 case 0xa0: /* NBSP */
4335 case 0x1680: /* OGHAM SPACE MARK */
4336 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4337 case 0x2000: /* EN QUAD */
4338 case 0x2001: /* EM QUAD */
4339 case 0x2002: /* EN SPACE */
4340 case 0x2003: /* EM SPACE */
4341 case 0x2004: /* THREE-PER-EM SPACE */
4342 case 0x2005: /* FOUR-PER-EM SPACE */
4343 case 0x2006: /* SIX-PER-EM SPACE */
4344 case 0x2007: /* FIGURE SPACE */
4345 case 0x2008: /* PUNCTUATION SPACE */
4346 case 0x2009: /* THIN SPACE */
4347 case 0x200A: /* HAIR SPACE */
4348 case 0x202f: /* NARROW NO-BREAK SPACE */
4349 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4350 case 0x3000: /* IDEOGRAPHIC SPACE */
4351 RRETURN(MATCH_NOMATCH);
4352 }
4353 }
4354 break;
4355
4356 case OP_HSPACE:
4357 for (i = 1; i <= min; i++)
4358 {
4359 if (eptr >= md->end_subject)
4360 {
4361 SCHECK_PARTIAL();
4362 RRETURN(MATCH_NOMATCH);
4363 }
4364 GETCHARINC(c, eptr);
4365 switch(c)
4366 {
4367 default: RRETURN(MATCH_NOMATCH);
4368 case 0x09: /* HT */
4369 case 0x20: /* SPACE */
4370 case 0xa0: /* NBSP */
4371 case 0x1680: /* OGHAM SPACE MARK */
4372 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4373 case 0x2000: /* EN QUAD */
4374 case 0x2001: /* EM QUAD */
4375 case 0x2002: /* EN SPACE */
4376 case 0x2003: /* EM SPACE */
4377 case 0x2004: /* THREE-PER-EM SPACE */
4378 case 0x2005: /* FOUR-PER-EM SPACE */
4379 case 0x2006: /* SIX-PER-EM SPACE */
4380 case 0x2007: /* FIGURE SPACE */
4381 case 0x2008: /* PUNCTUATION SPACE */
4382 case 0x2009: /* THIN SPACE */
4383 case 0x200A: /* HAIR SPACE */
4384 case 0x202f: /* NARROW NO-BREAK SPACE */
4385 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4386 case 0x3000: /* IDEOGRAPHIC SPACE */
4387 break;
4388 }
4389 }
4390 break;
4391
4392 case OP_NOT_VSPACE:
4393 for (i = 1; i <= min; i++)
4394 {
4395 if (eptr >= md->end_subject)
4396 {
4397 SCHECK_PARTIAL();
4398 RRETURN(MATCH_NOMATCH);
4399 }
4400 GETCHARINC(c, eptr);
4401 switch(c)
4402 {
4403 default: break;
4404 case 0x0a: /* LF */
4405 case 0x0b: /* VT */
4406 case 0x0c: /* FF */
4407 case 0x0d: /* CR */
4408 case 0x85: /* NEL */
4409 case 0x2028: /* LINE SEPARATOR */
4410 case 0x2029: /* PARAGRAPH SEPARATOR */
4411 RRETURN(MATCH_NOMATCH);
4412 }
4413 }
4414 break;
4415
4416 case OP_VSPACE:
4417 for (i = 1; i <= min; i++)
4418 {
4419 if (eptr >= md->end_subject)
4420 {
4421 SCHECK_PARTIAL();
4422 RRETURN(MATCH_NOMATCH);
4423 }
4424 GETCHARINC(c, eptr);
4425 switch(c)
4426 {
4427 default: RRETURN(MATCH_NOMATCH);
4428 case 0x0a: /* LF */
4429 case 0x0b: /* VT */
4430 case 0x0c: /* FF */
4431 case 0x0d: /* CR */
4432 case 0x85: /* NEL */
4433 case 0x2028: /* LINE SEPARATOR */
4434 case 0x2029: /* PARAGRAPH SEPARATOR */
4435 break;
4436 }
4437 }
4438 break;
4439
4440 case OP_NOT_DIGIT:
4441 for (i = 1; i <= min; i++)
4442 {
4443 if (eptr >= md->end_subject)
4444 {
4445 SCHECK_PARTIAL();
4446 RRETURN(MATCH_NOMATCH);
4447 }
4448 GETCHARINC(c, eptr);
4449 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4450 RRETURN(MATCH_NOMATCH);
4451 }
4452 break;
4453
4454 case OP_DIGIT:
4455 for (i = 1; i <= min; i++)
4456 {
4457 if (eptr >= md->end_subject)
4458 {
4459 SCHECK_PARTIAL();
4460 RRETURN(MATCH_NOMATCH);
4461 }
4462 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4463 RRETURN(MATCH_NOMATCH);
4464 eptr++;
4465 /* No need to skip more bytes - we know it's a 1-byte character */
4466 }
4467 break;
4468
4469 case OP_NOT_WHITESPACE:
4470 for (i = 1; i <= min; i++)
4471 {
4472 if (eptr >= md->end_subject)
4473 {
4474 SCHECK_PARTIAL();
4475 RRETURN(MATCH_NOMATCH);
4476 }
4477 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4478 RRETURN(MATCH_NOMATCH);
4479 eptr++;
4480 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4481 }
4482 break;
4483
4484 case OP_WHITESPACE:
4485 for (i = 1; i <= min; i++)
4486 {
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 RRETURN(MATCH_NOMATCH);
4491 }
4492 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4493 RRETURN(MATCH_NOMATCH);
4494 eptr++;
4495 /* No need to skip more bytes - we know it's a 1-byte character */
4496 }
4497 break;
4498
4499 case OP_NOT_WORDCHAR:
4500 for (i = 1; i <= min; i++)
4501 {
4502 if (eptr >= md->end_subject)
4503 {
4504 SCHECK_PARTIAL();
4505 RRETURN(MATCH_NOMATCH);
4506 }
4507 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4508 RRETURN(MATCH_NOMATCH);
4509 eptr++;
4510 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4511 }
4512 break;
4513
4514 case OP_WORDCHAR:
4515 for (i = 1; i <= min; i++)
4516 {
4517 if (eptr >= md->end_subject)
4518 {
4519 SCHECK_PARTIAL();
4520 RRETURN(MATCH_NOMATCH);
4521 }
4522 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4523 RRETURN(MATCH_NOMATCH);
4524 eptr++;
4525 /* No need to skip more bytes - we know it's a 1-byte character */
4526 }
4527 break;
4528
4529 default:
4530 RRETURN(PCRE_ERROR_INTERNAL);
4531 } /* End switch(ctype) */
4532
4533 else
4534 #endif /* SUPPORT_UTF */
4535
4536 /* Code for the non-UTF-8 case for minimum matching of operators other
4537 than OP_PROP and OP_NOTPROP. */
4538
4539 switch(ctype)
4540 {
4541 case OP_ANY:
4542 for (i = 1; i <= min; i++)
4543 {
4544 if (eptr >= md->end_subject)
4545 {
4546 SCHECK_PARTIAL();
4547 RRETURN(MATCH_NOMATCH);
4548 }
4549 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4550 if (md->partial != 0 &&
4551 eptr + 1 >= md->end_subject &&
4552 NLBLOCK->nltype == NLTYPE_FIXED &&
4553 NLBLOCK->nllen == 2 &&
4554 *eptr == NLBLOCK->nl[0])
4555 {
4556 md->hitend = TRUE;
4557 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4558 }
4559 eptr++;
4560 }
4561 break;
4562
4563 case OP_ALLANY:
4564 if (eptr > md->end_subject - min)
4565 {
4566 SCHECK_PARTIAL();
4567 RRETURN(MATCH_NOMATCH);
4568 }
4569 eptr += min;
4570 break;
4571
4572 case OP_ANYBYTE:
4573 if (eptr > md->end_subject - min)
4574 {
4575 SCHECK_PARTIAL();
4576 RRETURN(MATCH_NOMATCH);
4577 }
4578 eptr += min;
4579 break;
4580
4581 case OP_ANYNL:
4582 for (i = 1; i <= min; i++)
4583 {
4584 if (eptr >= md->end_subject)
4585 {
4586 SCHECK_PARTIAL();
4587 RRETURN(MATCH_NOMATCH);
4588 }
4589 switch(*eptr++)
4590 {
4591 default: RRETURN(MATCH_NOMATCH);
4592
4593 case 0x000d:
4594 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4595 break;
4596
4597 case 0x000a:
4598 break;
4599
4600 case 0x000b:
4601 case 0x000c:
4602 case 0x0085:
4603 #ifdef COMPILE_PCRE16
4604 case 0x2028:
4605 case 0x2029:
4606 #endif
4607 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4608 break;
4609 }
4610 }
4611 break;
4612
4613 case OP_NOT_HSPACE:
4614 for (i = 1; i <= min; i++)
4615 {
4616 if (eptr >= md->end_subject)
4617 {
4618 SCHECK_PARTIAL();
4619 RRETURN(MATCH_NOMATCH);
4620 }
4621 switch(*eptr++)
4622 {
4623 default: break;
4624 case 0x09: /* HT */
4625 case 0x20: /* SPACE */
4626 case 0xa0: /* NBSP */
4627 #ifdef COMPILE_PCRE16
4628 case 0x1680: /* OGHAM SPACE MARK */
4629 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4630 case 0x2000: /* EN QUAD */
4631 case 0x2001: /* EM QUAD */
4632 case 0x2002: /* EN SPACE */
4633 case 0x2003: /* EM SPACE */
4634 case 0x2004: /* THREE-PER-EM SPACE */
4635 case 0x2005: /* FOUR-PER-EM SPACE */
4636 case 0x2006: /* SIX-PER-EM SPACE */
4637 case 0x2007: /* FIGURE SPACE */
4638 case 0x2008: /* PUNCTUATION SPACE */
4639 case 0x2009: /* THIN SPACE */
4640 case 0x200A: /* HAIR SPACE */
4641 case 0x202f: /* NARROW NO-BREAK SPACE */
4642 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4643 case 0x3000: /* IDEOGRAPHIC SPACE */
4644 #endif
4645 RRETURN(MATCH_NOMATCH);
4646 }
4647 }
4648 break;
4649
4650 case OP_HSPACE:
4651 for (i = 1; i <= min; i++)
4652 {
4653 if (eptr >= md->end_subject)
4654 {
4655 SCHECK_PARTIAL();
4656 RRETURN(MATCH_NOMATCH);
4657 }
4658 switch(*eptr++)
4659 {
4660 default: RRETURN(MATCH_NOMATCH);
4661 case 0x09: /* HT */
4662 case 0x20: /* SPACE */
4663 case 0xa0: /* NBSP */
4664 #ifdef COMPILE_PCRE16
4665 case 0x1680: /* OGHAM SPACE MARK */
4666 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4667 case 0x2000: /* EN QUAD */
4668 case 0x2001: /* EM QUAD */
4669 case 0x2002: /* EN SPACE */
4670 case 0x2003: /* EM SPACE */
4671 case 0x2004: /* THREE-PER-EM SPACE */
4672 case 0x2005: /* FOUR-PER-EM SPACE */
4673 case 0x2006: /* SIX-PER-EM SPACE */
4674 case 0x2007: /* FIGURE SPACE */
4675 case 0x2008: /* PUNCTUATION SPACE */
4676 case 0x2009: /* THIN SPACE */
4677 case 0x200A: /* HAIR SPACE */
4678 case 0x202f: /* NARROW NO-BREAK SPACE */
4679 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4680 case 0x3000: /* IDEOGRAPHIC SPACE */
4681 #endif
4682 break;
4683 }
4684 }
4685 break;
4686
4687 case OP_NOT_VSPACE:
4688 for (i = 1; i <= min; i++)
4689 {
4690 if (eptr >= md->end_subject)
4691 {
4692 SCHECK_PARTIAL();
4693 RRETURN(MATCH_NOMATCH);
4694 }
4695 switch(*eptr++)
4696 {
4697 default: break;
4698 case 0x0a: /* LF */
4699 case 0x0b: /* VT */
4700 case 0x0c: /* FF */
4701 case 0x0d: /* CR */
4702 case 0x85: /* NEL */
4703 #ifdef COMPILE_PCRE16
4704 case 0x2028: /* LINE SEPARATOR */
4705 case 0x2029: /* PARAGRAPH SEPARATOR */
4706 #endif
4707 RRETURN(MATCH_NOMATCH);
4708 }
4709 }
4710 break;
4711
4712 case OP_VSPACE:
4713 for (i = 1; i <= min; i++)
4714 {
4715 if (eptr >= md->end_subject)
4716 {
4717 SCHECK_PARTIAL();
4718 RRETURN(MATCH_NOMATCH);
4719 }
4720 switch(*eptr++)
4721 {
4722 default: RRETURN(MATCH_NOMATCH);
4723 case 0x0a: /* LF */
4724 case 0x0b: /* VT */
4725 case 0x0c: /* FF */
4726 case 0x0d: /* CR */
4727 case 0x85: /* NEL */
4728 #ifdef COMPILE_PCRE16
4729 case 0x2028: /* LINE SEPARATOR */
4730 case 0x2029: /* PARAGRAPH SEPARATOR */
4731 #endif
4732 break;
4733 }
4734 }
4735 break;
4736
4737 case OP_NOT_DIGIT:
4738 for (i = 1; i <= min; i++)
4739 {
4740 if (eptr >= md->end_subject)
4741 {
4742 SCHECK_PARTIAL();
4743 RRETURN(MATCH_NOMATCH);
4744 }
4745 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4746 RRETURN(MATCH_NOMATCH);
4747 eptr++;
4748 }
4749 break;
4750
4751 case OP_DIGIT:
4752 for (i = 1; i <= min; i++)
4753 {
4754 if (eptr >= md->end_subject)
4755 {
4756 SCHECK_PARTIAL();
4757 RRETURN(MATCH_NOMATCH);
4758 }
4759 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4760 RRETURN(MATCH_NOMATCH);
4761 eptr++;
4762 }
4763 break;
4764
4765 case OP_NOT_WHITESPACE:
4766 for (i = 1; i <= min; i++)
4767 {
4768 if (eptr >= md->end_subject)
4769 {
4770 SCHECK_PARTIAL();
4771 RRETURN(MATCH_NOMATCH);
4772 }
4773 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4774 RRETURN(MATCH_NOMATCH);
4775 eptr++;
4776 }
4777 break;
4778
4779 case OP_WHITESPACE:
4780 for (i = 1; i <= min; i++)
4781 {
4782 if (eptr >= md->end_subject)
4783 {
4784 SCHECK_PARTIAL();
4785 RRETURN(MATCH_NOMATCH);
4786 }
4787 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4788 RRETURN(MATCH_NOMATCH);
4789 eptr++;
4790 }
4791 break;
4792
4793 case OP_NOT_WORDCHAR:
4794 for (i = 1; i <= min; i++)
4795 {
4796 if (eptr >= md->end_subject)
4797 {
4798 SCHECK_PARTIAL();
4799 RRETURN(MATCH_NOMATCH);
4800 }
4801 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4802 RRETURN(MATCH_NOMATCH);
4803 eptr++;
4804 }
4805 break;
4806
4807 case OP_WORDCHAR:
4808 for (i = 1; i <= min; i++)
4809 {
4810 if (eptr >= md->end_subject)
4811 {
4812 SCHECK_PARTIAL();
4813 RRETURN(MATCH_NOMATCH);
4814 }
4815 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4816 RRETURN(MATCH_NOMATCH);
4817 eptr++;
4818 }
4819 break;
4820
4821 default:
4822 RRETURN(PCRE_ERROR_INTERNAL);
4823 }
4824 }
4825
4826 /* If min = max, continue at the same level without recursing */
4827
4828 if (min == max) continue;
4829
4830 /* If minimizing, we have to test the rest of the pattern before each
4831 subsequent match. Again, separate the UTF-8 case for speed, and also
4832 separate the UCP cases. */
4833
4834 if (minimize)
4835 {
4836 #ifdef SUPPORT_UCP
4837 if (prop_type >= 0)
4838 {
4839 switch(prop_type)
4840 {
4841 case PT_ANY:
4842 for (fi = min;; fi++)
4843 {
4844 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4846 if (fi >= max) RRETURN(MATCH_NOMATCH);
4847 if (eptr >= md->end_subject)
4848 {
4849 SCHECK_PARTIAL();
4850 RRETURN(MATCH_NOMATCH);
4851 }
4852 GETCHARINCTEST(c, eptr);
4853 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4854 }
4855 /* Control never gets here */
4856
4857 case PT_LAMP:
4858 for (fi = min;; fi++)
4859 {
4860 int chartype;
4861 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4863 if (fi >= max) RRETURN(MATCH_NOMATCH);
4864 if (eptr >= md->end_subject)
4865 {
4866 SCHECK_PARTIAL();
4867 RRETURN(MATCH_NOMATCH);
4868 }
4869 GETCHARINCTEST(c, eptr);
4870 chartype = UCD_CHARTYPE(c);
4871 if ((chartype == ucp_Lu ||
4872 chartype == ucp_Ll ||
4873 chartype == ucp_Lt) == prop_fail_result)
4874 RRETURN(MATCH_NOMATCH);
4875 }
4876 /* Control never gets here */
4877
4878 case PT_GC:
4879 for (fi = min;; fi++)
4880 {
4881 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4882 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4883 if (fi >= max) RRETURN(MATCH_NOMATCH);
4884 if (eptr >= md->end_subject)
4885 {
4886 SCHECK_PARTIAL();
4887 RRETURN(MATCH_NOMATCH);
4888 }
4889 GETCHARINCTEST(c, eptr);
4890 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4891 RRETURN(MATCH_NOMATCH);
4892 }
4893 /* Control never gets here */
4894
4895 case PT_PC:
4896 for (fi = min;; fi++)
4897 {
4898 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4899 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4900 if (fi >= max) RRETURN(MATCH_NOMATCH);
4901 if (eptr >= md->end_subject)
4902 {
4903 SCHECK_PARTIAL();
4904 RRETURN(MATCH_NOMATCH);
4905 }
4906 GETCHARINCTEST(c, eptr);
4907 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 /* Control never gets here */
4911
4912 case PT_SC:
4913 for (fi = min;; fi++)
4914 {
4915 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4917 if (fi >= max) RRETURN(MATCH_NOMATCH);
4918 if (eptr >= md->end_subject)
4919 {
4920 SCHECK_PARTIAL();
4921 RRETURN(MATCH_NOMATCH);
4922 }
4923 GETCHARINCTEST(c, eptr);
4924 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4925 RRETURN(MATCH_NOMATCH);
4926 }
4927 /* Control never gets here */
4928
4929 case PT_ALNUM:
4930 for (fi = min;; fi++)
4931 {
4932 int category;
4933 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4935 if (fi >= max) RRETURN(MATCH_NOMATCH);
4936 if (eptr >= md->end_subject)
4937 {
4938 SCHECK_PARTIAL();
4939 RRETURN(MATCH_NOMATCH);
4940 }
4941 GETCHARINCTEST(c, eptr);
4942 category = UCD_CATEGORY(c);
4943 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4944 RRETURN(MATCH_NOMATCH);
4945 }
4946 /* Control never gets here */
4947
4948 case PT_SPACE: /* Perl space */
4949 for (fi = min;; fi++)
4950 {
4951 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4953 if (fi >= max) RRETURN(MATCH_NOMATCH);
4954 if (eptr >= md->end_subject)
4955 {
4956 SCHECK_PARTIAL();
4957 RRETURN(MATCH_NOMATCH);
4958 }
4959 GETCHARINCTEST(c, eptr);
4960 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4961 c == CHAR_FF || c == CHAR_CR)
4962 == prop_fail_result)
4963 RRETURN(MATCH_NOMATCH);
4964 }
4965 /* Control never gets here */
4966
4967 case PT_PXSPACE: /* POSIX space */
4968 for (fi = min;; fi++)
4969 {
4970 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4972 if (fi >= max) RRETURN(MATCH_NOMATCH);
4973 if (eptr >= md->end_subject)
4974 {
4975 SCHECK_PARTIAL();
4976 RRETURN(MATCH_NOMATCH);
4977 }
4978 GETCHARINCTEST(c, eptr);
4979 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4980 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4981 == prop_fail_result)
4982 RRETURN(MATCH_NOMATCH);
4983 }
4984 /* Control never gets here */
4985
4986 case PT_WORD:
4987 for (fi = min;; fi++)
4988 {
4989 int category;
4990 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4992 if (fi >= max) RRETURN(MATCH_NOMATCH);
4993 if (eptr >= md->end_subject)
4994 {
4995 SCHECK_PARTIAL();
4996 RRETURN(MATCH_NOMATCH);
4997 }
4998 GETCHARINCTEST(c, eptr);
4999 category = UCD_CATEGORY(c);
5000 if ((category == ucp_L ||
5001 category == ucp_N ||
5002 c == CHAR_UNDERSCORE)
5003 == prop_fail_result)
5004 RRETURN(MATCH_NOMATCH);
5005 }
5006 /* Control never gets here */
5007
5008 /* This should never occur */
5009
5010 default:
5011 RRETURN(PCRE_ERROR_INTERNAL);
5012 }
5013 }
5014
5015 /* Match extended Unicode sequences. We will get here only if the
5016 support is in the binary; otherwise a compile-time error occurs. */
5017
5018 else if (ctype == OP_EXTUNI)
5019 {
5020 for (fi = min;; fi++)
5021 {
5022 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5024 if (fi >= max) RRETURN(MATCH_NOMATCH);
5025 if (eptr >= md->end_subject)
5026 {
5027 SCHECK_PARTIAL();
5028 RRETURN(MATCH_NOMATCH);
5029 }
5030 GETCHARINCTEST(c, eptr);
5031 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
5032 while (eptr < md->end_subject)
5033 {
5034 int len = 1;
5035 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5036 if (UCD_CATEGORY(c) != ucp_M) break;
5037 eptr += len;
5038 }
5039 CHECK_PARTIAL();
5040 }
5041 }
5042 else
5043 #endif /* SUPPORT_UCP */
5044
5045 #ifdef SUPPORT_UTF
5046 if (utf)
5047 {
5048 for (fi = min;; fi++)
5049 {
5050 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5052 if (fi >= max) RRETURN(MATCH_NOMATCH);
5053 if (eptr >= md->end_subject)
5054 {
5055 SCHECK_PARTIAL();
5056 RRETURN(MATCH_NOMATCH);
5057 }
5058 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5059 RRETURN(MATCH_NOMATCH);
5060 GETCHARINC(c, eptr);
5061 switch(ctype)
5062 {
5063 case OP_ANY: /* This is the non-NL case */
5064 if (md->partial != 0 && /* Take care with CRLF partial */
5065 eptr >= md->end_subject &&
5066 NLBLOCK->nltype == NLTYPE_FIXED &&
5067 NLBLOCK->nllen == 2 &&
5068 c == NLBLOCK->nl[0])
5069 {
5070 md->hitend = TRUE;
5071 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5072 }
5073 break;
5074
5075 case OP_ALLANY:
5076 case OP_ANYBYTE:
5077 break;
5078
5079 case OP_ANYNL:
5080 switch(c)
5081 {
5082 default: RRETURN(MATCH_NOMATCH);
5083 case 0x000d:
5084 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5085 break;
5086 case 0x000a:
5087 break;
5088
5089 case 0x000b:
5090 case 0x000c:
5091 case 0x0085:
5092 case 0x2028:
5093 case 0x2029:
5094 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5095 break;
5096 }
5097 break;
5098
5099 case OP_NOT_HSPACE:
5100 switch(c)
5101 {
5102 default: break;
5103 case 0x09: /* HT */
5104 case 0x20: /* SPACE */
5105 case 0xa0: /* NBSP */
5106 case 0x1680: /* OGHAM SPACE MARK */
5107 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5108 case 0x2000: /* EN QUAD */
5109 case 0x2001: /* EM QUAD */
5110 case 0x2002: /* EN SPACE */
5111 case 0x2003: /* EM SPACE */
5112 case 0x2004: /* THREE-PER-EM SPACE */
5113 case 0x2005: /* FOUR-PER-EM SPACE */
5114 case 0x2006: /* SIX-PER-EM SPACE */
5115 case 0x2007: /* FIGURE SPACE */
5116 case 0x2008: /* PUNCTUATION SPACE */
5117 case 0x2009: /* THIN SPACE */
5118 case 0x200A: /* HAIR SPACE */
5119 case 0x202f: /* NARROW NO-BREAK SPACE */
5120 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5121 case 0x3000: /* IDEOGRAPHIC SPACE */
5122 RRETURN(MATCH_NOMATCH);
5123 }
5124 break;
5125
5126 case OP_HSPACE:
5127 switch(c)
5128 {
5129 default: RRETURN(MATCH_NOMATCH);
5130 case 0x09: /* HT */
5131 case 0x20: /* SPACE */
5132 case 0xa0: /* NBSP */
5133 case 0x1680: /* OGHAM SPACE MARK */
5134 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5135 case 0x2000: /* EN QUAD */
5136 case 0x2001: /* EM QUAD */
5137 case 0x2002: /* EN SPACE */
5138 case 0x2003: /* EM SPACE */
5139 case 0x2004: /* THREE-PER-EM SPACE */
5140 case 0x2005: /* FOUR-PER-EM SPACE */
5141 case 0x2006: /* SIX-PER-EM SPACE */
5142 case 0x2007: /* FIGURE SPACE */
5143 case 0x2008: /* PUNCTUATION SPACE */
5144 case 0x2009: /* THIN SPACE */
5145 case 0x200A: /* HAIR SPACE */
5146 case 0x202f: /* NARROW NO-BREAK SPACE */
5147 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5148 case 0x3000: /* IDEOGRAPHIC SPACE */
5149 break;
5150 }
5151 break;
5152
5153 case OP_NOT_VSPACE:
5154 switch(c)
5155 {
5156 default: break;
5157 case 0x0a: /* LF */
5158 case 0x0b: /* VT */
5159 case 0x0c: /* FF */
5160 case 0x0d: /* CR */
5161 case 0x85: /* NEL */
5162 case 0x2028: /* LINE SEPARATOR */
5163 case 0x2029: /* PARAGRAPH SEPARATOR */
5164 RRETURN(MATCH_NOMATCH);
5165 }
5166 break;
5167
5168 case OP_VSPACE:
5169 switch(c)
5170 {
5171 default: RRETURN(MATCH_NOMATCH);
5172 case 0x0a: /* LF */
5173 case 0x0b: /* VT */
5174 case 0x0c: /* FF */
5175 case 0x0d: /* CR */
5176 case 0x85: /* NEL */
5177 case 0x2028: /* LINE SEPARATOR */
5178 case 0x2029: /* PARAGRAPH SEPARATOR */
5179 break;
5180 }
5181 break;
5182
5183 case OP_NOT_DIGIT:
5184 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5185 RRETURN(MATCH_NOMATCH);
5186 break;
5187
5188 case OP_DIGIT:
5189 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5190 RRETURN(MATCH_NOMATCH);
5191 break;
5192
5193 case OP_NOT_WHITESPACE:
5194 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5195 RRETURN(MATCH_NOMATCH);
5196 break;
5197
5198 case OP_WHITESPACE:
5199 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5200 RRETURN(MATCH_NOMATCH);
5201 break;
5202
5203 case OP_NOT_WORDCHAR:
5204 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5205 RRETURN(MATCH_NOMATCH);
5206 break;
5207
5208 case OP_WORDCHAR:
5209 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5210 RRETURN(MATCH_NOMATCH);
5211 break;
5212
5213 default:
5214 RRETURN(PCRE_ERROR_INTERNAL);
5215 }
5216 }
5217 }
5218 else
5219 #endif
5220 /* Not UTF mode */
5221 {
5222 for (fi = min;; fi++)
5223 {
5224 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5226 if (fi >= max) RRETURN(MATCH_NOMATCH);
5227 if (eptr >= md->end_subject)
5228 {
5229 SCHECK_PARTIAL();
5230 RRETURN(MATCH_NOMATCH);
5231 }
5232 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5233 RRETURN(MATCH_NOMATCH);
5234 c = *eptr++;
5235 switch(ctype)
5236 {
5237 case OP_ANY: /* This is the non-NL case */
5238 if (md->partial != 0 && /* Take care with CRLF partial */
5239 eptr >= md->end_subject &&
5240 NLBLOCK->nltype == NLTYPE_FIXED &&
5241 NLBLOCK->nllen == 2 &&
5242 c == NLBLOCK->nl[0])
5243 {
5244 md->hitend = TRUE;
5245 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5246 }
5247 break;
5248
5249 case OP_ALLANY:
5250 case OP_ANYBYTE:
5251 break;
5252
5253 case OP_ANYNL:
5254 switch(c)
5255 {
5256 default: RRETURN(MATCH_NOMATCH);
5257 case 0x000d:
5258 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5259 break;
5260
5261 case 0x000a:
5262 break;
5263
5264 case 0x000b:
5265 case 0x000c:
5266 case 0x0085:
5267 #ifdef COMPILE_PCRE16
5268 case 0x2028:
5269 case 0x2029:
5270 #endif
5271 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5272 break;
5273 }
5274 break;
5275
5276 case OP_NOT_HSPACE:
5277 switch(c)
5278 {
5279 default: break;
5280 case 0x09: /* HT */
5281 case 0x20: /* SPACE */
5282 case 0xa0: /* NBSP */
5283 #ifdef COMPILE_PCRE16
5284 case 0x1680: /* OGHAM SPACE MARK */
5285 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5286 case 0x2000: /* EN QUAD */
5287 case 0x2001: /* EM QUAD */
5288 case 0x2002: /* EN SPACE */
5289 case 0x2003: /* EM SPACE */
5290 case 0x2004: /* THREE-PER-EM SPACE */
5291 case 0x2005: /* FOUR-PER-EM SPACE */
5292 case 0x2006: /* SIX-PER-EM SPACE */
5293 case 0x2007: /* FIGURE SPACE */
5294 case 0x2008: /* PUNCTUATION SPACE */
5295 case 0x2009: /* THIN SPACE */
5296 case 0x200A: /* HAIR SPACE */
5297 case 0x202f: /* NARROW NO-BREAK SPACE */
5298 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5299 case 0x3000: /* IDEOGRAPHIC SPACE */
5300 #endif
5301 RRETURN(MATCH_NOMATCH);
5302 }
5303 break;
5304
5305 case OP_HSPACE:
5306 switch(c)
5307 {
5308 default: RRETURN(MATCH_NOMATCH);
5309 case 0x09: /* HT */
5310 case 0x20: /* SPACE */
5311 case 0xa0: /* NBSP */
5312 #ifdef COMPILE_PCRE16
5313 case 0x1680: /* OGHAM SPACE MARK */
5314 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5315 case 0x2000: /* EN QUAD */
5316 case 0x2001: /* EM QUAD */
5317 case 0x2002: /* EN SPACE */
5318 case 0x2003: /* EM SPACE */
5319 case 0x2004: /* THREE-PER-EM SPACE */
5320 case 0x2005: /* FOUR-PER-EM SPACE */
5321 case 0x2006: /* SIX-PER-EM SPACE */
5322 case 0x2007: /* FIGURE SPACE */
5323 case 0x2008: /* PUNCTUATION SPACE */
5324 case 0x2009: /* THIN SPACE */
5325 case 0x200A: /* HAIR SPACE */
5326 case 0x202f: /* NARROW NO-BREAK SPACE */
5327 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5328 case 0x3000: /* IDEOGRAPHIC SPACE */
5329 #endif
5330 break;
5331 }
5332 break;
5333
5334 case OP_NOT_VSPACE:
5335 switch(c)
5336 {
5337 default: break;
5338 case 0x0a: /* LF */
5339 case 0x0b: /* VT */
5340 case 0x0c: /* FF */
5341 case 0x0d: /* CR */
5342 case 0x85: /* NEL */
5343 #ifdef COMPILE_PCRE16
5344 case 0x2028: /* LINE SEPARATOR */
5345 case 0x2029: /* PARAGRAPH SEPARATOR */
5346 #endif
5347 RRETURN(MATCH_NOMATCH);
5348 }
5349 break;
5350
5351 case OP_VSPACE:
5352 switch(c)
5353 {
5354 default: RRETURN(MATCH_NOMATCH);
5355 case 0x0a: /* LF */
5356 case 0x0b: /* VT */
5357 case 0x0c: /* FF */
5358 case 0x0d: /* CR */
5359 case 0x85: /* NEL */
5360 #ifdef COMPILE_PCRE16
5361 case 0x2028: /* LINE SEPARATOR */
5362 case 0x2029: /* PARAGRAPH SEPARATOR */
5363 #endif
5364 break;
5365 }
5366 break;
5367
5368 case OP_NOT_DIGIT:
5369 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5370 break;
5371
5372 case OP_DIGIT:
5373 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5374 break;
5375
5376 case OP_NOT_WHITESPACE:
5377 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5378 break;
5379
5380 case OP_WHITESPACE:
5381 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5382 break;
5383
5384 case OP_NOT_WORDCHAR:
5385 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5386 break;
5387
5388 case OP_WORDCHAR:
5389 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5390 break;
5391
5392 default:
5393 RRETURN(PCRE_ERROR_INTERNAL);
5394 }
5395 }
5396 }
5397 /* Control never gets here */
5398 }
5399
5400 /* If maximizing, it is worth using inline code for speed, doing the type
5401 test once at the start (i.e. keep it out of the loop). Again, keep the
5402 UTF-8 and UCP stuff separate. */
5403
5404 else
5405 {
5406 pp = eptr; /* Remember where we started */
5407
5408 #ifdef SUPPORT_UCP
5409 if (prop_type >= 0)
5410 {
5411 switch(prop_type)
5412 {
5413 case PT_ANY:
5414 for (i = min; i < max; i++)
5415 {
5416 int len = 1;
5417 if (eptr >= md->end_subject)
5418 {
5419 SCHECK_PARTIAL();
5420 break;
5421 }
5422 GETCHARLENTEST(c, eptr, len);
5423 if (prop_fail_result) break;
5424 eptr+= len;
5425 }
5426 break;
5427
5428 case PT_LAMP:
5429 for (i = min; i < max; i++)
5430 {
5431 int chartype;
5432 int len = 1;
5433 if (eptr >= md->end_subject)
5434 {
5435 SCHECK_PARTIAL();
5436 break;
5437 }
5438 GETCHARLENTEST(c, eptr, len);
5439 chartype = UCD_CHARTYPE(c);
5440 if ((chartype == ucp_Lu ||
5441 chartype == ucp_Ll ||
5442 chartype == ucp_Lt) == prop_fail_result)
5443 break;
5444 eptr+= len;
5445 }
5446 break;
5447
5448 case PT_GC:
5449 for (i = min; i < max; i++)
5450 {
5451 int len = 1;
5452 if (eptr >= md->end_subject)
5453 {
5454 SCHECK_PARTIAL();
5455 break;
5456 }
5457 GETCHARLENTEST(c, eptr, len);
5458 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5459 eptr+= len;
5460 }
5461 break;
5462
5463 case PT_PC:
5464 for (i = min; i < max; i++)
5465 {
5466 int len = 1;
5467 if (eptr >= md->end_subject)
5468 {
5469 SCHECK_PARTIAL();
5470 break;
5471 }
5472 GETCHARLENTEST(c, eptr, len);
5473 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5474 eptr+= len;
5475 }
5476 break;
5477
5478 case PT_SC:
5479 for (i = min; i < max; i++)
5480 {
5481 int len = 1;
5482 if (eptr >= md->end_subject)
5483 {
5484 SCHECK_PARTIAL();
5485 break;
5486 }
5487 GETCHARLENTEST(c, eptr, len);
5488 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5489 eptr+= len;
5490 }
5491 break;
5492
5493 case PT_ALNUM:
5494 for (i = min; i < max; i++)
5495 {
5496 int category;
5497 int len = 1;
5498 if (eptr >= md->end_subject)
5499 {
5500 SCHECK_PARTIAL();
5501 break;
5502 }
5503 GETCHARLENTEST(c, eptr, len);
5504 category = UCD_CATEGORY(c);
5505 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5506 break;
5507 eptr+= len;
5508 }
5509 break;
5510
5511 case PT_SPACE: /* Perl space */
5512 for (i = min; i < max; i++)
5513 {
5514 int len = 1;
5515 if (eptr >= md->end_subject)
5516 {
5517 SCHECK_PARTIAL();
5518 break;
5519 }
5520 GETCHARLENTEST(c, eptr, len);
5521 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5522 c == CHAR_FF || c == CHAR_CR)
5523 == prop_fail_result)
5524 break;
5525 eptr+= len;
5526 }
5527 break;
5528
5529 case PT_PXSPACE: /* POSIX space */
5530 for (i = min; i < max; i++)
5531 {
5532 int len = 1;
5533 if (eptr >= md->end_subject)
5534 {
5535 SCHECK_PARTIAL();
5536 break;
5537 }
5538 GETCHARLENTEST(c, eptr, len);
5539 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5540 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5541 == prop_fail_result)
5542 break;
5543 eptr+= len;
5544 }
5545 break;
5546
5547 case PT_WORD:
5548 for (i = min; i < max; i++)
5549 {
5550 int category;
5551 int len = 1;
5552 if (eptr >= md->end_subject)
5553 {
5554 SCHECK_PARTIAL();
5555 break;
5556 }
5557 GETCHARLENTEST(c, eptr, len);
5558 category = UCD_CATEGORY(c);
5559 if ((category == ucp_L || category == ucp_N ||
5560 c == CHAR_UNDERSCORE) == prop_fail_result)
5561 break;
5562 eptr+= len;
5563 }
5564 break;
5565
5566 default:
5567 RRETURN(PCRE_ERROR_INTERNAL);
5568 }
5569
5570 /* eptr is now past the end of the maximum run */
5571
5572 if (possessive) continue;
5573 for(;;)
5574 {
5575 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5577 if (eptr-- == pp) break; /* Stop if tried at original pos */
5578 if (utf) BACKCHAR(eptr);
5579 }
5580 }
5581
5582 /* Match extended Unicode sequences. We will get here only if the
5583 support is in the binary; otherwise a compile-time error occurs. */
5584
5585 else if (ctype == OP_EXTUNI)
5586 {
5587 for (i = min; i < max; i++)
5588 {
5589 int len = 1;
5590 if (eptr >= md->end_subject)
5591 {
5592 SCHECK_PARTIAL();
5593 break;
5594 }
5595 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5596 if (UCD_CATEGORY(c) == ucp_M) break;
5597 eptr += len;
5598 while (eptr < md->end_subject)
5599 {
5600 len = 1;
5601 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5602 if (UCD_CATEGORY(c) != ucp_M) break;
5603 eptr += len;
5604 }
5605 CHECK_PARTIAL();
5606 }
5607
5608 /* eptr is now past the end of the maximum run */
5609
5610 if (possessive) continue;
5611
5612 for(;;)
5613 {
5614 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5615 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5616 if (eptr-- == pp) break; /* Stop if tried at original pos */
5617 for (;;) /* Move back over one extended */
5618 {
5619 if (!utf) c = *eptr; else
5620 {
5621 BACKCHAR(eptr);
5622 GETCHAR(c, eptr);
5623 }
5624 if (UCD_CATEGORY(c) != ucp_M) break;
5625 eptr--;
5626 }
5627 }
5628 }
5629
5630 else
5631 #endif /* SUPPORT_UCP */
5632
5633 #ifdef SUPPORT_UTF
5634 if (utf)
5635 {
5636 switch(ctype)
5637 {
5638 case OP_ANY:
5639 if (max < INT_MAX)
5640 {
5641 for (i = min; i < max; i++)
5642 {
5643 if (eptr >= md->end_subject)
5644 {
5645 SCHECK_PARTIAL();
5646 break;
5647 }
5648 if (IS_NEWLINE(eptr)) break;
5649 if (md->partial != 0 && /* Take care with CRLF partial */
5650 eptr + 1 >= md->end_subject &&
5651 NLBLOCK->nltype == NLTYPE_FIXED &&
5652 NLBLOCK->nllen == 2 &&
5653 *eptr == NLBLOCK->nl[0])
5654 {
5655 md->hitend = TRUE;
5656 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5657 }
5658 eptr++;
5659 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5660 }
5661 }
5662
5663 /* Handle unlimited UTF-8 repeat */
5664
5665 else
5666 {
5667 for (i = min; i < max; i++)
5668 {
5669 if (eptr >= md->end_subject)
5670 {
5671 SCHECK_PARTIAL();
5672 break;
5673 }
5674 if (IS_NEWLINE(eptr)) break;
5675 if (md->partial != 0 && /* Take care with CRLF partial */
5676 eptr + 1 >= md->end_subject &&
5677 NLBLOCK->nltype == NLTYPE_FIXED &&
5678 NLBLOCK->nllen == 2 &&
5679 *eptr == NLBLOCK->nl[0])
5680 {
5681 md->hitend = TRUE;
5682 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5683 }
5684 eptr++;
5685 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5686 }
5687 }
5688 break;
5689
5690 case OP_ALLANY:
5691 if (max < INT_MAX)
5692 {
5693 for (i = min; i < max; i++)
5694 {
5695 if (eptr >= md->end_subject)
5696 {
5697 SCHECK_PARTIAL();
5698 break;
5699 }
5700 eptr++;
5701 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5702 }
5703 }
5704 else
5705 {
5706 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5707 SCHECK_PARTIAL();
5708 }
5709 break;
5710
5711 /* The byte case is the same as non-UTF8 */
5712
5713 case OP_ANYBYTE:
5714 c = max - min;
5715 if (c > (unsigned int)(md->end_subject - eptr))
5716 {
5717 eptr = md->end_subject;
5718 SCHECK_PARTIAL();
5719 }
5720 else eptr += c;
5721 break;
5722
5723 case OP_ANYNL:
5724 for (i = min; i < max; i++)
5725 {
5726 int len = 1;
5727 if (eptr >= md->end_subject)
5728 {
5729 SCHECK_PARTIAL();
5730 break;
5731 }
5732 GETCHARLEN(c, eptr, len);
5733 if (c == 0x000d)
5734 {
5735 if (++eptr >= md->end_subject) break;
5736 if (*eptr == 0x000a) eptr++;
5737 }
5738 else
5739 {
5740 if (c != 0x000a &&
5741 (md->bsr_anycrlf ||
5742 (c != 0x000b && c != 0x000c &&
5743 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5744 break;
5745 eptr += len;
5746 }
5747 }
5748 break;
5749
5750 case OP_NOT_HSPACE:
5751 case OP_HSPACE:
5752 for (i = min; i < max; i++)
5753 {
5754 BOOL gotspace;
5755 int len = 1;
5756 if (eptr >= md->end_subject)
5757 {
5758 SCHECK_PARTIAL();
5759 break;
5760 }
5761 GETCHARLEN(c, eptr, len);
5762 switch(c)
5763 {
5764 default: gotspace = FALSE; break;
5765 case 0x09: /* HT */
5766 case 0x20: /* SPACE */
5767 case 0xa0: /* NBSP */
5768 case 0x1680: /* OGHAM SPACE MARK */
5769 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5770 case 0x2000: /* EN QUAD */
5771 case 0x2001: /* EM QUAD */
5772 case 0x2002: /* EN SPACE */
5773 case 0x2003: /* EM SPACE */
5774 case 0x2004: /* THREE-PER-EM SPACE */
5775 case 0x2005: /* FOUR-PER-EM SPACE */
5776 case 0x2006: /* SIX-PER-EM SPACE */
5777 case 0x2007: /* FIGURE SPACE */
5778 case 0x2008: /* PUNCTUATION SPACE */
5779 case 0x2009: /* THIN SPACE */
5780 case 0x200A: /* HAIR SPACE */
5781 case 0x202f: /* NARROW NO-BREAK SPACE */
5782 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5783 case 0x3000: /* IDEOGRAPHIC SPACE */
5784 gotspace = TRUE;
5785 break;
5786 }
5787 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5788 eptr += len;
5789 }
5790 break;
5791
5792 case OP_NOT_VSPACE:
5793 case OP_VSPACE:
5794 for (i = min; i < max; i++)
5795 {
5796 BOOL gotspace;
5797 int len = 1;
5798 if (eptr >= md->end_subject)
5799 {
5800 SCHECK_PARTIAL();
5801 break;
5802 }
5803 GETCHARLEN(c, eptr, len);
5804 switch(c)
5805 {
5806 default: gotspace = FALSE; break;
5807 case 0x0a: /* LF */
5808 case 0x0b: /* VT */
5809 case 0x0c: /* FF */
5810 case 0x0d: /* CR */
5811 case 0x85: /* NEL */
5812 case 0x2028: /* LINE SEPARATOR */
5813 case 0x2029: /* PARAGRAPH SEPARATOR */
5814 gotspace = TRUE;
5815 break;
5816 }
5817 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5818 eptr += len;
5819 }
5820 break;
5821
5822 case OP_NOT_DIGIT:
5823 for (i = min; i < max; i++)
5824 {
5825 int len = 1;
5826 if (eptr >= md->end_subject)
5827 {
5828 SCHECK_PARTIAL();
5829 break;
5830 }
5831 GETCHARLEN(c, eptr, len);
5832 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5833 eptr+= len;
5834 }
5835 break;
5836
5837 case OP_DIGIT:
5838 for (i = min; i < max; i++)
5839 {
5840 int len = 1;
5841 if (eptr >= md->end_subject)
5842 {
5843 SCHECK_PARTIAL();
5844 break;
5845 }
5846 GETCHARLEN(c, eptr, len);
5847 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5848 eptr+= len;
5849 }
5850 break;
5851
5852 case OP_NOT_WHITESPACE:
5853 for (i = min; i < max; i++)
5854 {
5855 int len = 1;
5856 if (eptr >= md->end_subject)
5857 {
5858 SCHECK_PARTIAL();
5859 break;
5860 }
5861 GETCHARLEN(c, eptr, len);
5862 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5863 eptr+= len;
5864 }
5865 break;
5866
5867 case OP_WHITESPACE:
5868 for (i = min; i < max; i++)
5869 {
5870 int len = 1;
5871 if (eptr >= md->end_subject)
5872 {
5873 SCHECK_PARTIAL();
5874 break;
5875 }
5876 GETCHARLEN(c, eptr, len);
5877 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5878 eptr+= len;
5879 }
5880 break;
5881
5882 case OP_NOT_WORDCHAR:
5883 for (i = min; i < max; i++)
5884 {
5885 int len = 1;
5886 if (eptr >= md->end_subject)
5887 {
5888 SCHECK_PARTIAL();
5889 break;
5890 }
5891 GETCHARLEN(c, eptr, len);
5892 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5893 eptr+= len;
5894 }
5895 break;
5896
5897 case OP_WORDCHAR:
5898 for (i = min; i < max; i++)
5899 {
5900 int len = 1;
5901 if (eptr >= md->end_subject)
5902 {
5903 SCHECK_PARTIAL();
5904 break;
5905 }
5906 GETCHARLEN(c, eptr, len);
5907 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5908 eptr+= len;
5909 }
5910 break;
5911
5912 default:
5913 RRETURN(PCRE_ERROR_INTERNAL);
5914 }
5915
5916 /* eptr is now past the end of the maximum run. If possessive, we are
5917 done (no backing up). Otherwise, match at this position; anything other
5918 than no match is immediately returned. For nomatch, back up one
5919 character, unless we are matching \R and the last thing matched was
5920 \r\n, in which case, back up two bytes. */
5921
5922 if (possessive) continue;
5923 for(;;)
5924 {
5925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5927 if (eptr-- == pp) break; /* Stop if tried at original pos */
5928 BACKCHAR(eptr);
5929 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5930 eptr[-1] == '\r') eptr--;
5931 }
5932 }
5933 else
5934 #endif /* SUPPORT_UTF */
5935 /* Not UTF mode */
5936 {
5937 switch(ctype)
5938 {
5939 case OP_ANY:
5940 for (i = min; i < max; i++)
5941 {
5942 if (eptr >= md->end_subject)
5943 {
5944 SCHECK_PARTIAL();
5945 break;
5946 }
5947 if (IS_NEWLINE(eptr)) break;
5948 if (md->partial != 0 && /* Take care with CRLF partial */
5949 eptr + 1 >= md->end_subject &&
5950 NLBLOCK->nltype == NLTYPE_FIXED &&
5951 NLBLOCK->nllen == 2 &&
5952 *eptr == NLBLOCK->nl[0])
5953 {
5954 md->hitend = TRUE;
5955 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5956 }
5957 eptr++;
5958 }
5959 break;
5960
5961 case OP_ALLANY:
5962 case OP_ANYBYTE:
5963 c = max - min;
5964 if (c > (unsigned int)(md->end_subject - eptr))
5965 {
5966 eptr = md->end_subject;
5967 SCHECK_PARTIAL();
5968 }
5969 else eptr += c;
5970 break;
5971
5972 case OP_ANYNL:
5973 for (i = min; i < max; i++)
5974 {
5975 if (eptr >= md->end_subject)
5976 {
5977 SCHECK_PARTIAL();
5978 break;
5979 }
5980 c = *eptr;
5981 if (c == 0x000d)
5982 {
5983 if (++eptr >= md->end_subject) break;
5984 if (*eptr == 0x000a) eptr++;
5985 }
5986 else
5987 {
5988 if (c != 0x000a && (md->bsr_anycrlf ||
5989 (c != 0x000b && c != 0x000c && c != 0x0085
5990 #ifdef COMPILE_PCRE16
5991 && c != 0x2028 && c != 0x2029
5992 #endif
5993 ))) break;
5994 eptr++;
5995 }
5996 }
5997 break;
5998
5999 case OP_NOT_HSPACE:
6000 for (i = min; i < max; i++)
6001 {
6002 if (eptr >= md->end_subject)
6003 {
6004 SCHECK_PARTIAL();
6005 break;
6006 }
6007 c = *eptr;
6008 if (c == 0x09 || c == 0x20 || c == 0xa0
6009 #ifdef COMPILE_PCRE16
6010 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
6011 || c == 0x202f || c == 0x205f || c == 0x3000
6012 #endif
6013 ) break;
6014 eptr++;
6015 }
6016 break;
6017
6018 case OP_HSPACE:
6019 for (i = min; i < max; i++)
6020 {
6021 if (eptr >= md->end_subject)
6022 {
6023 SCHECK_PARTIAL();
6024 break;
6025 }
6026 c = *eptr;
6027 if (c != 0x09 && c != 0x20 && c != 0xa0
6028 #ifdef COMPILE_PCRE16
6029 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
6030 && c != 0x202f && c != 0x205f && c != 0x3000
6031 #endif
6032 ) break;
6033 eptr++;
6034 }
6035 break;
6036
6037 case OP_NOT_VSPACE:
6038 for (i = min; i < max; i++)
6039 {
6040 if (eptr >= md->end_subject)
6041 {
6042 SCHECK_PARTIAL();
6043 break;
6044 }
6045 c = *eptr;
6046 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
6047 #ifdef COMPILE_PCRE16
6048 || c == 0x2028 || c == 0x2029
6049 #endif
6050 ) break;
6051 eptr++;
6052 }
6053 break;
6054
6055 case OP_VSPACE:
6056 for (i = min; i < max; i++)
6057 {
6058 if (eptr >= md->end_subject)
6059 {
6060 SCHECK_PARTIAL();
6061 break;
6062 }
6063 c = *eptr;
6064 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
6065 #ifdef COMPILE_PCRE16
6066 && c != 0x2028 && c != 0x2029
6067 #endif
6068 ) break;
6069 eptr++;
6070 }
6071 break;
6072
6073 case OP_NOT_DIGIT:
6074 for (i = min; i < max; i++)
6075 {
6076 if (eptr >= md->end_subject)
6077 {
6078 SCHECK_PARTIAL();
6079 break;
6080 }
6081 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6082 eptr++;
6083 }
6084 break;
6085
6086 case OP_DIGIT:
6087 for (i = min; i < max; i++)
6088 {
6089 if (eptr >= md->end_subject)
6090 {
6091 SCHECK_PARTIAL();
6092 break;
6093 }
6094 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6095 eptr++;
6096 }
6097 break;
6098
6099 case OP_NOT_WHITESPACE:
6100 for (i = min; i < max; i++)
6101 {
6102 if (eptr >= md->end_subject)
6103 {
6104 SCHECK_PARTIAL();
6105 break;
6106 }
6107 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6108 eptr++;
6109 }
6110 break;
6111
6112 case OP_WHITESPACE:
6113 for (i = min; i < max; i++)
6114 {
6115 if (eptr >= md->end_subject)
6116 {
6117 SCHECK_PARTIAL();
6118 break;
6119 }
6120 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6121 eptr++;
6122 }
6123 break;
6124
6125 case OP_NOT_WORDCHAR:
6126 for (i = min; i < max; i++)
6127 {
6128 if (eptr >= md->end_subject)
6129 {
6130 SCHECK_PARTIAL();
6131 break;
6132 }
6133 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6134 eptr++;
6135 }
6136 break;
6137
6138 case OP_WORDCHAR:
6139 for (i = min; i < max; i++)
6140 {
6141 if (eptr >= md->end_subject)
6142 {
6143 SCHECK_PARTIAL();
6144 break;
6145 }
6146 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6147 eptr++;
6148 }
6149 break;
6150
6151 default:
6152 RRETURN(PCRE_ERROR_INTERNAL);
6153 }
6154
6155 /* eptr is now past the end of the maximum run. If possessive, we are
6156 done (no backing up). Otherwise, match at this position; anything other
6157 than no match is immediately returned. For nomatch, back up one
6158 character (byte), unless we are matching \R and the last thing matched
6159 was \r\n, in which case, back up two bytes. */
6160
6161 if (possessive) continue;
6162 while (eptr >= pp)
6163 {
6164 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6165 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6166 eptr--;
6167 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6168 eptr[-1] == '\r') eptr--;
6169 }
6170 }
6171
6172 /* Get here if we can't make it match with any permitted repetitions */
6173
6174 RRETURN(MATCH_NOMATCH);
6175 }
6176 /* Control never gets here */
6177
6178 /* There's been some horrible disaster. Arrival here can only mean there is
6179 something seriously wrong in the code above or the OP_xxx definitions. */
6180
6181 default:
6182 DPRINTF(("Unknown opcode %d\n", *ecode));
6183 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6184 }
6185
6186 /* Do not stick any code in here without much thought; it is assumed
6187 that "continue" in the code above comes out to here to repeat the main
6188 loop. */
6189
6190 } /* End of main loop */
6191 /* Control never reaches here */
6192
6193
6194 /* When compiling to use the heap rather than the stack for recursive calls to
6195 match(), the RRETURN() macro jumps here. The number that is saved in
6196 frame->Xwhere indicates which label we actually want to return to. */
6197
6198 #ifdef NO_RECURSE
6199 #define LBL(val) case val: goto L_RM##val;
6200 HEAP_RETURN:
6201 switch (frame->Xwhere)
6202 {
6203 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6204 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6205 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6206 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6207 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6208 LBL(65) LBL(66)
6209 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6210 LBL(21)
6211 #endif
6212 #ifdef SUPPORT_UTF
6213 LBL(16) LBL(18) LBL(20)
6214 LBL(22) LBL(23) LBL(28) LBL(30)
6215 LBL(32) LBL(34) LBL(42) LBL(46)
6216 #ifdef SUPPORT_UCP
6217 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6218 LBL(59) LBL(60) LBL(61) LBL(62)
6219 #endif /* SUPPORT_UCP */
6220 #endif /* SUPPORT_UTF */
6221 default:
6222 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6223
6224 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6225
6226 return PCRE_ERROR_INTERNAL;
6227 }
6228 #undef LBL
6229 #endif /* NO_RECURSE */
6230 }
6231
6232
6233 /***************************************************************************
6234 ****************************************************************************
6235 RECURSION IN THE match() FUNCTION
6236
6237 Undefine all the macros that were defined above to handle this. */
6238
6239 #ifdef NO_RECURSE
6240 #undef eptr
6241 #undef ecode
6242 #undef mstart
6243 #undef offset_top
6244 #undef eptrb
6245 #undef flags
6246
6247 #undef callpat
6248 #undef charptr
6249 #undef data
6250 #undef next
6251 #undef pp
6252 #undef prev
6253 #undef saved_eptr
6254
6255 #undef new_recursive
6256
6257 #undef cur_is_word
6258 #undef condition
6259 #undef prev_is_word
6260
6261 #undef ctype
6262 #undef length
6263 #undef max
6264 #undef min
6265 #undef number
6266 #undef offset
6267 #undef op
6268 #undef save_capture_last
6269 #undef save_offset1
6270 #undef save_offset2
6271 #undef save_offset3
6272 #undef stacksave
6273
6274 #undef newptrb
6275
6276 #endif
6277
6278 /* These two are defined as macros in both cases */
6279
6280 #undef fc
6281 #undef fi
6282
6283 /***************************************************************************
6284 ***************************************************************************/
6285
6286
6287 #ifdef NO_RECURSE
6288 /*************************************************
6289 * Release allocated heap frames *
6290 *************************************************/
6291
6292 /* This function releases all the allocated frames. The base frame is on the
6293 machine stack, and so must not be freed.
6294
6295 Argument: the address of the base frame
6296 Returns: nothing
6297 */
6298
6299 static void
6300 release_match_heapframes (heapframe *frame_base)
6301 {
6302 heapframe *nextframe = frame_base->Xnextframe;
6303 while (nextframe != NULL)
6304 {
6305 heapframe *oldframe = nextframe;
6306 nextframe = nextframe->Xnextframe;
6307 (PUBL(stack_free))(oldframe);
6308 }
6309 }
6310 #endif
6311
6312
6313 /*************************************************
6314 * Execute a Regular Expression *
6315 *************************************************/
6316
6317 /* This function applies a compiled re to a subject string and picks out
6318 portions of the string if it matches. Two elements in the vector are set for
6319 each substring: the offsets to the start and end of the substring.
6320
6321 Arguments:
6322 argument_re points to the compiled expression
6323 extra_data points to extra data or is NULL
6324 subject points to the subject string
6325 length length of subject string (may contain binary zeros)
6326 start_offset where to start in the subject string
6327 options option bits
6328 offsets points to a vector of ints to be filled in with offsets
6329 offsetcount the number of elements in the vector
6330
6331 Returns: > 0 => success; value is the number of elements filled in
6332 = 0 => success, but offsets is not big enough
6333 -1 => failed to match
6334 < -1 => some kind of unexpected problem
6335 */
6336
6337 #ifdef COMPILE_PCRE8
6338 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6339 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6340 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6341 int offsetcount)
6342 #else
6343 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6344 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6345 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6346 int offsetcount)
6347 #endif
6348 {
6349 int rc, ocount, arg_offset_max;
6350 int newline;
6351 BOOL using_temporary_offsets = FALSE;
6352 BOOL anchored;
6353 BOOL startline;
6354 BOOL firstline;
6355 BOOL utf;
6356 BOOL has_first_char = FALSE;
6357 BOOL has_req_char = FALSE;
6358 pcre_uchar first_char = 0;
6359 pcre_uchar first_char2 = 0;
6360 pcre_uchar req_char = 0;
6361 pcre_uchar req_char2 = 0;
6362 match_data match_block;
6363 match_data *md = &match_block;
6364 const pcre_uint8 *tables;
6365 const pcre_uint8 *start_bits = NULL;
6366 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6367 PCRE_PUCHAR end_subject;
6368 PCRE_PUCHAR start_partial = NULL;
6369 PCRE_PUCHAR req_char_ptr = start_match - 1;
6370
6371 const pcre_study_data *study;
6372 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6373
6374 #ifdef NO_RECURSE
6375 heapframe frame_zero;
6376 frame_zero.Xprevframe = NULL; /* Marks the top level */
6377 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6378 md->match_frames_base = &frame_zero;
6379 #endif
6380
6381 /* Check for the special magic call that measures the size of the stack used
6382 per recursive call of match(). Without the funny casting for sizeof, a Windows
6383 compiler gave this error: "unary minus operator applied to unsigned type,
6384 result still unsigned". Hopefully the cast fixes that. */
6385
6386 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6387 start_offset == -999)
6388 #ifdef NO_RECURSE
6389 return -((int)sizeof(heapframe));
6390 #else
6391 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6392 #endif
6393
6394 /* Plausibility checks */
6395
6396 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6397 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6398 return PCRE_ERROR_NULL;
6399 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6400 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6401
6402 /* Check that the first field in the block is the magic number. If it is not,
6403 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6404 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6405 means that the pattern is likely compiled with different endianness. */
6406
6407 if (re->magic_number != MAGIC_NUMBER)
6408 return re->magic_number == REVERSED_MAGIC_NUMBER?
6409 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6410 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6411
6412 /* These two settings are used in the code for checking a UTF-8 string that
6413 follows immediately afterwards. Other values in the md block are used only
6414 during "normal" pcre_exec() processing, not when the JIT support is in use,
6415 so they are set up later. */
6416
6417 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6418 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6419 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6420 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6421
6422 /* Check a UTF-8 string if required. Pass back the character offset and error
6423 code for an invalid string if a results vector is available. */
6424
6425 #ifdef SUPPORT_UTF
6426 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6427 {
6428 int erroroffset;
6429 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6430 if (errorcode != 0)
6431 {
6432 if (offsetcount >= 2)
6433 {
6434 offsets[0] = erroroffset;
6435 offsets[1] = errorcode;
6436 }
6437 #ifdef COMPILE_PCRE16
6438 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6439 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6440 #else
6441 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6442 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6443 #endif
6444 }
6445
6446 /* Check that a start_offset points to the start of a UTF character. */
6447 if (start_offset > 0 && start_offset < length &&
6448 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6449 return PCRE_ERROR_BADUTF8_OFFSET;
6450 }
6451 #endif
6452
6453 /* If the pattern was successfully studied with JIT support, run the JIT
6454 executable instead of the rest of this function. Most options must be set at
6455 compile time for the JIT code to be usable. Fallback to the normal code path if
6456 an unsupported flag is set. */
6457
6458 #ifdef SUPPORT_JIT
6459 if (extra_data != NULL
6460 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6461 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6462 && extra_data->executable_jit != NULL
6463 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6464 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6465 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6466 {
6467 rc = PRIV(jit_exec)(re, extra_data->executable_jit,
6468 (const pcre_uchar *)subject, length, start_offset, options,
6469 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6470 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount,
6471 ((extra_data->flags & PCRE_EXTRA_MARK) != 0) ? extra_data->mark : NULL);
6472
6473 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6474 mode is not compiled. In this case we simply fallback to interpreter. */
6475
6476 if (rc != PCRE_ERROR_NULL) return rc;
6477 }
6478 #endif
6479
6480 /* Carry on with non-JIT matching. This information is for finding all the
6481 numbers associated with a given name, for condition testing. */
6482
6483 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6484 md->name_count = re->name_count;
6485 md->name_entry_size = re->name_entry_size;
6486
6487 /* Fish out the optional data from the extra_data structure, first setting
6488 the default values. */
6489
6490 study = NULL;
6491 md->match_limit = MATCH_LIMIT;
6492 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6493 md->callout_data = NULL;
6494
6495 /* The table pointer is always in native byte order. */
6496
6497 tables = re->tables;
6498
6499 if (extra_data != NULL)
6500 {
6501 register unsigned int flags = extra_data->flags;
6502 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6503 study = (const pcre_study_data *)extra_data->study_data;
6504 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6505 md->match_limit = extra_data->match_limit;
6506 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6507 md->match_limit_recursion = extra_data->match_limit_recursion;
6508 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6509 md->callout_data = extra_data->callout_data;
6510 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6511 }
6512
6513 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6514 is a feature that makes it possible to save compiled regex and re-use them
6515 in other programs later. */
6516
6517 if (tables == NULL) tables = PRIV(default_tables);
6518
6519 /* Set up other data */
6520
6521 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6522 startline = (re->flags & PCRE_STARTLINE) != 0;
6523 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6524
6525 /* The code starts after the real_pcre block and the capture name table. */
6526
6527 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6528 re->name_count * re->name_entry_size;
6529
6530 md->start_subject = (PCRE_PUCHAR)subject;
6531 md->start_offset = start_offset;
6532 md->end_subject = md->start_subject + length;
6533 end_subject = md->end_subject;
6534
6535 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6536 md->use_ucp = (re->options & PCRE_UCP) != 0;
6537 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6538 md->ignore_skip_arg = FALSE;
6539
6540 /* Some options are unpacked into BOOL variables in the hope that testing
6541 them will be faster than individual option bits. */
6542
6543 md->notbol = (options & PCRE_NOTBOL) != 0;
6544 md->noteol = (options & PCRE_NOTEOL) != 0;
6545 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6546 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6547
6548 md->hitend = FALSE;
6549 md->mark = md->nomatch_mark = NULL; /* In case never set */
6550
6551 md->recursive = NULL; /* No recursion at top level */
6552 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6553
6554 md->lcc = tables + lcc_offset;
6555 md->fcc = tables + fcc_offset;
6556 md->ctypes = tables + ctypes_offset;
6557
6558 /* Handle different \R options. */
6559
6560 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6561 {
6562 case 0:
6563 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6564 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6565 else
6566 #ifdef BSR_ANYCRLF
6567 md->bsr_anycrlf = TRUE;
6568 #else
6569 md->bsr_anycrlf = FALSE;
6570 #endif
6571 break;
6572
6573 case PCRE_BSR_ANYCRLF:
6574 md->bsr_anycrlf = TRUE;
6575 break;
6576
6577 case PCRE_BSR_UNICODE:
6578 md->bsr_anycrlf = FALSE;
6579 break;
6580
6581 default: return PCRE_ERROR_BADNEWLINE;
6582 }
6583
6584 /* Handle different types of newline. The three bits give eight cases. If
6585 nothing is set at run time, whatever was used at compile time applies. */
6586
6587 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6588 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6589 {
6590 case 0: newline = NEWLINE; break; /* Compile-time default */
6591 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6592 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6593 case PCRE_NEWLINE_CR+
6594 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6595 case PCRE_NEWLINE_ANY: newline = -1; break;
6596 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6597 default: return PCRE_ERROR_BADNEWLINE;
6598 }
6599
6600 if (newline == -2)
6601 {
6602 md->nltype = NLTYPE_ANYCRLF;
6603 }
6604 else if (newline < 0)
6605 {
6606 md->nltype = NLTYPE_ANY;
6607 }
6608 else
6609 {
6610 md->nltype = NLTYPE_FIXED;
6611 if (newline > 255)
6612 {
6613 md->nllen = 2;
6614 md->nl[0] = (newline >> 8) & 255;
6615 md->nl[1] = newline & 255;
6616 }
6617 else
6618 {
6619 md->nllen = 1;
6620 md->nl[0] = newline;
6621 }
6622 }
6623
6624 /* Partial matching was originally supported only for a restricted set of
6625 regexes; from release 8.00 there are no restrictions, but the bits are still
6626 defined (though never set). So there's no harm in leaving this code. */
6627
6628 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6629 return PCRE_ERROR_BADPARTIAL;
6630
6631 /* If the expression has got more back references than the offsets supplied can
6632 hold, we get a temporary chunk of working store to use during the matching.
6633 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6634 of 3. */
6635
6636 ocount = offsetcount - (offsetcount % 3);
6637 arg_offset_max = (2*ocount)/3;
6638
6639 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6640 {
6641 ocount = re->top_backref * 3 + 3;
6642 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6643 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6644 using_temporary_offsets = TRUE;
6645 DPRINTF(("Got memory to hold back references\n"));
6646 }
6647 else md->offset_vector = offsets;
6648
6649 md->offset_end = ocount;
6650 md->offset_max = (2*ocount)/3;
6651 md->offset_overflow = FALSE;
6652 md->capture_last = -1;
6653
6654 /* Reset the working variable associated with each extraction. These should
6655 never be used unless previously set, but they get saved and restored, and so we
6656 initialize them to avoid reading uninitialized locations. Also, unset the
6657 offsets for the matched string. This is really just for tidiness with callouts,
6658 in case they inspect these fields. */
6659
6660 if (md->offset_vector != NULL)
6661 {
6662 register int *iptr = md->offset_vector + ocount;
6663 register int *iend = iptr - re->top_bracket;
6664 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6665 while (--iptr >= iend) *iptr = -1;
6666 md->offset_vector[0] = md->offset_vector[1] = -1;
6667 }
6668
6669 /* Set up the first character to match, if available. The first_char value is
6670 never set for an anchored regular expression, but the anchoring may be forced
6671 at run time, so we have to test for anchoring. The first char may be unset for
6672 an unanchored pattern, of course. If there's no first char and the pattern was
6673 studied, there may be a bitmap of possible first characters. */
6674
6675 if (!anchored)
6676 {
6677 if ((re->flags & PCRE_FIRSTSET) != 0)
6678 {
6679 has_first_char = TRUE;
6680 first_char = first_char2 = (pcre_uchar)(re->first_char);
6681 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6682 {
6683 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6684 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6685 if (utf && first_char > 127)
6686 first_char2 = UCD_OTHERCASE(first_char);
6687 #endif
6688 }
6689 }
6690 else
6691 if (!startline && study != NULL &&
6692 (study->flags & PCRE_STUDY_MAPPED) != 0)
6693 start_bits = study->start_bits;
6694 }
6695
6696 /* For anchored or unanchored matches, there may be a "last known required
6697 character" set. */
6698
6699 if ((re->flags & PCRE_REQCHSET) != 0)
6700 {
6701 has_req_char = TRUE;
6702 req_char = req_char2 = (pcre_uchar)(re->req_char);
6703 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6704 {
6705 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6706 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6707 if (utf && req_char > 127)
6708 req_char2 = UCD_OTHERCASE(req_char);
6709 #endif
6710 }
6711 }
6712
6713
6714 /* ==========================================================================*/
6715
6716 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6717 the loop runs just once. */
6718
6719 for(;;)
6720 {
6721 PCRE_PUCHAR save_end_subject = end_subject;
6722 PCRE_PUCHAR new_start_match;
6723
6724 /* If firstline is TRUE, the start of the match is constrained to the first
6725 line of a multiline string. That is, the match must be before or at the first
6726 newline. Implement this by temporarily adjusting end_subject so that we stop
6727 scanning at a newline. If the match fails at the newline, later code breaks
6728 this loop. */
6729
6730 if (firstline)
6731 {
6732 PCRE_PUCHAR t = start_match;
6733 #ifdef SUPPORT_UTF
6734 if (utf)
6735 {
6736 while (t < md->end_subject && !IS_NEWLINE(t))
6737 {
6738 t++;
6739 ACROSSCHAR(t < end_subject, *t, t++);
6740 }
6741 }
6742 else
6743 #endif
6744