/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 893 - (show annotations)
Thu Jan 19 17:15:11 2012 UTC (7 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 212968 byte(s)
Error occurred while calculating annotation data.
Experimental stack size determination.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
489 frame on the stack here; subsequent instantiations are obtained from the heap
490 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
491 the top-level on the stack rather than malloc-ing them all gives a performance
492 boost in many cases where there is not much "recursion". */
493
494 #ifdef NO_RECURSE
495 heapframe frame_zero;
496 heapframe *frame = &frame_zero;
497 frame->Xprevframe = NULL; /* Marks the top level */
498
499 /* Copy in the original argument variables */
500
501 frame->Xeptr = eptr;
502 frame->Xecode = ecode;
503 frame->Xmstart = mstart;
504 frame->Xoffset_top = offset_top;
505 frame->Xeptrb = eptrb;
506 frame->Xrdepth = rdepth;
507
508 /* This is where control jumps back to to effect "recursion" */
509
510 HEAP_RECURSE:
511
512 /* Macros make the argument variables come from the current frame */
513
514 #define eptr frame->Xeptr
515 #define ecode frame->Xecode
516 #define mstart frame->Xmstart
517 #define offset_top frame->Xoffset_top
518 #define eptrb frame->Xeptrb
519 #define rdepth frame->Xrdepth
520
521 /* Ditto for the local variables */
522
523 #ifdef SUPPORT_UTF
524 #define charptr frame->Xcharptr
525 #endif
526 #define callpat frame->Xcallpat
527 #define codelink frame->Xcodelink
528 #define data frame->Xdata
529 #define next frame->Xnext
530 #define pp frame->Xpp
531 #define prev frame->Xprev
532 #define saved_eptr frame->Xsaved_eptr
533
534 #define new_recursive frame->Xnew_recursive
535
536 #define cur_is_word frame->Xcur_is_word
537 #define condition frame->Xcondition
538 #define prev_is_word frame->Xprev_is_word
539
540 #ifdef SUPPORT_UCP
541 #define prop_type frame->Xprop_type
542 #define prop_value frame->Xprop_value
543 #define prop_fail_result frame->Xprop_fail_result
544 #define oclength frame->Xoclength
545 #define occhars frame->Xocchars
546 #endif
547
548 #define ctype frame->Xctype
549 #define fc frame->Xfc
550 #define fi frame->Xfi
551 #define length frame->Xlength
552 #define max frame->Xmax
553 #define min frame->Xmin
554 #define number frame->Xnumber
555 #define offset frame->Xoffset
556 #define op frame->Xop
557 #define save_capture_last frame->Xsave_capture_last
558 #define save_offset1 frame->Xsave_offset1
559 #define save_offset2 frame->Xsave_offset2
560 #define save_offset3 frame->Xsave_offset3
561 #define stacksave frame->Xstacksave
562
563 #define newptrb frame->Xnewptrb
564
565 /* When recursion is being used, local variables are allocated on the stack and
566 get preserved during recursion in the normal way. In this environment, fi and
567 i, and fc and c, can be the same variables. */
568
569 #else /* NO_RECURSE not defined */
570 #define fi i
571 #define fc c
572
573 /* Many of the following variables are used only in small blocks of the code.
574 My normal style of coding would have declared them within each of those blocks.
575 However, in order to accommodate the version of this code that uses an external
576 "stack" implemented on the heap, it is easier to declare them all here, so the
577 declarations can be cut out in a block. The only declarations within blocks
578 below are for variables that do not have to be preserved over a recursive call
579 to RMATCH(). */
580
581 #ifdef SUPPORT_UTF
582 const pcre_uchar *charptr;
583 #endif
584 const pcre_uchar *callpat;
585 const pcre_uchar *data;
586 const pcre_uchar *next;
587 PCRE_PUCHAR pp;
588 const pcre_uchar *prev;
589 PCRE_PUCHAR saved_eptr;
590
591 recursion_info new_recursive;
592
593 BOOL cur_is_word;
594 BOOL condition;
595 BOOL prev_is_word;
596
597 #ifdef SUPPORT_UCP
598 int prop_type;
599 int prop_value;
600 int prop_fail_result;
601 int oclength;
602 pcre_uchar occhars[6];
603 #endif
604
605 int codelink;
606 int ctype;
607 int length;
608 int max;
609 int min;
610 int number;
611 int offset;
612 int op;
613 int save_capture_last;
614 int save_offset1, save_offset2, save_offset3;
615 int stacksave[REC_STACK_SAVE_MAX];
616
617 eptrblock newptrb;
618
619 /* There is a special fudge for calling match() in a way that causes it to
620 measure the size of its basic stack frame when the stack is being used for
621 recursion. The first argument (eptr) points to a pointer that is used
622 "statically" for doing the calculation. The second argument (ecode) being NULL
623 triggers this behaviour. It cannot normally every be NULL. The return is the
624 negated value of the frame size. */
625
626 if (ecode == NULL)
627 {
628 char **aptr = (char **)eptr;
629 if (rdepth == 0)
630 {
631 *aptr = (char *)&rdepth;
632 return match(eptr, NULL, NULL, 0, NULL, NULL, 1);
633 }
634 else
635 {
636 int len = (char *)&rdepth - *aptr;
637 return (len > 0)? -len : len;
638 }
639 }
640 #endif /* NO_RECURSE */
641
642 /* To save space on the stack and in the heap frame, I have doubled up on some
643 of the local variables that are used only in localised parts of the code, but
644 still need to be preserved over recursive calls of match(). These macros define
645 the alternative names that are used. */
646
647 #define allow_zero cur_is_word
648 #define cbegroup condition
649 #define code_offset codelink
650 #define condassert condition
651 #define matched_once prev_is_word
652 #define foc number
653 #define save_mark data
654
655 /* These statements are here to stop the compiler complaining about unitialized
656 variables. */
657
658 #ifdef SUPPORT_UCP
659 prop_value = 0;
660 prop_fail_result = 0;
661 #endif
662
663
664 /* This label is used for tail recursion, which is used in a few cases even
665 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
666 used. Thanks to Ian Taylor for noticing this possibility and sending the
667 original patch. */
668
669 TAIL_RECURSE:
670
671 /* OK, now we can get on with the real code of the function. Recursive calls
672 are specified by the macro RMATCH and RRETURN is used to return. When
673 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
674 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
675 defined). However, RMATCH isn't like a function call because it's quite a
676 complicated macro. It has to be used in one particular way. This shouldn't,
677 however, impact performance when true recursion is being used. */
678
679 #ifdef SUPPORT_UTF
680 utf = md->utf; /* Local copy of the flag */
681 #else
682 utf = FALSE;
683 #endif
684
685 /* First check that we haven't called match() too many times, or that we
686 haven't exceeded the recursive call limit. */
687
688 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
689 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
690
691 /* At the start of a group with an unlimited repeat that may match an empty
692 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
693 done this way to save having to use another function argument, which would take
694 up space on the stack. See also MATCH_CONDASSERT below.
695
696 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
697 such remembered pointers, to be checked when we hit the closing ket, in order
698 to break infinite loops that match no characters. When match() is called in
699 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
700 NOT be used with tail recursion, because the memory block that is used is on
701 the stack, so a new one may be required for each match(). */
702
703 if (md->match_function_type == MATCH_CBEGROUP)
704 {
705 newptrb.epb_saved_eptr = eptr;
706 newptrb.epb_prev = eptrb;
707 eptrb = &newptrb;
708 md->match_function_type = 0;
709 }
710
711 /* Now start processing the opcodes. */
712
713 for (;;)
714 {
715 minimize = possessive = FALSE;
716 op = *ecode;
717
718 switch(op)
719 {
720 case OP_MARK:
721 md->nomatch_mark = ecode + 2;
722 md->mark = NULL; /* In case previously set by assertion */
723 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
724 eptrb, RM55);
725 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
726 md->mark == NULL) md->mark = ecode + 2;
727
728 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
729 argument, and we must check whether that argument matches this MARK's
730 argument. It is passed back in md->start_match_ptr (an overloading of that
731 variable). If it does match, we reset that variable to the current subject
732 position and return MATCH_SKIP. Otherwise, pass back the return code
733 unaltered. */
734
735 else if (rrc == MATCH_SKIP_ARG &&
736 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
737 {
738 md->start_match_ptr = eptr;
739 RRETURN(MATCH_SKIP);
740 }
741 RRETURN(rrc);
742
743 case OP_FAIL:
744 RRETURN(MATCH_NOMATCH);
745
746 /* COMMIT overrides PRUNE, SKIP, and THEN */
747
748 case OP_COMMIT:
749 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
750 eptrb, RM52);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
752 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
753 rrc != MATCH_THEN)
754 RRETURN(rrc);
755 RRETURN(MATCH_COMMIT);
756
757 /* PRUNE overrides THEN */
758
759 case OP_PRUNE:
760 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
761 eptrb, RM51);
762 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
763 RRETURN(MATCH_PRUNE);
764
765 case OP_PRUNE_ARG:
766 md->nomatch_mark = ecode + 2;
767 md->mark = NULL; /* In case previously set by assertion */
768 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
769 eptrb, RM56);
770 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
771 md->mark == NULL) md->mark = ecode + 2;
772 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
773 RRETURN(MATCH_PRUNE);
774
775 /* SKIP overrides PRUNE and THEN */
776
777 case OP_SKIP:
778 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
779 eptrb, RM53);
780 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
781 RRETURN(rrc);
782 md->start_match_ptr = eptr; /* Pass back current position */
783 RRETURN(MATCH_SKIP);
784
785 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
786 nomatch_mark. There is a flag that disables this opcode when re-matching a
787 pattern that ended with a SKIP for which there was not a matching MARK. */
788
789 case OP_SKIP_ARG:
790 if (md->ignore_skip_arg)
791 {
792 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
793 break;
794 }
795 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
796 eptrb, RM57);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
798 RRETURN(rrc);
799
800 /* Pass back the current skip name by overloading md->start_match_ptr and
801 returning the special MATCH_SKIP_ARG return code. This will either be
802 caught by a matching MARK, or get to the top, where it causes a rematch
803 with the md->ignore_skip_arg flag set. */
804
805 md->start_match_ptr = ecode + 2;
806 RRETURN(MATCH_SKIP_ARG);
807
808 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
809 the branch in which it occurs can be determined. Overload the start of
810 match pointer to do this. */
811
812 case OP_THEN:
813 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
814 eptrb, RM54);
815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
816 md->start_match_ptr = ecode;
817 RRETURN(MATCH_THEN);
818
819 case OP_THEN_ARG:
820 md->nomatch_mark = ecode + 2;
821 md->mark = NULL; /* In case previously set by assertion */
822 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
823 md, eptrb, RM58);
824 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
825 md->mark == NULL) md->mark = ecode + 2;
826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
827 md->start_match_ptr = ecode;
828 RRETURN(MATCH_THEN);
829
830 /* Handle an atomic group that does not contain any capturing parentheses.
831 This can be handled like an assertion. Prior to 8.13, all atomic groups
832 were handled this way. In 8.13, the code was changed as below for ONCE, so
833 that backups pass through the group and thereby reset captured values.
834 However, this uses a lot more stack, so in 8.20, atomic groups that do not
835 contain any captures generate OP_ONCE_NC, which can be handled in the old,
836 less stack intensive way.
837
838 Check the alternative branches in turn - the matching won't pass the KET
839 for this kind of subpattern. If any one branch matches, we carry on as at
840 the end of a normal bracket, leaving the subject pointer, but resetting
841 the start-of-match value in case it was changed by \K. */
842
843 case OP_ONCE_NC:
844 prev = ecode;
845 saved_eptr = eptr;
846 save_mark = md->mark;
847 do
848 {
849 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
850 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
851 {
852 mstart = md->start_match_ptr;
853 break;
854 }
855 if (rrc == MATCH_THEN)
856 {
857 next = ecode + GET(ecode,1);
858 if (md->start_match_ptr < next &&
859 (*ecode == OP_ALT || *next == OP_ALT))
860 rrc = MATCH_NOMATCH;
861 }
862
863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
864 ecode += GET(ecode,1);
865 md->mark = save_mark;
866 }
867 while (*ecode == OP_ALT);
868
869 /* If hit the end of the group (which could be repeated), fail */
870
871 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
872
873 /* Continue as from after the group, updating the offsets high water
874 mark, since extracts may have been taken. */
875
876 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
877
878 offset_top = md->end_offset_top;
879 eptr = md->end_match_ptr;
880
881 /* For a non-repeating ket, just continue at this level. This also
882 happens for a repeating ket if no characters were matched in the group.
883 This is the forcible breaking of infinite loops as implemented in Perl
884 5.005. */
885
886 if (*ecode == OP_KET || eptr == saved_eptr)
887 {
888 ecode += 1+LINK_SIZE;
889 break;
890 }
891
892 /* The repeating kets try the rest of the pattern or restart from the
893 preceding bracket, in the appropriate order. The second "call" of match()
894 uses tail recursion, to avoid using another stack frame. */
895
896 if (*ecode == OP_KETRMIN)
897 {
898 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
899 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
900 ecode = prev;
901 goto TAIL_RECURSE;
902 }
903 else /* OP_KETRMAX */
904 {
905 md->match_function_type = MATCH_CBEGROUP;
906 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
908 ecode += 1 + LINK_SIZE;
909 goto TAIL_RECURSE;
910 }
911 /* Control never gets here */
912
913 /* Handle a capturing bracket, other than those that are possessive with an
914 unlimited repeat. If there is space in the offset vector, save the current
915 subject position in the working slot at the top of the vector. We mustn't
916 change the current values of the data slot, because they may be set from a
917 previous iteration of this group, and be referred to by a reference inside
918 the group. A failure to match might occur after the group has succeeded,
919 if something later on doesn't match. For this reason, we need to restore
920 the working value and also the values of the final offsets, in case they
921 were set by a previous iteration of the same bracket.
922
923 If there isn't enough space in the offset vector, treat this as if it were
924 a non-capturing bracket. Don't worry about setting the flag for the error
925 case here; that is handled in the code for KET. */
926
927 case OP_CBRA:
928 case OP_SCBRA:
929 number = GET2(ecode, 1+LINK_SIZE);
930 offset = number << 1;
931
932 #ifdef PCRE_DEBUG
933 printf("start bracket %d\n", number);
934 printf("subject=");
935 pchars(eptr, 16, TRUE, md);
936 printf("\n");
937 #endif
938
939 if (offset < md->offset_max)
940 {
941 save_offset1 = md->offset_vector[offset];
942 save_offset2 = md->offset_vector[offset+1];
943 save_offset3 = md->offset_vector[md->offset_end - number];
944 save_capture_last = md->capture_last;
945 save_mark = md->mark;
946
947 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
948 md->offset_vector[md->offset_end - number] =
949 (int)(eptr - md->start_subject);
950
951 for (;;)
952 {
953 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
954 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
955 eptrb, RM1);
956 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
957
958 /* If we backed up to a THEN, check whether it is within the current
959 branch by comparing the address of the THEN that is passed back with
960 the end of the branch. If it is within the current branch, and the
961 branch is one of two or more alternatives (it either starts or ends
962 with OP_ALT), we have reached the limit of THEN's action, so convert
963 the return code to NOMATCH, which will cause normal backtracking to
964 happen from now on. Otherwise, THEN is passed back to an outer
965 alternative. This implements Perl's treatment of parenthesized groups,
966 where a group not containing | does not affect the current alternative,
967 that is, (X) is NOT the same as (X|(*F)). */
968
969 if (rrc == MATCH_THEN)
970 {
971 next = ecode + GET(ecode,1);
972 if (md->start_match_ptr < next &&
973 (*ecode == OP_ALT || *next == OP_ALT))
974 rrc = MATCH_NOMATCH;
975 }
976
977 /* Anything other than NOMATCH is passed back. */
978
979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
980 md->capture_last = save_capture_last;
981 ecode += GET(ecode, 1);
982 md->mark = save_mark;
983 if (*ecode != OP_ALT) break;
984 }
985
986 DPRINTF(("bracket %d failed\n", number));
987 md->offset_vector[offset] = save_offset1;
988 md->offset_vector[offset+1] = save_offset2;
989 md->offset_vector[md->offset_end - number] = save_offset3;
990
991 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
992
993 RRETURN(rrc);
994 }
995
996 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
997 as a non-capturing bracket. */
998
999 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1000 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1001
1002 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1003
1004 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006
1007 /* Non-capturing or atomic group, except for possessive with unlimited
1008 repeat and ONCE group with no captures. Loop for all the alternatives.
1009
1010 When we get to the final alternative within the brackets, we used to return
1011 the result of a recursive call to match() whatever happened so it was
1012 possible to reduce stack usage by turning this into a tail recursion,
1013 except in the case of a possibly empty group. However, now that there is
1014 the possiblity of (*THEN) occurring in the final alternative, this
1015 optimization is no longer always possible.
1016
1017 We can optimize if we know there are no (*THEN)s in the pattern; at present
1018 this is the best that can be done.
1019
1020 MATCH_ONCE is returned when the end of an atomic group is successfully
1021 reached, but subsequent matching fails. It passes back up the tree (causing
1022 captured values to be reset) until the original atomic group level is
1023 reached. This is tested by comparing md->once_target with the start of the
1024 group. At this point, the return is converted into MATCH_NOMATCH so that
1025 previous backup points can be taken. */
1026
1027 case OP_ONCE:
1028 case OP_BRA:
1029 case OP_SBRA:
1030 DPRINTF(("start non-capturing bracket\n"));
1031
1032 for (;;)
1033 {
1034 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1035
1036 /* If this is not a possibly empty group, and there are no (*THEN)s in
1037 the pattern, and this is the final alternative, optimize as described
1038 above. */
1039
1040 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1041 {
1042 ecode += PRIV(OP_lengths)[*ecode];
1043 goto TAIL_RECURSE;
1044 }
1045
1046 /* In all other cases, we have to make another call to match(). */
1047
1048 save_mark = md->mark;
1049 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1050 RM2);
1051
1052 /* See comment in the code for capturing groups above about handling
1053 THEN. */
1054
1055 if (rrc == MATCH_THEN)
1056 {
1057 next = ecode + GET(ecode,1);
1058 if (md->start_match_ptr < next &&
1059 (*ecode == OP_ALT || *next == OP_ALT))
1060 rrc = MATCH_NOMATCH;
1061 }
1062
1063 if (rrc != MATCH_NOMATCH)
1064 {
1065 if (rrc == MATCH_ONCE)
1066 {
1067 const pcre_uchar *scode = ecode;
1068 if (*scode != OP_ONCE) /* If not at start, find it */
1069 {
1070 while (*scode == OP_ALT) scode += GET(scode, 1);
1071 scode -= GET(scode, 1);
1072 }
1073 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1074 }
1075 RRETURN(rrc);
1076 }
1077 ecode += GET(ecode, 1);
1078 md->mark = save_mark;
1079 if (*ecode != OP_ALT) break;
1080 }
1081
1082 RRETURN(MATCH_NOMATCH);
1083
1084 /* Handle possessive capturing brackets with an unlimited repeat. We come
1085 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1086 handled similarly to the normal case above. However, the matching is
1087 different. The end of these brackets will always be OP_KETRPOS, which
1088 returns MATCH_KETRPOS without going further in the pattern. By this means
1089 we can handle the group by iteration rather than recursion, thereby
1090 reducing the amount of stack needed. */
1091
1092 case OP_CBRAPOS:
1093 case OP_SCBRAPOS:
1094 allow_zero = FALSE;
1095
1096 POSSESSIVE_CAPTURE:
1097 number = GET2(ecode, 1+LINK_SIZE);
1098 offset = number << 1;
1099
1100 #ifdef PCRE_DEBUG
1101 printf("start possessive bracket %d\n", number);
1102 printf("subject=");
1103 pchars(eptr, 16, TRUE, md);
1104 printf("\n");
1105 #endif
1106
1107 if (offset < md->offset_max)
1108 {
1109 matched_once = FALSE;
1110 code_offset = (int)(ecode - md->start_code);
1111
1112 save_offset1 = md->offset_vector[offset];
1113 save_offset2 = md->offset_vector[offset+1];
1114 save_offset3 = md->offset_vector[md->offset_end - number];
1115 save_capture_last = md->capture_last;
1116
1117 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1118
1119 /* Each time round the loop, save the current subject position for use
1120 when the group matches. For MATCH_MATCH, the group has matched, so we
1121 restart it with a new subject starting position, remembering that we had
1122 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1123 usual. If we haven't matched any alternatives in any iteration, check to
1124 see if a previous iteration matched. If so, the group has matched;
1125 continue from afterwards. Otherwise it has failed; restore the previous
1126 capture values before returning NOMATCH. */
1127
1128 for (;;)
1129 {
1130 md->offset_vector[md->offset_end - number] =
1131 (int)(eptr - md->start_subject);
1132 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1133 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1134 eptrb, RM63);
1135 if (rrc == MATCH_KETRPOS)
1136 {
1137 offset_top = md->end_offset_top;
1138 eptr = md->end_match_ptr;
1139 ecode = md->start_code + code_offset;
1140 save_capture_last = md->capture_last;
1141 matched_once = TRUE;
1142 continue;
1143 }
1144
1145 /* See comment in the code for capturing groups above about handling
1146 THEN. */
1147
1148 if (rrc == MATCH_THEN)
1149 {
1150 next = ecode + GET(ecode,1);
1151 if (md->start_match_ptr < next &&
1152 (*ecode == OP_ALT || *next == OP_ALT))
1153 rrc = MATCH_NOMATCH;
1154 }
1155
1156 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1157 md->capture_last = save_capture_last;
1158 ecode += GET(ecode, 1);
1159 if (*ecode != OP_ALT) break;
1160 }
1161
1162 if (!matched_once)
1163 {
1164 md->offset_vector[offset] = save_offset1;
1165 md->offset_vector[offset+1] = save_offset2;
1166 md->offset_vector[md->offset_end - number] = save_offset3;
1167 }
1168
1169 if (allow_zero || matched_once)
1170 {
1171 ecode += 1 + LINK_SIZE;
1172 break;
1173 }
1174
1175 RRETURN(MATCH_NOMATCH);
1176 }
1177
1178 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1179 as a non-capturing bracket. */
1180
1181 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1182 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1183
1184 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1185
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1188
1189 /* Non-capturing possessive bracket with unlimited repeat. We come here
1190 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1191 without the capturing complication. It is written out separately for speed
1192 and cleanliness. */
1193
1194 case OP_BRAPOS:
1195 case OP_SBRAPOS:
1196 allow_zero = FALSE;
1197
1198 POSSESSIVE_NON_CAPTURE:
1199 matched_once = FALSE;
1200 code_offset = (int)(ecode - md->start_code);
1201
1202 for (;;)
1203 {
1204 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1205 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1206 eptrb, RM48);
1207 if (rrc == MATCH_KETRPOS)
1208 {
1209 offset_top = md->end_offset_top;
1210 eptr = md->end_match_ptr;
1211 ecode = md->start_code + code_offset;
1212 matched_once = TRUE;
1213 continue;
1214 }
1215
1216 /* See comment in the code for capturing groups above about handling
1217 THEN. */
1218
1219 if (rrc == MATCH_THEN)
1220 {
1221 next = ecode + GET(ecode,1);
1222 if (md->start_match_ptr < next &&
1223 (*ecode == OP_ALT || *next == OP_ALT))
1224 rrc = MATCH_NOMATCH;
1225 }
1226
1227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228 ecode += GET(ecode, 1);
1229 if (*ecode != OP_ALT) break;
1230 }
1231
1232 if (matched_once || allow_zero)
1233 {
1234 ecode += 1 + LINK_SIZE;
1235 break;
1236 }
1237 RRETURN(MATCH_NOMATCH);
1238
1239 /* Control never reaches here. */
1240
1241 /* Conditional group: compilation checked that there are no more than
1242 two branches. If the condition is false, skipping the first branch takes us
1243 past the end if there is only one branch, but that's OK because that is
1244 exactly what going to the ket would do. */
1245
1246 case OP_COND:
1247 case OP_SCOND:
1248 codelink = GET(ecode, 1);
1249
1250 /* Because of the way auto-callout works during compile, a callout item is
1251 inserted between OP_COND and an assertion condition. */
1252
1253 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1254 {
1255 if (PUBL(callout) != NULL)
1256 {
1257 PUBL(callout_block) cb;
1258 cb.version = 2; /* Version 1 of the callout block */
1259 cb.callout_number = ecode[LINK_SIZE+2];
1260 cb.offset_vector = md->offset_vector;
1261 #ifdef COMPILE_PCRE8
1262 cb.subject = (PCRE_SPTR)md->start_subject;
1263 #else
1264 cb.subject = (PCRE_SPTR16)md->start_subject;
1265 #endif
1266 cb.subject_length = (int)(md->end_subject - md->start_subject);
1267 cb.start_match = (int)(mstart - md->start_subject);
1268 cb.current_position = (int)(eptr - md->start_subject);
1269 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1270 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1271 cb.capture_top = offset_top/2;
1272 cb.capture_last = md->capture_last;
1273 cb.callout_data = md->callout_data;
1274 cb.mark = md->nomatch_mark;
1275 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1276 if (rrc < 0) RRETURN(rrc);
1277 }
1278 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1279 }
1280
1281 condcode = ecode[LINK_SIZE+1];
1282
1283 /* Now see what the actual condition is */
1284
1285 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1286 {
1287 if (md->recursive == NULL) /* Not recursing => FALSE */
1288 {
1289 condition = FALSE;
1290 ecode += GET(ecode, 1);
1291 }
1292 else
1293 {
1294 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1295 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1296
1297 /* If the test is for recursion into a specific subpattern, and it is
1298 false, but the test was set up by name, scan the table to see if the
1299 name refers to any other numbers, and test them. The condition is true
1300 if any one is set. */
1301
1302 if (!condition && condcode == OP_NRREF)
1303 {
1304 pcre_uchar *slotA = md->name_table;
1305 for (i = 0; i < md->name_count; i++)
1306 {
1307 if (GET2(slotA, 0) == recno) break;
1308 slotA += md->name_entry_size;
1309 }
1310
1311 /* Found a name for the number - there can be only one; duplicate
1312 names for different numbers are allowed, but not vice versa. First
1313 scan down for duplicates. */
1314
1315 if (i < md->name_count)
1316 {
1317 pcre_uchar *slotB = slotA;
1318 while (slotB > md->name_table)
1319 {
1320 slotB -= md->name_entry_size;
1321 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1322 {
1323 condition = GET2(slotB, 0) == md->recursive->group_num;
1324 if (condition) break;
1325 }
1326 else break;
1327 }
1328
1329 /* Scan up for duplicates */
1330
1331 if (!condition)
1332 {
1333 slotB = slotA;
1334 for (i++; i < md->name_count; i++)
1335 {
1336 slotB += md->name_entry_size;
1337 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1338 {
1339 condition = GET2(slotB, 0) == md->recursive->group_num;
1340 if (condition) break;
1341 }
1342 else break;
1343 }
1344 }
1345 }
1346 }
1347
1348 /* Chose branch according to the condition */
1349
1350 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1351 }
1352 }
1353
1354 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1355 {
1356 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1357 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1358
1359 /* If the numbered capture is unset, but the reference was by name,
1360 scan the table to see if the name refers to any other numbers, and test
1361 them. The condition is true if any one is set. This is tediously similar
1362 to the code above, but not close enough to try to amalgamate. */
1363
1364 if (!condition && condcode == OP_NCREF)
1365 {
1366 int refno = offset >> 1;
1367 pcre_uchar *slotA = md->name_table;
1368
1369 for (i = 0; i < md->name_count; i++)
1370 {
1371 if (GET2(slotA, 0) == refno) break;
1372 slotA += md->name_entry_size;
1373 }
1374
1375 /* Found a name for the number - there can be only one; duplicate names
1376 for different numbers are allowed, but not vice versa. First scan down
1377 for duplicates. */
1378
1379 if (i < md->name_count)
1380 {
1381 pcre_uchar *slotB = slotA;
1382 while (slotB > md->name_table)
1383 {
1384 slotB -= md->name_entry_size;
1385 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1386 {
1387 offset = GET2(slotB, 0) << 1;
1388 condition = offset < offset_top &&
1389 md->offset_vector[offset] >= 0;
1390 if (condition) break;
1391 }
1392 else break;
1393 }
1394
1395 /* Scan up for duplicates */
1396
1397 if (!condition)
1398 {
1399 slotB = slotA;
1400 for (i++; i < md->name_count; i++)
1401 {
1402 slotB += md->name_entry_size;
1403 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1404 {
1405 offset = GET2(slotB, 0) << 1;
1406 condition = offset < offset_top &&
1407 md->offset_vector[offset] >= 0;
1408 if (condition) break;
1409 }
1410 else break;
1411 }
1412 }
1413 }
1414 }
1415
1416 /* Chose branch according to the condition */
1417
1418 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1419 }
1420
1421 else if (condcode == OP_DEF) /* DEFINE - always false */
1422 {
1423 condition = FALSE;
1424 ecode += GET(ecode, 1);
1425 }
1426
1427 /* The condition is an assertion. Call match() to evaluate it - setting
1428 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1429 an assertion. */
1430
1431 else
1432 {
1433 md->match_function_type = MATCH_CONDASSERT;
1434 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1435 if (rrc == MATCH_MATCH)
1436 {
1437 if (md->end_offset_top > offset_top)
1438 offset_top = md->end_offset_top; /* Captures may have happened */
1439 condition = TRUE;
1440 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1441 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1442 }
1443
1444 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1445 assertion; it is therefore treated as NOMATCH. */
1446
1447 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1448 {
1449 RRETURN(rrc); /* Need braces because of following else */
1450 }
1451 else
1452 {
1453 condition = FALSE;
1454 ecode += codelink;
1455 }
1456 }
1457
1458 /* We are now at the branch that is to be obeyed. As there is only one, can
1459 use tail recursion to avoid using another stack frame, except when there is
1460 unlimited repeat of a possibly empty group. In the latter case, a recursive
1461 call to match() is always required, unless the second alternative doesn't
1462 exist, in which case we can just plough on. Note that, for compatibility
1463 with Perl, the | in a conditional group is NOT treated as creating two
1464 alternatives. If a THEN is encountered in the branch, it propagates out to
1465 the enclosing alternative (unless nested in a deeper set of alternatives,
1466 of course). */
1467
1468 if (condition || *ecode == OP_ALT)
1469 {
1470 if (op != OP_SCOND)
1471 {
1472 ecode += 1 + LINK_SIZE;
1473 goto TAIL_RECURSE;
1474 }
1475
1476 md->match_function_type = MATCH_CBEGROUP;
1477 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1478 RRETURN(rrc);
1479 }
1480
1481 /* Condition false & no alternative; continue after the group. */
1482
1483 else
1484 {
1485 ecode += 1 + LINK_SIZE;
1486 }
1487 break;
1488
1489
1490 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1491 to close any currently open capturing brackets. */
1492
1493 case OP_CLOSE:
1494 number = GET2(ecode, 1);
1495 offset = number << 1;
1496
1497 #ifdef PCRE_DEBUG
1498 printf("end bracket %d at *ACCEPT", number);
1499 printf("\n");
1500 #endif
1501
1502 md->capture_last = number;
1503 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1504 {
1505 md->offset_vector[offset] =
1506 md->offset_vector[md->offset_end - number];
1507 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1508 if (offset_top <= offset) offset_top = offset + 2;
1509 }
1510 ecode += 1 + IMM2_SIZE;
1511 break;
1512
1513
1514 /* End of the pattern, either real or forced. */
1515
1516 case OP_END:
1517 case OP_ACCEPT:
1518 case OP_ASSERT_ACCEPT:
1519
1520 /* If we have matched an empty string, fail if not in an assertion and not
1521 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1522 is set and we have matched at the start of the subject. In both cases,
1523 backtracking will then try other alternatives, if any. */
1524
1525 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1526 md->recursive == NULL &&
1527 (md->notempty ||
1528 (md->notempty_atstart &&
1529 mstart == md->start_subject + md->start_offset)))
1530 RRETURN(MATCH_NOMATCH);
1531
1532 /* Otherwise, we have a match. */
1533
1534 md->end_match_ptr = eptr; /* Record where we ended */
1535 md->end_offset_top = offset_top; /* and how many extracts were taken */
1536 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1537
1538 /* For some reason, the macros don't work properly if an expression is
1539 given as the argument to RRETURN when the heap is in use. */
1540
1541 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1542 RRETURN(rrc);
1543
1544 /* Assertion brackets. Check the alternative branches in turn - the
1545 matching won't pass the KET for an assertion. If any one branch matches,
1546 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1547 start of each branch to move the current point backwards, so the code at
1548 this level is identical to the lookahead case. When the assertion is part
1549 of a condition, we want to return immediately afterwards. The caller of
1550 this incarnation of the match() function will have set MATCH_CONDASSERT in
1551 md->match_function type, and one of these opcodes will be the first opcode
1552 that is processed. We use a local variable that is preserved over calls to
1553 match() to remember this case. */
1554
1555 case OP_ASSERT:
1556 case OP_ASSERTBACK:
1557 save_mark = md->mark;
1558 if (md->match_function_type == MATCH_CONDASSERT)
1559 {
1560 condassert = TRUE;
1561 md->match_function_type = 0;
1562 }
1563 else condassert = FALSE;
1564
1565 do
1566 {
1567 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1568 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1569 {
1570 mstart = md->start_match_ptr; /* In case \K reset it */
1571 break;
1572 }
1573
1574 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1575 as NOMATCH. */
1576
1577 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1578 ecode += GET(ecode, 1);
1579 md->mark = save_mark;
1580 }
1581 while (*ecode == OP_ALT);
1582
1583 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1584
1585 /* If checking an assertion for a condition, return MATCH_MATCH. */
1586
1587 if (condassert) RRETURN(MATCH_MATCH);
1588
1589 /* Continue from after the assertion, updating the offsets high water
1590 mark, since extracts may have been taken during the assertion. */
1591
1592 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1593 ecode += 1 + LINK_SIZE;
1594 offset_top = md->end_offset_top;
1595 continue;
1596
1597 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1598 PRUNE, or COMMIT means we must assume failure without checking subsequent
1599 branches. */
1600
1601 case OP_ASSERT_NOT:
1602 case OP_ASSERTBACK_NOT:
1603 save_mark = md->mark;
1604 if (md->match_function_type == MATCH_CONDASSERT)
1605 {
1606 condassert = TRUE;
1607 md->match_function_type = 0;
1608 }
1609 else condassert = FALSE;
1610
1611 do
1612 {
1613 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1614 md->mark = save_mark;
1615 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1616 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1617 {
1618 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1619 break;
1620 }
1621
1622 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1623 as NOMATCH. */
1624
1625 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1626 ecode += GET(ecode,1);
1627 }
1628 while (*ecode == OP_ALT);
1629
1630 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1631
1632 ecode += 1 + LINK_SIZE;
1633 continue;
1634
1635 /* Move the subject pointer back. This occurs only at the start of
1636 each branch of a lookbehind assertion. If we are too close to the start to
1637 move back, this match function fails. When working with UTF-8 we move
1638 back a number of characters, not bytes. */
1639
1640 case OP_REVERSE:
1641 #ifdef SUPPORT_UTF
1642 if (utf)
1643 {
1644 i = GET(ecode, 1);
1645 while (i-- > 0)
1646 {
1647 eptr--;
1648 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1649 BACKCHAR(eptr);
1650 }
1651 }
1652 else
1653 #endif
1654
1655 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1656
1657 {
1658 eptr -= GET(ecode, 1);
1659 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1660 }
1661
1662 /* Save the earliest consulted character, then skip to next op code */
1663
1664 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1665 ecode += 1 + LINK_SIZE;
1666 break;
1667
1668 /* The callout item calls an external function, if one is provided, passing
1669 details of the match so far. This is mainly for debugging, though the
1670 function is able to force a failure. */
1671
1672 case OP_CALLOUT:
1673 if (PUBL(callout) != NULL)
1674 {
1675 PUBL(callout_block) cb;
1676 cb.version = 2; /* Version 1 of the callout block */
1677 cb.callout_number = ecode[1];
1678 cb.offset_vector = md->offset_vector;
1679 #ifdef COMPILE_PCRE8
1680 cb.subject = (PCRE_SPTR)md->start_subject;
1681 #else
1682 cb.subject = (PCRE_SPTR16)md->start_subject;
1683 #endif
1684 cb.subject_length = (int)(md->end_subject - md->start_subject);
1685 cb.start_match = (int)(mstart - md->start_subject);
1686 cb.current_position = (int)(eptr - md->start_subject);
1687 cb.pattern_position = GET(ecode, 2);
1688 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1689 cb.capture_top = offset_top/2;
1690 cb.capture_last = md->capture_last;
1691 cb.callout_data = md->callout_data;
1692 cb.mark = md->nomatch_mark;
1693 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1694 if (rrc < 0) RRETURN(rrc);
1695 }
1696 ecode += 2 + 2*LINK_SIZE;
1697 break;
1698
1699 /* Recursion either matches the current regex, or some subexpression. The
1700 offset data is the offset to the starting bracket from the start of the
1701 whole pattern. (This is so that it works from duplicated subpatterns.)
1702
1703 The state of the capturing groups is preserved over recursion, and
1704 re-instated afterwards. We don't know how many are started and not yet
1705 finished (offset_top records the completed total) so we just have to save
1706 all the potential data. There may be up to 65535 such values, which is too
1707 large to put on the stack, but using malloc for small numbers seems
1708 expensive. As a compromise, the stack is used when there are no more than
1709 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1710
1711 There are also other values that have to be saved. We use a chained
1712 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1713 for the original version of this logic. It has, however, been hacked around
1714 a lot, so he is not to blame for the current way it works. */
1715
1716 case OP_RECURSE:
1717 {
1718 recursion_info *ri;
1719 int recno;
1720
1721 callpat = md->start_code + GET(ecode, 1);
1722 recno = (callpat == md->start_code)? 0 :
1723 GET2(callpat, 1 + LINK_SIZE);
1724
1725 /* Check for repeating a recursion without advancing the subject pointer.
1726 This should catch convoluted mutual recursions. (Some simple cases are
1727 caught at compile time.) */
1728
1729 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1730 if (recno == ri->group_num && eptr == ri->subject_position)
1731 RRETURN(PCRE_ERROR_RECURSELOOP);
1732
1733 /* Add to "recursing stack" */
1734
1735 new_recursive.group_num = recno;
1736 new_recursive.subject_position = eptr;
1737 new_recursive.prevrec = md->recursive;
1738 md->recursive = &new_recursive;
1739
1740 /* Where to continue from afterwards */
1741
1742 ecode += 1 + LINK_SIZE;
1743
1744 /* Now save the offset data */
1745
1746 new_recursive.saved_max = md->offset_end;
1747 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1748 new_recursive.offset_save = stacksave;
1749 else
1750 {
1751 new_recursive.offset_save =
1752 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1753 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1754 }
1755 memcpy(new_recursive.offset_save, md->offset_vector,
1756 new_recursive.saved_max * sizeof(int));
1757
1758 /* OK, now we can do the recursion. After processing each alternative,
1759 restore the offset data. If there were nested recursions, md->recursive
1760 might be changed, so reset it before looping. */
1761
1762 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1763 cbegroup = (*callpat >= OP_SBRA);
1764 do
1765 {
1766 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1767 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1768 md, eptrb, RM6);
1769 memcpy(md->offset_vector, new_recursive.offset_save,
1770 new_recursive.saved_max * sizeof(int));
1771 md->recursive = new_recursive.prevrec;
1772 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1773 {
1774 DPRINTF(("Recursion matched\n"));
1775 if (new_recursive.offset_save != stacksave)
1776 (PUBL(free))(new_recursive.offset_save);
1777
1778 /* Set where we got to in the subject, and reset the start in case
1779 it was changed by \K. This *is* propagated back out of a recursion,
1780 for Perl compatibility. */
1781
1782 eptr = md->end_match_ptr;
1783 mstart = md->start_match_ptr;
1784 goto RECURSION_MATCHED; /* Exit loop; end processing */
1785 }
1786
1787 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1788 as NOMATCH. */
1789
1790 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1791 {
1792 DPRINTF(("Recursion gave error %d\n", rrc));
1793 if (new_recursive.offset_save != stacksave)
1794 (PUBL(free))(new_recursive.offset_save);
1795 RRETURN(rrc);
1796 }
1797
1798 md->recursive = &new_recursive;
1799 callpat += GET(callpat, 1);
1800 }
1801 while (*callpat == OP_ALT);
1802
1803 DPRINTF(("Recursion didn't match\n"));
1804 md->recursive = new_recursive.prevrec;
1805 if (new_recursive.offset_save != stacksave)
1806 (PUBL(free))(new_recursive.offset_save);
1807 RRETURN(MATCH_NOMATCH);
1808 }
1809
1810 RECURSION_MATCHED:
1811 break;
1812
1813 /* An alternation is the end of a branch; scan along to find the end of the
1814 bracketed group and go to there. */
1815
1816 case OP_ALT:
1817 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1818 break;
1819
1820 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1821 indicating that it may occur zero times. It may repeat infinitely, or not
1822 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1823 with fixed upper repeat limits are compiled as a number of copies, with the
1824 optional ones preceded by BRAZERO or BRAMINZERO. */
1825
1826 case OP_BRAZERO:
1827 next = ecode + 1;
1828 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1830 do next += GET(next, 1); while (*next == OP_ALT);
1831 ecode = next + 1 + LINK_SIZE;
1832 break;
1833
1834 case OP_BRAMINZERO:
1835 next = ecode + 1;
1836 do next += GET(next, 1); while (*next == OP_ALT);
1837 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1839 ecode++;
1840 break;
1841
1842 case OP_SKIPZERO:
1843 next = ecode+1;
1844 do next += GET(next,1); while (*next == OP_ALT);
1845 ecode = next + 1 + LINK_SIZE;
1846 break;
1847
1848 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1849 here; just jump to the group, with allow_zero set TRUE. */
1850
1851 case OP_BRAPOSZERO:
1852 op = *(++ecode);
1853 allow_zero = TRUE;
1854 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1855 goto POSSESSIVE_NON_CAPTURE;
1856
1857 /* End of a group, repeated or non-repeating. */
1858
1859 case OP_KET:
1860 case OP_KETRMIN:
1861 case OP_KETRMAX:
1862 case OP_KETRPOS:
1863 prev = ecode - GET(ecode, 1);
1864
1865 /* If this was a group that remembered the subject start, in order to break
1866 infinite repeats of empty string matches, retrieve the subject start from
1867 the chain. Otherwise, set it NULL. */
1868
1869 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1870 {
1871 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1872 eptrb = eptrb->epb_prev; /* Backup to previous group */
1873 }
1874 else saved_eptr = NULL;
1875
1876 /* If we are at the end of an assertion group or a non-capturing atomic
1877 group, stop matching and return MATCH_MATCH, but record the current high
1878 water mark for use by positive assertions. We also need to record the match
1879 start in case it was changed by \K. */
1880
1881 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1882 *prev == OP_ONCE_NC)
1883 {
1884 md->end_match_ptr = eptr; /* For ONCE_NC */
1885 md->end_offset_top = offset_top;
1886 md->start_match_ptr = mstart;
1887 RRETURN(MATCH_MATCH); /* Sets md->mark */
1888 }
1889
1890 /* For capturing groups we have to check the group number back at the start
1891 and if necessary complete handling an extraction by setting the offsets and
1892 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1893 into group 0, so it won't be picked up here. Instead, we catch it when the
1894 OP_END is reached. Other recursion is handled here. We just have to record
1895 the current subject position and start match pointer and give a MATCH
1896 return. */
1897
1898 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1899 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1900 {
1901 number = GET2(prev, 1+LINK_SIZE);
1902 offset = number << 1;
1903
1904 #ifdef PCRE_DEBUG
1905 printf("end bracket %d", number);
1906 printf("\n");
1907 #endif
1908
1909 /* Handle a recursively called group. */
1910
1911 if (md->recursive != NULL && md->recursive->group_num == number)
1912 {
1913 md->end_match_ptr = eptr;
1914 md->start_match_ptr = mstart;
1915 RRETURN(MATCH_MATCH);
1916 }
1917
1918 /* Deal with capturing */
1919
1920 md->capture_last = number;
1921 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1922 {
1923 /* If offset is greater than offset_top, it means that we are
1924 "skipping" a capturing group, and that group's offsets must be marked
1925 unset. In earlier versions of PCRE, all the offsets were unset at the
1926 start of matching, but this doesn't work because atomic groups and
1927 assertions can cause a value to be set that should later be unset.
1928 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1929 part of the atomic group, but this is not on the final matching path,
1930 so must be unset when 2 is set. (If there is no group 2, there is no
1931 problem, because offset_top will then be 2, indicating no capture.) */
1932
1933 if (offset > offset_top)
1934 {
1935 register int *iptr = md->offset_vector + offset_top;
1936 register int *iend = md->offset_vector + offset;
1937 while (iptr < iend) *iptr++ = -1;
1938 }
1939
1940 /* Now make the extraction */
1941
1942 md->offset_vector[offset] =
1943 md->offset_vector[md->offset_end - number];
1944 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1945 if (offset_top <= offset) offset_top = offset + 2;
1946 }
1947 }
1948
1949 /* For an ordinary non-repeating ket, just continue at this level. This
1950 also happens for a repeating ket if no characters were matched in the
1951 group. This is the forcible breaking of infinite loops as implemented in
1952 Perl 5.005. For a non-repeating atomic group that includes captures,
1953 establish a backup point by processing the rest of the pattern at a lower
1954 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1955 original OP_ONCE level, thereby bypassing intermediate backup points, but
1956 resetting any captures that happened along the way. */
1957
1958 if (*ecode == OP_KET || eptr == saved_eptr)
1959 {
1960 if (*prev == OP_ONCE)
1961 {
1962 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1964 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1965 RRETURN(MATCH_ONCE);
1966 }
1967 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1968 break;
1969 }
1970
1971 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1972 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1973 at a time from the outer level, thus saving stack. */
1974
1975 if (*ecode == OP_KETRPOS)
1976 {
1977 md->end_match_ptr = eptr;
1978 md->end_offset_top = offset_top;
1979 RRETURN(MATCH_KETRPOS);
1980 }
1981
1982 /* The normal repeating kets try the rest of the pattern or restart from
1983 the preceding bracket, in the appropriate order. In the second case, we can
1984 use tail recursion to avoid using another stack frame, unless we have an
1985 an atomic group or an unlimited repeat of a group that can match an empty
1986 string. */
1987
1988 if (*ecode == OP_KETRMIN)
1989 {
1990 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1992 if (*prev == OP_ONCE)
1993 {
1994 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1996 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1997 RRETURN(MATCH_ONCE);
1998 }
1999 if (*prev >= OP_SBRA) /* Could match an empty string */
2000 {
2001 md->match_function_type = MATCH_CBEGROUP;
2002 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2003 RRETURN(rrc);
2004 }
2005 ecode = prev;
2006 goto TAIL_RECURSE;
2007 }
2008 else /* OP_KETRMAX */
2009 {
2010 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2011 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2012 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2013 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2014 if (*prev == OP_ONCE)
2015 {
2016 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2017 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2018 md->once_target = prev;
2019 RRETURN(MATCH_ONCE);
2020 }
2021 ecode += 1 + LINK_SIZE;
2022 goto TAIL_RECURSE;
2023 }
2024 /* Control never gets here */
2025
2026 /* Not multiline mode: start of subject assertion, unless notbol. */
2027
2028 case OP_CIRC:
2029 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2030
2031 /* Start of subject assertion */
2032
2033 case OP_SOD:
2034 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2035 ecode++;
2036 break;
2037
2038 /* Multiline mode: start of subject unless notbol, or after any newline. */
2039
2040 case OP_CIRCM:
2041 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2042 if (eptr != md->start_subject &&
2043 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2044 RRETURN(MATCH_NOMATCH);
2045 ecode++;
2046 break;
2047
2048 /* Start of match assertion */
2049
2050 case OP_SOM:
2051 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2052 ecode++;
2053 break;
2054
2055 /* Reset the start of match point */
2056
2057 case OP_SET_SOM:
2058 mstart = eptr;
2059 ecode++;
2060 break;
2061
2062 /* Multiline mode: assert before any newline, or before end of subject
2063 unless noteol is set. */
2064
2065 case OP_DOLLM:
2066 if (eptr < md->end_subject)
2067 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2068 else
2069 {
2070 if (md->noteol) RRETURN(MATCH_NOMATCH);
2071 SCHECK_PARTIAL();
2072 }
2073 ecode++;
2074 break;
2075
2076 /* Not multiline mode: assert before a terminating newline or before end of
2077 subject unless noteol is set. */
2078
2079 case OP_DOLL:
2080 if (md->noteol) RRETURN(MATCH_NOMATCH);
2081 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2082
2083 /* ... else fall through for endonly */
2084
2085 /* End of subject assertion (\z) */
2086
2087 case OP_EOD:
2088 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2089 SCHECK_PARTIAL();
2090 ecode++;
2091 break;
2092
2093 /* End of subject or ending \n assertion (\Z) */
2094
2095 case OP_EODN:
2096 ASSERT_NL_OR_EOS:
2097 if (eptr < md->end_subject &&
2098 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2099 RRETURN(MATCH_NOMATCH);
2100
2101 /* Either at end of string or \n before end. */
2102
2103 SCHECK_PARTIAL();
2104 ecode++;
2105 break;
2106
2107 /* Word boundary assertions */
2108
2109 case OP_NOT_WORD_BOUNDARY:
2110 case OP_WORD_BOUNDARY:
2111 {
2112
2113 /* Find out if the previous and current characters are "word" characters.
2114 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2115 be "non-word" characters. Remember the earliest consulted character for
2116 partial matching. */
2117
2118 #ifdef SUPPORT_UTF
2119 if (utf)
2120 {
2121 /* Get status of previous character */
2122
2123 if (eptr == md->start_subject) prev_is_word = FALSE; else
2124 {
2125 PCRE_PUCHAR lastptr = eptr - 1;
2126 BACKCHAR(lastptr);
2127 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2128 GETCHAR(c, lastptr);
2129 #ifdef SUPPORT_UCP
2130 if (md->use_ucp)
2131 {
2132 if (c == '_') prev_is_word = TRUE; else
2133 {
2134 int cat = UCD_CATEGORY(c);
2135 prev_is_word = (cat == ucp_L || cat == ucp_N);
2136 }
2137 }
2138 else
2139 #endif
2140 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2141 }
2142
2143 /* Get status of next character */
2144
2145 if (eptr >= md->end_subject)
2146 {
2147 SCHECK_PARTIAL();
2148 cur_is_word = FALSE;
2149 }
2150 else
2151 {
2152 GETCHAR(c, eptr);
2153 #ifdef SUPPORT_UCP
2154 if (md->use_ucp)
2155 {
2156 if (c == '_') cur_is_word = TRUE; else
2157 {
2158 int cat = UCD_CATEGORY(c);
2159 cur_is_word = (cat == ucp_L || cat == ucp_N);
2160 }
2161 }
2162 else
2163 #endif
2164 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2165 }
2166 }
2167 else
2168 #endif
2169
2170 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2171 consistency with the behaviour of \w we do use it in this case. */
2172
2173 {
2174 /* Get status of previous character */
2175
2176 if (eptr == md->start_subject) prev_is_word = FALSE; else
2177 {
2178 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2179 #ifdef SUPPORT_UCP
2180 if (md->use_ucp)
2181 {
2182 c = eptr[-1];
2183 if (c == '_') prev_is_word = TRUE; else
2184 {
2185 int cat = UCD_CATEGORY(c);
2186 prev_is_word = (cat == ucp_L || cat == ucp_N);
2187 }
2188 }
2189 else
2190 #endif
2191 prev_is_word = MAX_255(eptr[-1])
2192 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2193 }
2194
2195 /* Get status of next character */
2196
2197 if (eptr >= md->end_subject)
2198 {
2199 SCHECK_PARTIAL();
2200 cur_is_word = FALSE;
2201 }
2202 else
2203 #ifdef SUPPORT_UCP
2204 if (md->use_ucp)
2205 {
2206 c = *eptr;
2207 if (c == '_') cur_is_word = TRUE; else
2208 {
2209 int cat = UCD_CATEGORY(c);
2210 cur_is_word = (cat == ucp_L || cat == ucp_N);
2211 }
2212 }
2213 else
2214 #endif
2215 cur_is_word = MAX_255(*eptr)
2216 && ((md->ctypes[*eptr] & ctype_word) != 0);
2217 }
2218
2219 /* Now see if the situation is what we want */
2220
2221 if ((*ecode++ == OP_WORD_BOUNDARY)?
2222 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2223 RRETURN(MATCH_NOMATCH);
2224 }
2225 break;
2226
2227 /* Match a single character type; inline for speed */
2228
2229 case OP_ANY:
2230 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2231 /* Fall through */
2232
2233 case OP_ALLANY:
2234 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2235 { /* not be updated before SCHECK_PARTIAL. */
2236 SCHECK_PARTIAL();
2237 RRETURN(MATCH_NOMATCH);
2238 }
2239 eptr++;
2240 #ifdef SUPPORT_UTF
2241 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2242 #endif
2243 ecode++;
2244 break;
2245
2246 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2247 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2248
2249 case OP_ANYBYTE:
2250 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2251 { /* not be updated before SCHECK_PARTIAL. */
2252 SCHECK_PARTIAL();
2253 RRETURN(MATCH_NOMATCH);
2254 }
2255 eptr++;
2256 ecode++;
2257 break;
2258
2259 case OP_NOT_DIGIT:
2260 if (eptr >= md->end_subject)
2261 {
2262 SCHECK_PARTIAL();
2263 RRETURN(MATCH_NOMATCH);
2264 }
2265 GETCHARINCTEST(c, eptr);
2266 if (
2267 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2268 c < 256 &&
2269 #endif
2270 (md->ctypes[c] & ctype_digit) != 0
2271 )
2272 RRETURN(MATCH_NOMATCH);
2273 ecode++;
2274 break;
2275
2276 case OP_DIGIT:
2277 if (eptr >= md->end_subject)
2278 {
2279 SCHECK_PARTIAL();
2280 RRETURN(MATCH_NOMATCH);
2281 }
2282 GETCHARINCTEST(c, eptr);
2283 if (
2284 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2285 c > 255 ||
2286 #endif
2287 (md->ctypes[c] & ctype_digit) == 0
2288 )
2289 RRETURN(MATCH_NOMATCH);
2290 ecode++;
2291 break;
2292
2293 case OP_NOT_WHITESPACE:
2294 if (eptr >= md->end_subject)
2295 {
2296 SCHECK_PARTIAL();
2297 RRETURN(MATCH_NOMATCH);
2298 }
2299 GETCHARINCTEST(c, eptr);
2300 if (
2301 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2302 c < 256 &&
2303 #endif
2304 (md->ctypes[c] & ctype_space) != 0
2305 )
2306 RRETURN(MATCH_NOMATCH);
2307 ecode++;
2308 break;
2309
2310 case OP_WHITESPACE:
2311 if (eptr >= md->end_subject)
2312 {
2313 SCHECK_PARTIAL();
2314 RRETURN(MATCH_NOMATCH);
2315 }
2316 GETCHARINCTEST(c, eptr);
2317 if (
2318 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2319 c > 255 ||
2320 #endif
2321 (md->ctypes[c] & ctype_space) == 0
2322 )
2323 RRETURN(MATCH_NOMATCH);
2324 ecode++;
2325 break;
2326
2327 case OP_NOT_WORDCHAR:
2328 if (eptr >= md->end_subject)
2329 {
2330 SCHECK_PARTIAL();
2331 RRETURN(MATCH_NOMATCH);
2332 }
2333 GETCHARINCTEST(c, eptr);
2334 if (
2335 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2336 c < 256 &&
2337 #endif
2338 (md->ctypes[c] & ctype_word) != 0
2339 )
2340 RRETURN(MATCH_NOMATCH);
2341 ecode++;
2342 break;
2343
2344 case OP_WORDCHAR:
2345 if (eptr >= md->end_subject)
2346 {
2347 SCHECK_PARTIAL();
2348 RRETURN(MATCH_NOMATCH);
2349 }
2350 GETCHARINCTEST(c, eptr);
2351 if (
2352 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2353 c > 255 ||
2354 #endif
2355 (md->ctypes[c] & ctype_word) == 0
2356 )
2357 RRETURN(MATCH_NOMATCH);
2358 ecode++;
2359 break;
2360
2361 case OP_ANYNL:
2362 if (eptr >= md->end_subject)
2363 {
2364 SCHECK_PARTIAL();
2365 RRETURN(MATCH_NOMATCH);
2366 }
2367 GETCHARINCTEST(c, eptr);
2368 switch(c)
2369 {
2370 default: RRETURN(MATCH_NOMATCH);
2371
2372 case 0x000d:
2373 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2374 break;
2375
2376 case 0x000a:
2377 break;
2378
2379 case 0x000b:
2380 case 0x000c:
2381 case 0x0085:
2382 case 0x2028:
2383 case 0x2029:
2384 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2385 break;
2386 }
2387 ecode++;
2388 break;
2389
2390 case OP_NOT_HSPACE:
2391 if (eptr >= md->end_subject)
2392 {
2393 SCHECK_PARTIAL();
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 GETCHARINCTEST(c, eptr);
2397 switch(c)
2398 {
2399 default: break;
2400 case 0x09: /* HT */
2401 case 0x20: /* SPACE */
2402 case 0xa0: /* NBSP */
2403 case 0x1680: /* OGHAM SPACE MARK */
2404 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2405 case 0x2000: /* EN QUAD */
2406 case 0x2001: /* EM QUAD */
2407 case 0x2002: /* EN SPACE */
2408 case 0x2003: /* EM SPACE */
2409 case 0x2004: /* THREE-PER-EM SPACE */
2410 case 0x2005: /* FOUR-PER-EM SPACE */
2411 case 0x2006: /* SIX-PER-EM SPACE */
2412 case 0x2007: /* FIGURE SPACE */
2413 case 0x2008: /* PUNCTUATION SPACE */
2414 case 0x2009: /* THIN SPACE */
2415 case 0x200A: /* HAIR SPACE */
2416 case 0x202f: /* NARROW NO-BREAK SPACE */
2417 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2418 case 0x3000: /* IDEOGRAPHIC SPACE */
2419 RRETURN(MATCH_NOMATCH);
2420 }
2421 ecode++;
2422 break;
2423
2424 case OP_HSPACE:
2425 if (eptr >= md->end_subject)
2426 {
2427 SCHECK_PARTIAL();
2428 RRETURN(MATCH_NOMATCH);
2429 }
2430 GETCHARINCTEST(c, eptr);
2431 switch(c)
2432 {
2433 default: RRETURN(MATCH_NOMATCH);
2434 case 0x09: /* HT */
2435 case 0x20: /* SPACE */
2436 case 0xa0: /* NBSP */
2437 case 0x1680: /* OGHAM SPACE MARK */
2438 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2439 case 0x2000: /* EN QUAD */
2440 case 0x2001: /* EM QUAD */
2441 case 0x2002: /* EN SPACE */
2442 case 0x2003: /* EM SPACE */
2443 case 0x2004: /* THREE-PER-EM SPACE */
2444 case 0x2005: /* FOUR-PER-EM SPACE */
2445 case 0x2006: /* SIX-PER-EM SPACE */
2446 case 0x2007: /* FIGURE SPACE */
2447 case 0x2008: /* PUNCTUATION SPACE */
2448 case 0x2009: /* THIN SPACE */
2449 case 0x200A: /* HAIR SPACE */
2450 case 0x202f: /* NARROW NO-BREAK SPACE */
2451 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2452 case 0x3000: /* IDEOGRAPHIC SPACE */
2453 break;
2454 }
2455 ecode++;
2456 break;
2457
2458 case OP_NOT_VSPACE:
2459 if (eptr >= md->end_subject)
2460 {
2461 SCHECK_PARTIAL();
2462 RRETURN(MATCH_NOMATCH);
2463 }
2464 GETCHARINCTEST(c, eptr);
2465 switch(c)
2466 {
2467 default: break;
2468 case 0x0a: /* LF */
2469 case 0x0b: /* VT */
2470 case 0x0c: /* FF */
2471 case 0x0d: /* CR */
2472 case 0x85: /* NEL */
2473 case 0x2028: /* LINE SEPARATOR */
2474 case 0x2029: /* PARAGRAPH SEPARATOR */
2475 RRETURN(MATCH_NOMATCH);
2476 }
2477 ecode++;
2478 break;
2479
2480 case OP_VSPACE:
2481 if (eptr >= md->end_subject)
2482 {
2483 SCHECK_PARTIAL();
2484 RRETURN(MATCH_NOMATCH);
2485 }
2486 GETCHARINCTEST(c, eptr);
2487 switch(c)
2488 {
2489 default: RRETURN(MATCH_NOMATCH);
2490 case 0x0a: /* LF */
2491 case 0x0b: /* VT */
2492 case 0x0c: /* FF */
2493 case 0x0d: /* CR */
2494 case 0x85: /* NEL */
2495 case 0x2028: /* LINE SEPARATOR */
2496 case 0x2029: /* PARAGRAPH SEPARATOR */
2497 break;
2498 }
2499 ecode++;
2500 break;
2501
2502 #ifdef SUPPORT_UCP
2503 /* Check the next character by Unicode property. We will get here only
2504 if the support is in the binary; otherwise a compile-time error occurs. */
2505
2506 case OP_PROP:
2507 case OP_NOTPROP:
2508 if (eptr >= md->end_subject)
2509 {
2510 SCHECK_PARTIAL();
2511 RRETURN(MATCH_NOMATCH);
2512 }
2513 GETCHARINCTEST(c, eptr);
2514 {
2515 const ucd_record *prop = GET_UCD(c);
2516
2517 switch(ecode[1])
2518 {
2519 case PT_ANY:
2520 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2521 break;
2522
2523 case PT_LAMP:
2524 if ((prop->chartype == ucp_Lu ||
2525 prop->chartype == ucp_Ll ||
2526 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2527 RRETURN(MATCH_NOMATCH);
2528 break;
2529
2530 case PT_GC:
2531 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2532 RRETURN(MATCH_NOMATCH);
2533 break;
2534
2535 case PT_PC:
2536 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2537 RRETURN(MATCH_NOMATCH);
2538 break;
2539
2540 case PT_SC:
2541 if ((ecode[2] != prop->script) == (op == OP_PROP))
2542 RRETURN(MATCH_NOMATCH);
2543 break;
2544
2545 /* These are specials */
2546
2547 case PT_ALNUM:
2548 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2549 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2550 RRETURN(MATCH_NOMATCH);
2551 break;
2552
2553 case PT_SPACE: /* Perl space */
2554 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2555 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2556 == (op == OP_NOTPROP))
2557 RRETURN(MATCH_NOMATCH);
2558 break;
2559
2560 case PT_PXSPACE: /* POSIX space */
2561 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2562 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2563 c == CHAR_FF || c == CHAR_CR)
2564 == (op == OP_NOTPROP))
2565 RRETURN(MATCH_NOMATCH);
2566 break;
2567
2568 case PT_WORD:
2569 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2570 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2571 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2572 RRETURN(MATCH_NOMATCH);
2573 break;
2574
2575 /* This should never occur */
2576
2577 default:
2578 RRETURN(PCRE_ERROR_INTERNAL);
2579 }
2580
2581 ecode += 3;
2582 }
2583 break;
2584
2585 /* Match an extended Unicode sequence. We will get here only if the support
2586 is in the binary; otherwise a compile-time error occurs. */
2587
2588 case OP_EXTUNI:
2589 if (eptr >= md->end_subject)
2590 {
2591 SCHECK_PARTIAL();
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 GETCHARINCTEST(c, eptr);
2595 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2596 while (eptr < md->end_subject)
2597 {
2598 int len = 1;
2599 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2600 if (UCD_CATEGORY(c) != ucp_M) break;
2601 eptr += len;
2602 }
2603 ecode++;
2604 break;
2605 #endif
2606
2607
2608 /* Match a back reference, possibly repeatedly. Look past the end of the
2609 item to see if there is repeat information following. The code is similar
2610 to that for character classes, but repeated for efficiency. Then obey
2611 similar code to character type repeats - written out again for speed.
2612 However, if the referenced string is the empty string, always treat
2613 it as matched, any number of times (otherwise there could be infinite
2614 loops). */
2615
2616 case OP_REF:
2617 case OP_REFI:
2618 caseless = op == OP_REFI;
2619 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2620 ecode += 1 + IMM2_SIZE;
2621
2622 /* If the reference is unset, there are two possibilities:
2623
2624 (a) In the default, Perl-compatible state, set the length negative;
2625 this ensures that every attempt at a match fails. We can't just fail
2626 here, because of the possibility of quantifiers with zero minima.
2627
2628 (b) If the JavaScript compatibility flag is set, set the length to zero
2629 so that the back reference matches an empty string.
2630
2631 Otherwise, set the length to the length of what was matched by the
2632 referenced subpattern. */
2633
2634 if (offset >= offset_top || md->offset_vector[offset] < 0)
2635 length = (md->jscript_compat)? 0 : -1;
2636 else
2637 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2638
2639 /* Set up for repetition, or handle the non-repeated case */
2640
2641 switch (*ecode)
2642 {
2643 case OP_CRSTAR:
2644 case OP_CRMINSTAR:
2645 case OP_CRPLUS:
2646 case OP_CRMINPLUS:
2647 case OP_CRQUERY:
2648 case OP_CRMINQUERY:
2649 c = *ecode++ - OP_CRSTAR;
2650 minimize = (c & 1) != 0;
2651 min = rep_min[c]; /* Pick up values from tables; */
2652 max = rep_max[c]; /* zero for max => infinity */
2653 if (max == 0) max = INT_MAX;
2654 break;
2655
2656 case OP_CRRANGE:
2657 case OP_CRMINRANGE:
2658 minimize = (*ecode == OP_CRMINRANGE);
2659 min = GET2(ecode, 1);
2660 max = GET2(ecode, 1 + IMM2_SIZE);
2661 if (max == 0) max = INT_MAX;
2662 ecode += 1 + 2 * IMM2_SIZE;
2663 break;
2664
2665 default: /* No repeat follows */
2666 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2667 {
2668 CHECK_PARTIAL();
2669 RRETURN(MATCH_NOMATCH);
2670 }
2671 eptr += length;
2672 continue; /* With the main loop */
2673 }
2674
2675 /* Handle repeated back references. If the length of the reference is
2676 zero, just continue with the main loop. If the length is negative, it
2677 means the reference is unset in non-Java-compatible mode. If the minimum is
2678 zero, we can continue at the same level without recursion. For any other
2679 minimum, carrying on will result in NOMATCH. */
2680
2681 if (length == 0) continue;
2682 if (length < 0 && min == 0) continue;
2683
2684 /* First, ensure the minimum number of matches are present. We get back
2685 the length of the reference string explicitly rather than passing the
2686 address of eptr, so that eptr can be a register variable. */
2687
2688 for (i = 1; i <= min; i++)
2689 {
2690 int slength;
2691 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2692 {
2693 CHECK_PARTIAL();
2694 RRETURN(MATCH_NOMATCH);
2695 }
2696 eptr += slength;
2697 }
2698
2699 /* If min = max, continue at the same level without recursion.
2700 They are not both allowed to be zero. */
2701
2702 if (min == max) continue;
2703
2704 /* If minimizing, keep trying and advancing the pointer */
2705
2706 if (minimize)
2707 {
2708 for (fi = min;; fi++)
2709 {
2710 int slength;
2711 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2712 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2713 if (fi >= max) RRETURN(MATCH_NOMATCH);
2714 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2715 {
2716 CHECK_PARTIAL();
2717 RRETURN(MATCH_NOMATCH);
2718 }
2719 eptr += slength;
2720 }
2721 /* Control never gets here */
2722 }
2723
2724 /* If maximizing, find the longest string and work backwards */
2725
2726 else
2727 {
2728 pp = eptr;
2729 for (i = min; i < max; i++)
2730 {
2731 int slength;
2732 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2733 {
2734 CHECK_PARTIAL();
2735 break;
2736 }
2737 eptr += slength;
2738 }
2739 while (eptr >= pp)
2740 {
2741 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2742 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2743 eptr -= length;
2744 }
2745 RRETURN(MATCH_NOMATCH);
2746 }
2747 /* Control never gets here */
2748
2749 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2750 used when all the characters in the class have values in the range 0-255,
2751 and either the matching is caseful, or the characters are in the range
2752 0-127 when UTF-8 processing is enabled. The only difference between
2753 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2754 encountered.
2755
2756 First, look past the end of the item to see if there is repeat information
2757 following. Then obey similar code to character type repeats - written out
2758 again for speed. */
2759
2760 case OP_NCLASS:
2761 case OP_CLASS:
2762 {
2763 /* The data variable is saved across frames, so the byte map needs to
2764 be stored there. */
2765 #define BYTE_MAP ((pcre_uint8 *)data)
2766 data = ecode + 1; /* Save for matching */
2767 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2768
2769 switch (*ecode)
2770 {
2771 case OP_CRSTAR:
2772 case OP_CRMINSTAR:
2773 case OP_CRPLUS:
2774 case OP_CRMINPLUS:
2775 case OP_CRQUERY:
2776 case OP_CRMINQUERY:
2777 c = *ecode++ - OP_CRSTAR;
2778 minimize = (c & 1) != 0;
2779 min = rep_min[c]; /* Pick up values from tables; */
2780 max = rep_max[c]; /* zero for max => infinity */
2781 if (max == 0) max = INT_MAX;
2782 break;
2783
2784 case OP_CRRANGE:
2785 case OP_CRMINRANGE:
2786 minimize = (*ecode == OP_CRMINRANGE);
2787 min = GET2(ecode, 1);
2788 max = GET2(ecode, 1 + IMM2_SIZE);
2789 if (max == 0) max = INT_MAX;
2790 ecode += 1 + 2 * IMM2_SIZE;
2791 break;
2792
2793 default: /* No repeat follows */
2794 min = max = 1;
2795 break;
2796 }
2797
2798 /* First, ensure the minimum number of matches are present. */
2799
2800 #ifdef SUPPORT_UTF
2801 if (utf)
2802 {
2803 for (i = 1; i <= min; i++)
2804 {
2805 if (eptr >= md->end_subject)
2806 {
2807 SCHECK_PARTIAL();
2808 RRETURN(MATCH_NOMATCH);
2809 }
2810 GETCHARINC(c, eptr);
2811 if (c > 255)
2812 {
2813 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2814 }
2815 else
2816 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2817 }
2818 }
2819 else
2820 #endif
2821 /* Not UTF mode */
2822 {
2823 for (i = 1; i <= min; i++)
2824 {
2825 if (eptr >= md->end_subject)
2826 {
2827 SCHECK_PARTIAL();
2828 RRETURN(MATCH_NOMATCH);
2829 }
2830 c = *eptr++;
2831 #ifndef COMPILE_PCRE8
2832 if (c > 255)
2833 {
2834 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2835 }
2836 else
2837 #endif
2838 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2839 }
2840 }
2841
2842 /* If max == min we can continue with the main loop without the
2843 need to recurse. */
2844
2845 if (min == max) continue;
2846
2847 /* If minimizing, keep testing the rest of the expression and advancing
2848 the pointer while it matches the class. */
2849
2850 if (minimize)
2851 {
2852 #ifdef SUPPORT_UTF
2853 if (utf)
2854 {
2855 for (fi = min;; fi++)
2856 {
2857 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2859 if (fi >= max) RRETURN(MATCH_NOMATCH);
2860 if (eptr >= md->end_subject)
2861 {
2862 SCHECK_PARTIAL();
2863 RRETURN(MATCH_NOMATCH);
2864 }
2865 GETCHARINC(c, eptr);
2866 if (c > 255)
2867 {
2868 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2869 }
2870 else
2871 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2872 }
2873 }
2874 else
2875 #endif
2876 /* Not UTF mode */
2877 {
2878 for (fi = min;; fi++)
2879 {
2880 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2882 if (fi >= max) RRETURN(MATCH_NOMATCH);
2883 if (eptr >= md->end_subject)
2884 {
2885 SCHECK_PARTIAL();
2886 RRETURN(MATCH_NOMATCH);
2887 }
2888 c = *eptr++;
2889 #ifndef COMPILE_PCRE8
2890 if (c > 255)
2891 {
2892 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2893 }
2894 else
2895 #endif
2896 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2897 }
2898 }
2899 /* Control never gets here */
2900 }
2901
2902 /* If maximizing, find the longest possible run, then work backwards. */
2903
2904 else
2905 {
2906 pp = eptr;
2907
2908 #ifdef SUPPORT_UTF
2909 if (utf)
2910 {
2911 for (i = min; i < max; i++)
2912 {
2913 int len = 1;
2914 if (eptr >= md->end_subject)
2915 {
2916 SCHECK_PARTIAL();
2917 break;
2918 }
2919 GETCHARLEN(c, eptr, len);
2920 if (c > 255)
2921 {
2922 if (op == OP_CLASS) break;
2923 }
2924 else
2925 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2926 eptr += len;
2927 }
2928 for (;;)
2929 {
2930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2932 if (eptr-- == pp) break; /* Stop if tried at original pos */
2933 BACKCHAR(eptr);
2934 }
2935 }
2936 else
2937 #endif
2938 /* Not UTF mode */
2939 {
2940 for (i = min; i < max; i++)
2941 {
2942 if (eptr >= md->end_subject)
2943 {
2944 SCHECK_PARTIAL();
2945 break;
2946 }
2947 c = *eptr;
2948 #ifndef COMPILE_PCRE8
2949 if (c > 255)
2950 {
2951 if (op == OP_CLASS) break;
2952 }
2953 else
2954 #endif
2955 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2956 eptr++;
2957 }
2958 while (eptr >= pp)
2959 {
2960 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2961 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2962 eptr--;
2963 }
2964 }
2965
2966 RRETURN(MATCH_NOMATCH);
2967 }
2968 #undef BYTE_MAP
2969 }
2970 /* Control never gets here */
2971
2972
2973 /* Match an extended character class. This opcode is encountered only
2974 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2975 mode, because Unicode properties are supported in non-UTF-8 mode. */
2976
2977 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2978 case OP_XCLASS:
2979 {
2980 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2981 ecode += GET(ecode, 1); /* Advance past the item */
2982
2983 switch (*ecode)
2984 {
2985 case OP_CRSTAR:
2986 case OP_CRMINSTAR:
2987 case OP_CRPLUS:
2988 case OP_CRMINPLUS:
2989 case OP_CRQUERY:
2990 case OP_CRMINQUERY:
2991 c = *ecode++ - OP_CRSTAR;
2992 minimize = (c & 1) != 0;
2993 min = rep_min[c]; /* Pick up values from tables; */
2994 max = rep_max[c]; /* zero for max => infinity */
2995 if (max == 0) max = INT_MAX;
2996 break;
2997
2998 case OP_CRRANGE:
2999 case OP_CRMINRANGE:
3000 minimize = (*ecode == OP_CRMINRANGE);
3001 min = GET2(ecode, 1);
3002 max = GET2(ecode, 1 + IMM2_SIZE);
3003 if (max == 0) max = INT_MAX;
3004 ecode += 1 + 2 * IMM2_SIZE;
3005 break;
3006
3007 default: /* No repeat follows */
3008 min = max = 1;
3009 break;
3010 }
3011
3012 /* First, ensure the minimum number of matches are present. */
3013
3014 for (i = 1; i <= min; i++)
3015 {
3016 if (eptr >= md->end_subject)
3017 {
3018 SCHECK_PARTIAL();
3019 RRETURN(MATCH_NOMATCH);
3020 }
3021 GETCHARINCTEST(c, eptr);
3022 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3023 }
3024
3025 /* If max == min we can continue with the main loop without the
3026 need to recurse. */
3027
3028 if (min == max) continue;
3029
3030 /* If minimizing, keep testing the rest of the expression and advancing
3031 the pointer while it matches the class. */
3032
3033 if (minimize)
3034 {
3035 for (fi = min;; fi++)
3036 {
3037 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3038 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3039 if (fi >= max) RRETURN(MATCH_NOMATCH);
3040 if (eptr >= md->end_subject)
3041 {
3042 SCHECK_PARTIAL();
3043 RRETURN(MATCH_NOMATCH);
3044 }
3045 GETCHARINCTEST(c, eptr);
3046 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3047 }
3048 /* Control never gets here */
3049 }
3050
3051 /* If maximizing, find the longest possible run, then work backwards. */
3052
3053 else
3054 {
3055 pp = eptr;
3056 for (i = min; i < max; i++)
3057 {
3058 int len = 1;
3059 if (eptr >= md->end_subject)
3060 {
3061 SCHECK_PARTIAL();
3062 break;
3063 }
3064 #ifdef SUPPORT_UTF
3065 GETCHARLENTEST(c, eptr, len);
3066 #else
3067 c = *eptr;
3068 #endif
3069 if (!PRIV(xclass)(c, data, utf)) break;
3070 eptr += len;
3071 }
3072 for(;;)
3073 {
3074 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3075 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3076 if (eptr-- == pp) break; /* Stop if tried at original pos */
3077 #ifdef SUPPORT_UTF
3078 if (utf) BACKCHAR(eptr);
3079 #endif
3080 }
3081 RRETURN(MATCH_NOMATCH);
3082 }
3083
3084 /* Control never gets here */
3085 }
3086 #endif /* End of XCLASS */
3087
3088 /* Match a single character, casefully */
3089
3090 case OP_CHAR:
3091 #ifdef SUPPORT_UTF
3092 if (utf)
3093 {
3094 length = 1;
3095 ecode++;
3096 GETCHARLEN(fc, ecode, length);
3097 if (length > md->end_subject - eptr)
3098 {
3099 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3100 RRETURN(MATCH_NOMATCH);
3101 }
3102 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3103 }
3104 else
3105 #endif
3106 /* Not UTF mode */
3107 {
3108 if (md->end_subject - eptr < 1)
3109 {
3110 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3111 RRETURN(MATCH_NOMATCH);
3112 }
3113 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3114 ecode += 2;
3115 }
3116 break;
3117
3118 /* Match a single character, caselessly. If we are at the end of the
3119 subject, give up immediately. */
3120
3121 case OP_CHARI:
3122 if (eptr >= md->end_subject)
3123 {
3124 SCHECK_PARTIAL();
3125 RRETURN(MATCH_NOMATCH);
3126 }
3127
3128 #ifdef SUPPORT_UTF
3129 if (utf)
3130 {
3131 length = 1;
3132 ecode++;
3133 GETCHARLEN(fc, ecode, length);
3134
3135 /* If the pattern character's value is < 128, we have only one byte, and
3136 we know that its other case must also be one byte long, so we can use the
3137 fast lookup table. We know that there is at least one byte left in the
3138 subject. */
3139
3140 if (fc < 128)
3141 {
3142 if (md->lcc[fc]
3143 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3144 ecode++;
3145 eptr++;
3146 }
3147
3148 /* Otherwise we must pick up the subject character. Note that we cannot
3149 use the value of "length" to check for sufficient bytes left, because the
3150 other case of the character may have more or fewer bytes. */
3151
3152 else
3153 {
3154 unsigned int dc;
3155 GETCHARINC(dc, eptr);
3156 ecode += length;
3157
3158 /* If we have Unicode property support, we can use it to test the other
3159 case of the character, if there is one. */
3160
3161 if (fc != dc)
3162 {
3163 #ifdef SUPPORT_UCP
3164 if (dc != UCD_OTHERCASE(fc))
3165 #endif
3166 RRETURN(MATCH_NOMATCH);
3167 }
3168 }
3169 }
3170 else
3171 #endif /* SUPPORT_UTF */
3172
3173 /* Not UTF mode */
3174 {
3175 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3176 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3177 eptr++;
3178 ecode += 2;
3179 }
3180 break;
3181
3182 /* Match a single character repeatedly. */
3183
3184 case OP_EXACT:
3185 case OP_EXACTI:
3186 min = max = GET2(ecode, 1);
3187 ecode += 1 + IMM2_SIZE;
3188 goto REPEATCHAR;
3189
3190 case OP_POSUPTO:
3191 case OP_POSUPTOI:
3192 possessive = TRUE;
3193 /* Fall through */
3194
3195 case OP_UPTO:
3196 case OP_UPTOI:
3197 case OP_MINUPTO:
3198 case OP_MINUPTOI:
3199 min = 0;
3200 max = GET2(ecode, 1);
3201 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3202 ecode += 1 + IMM2_SIZE;
3203 goto REPEATCHAR;
3204
3205 case OP_POSSTAR:
3206 case OP_POSSTARI:
3207 possessive = TRUE;
3208 min = 0;
3209 max = INT_MAX;
3210 ecode++;
3211 goto REPEATCHAR;
3212
3213 case OP_POSPLUS:
3214 case OP_POSPLUSI:
3215 possessive = TRUE;
3216 min = 1;
3217 max = INT_MAX;
3218 ecode++;
3219 goto REPEATCHAR;
3220
3221 case OP_POSQUERY:
3222 case OP_POSQUERYI:
3223 possessive = TRUE;
3224 min = 0;
3225 max = 1;
3226 ecode++;
3227 goto REPEATCHAR;
3228
3229 case OP_STAR:
3230 case OP_STARI:
3231 case OP_MINSTAR:
3232 case OP_MINSTARI:
3233 case OP_PLUS:
3234 case OP_PLUSI:
3235 case OP_MINPLUS:
3236 case OP_MINPLUSI:
3237 case OP_QUERY:
3238 case OP_QUERYI:
3239 case OP_MINQUERY:
3240 case OP_MINQUERYI:
3241 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3242 minimize = (c & 1) != 0;
3243 min = rep_min[c]; /* Pick up values from tables; */
3244 max = rep_max[c]; /* zero for max => infinity */
3245 if (max == 0) max = INT_MAX;
3246
3247 /* Common code for all repeated single-character matches. */
3248
3249 REPEATCHAR:
3250 #ifdef SUPPORT_UTF
3251 if (utf)
3252 {
3253 length = 1;
3254 charptr = ecode;
3255 GETCHARLEN(fc, ecode, length);
3256 ecode += length;
3257
3258 /* Handle multibyte character matching specially here. There is
3259 support for caseless matching if UCP support is present. */
3260
3261 if (length > 1)
3262 {
3263 #ifdef SUPPORT_UCP
3264 unsigned int othercase;
3265 if (op >= OP_STARI && /* Caseless */
3266 (othercase = UCD_OTHERCASE(fc)) != fc)
3267 oclength = PRIV(ord2utf)(othercase, occhars);
3268 else oclength = 0;
3269 #endif /* SUPPORT_UCP */
3270
3271 for (i = 1; i <= min; i++)
3272 {
3273 if (eptr <= md->end_subject - length &&
3274 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3275 #ifdef SUPPORT_UCP
3276 else if (oclength > 0 &&
3277 eptr <= md->end_subject - oclength &&
3278 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3279 #endif /* SUPPORT_UCP */
3280 else
3281 {
3282 CHECK_PARTIAL();
3283 RRETURN(MATCH_NOMATCH);
3284 }
3285 }
3286
3287 if (min == max) continue;
3288
3289 if (minimize)
3290 {
3291 for (fi = min;; fi++)
3292 {
3293 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3294 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3295 if (fi >= max) RRETURN(MATCH_NOMATCH);
3296 if (eptr <= md->end_subject - length &&
3297 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3298 #ifdef SUPPORT_UCP
3299 else if (oclength > 0 &&
3300 eptr <= md->end_subject - oclength &&
3301 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3302 #endif /* SUPPORT_UCP */
3303 else
3304 {
3305 CHECK_PARTIAL();
3306 RRETURN(MATCH_NOMATCH);
3307 }
3308 }
3309 /* Control never gets here */
3310 }
3311
3312 else /* Maximize */
3313 {
3314 pp = eptr;
3315 for (i = min; i < max; i++)
3316 {
3317 if (eptr <= md->end_subject - length &&
3318 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3319 #ifdef SUPPORT_UCP
3320 else if (oclength > 0 &&
3321 eptr <= md->end_subject - oclength &&
3322 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3323 #endif /* SUPPORT_UCP */
3324 else
3325 {
3326 CHECK_PARTIAL();
3327 break;
3328 }
3329 }
3330
3331 if (possessive) continue;
3332
3333 for(;;)
3334 {
3335 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3337 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3338 #ifdef SUPPORT_UCP
3339 eptr--;
3340 BACKCHAR(eptr);
3341 #else /* without SUPPORT_UCP */
3342 eptr -= length;
3343 #endif /* SUPPORT_UCP */
3344 }
3345 }
3346 /* Control never gets here */
3347 }
3348
3349 /* If the length of a UTF-8 character is 1, we fall through here, and
3350 obey the code as for non-UTF-8 characters below, though in this case the
3351 value of fc will always be < 128. */
3352 }
3353 else
3354 #endif /* SUPPORT_UTF */
3355 /* When not in UTF-8 mode, load a single-byte character. */
3356 fc = *ecode++;
3357
3358 /* The value of fc at this point is always one character, though we may
3359 or may not be in UTF mode. The code is duplicated for the caseless and
3360 caseful cases, for speed, since matching characters is likely to be quite
3361 common. First, ensure the minimum number of matches are present. If min =
3362 max, continue at the same level without recursing. Otherwise, if
3363 minimizing, keep trying the rest of the expression and advancing one
3364 matching character if failing, up to the maximum. Alternatively, if
3365 maximizing, find the maximum number of characters and work backwards. */
3366
3367 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3368 max, eptr));
3369
3370 if (op >= OP_STARI) /* Caseless */
3371 {
3372 #ifdef COMPILE_PCRE8
3373 /* fc must be < 128 if UTF is enabled. */
3374 foc = md->fcc[fc];
3375 #else
3376 #ifdef SUPPORT_UTF
3377 #ifdef SUPPORT_UCP
3378 if (utf && fc > 127)
3379 foc = UCD_OTHERCASE(fc);
3380 #else
3381 if (utf && fc > 127)
3382 foc = fc;
3383 #endif /* SUPPORT_UCP */
3384 else
3385 #endif /* SUPPORT_UTF */
3386 foc = TABLE_GET(fc, md->fcc, fc);
3387 #endif /* COMPILE_PCRE8 */
3388
3389 for (i = 1; i <= min; i++)
3390 {
3391 if (eptr >= md->end_subject)
3392 {
3393 SCHECK_PARTIAL();
3394 RRETURN(MATCH_NOMATCH);
3395 }
3396 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3397 eptr++;
3398 }
3399 if (min == max) continue;
3400 if (minimize)
3401 {
3402 for (fi = min;; fi++)
3403 {
3404 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3405 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3406 if (fi >= max) RRETURN(MATCH_NOMATCH);
3407 if (eptr >= md->end_subject)
3408 {
3409 SCHECK_PARTIAL();
3410 RRETURN(MATCH_NOMATCH);
3411 }
3412 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3413 eptr++;
3414 }
3415 /* Control never gets here */
3416 }
3417 else /* Maximize */
3418 {
3419 pp = eptr;
3420 for (i = min; i < max; i++)
3421 {
3422 if (eptr >= md->end_subject)
3423 {
3424 SCHECK_PARTIAL();
3425 break;
3426 }
3427 if (fc != *eptr && foc != *eptr) break;
3428 eptr++;
3429 }
3430
3431 if (possessive) continue;
3432
3433 while (eptr >= pp)
3434 {
3435 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3436 eptr--;
3437 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438 }
3439 RRETURN(MATCH_NOMATCH);
3440 }
3441 /* Control never gets here */
3442 }
3443
3444 /* Caseful comparisons (includes all multi-byte characters) */
3445
3446 else
3447 {
3448 for (i = 1; i <= min; i++)
3449 {
3450 if (eptr >= md->end_subject)
3451 {
3452 SCHECK_PARTIAL();
3453 RRETURN(MATCH_NOMATCH);
3454 }
3455 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3456 }
3457
3458 if (min == max) continue;
3459
3460 if (minimize)
3461 {
3462 for (fi = min;; fi++)
3463 {
3464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3466 if (fi >= max) RRETURN(MATCH_NOMATCH);
3467 if (eptr >= md->end_subject)
3468 {
3469 SCHECK_PARTIAL();
3470 RRETURN(MATCH_NOMATCH);
3471 }
3472 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3473 }
3474 /* Control never gets here */
3475 }
3476 else /* Maximize */
3477 {
3478 pp = eptr;
3479 for (i = min; i < max; i++)
3480 {
3481 if (eptr >= md->end_subject)
3482 {
3483 SCHECK_PARTIAL();
3484 break;
3485 }
3486 if (fc != *eptr) break;
3487 eptr++;
3488 }
3489 if (possessive) continue;
3490
3491 while (eptr >= pp)
3492 {
3493 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3494 eptr--;
3495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3496 }
3497 RRETURN(MATCH_NOMATCH);
3498 }
3499 }
3500 /* Control never gets here */
3501
3502 /* Match a negated single one-byte character. The character we are
3503 checking can be multibyte. */
3504
3505 case OP_NOT:
3506 case OP_NOTI:
3507 if (eptr >= md->end_subject)
3508 {
3509 SCHECK_PARTIAL();
3510 RRETURN(MATCH_NOMATCH);
3511 }
3512 ecode++;
3513 GETCHARINCTEST(c, eptr);
3514 if (op == OP_NOTI) /* The caseless case */
3515 {
3516 register int ch, och;
3517 ch = *ecode++;
3518 #ifdef COMPILE_PCRE8
3519 /* ch must be < 128 if UTF is enabled. */
3520 och = md->fcc[ch];
3521 #else
3522 #ifdef SUPPORT_UTF
3523 #ifdef SUPPORT_UCP
3524 if (utf && ch > 127)
3525 och = UCD_OTHERCASE(ch);
3526 #else
3527 if (utf && ch > 127)
3528 och = ch;
3529 #endif /* SUPPORT_UCP */
3530 else
3531 #endif /* SUPPORT_UTF */
3532 och = TABLE_GET(ch, md->fcc, ch);
3533 #endif /* COMPILE_PCRE8 */
3534 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3535 }
3536 else /* Caseful */
3537 {
3538 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3539 }
3540 break;
3541
3542 /* Match a negated single one-byte character repeatedly. This is almost a
3543 repeat of the code for a repeated single character, but I haven't found a
3544 nice way of commoning these up that doesn't require a test of the
3545 positive/negative option for each character match. Maybe that wouldn't add
3546 very much to the time taken, but character matching *is* what this is all
3547 about... */
3548
3549 case OP_NOTEXACT:
3550 case OP_NOTEXACTI:
3551 min = max = GET2(ecode, 1);
3552 ecode += 1 + IMM2_SIZE;
3553 goto REPEATNOTCHAR;
3554
3555 case OP_NOTUPTO:
3556 case OP_NOTUPTOI:
3557 case OP_NOTMINUPTO:
3558 case OP_NOTMINUPTOI:
3559 min = 0;
3560 max = GET2(ecode, 1);
3561 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3562 ecode += 1 + IMM2_SIZE;
3563 goto REPEATNOTCHAR;
3564
3565 case OP_NOTPOSSTAR:
3566 case OP_NOTPOSSTARI:
3567 possessive = TRUE;
3568 min = 0;
3569 max = INT_MAX;
3570 ecode++;
3571 goto REPEATNOTCHAR;
3572
3573 case OP_NOTPOSPLUS:
3574 case OP_NOTPOSPLUSI:
3575 possessive = TRUE;
3576 min = 1;
3577 max = INT_MAX;
3578 ecode++;
3579 goto REPEATNOTCHAR;
3580
3581 case OP_NOTPOSQUERY:
3582 case OP_NOTPOSQUERYI:
3583 possessive = TRUE;
3584 min = 0;
3585 max = 1;
3586 ecode++;
3587 goto REPEATNOTCHAR;
3588
3589 case OP_NOTPOSUPTO:
3590 case OP_NOTPOSUPTOI:
3591 possessive = TRUE;
3592 min = 0;
3593 max = GET2(ecode, 1);
3594 ecode += 1 + IMM2_SIZE;
3595 goto REPEATNOTCHAR;
3596
3597 case OP_NOTSTAR:
3598 case OP_NOTSTARI:
3599 case OP_NOTMINSTAR:
3600 case OP_NOTMINSTARI:
3601 case OP_NOTPLUS:
3602 case OP_NOTPLUSI:
3603 case OP_NOTMINPLUS:
3604 case OP_NOTMINPLUSI:
3605 case OP_NOTQUERY:
3606 case OP_NOTQUERYI:
3607 case OP_NOTMINQUERY:
3608 case OP_NOTMINQUERYI:
3609 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3610 minimize = (c & 1) != 0;
3611 min = rep_min[c]; /* Pick up values from tables; */
3612 max = rep_max[c]; /* zero for max => infinity */
3613 if (max == 0) max = INT_MAX;
3614
3615 /* Common code for all repeated single-byte matches. */
3616
3617 REPEATNOTCHAR:
3618 fc = *ecode++;
3619
3620 /* The code is duplicated for the caseless and caseful cases, for speed,
3621 since matching characters is likely to be quite common. First, ensure the
3622 minimum number of matches are present. If min = max, continue at the same
3623 level without recursing. Otherwise, if minimizing, keep trying the rest of
3624 the expression and advancing one matching character if failing, up to the
3625 maximum. Alternatively, if maximizing, find the maximum number of
3626 characters and work backwards. */
3627
3628 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3629 max, eptr));
3630
3631 if (op >= OP_NOTSTARI) /* Caseless */
3632 {
3633 #ifdef COMPILE_PCRE8
3634 /* fc must be < 128 if UTF is enabled. */
3635 foc = md->fcc[fc];
3636 #else
3637 #ifdef SUPPORT_UTF
3638 #ifdef SUPPORT_UCP
3639 if (utf && fc > 127)
3640 foc = UCD_OTHERCASE(fc);
3641 #else
3642 if (utf && fc > 127)
3643 foc = fc;
3644 #endif /* SUPPORT_UCP */
3645 else
3646 #endif /* SUPPORT_UTF */
3647 foc = TABLE_GET(fc, md->fcc, fc);
3648 #endif /* COMPILE_PCRE8 */
3649
3650 #ifdef SUPPORT_UTF
3651 if (utf)
3652 {
3653 register unsigned int d;
3654 for (i = 1; i <= min; i++)
3655 {
3656 if (eptr >= md->end_subject)
3657 {
3658 SCHECK_PARTIAL();
3659 RRETURN(MATCH_NOMATCH);
3660 }
3661 GETCHARINC(d, eptr);
3662 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3663 }
3664 }
3665 else
3666 #endif
3667 /* Not UTF mode */
3668 {
3669 for (i = 1; i <= min; i++)
3670 {
3671 if (eptr >= md->end_subject)
3672 {
3673 SCHECK_PARTIAL();
3674 RRETURN(MATCH_NOMATCH);
3675 }
3676 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3677 eptr++;
3678 }
3679 }
3680
3681 if (min == max) continue;
3682
3683 if (minimize)
3684 {
3685 #ifdef SUPPORT_UTF
3686 if (utf)
3687 {
3688 register unsigned int d;
3689 for (fi = min;; fi++)
3690 {
3691 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3693 if (fi >= max) RRETURN(MATCH_NOMATCH);
3694 if (eptr >= md->end_subject)
3695 {
3696 SCHECK_PARTIAL();
3697 RRETURN(MATCH_NOMATCH);
3698 }
3699 GETCHARINC(d, eptr);
3700 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3701 }
3702 }
3703 else
3704 #endif
3705 /* Not UTF mode */
3706 {
3707 for (fi = min;; fi++)
3708 {
3709 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3711 if (fi >= max) RRETURN(MATCH_NOMATCH);
3712 if (eptr >= md->end_subject)
3713 {
3714 SCHECK_PARTIAL();
3715 RRETURN(MATCH_NOMATCH);
3716 }
3717 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3718 eptr++;
3719 }
3720 }
3721 /* Control never gets here */
3722 }
3723
3724 /* Maximize case */
3725
3726 else
3727 {
3728 pp = eptr;
3729
3730 #ifdef SUPPORT_UTF
3731 if (utf)
3732 {
3733 register unsigned int d;
3734 for (i = min; i < max; i++)
3735 {
3736 int len = 1;
3737 if (eptr >= md->end_subject)
3738 {
3739 SCHECK_PARTIAL();
3740 break;
3741 }
3742 GETCHARLEN(d, eptr, len);
3743 if (fc == d || foc == d) break;
3744 eptr += len;
3745 }
3746 if (possessive) continue;
3747 for(;;)
3748 {
3749 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3750 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3751 if (eptr-- == pp) break; /* Stop if tried at original pos */
3752 BACKCHAR(eptr);
3753 }
3754 }
3755 else
3756 #endif
3757 /* Not UTF mode */
3758 {
3759 for (i = min; i < max; i++)
3760 {
3761 if (eptr >= md->end_subject)
3762 {
3763 SCHECK_PARTIAL();
3764 break;
3765 }
3766 if (fc == *eptr || foc == *eptr) break;
3767 eptr++;
3768 }
3769 if (possessive) continue;
3770 while (eptr >= pp)
3771 {
3772 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3774 eptr--;
3775 }
3776 }
3777
3778 RRETURN(MATCH_NOMATCH);
3779 }
3780 /* Control never gets here */
3781 }
3782
3783 /* Caseful comparisons */
3784
3785 else
3786 {
3787 #ifdef SUPPORT_UTF
3788 if (utf)
3789 {
3790 register unsigned int d;
3791 for (i = 1; i <= min; i++)
3792 {
3793 if (eptr >= md->end_subject)
3794 {
3795 SCHECK_PARTIAL();
3796 RRETURN(MATCH_NOMATCH);
3797 }
3798 GETCHARINC(d, eptr);
3799 if (fc == d) RRETURN(MATCH_NOMATCH);
3800 }
3801 }
3802 else
3803 #endif
3804 /* Not UTF mode */
3805 {
3806 for (i = 1; i <= min; i++)
3807 {
3808 if (eptr >= md->end_subject)
3809 {
3810 SCHECK_PARTIAL();
3811 RRETURN(MATCH_NOMATCH);
3812 }
3813 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3814 }
3815 }
3816
3817 if (min == max) continue;
3818
3819 if (minimize)
3820 {
3821 #ifdef SUPPORT_UTF
3822 if (utf)
3823 {
3824 register unsigned int d;
3825 for (fi = min;; fi++)
3826 {
3827 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3829 if (fi >= max) RRETURN(MATCH_NOMATCH);
3830 if (eptr >= md->end_subject)
3831 {
3832 SCHECK_PARTIAL();
3833 RRETURN(MATCH_NOMATCH);
3834 }
3835 GETCHARINC(d, eptr);
3836 if (fc == d) RRETURN(MATCH_NOMATCH);
3837 }
3838 }
3839 else
3840 #endif
3841 /* Not UTF mode */
3842 {
3843 for (fi = min;; fi++)
3844 {
3845 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3847 if (fi >= max) RRETURN(MATCH_NOMATCH);
3848 if (eptr >= md->end_subject)
3849 {
3850 SCHECK_PARTIAL();
3851 RRETURN(MATCH_NOMATCH);
3852 }
3853 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3854 }
3855 }
3856 /* Control never gets here */
3857 }
3858
3859 /* Maximize case */
3860
3861 else
3862 {
3863 pp = eptr;
3864
3865 #ifdef SUPPORT_UTF
3866 if (utf)
3867 {
3868 register unsigned int d;
3869 for (i = min; i < max; i++)
3870 {
3871 int len = 1;
3872 if (eptr >= md->end_subject)
3873 {
3874 SCHECK_PARTIAL();
3875 break;
3876 }
3877 GETCHARLEN(d, eptr, len);
3878 if (fc == d) break;
3879 eptr += len;
3880 }
3881 if (possessive) continue;
3882 for(;;)
3883 {
3884 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3886 if (eptr-- == pp) break; /* Stop if tried at original pos */
3887 BACKCHAR(eptr);
3888 }
3889 }
3890 else
3891 #endif
3892 /* Not UTF mode */
3893 {
3894 for (i = min; i < max; i++)
3895 {
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 break;
3900 }
3901 if (fc == *eptr) break;
3902 eptr++;
3903 }
3904 if (possessive) continue;
3905 while (eptr >= pp)
3906 {
3907 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3909 eptr--;
3910 }
3911 }
3912
3913 RRETURN(MATCH_NOMATCH);
3914 }
3915 }
3916 /* Control never gets here */
3917
3918 /* Match a single character type repeatedly; several different opcodes
3919 share code. This is very similar to the code for single characters, but we
3920 repeat it in the interests of efficiency. */
3921
3922 case OP_TYPEEXACT:
3923 min = max = GET2(ecode, 1);
3924 minimize = TRUE;
3925 ecode += 1 + IMM2_SIZE;
3926 goto REPEATTYPE;
3927
3928 case OP_TYPEUPTO:
3929 case OP_TYPEMINUPTO:
3930 min = 0;
3931 max = GET2(ecode, 1);
3932 minimize = *ecode == OP_TYPEMINUPTO;
3933 ecode += 1 + IMM2_SIZE;
3934 goto REPEATTYPE;
3935
3936 case OP_TYPEPOSSTAR:
3937 possessive = TRUE;
3938 min = 0;
3939 max = INT_MAX;
3940 ecode++;
3941 goto REPEATTYPE;
3942
3943 case OP_TYPEPOSPLUS:
3944 possessive = TRUE;
3945 min = 1;
3946 max = INT_MAX;
3947 ecode++;
3948 goto REPEATTYPE;
3949
3950 case OP_TYPEPOSQUERY:
3951 possessive = TRUE;
3952 min = 0;
3953 max = 1;
3954 ecode++;
3955 goto REPEATTYPE;
3956
3957 case OP_TYPEPOSUPTO:
3958 possessive = TRUE;
3959 min = 0;
3960 max = GET2(ecode, 1);
3961 ecode += 1 + IMM2_SIZE;
3962 goto REPEATTYPE;
3963
3964 case OP_TYPESTAR:
3965 case OP_TYPEMINSTAR:
3966 case OP_TYPEPLUS:
3967 case OP_TYPEMINPLUS:
3968 case OP_TYPEQUERY:
3969 case OP_TYPEMINQUERY:
3970 c = *ecode++ - OP_TYPESTAR;
3971 minimize = (c & 1) != 0;
3972 min = rep_min[c]; /* Pick up values from tables; */
3973 max = rep_max[c]; /* zero for max => infinity */
3974 if (max == 0) max = INT_MAX;
3975
3976 /* Common code for all repeated single character type matches. Note that
3977 in UTF-8 mode, '.' matches a character of any length, but for the other
3978 character types, the valid characters are all one-byte long. */
3979
3980 REPEATTYPE:
3981 ctype = *ecode++; /* Code for the character type */
3982
3983 #ifdef SUPPORT_UCP
3984 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3985 {
3986 prop_fail_result = ctype == OP_NOTPROP;
3987 prop_type = *ecode++;
3988 prop_value = *ecode++;
3989 }
3990 else prop_type = -1;
3991 #endif
3992
3993 /* First, ensure the minimum number of matches are present. Use inline
3994 code for maximizing the speed, and do the type test once at the start
3995 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3996 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3997 and single-bytes. */
3998
3999 if (min > 0)
4000 {
4001 #ifdef SUPPORT_UCP
4002 if (prop_type >= 0)
4003 {
4004 switch(prop_type)
4005 {
4006 case PT_ANY:
4007 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4008 for (i = 1; i <= min; i++)
4009 {
4010 if (eptr >= md->end_subject)
4011 {
4012 SCHECK_PARTIAL();
4013 RRETURN(MATCH_NOMATCH);
4014 }
4015 GETCHARINCTEST(c, eptr);
4016 }
4017 break;
4018
4019 case PT_LAMP:
4020 for (i = 1; i <= min; i++)
4021 {
4022 int chartype;
4023 if (eptr >= md->end_subject)
4024 {
4025 SCHECK_PARTIAL();
4026 RRETURN(MATCH_NOMATCH);
4027 }
4028 GETCHARINCTEST(c, eptr);
4029 chartype = UCD_CHARTYPE(c);
4030 if ((chartype == ucp_Lu ||
4031 chartype == ucp_Ll ||
4032 chartype == ucp_Lt) == prop_fail_result)
4033 RRETURN(MATCH_NOMATCH);
4034 }
4035 break;
4036
4037 case PT_GC:
4038 for (i = 1; i <= min; i++)
4039 {
4040 if (eptr >= md->end_subject)
4041 {
4042 SCHECK_PARTIAL();
4043 RRETURN(MATCH_NOMATCH);
4044 }
4045 GETCHARINCTEST(c, eptr);
4046 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4047 RRETURN(MATCH_NOMATCH);
4048 }
4049 break;
4050
4051 case PT_PC:
4052 for (i = 1; i <= min; i++)
4053 {
4054 if (eptr >= md->end_subject)
4055 {
4056 SCHECK_PARTIAL();
4057 RRETURN(MATCH_NOMATCH);
4058 }
4059 GETCHARINCTEST(c, eptr);
4060 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4061 RRETURN(MATCH_NOMATCH);
4062 }
4063 break;
4064
4065 case PT_SC:
4066 for (i = 1; i <= min; i++)
4067 {
4068 if (eptr >= md->end_subject)
4069 {
4070 SCHECK_PARTIAL();
4071 RRETURN(MATCH_NOMATCH);
4072 }
4073 GETCHARINCTEST(c, eptr);
4074 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4075 RRETURN(MATCH_NOMATCH);
4076 }
4077 break;
4078
4079 case PT_ALNUM:
4080 for (i = 1; i <= min; i++)
4081 {
4082 int category;
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 RRETURN(MATCH_NOMATCH);
4087 }
4088 GETCHARINCTEST(c, eptr);
4089 category = UCD_CATEGORY(c);
4090 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4091 RRETURN(MATCH_NOMATCH);
4092 }
4093 break;
4094
4095 case PT_SPACE: /* Perl space */
4096 for (i = 1; i <= min; i++)
4097 {
4098 if (eptr >= md->end_subject)
4099 {
4100 SCHECK_PARTIAL();
4101 RRETURN(MATCH_NOMATCH);
4102 }
4103 GETCHARINCTEST(c, eptr);
4104 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4105 c == CHAR_FF || c == CHAR_CR)
4106 == prop_fail_result)
4107 RRETURN(MATCH_NOMATCH);
4108 }
4109 break;
4110
4111 case PT_PXSPACE: /* POSIX space */
4112 for (i = 1; i <= min; i++)
4113 {
4114 if (eptr >= md->end_subject)
4115 {
4116 SCHECK_PARTIAL();
4117 RRETURN(MATCH_NOMATCH);
4118 }
4119 GETCHARINCTEST(c, eptr);
4120 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4121 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4122 == prop_fail_result)
4123 RRETURN(MATCH_NOMATCH);
4124 }
4125 break;
4126
4127 case PT_WORD:
4128 for (i = 1; i <= min; i++)
4129 {
4130 int category;
4131 if (eptr >= md->end_subject)
4132 {
4133 SCHECK_PARTIAL();
4134 RRETURN(MATCH_NOMATCH);
4135 }
4136 GETCHARINCTEST(c, eptr);
4137 category = UCD_CATEGORY(c);
4138 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4139 == prop_fail_result)
4140 RRETURN(MATCH_NOMATCH);
4141 }
4142 break;
4143
4144 /* This should not occur */
4145
4146 default:
4147 RRETURN(PCRE_ERROR_INTERNAL);
4148 }
4149 }
4150
4151 /* Match extended Unicode sequences. We will get here only if the
4152 support is in the binary; otherwise a compile-time error occurs. */
4153
4154 else if (ctype == OP_EXTUNI)
4155 {
4156 for (i = 1; i <= min; i++)
4157 {
4158 if (eptr >= md->end_subject)
4159 {
4160 SCHECK_PARTIAL();
4161 RRETURN(MATCH_NOMATCH);
4162 }
4163 GETCHARINCTEST(c, eptr);
4164 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4165 while (eptr < md->end_subject)
4166 {
4167 int len = 1;
4168 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4169 if (UCD_CATEGORY(c) != ucp_M) break;
4170 eptr += len;
4171 }
4172 }
4173 }
4174
4175 else
4176 #endif /* SUPPORT_UCP */
4177
4178 /* Handle all other cases when the coding is UTF-8 */
4179
4180 #ifdef SUPPORT_UTF
4181 if (utf) switch(ctype)
4182 {
4183 case OP_ANY:
4184 for (i = 1; i <= min; i++)
4185 {
4186 if (eptr >= md->end_subject)
4187 {
4188 SCHECK_PARTIAL();
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4192 eptr++;
4193 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4194 }
4195 break;
4196
4197 case OP_ALLANY:
4198 for (i = 1; i <= min; i++)
4199 {
4200 if (eptr >= md->end_subject)
4201 {
4202 SCHECK_PARTIAL();
4203 RRETURN(MATCH_NOMATCH);
4204 }
4205 eptr++;
4206 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4207 }
4208 break;
4209
4210 case OP_ANYBYTE:
4211 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4212 eptr += min;
4213 break;
4214
4215 case OP_ANYNL:
4216 for (i = 1; i <= min; i++)
4217 {
4218 if (eptr >= md->end_subject)
4219 {
4220 SCHECK_PARTIAL();
4221 RRETURN(MATCH_NOMATCH);
4222 }
4223 GETCHARINC(c, eptr);
4224 switch(c)
4225 {
4226 default: RRETURN(MATCH_NOMATCH);
4227
4228 case 0x000d:
4229 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4230 break;
4231
4232 case 0x000a:
4233 break;
4234
4235 case 0x000b:
4236 case 0x000c:
4237 case 0x0085:
4238 case 0x2028:
4239 case 0x2029:
4240 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4241 break;
4242 }
4243 }
4244 break;
4245
4246 case OP_NOT_HSPACE:
4247 for (i = 1; i <= min; i++)
4248 {
4249 if (eptr >= md->end_subject)
4250 {
4251 SCHECK_PARTIAL();
4252 RRETURN(MATCH_NOMATCH);
4253 }
4254 GETCHARINC(c, eptr);
4255 switch(c)
4256 {
4257 default: break;
4258 case 0x09: /* HT */
4259 case 0x20: /* SPACE */
4260 case 0xa0: /* NBSP */
4261 case 0x1680: /* OGHAM SPACE MARK */
4262 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4263 case 0x2000: /* EN QUAD */
4264 case 0x2001: /* EM QUAD */
4265 case 0x2002: /* EN SPACE */
4266 case 0x2003: /* EM SPACE */
4267 case 0x2004: /* THREE-PER-EM SPACE */
4268 case 0x2005: /* FOUR-PER-EM SPACE */
4269 case 0x2006: /* SIX-PER-EM SPACE */
4270 case 0x2007: /* FIGURE SPACE */
4271 case 0x2008: /* PUNCTUATION SPACE */
4272 case 0x2009: /* THIN SPACE */
4273 case 0x200A: /* HAIR SPACE */
4274 case 0x202f: /* NARROW NO-BREAK SPACE */
4275 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4276 case 0x3000: /* IDEOGRAPHIC SPACE */
4277 RRETURN(MATCH_NOMATCH);
4278 }
4279 }
4280 break;
4281
4282 case OP_HSPACE:
4283 for (i = 1; i <= min; i++)
4284 {
4285 if (eptr >= md->end_subject)
4286 {
4287 SCHECK_PARTIAL();
4288 RRETURN(MATCH_NOMATCH);
4289 }
4290 GETCHARINC(c, eptr);
4291 switch(c)
4292 {
4293 default: RRETURN(MATCH_NOMATCH);
4294 case 0x09: /* HT */
4295 case 0x20: /* SPACE */
4296 case 0xa0: /* NBSP */
4297 case 0x1680: /* OGHAM SPACE MARK */
4298 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4299 case 0x2000: /* EN QUAD */
4300 case 0x2001: /* EM QUAD */
4301 case 0x2002: /* EN SPACE */
4302 case 0x2003: /* EM SPACE */
4303 case 0x2004: /* THREE-PER-EM SPACE */
4304 case 0x2005: /* FOUR-PER-EM SPACE */
4305 case 0x2006: /* SIX-PER-EM SPACE */
4306 case 0x2007: /* FIGURE SPACE */
4307 case 0x2008: /* PUNCTUATION SPACE */
4308 case 0x2009: /* THIN SPACE */
4309 case 0x200A: /* HAIR SPACE */
4310 case 0x202f: /* NARROW NO-BREAK SPACE */
4311 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4312 case 0x3000: /* IDEOGRAPHIC SPACE */
4313 break;
4314 }
4315 }
4316 break;
4317
4318 case OP_NOT_VSPACE:
4319 for (i = 1; i <= min; i++)
4320 {
4321 if (eptr >= md->end_subject)
4322 {
4323 SCHECK_PARTIAL();
4324 RRETURN(MATCH_NOMATCH);
4325 }
4326 GETCHARINC(c, eptr);
4327 switch(c)
4328 {
4329 default: break;
4330 case 0x0a: /* LF */
4331 case 0x0b: /* VT */
4332 case 0x0c: /* FF */
4333 case 0x0d: /* CR */
4334 case 0x85: /* NEL */
4335 case 0x2028: /* LINE SEPARATOR */
4336 case 0x2029: /* PARAGRAPH SEPARATOR */
4337 RRETURN(MATCH_NOMATCH);
4338 }
4339 }
4340 break;
4341
4342 case OP_VSPACE:
4343 for (i = 1; i <= min; i++)
4344 {
4345 if (eptr >= md->end_subject)
4346 {
4347 SCHECK_PARTIAL();
4348 RRETURN(MATCH_NOMATCH);
4349 }
4350 GETCHARINC(c, eptr);
4351 switch(c)
4352 {
4353 default: RRETURN(MATCH_NOMATCH);
4354 case 0x0a: /* LF */
4355 case 0x0b: /* VT */
4356 case 0x0c: /* FF */
4357 case 0x0d: /* CR */
4358 case 0x85: /* NEL */
4359 case 0x2028: /* LINE SEPARATOR */
4360 case 0x2029: /* PARAGRAPH SEPARATOR */
4361 break;
4362 }
4363 }
4364 break;
4365
4366 case OP_NOT_DIGIT:
4367 for (i = 1; i <= min; i++)
4368 {
4369 if (eptr >= md->end_subject)
4370 {
4371 SCHECK_PARTIAL();
4372 RRETURN(MATCH_NOMATCH);
4373 }
4374 GETCHARINC(c, eptr);
4375 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4376 RRETURN(MATCH_NOMATCH);
4377 }
4378 break;
4379
4380 case OP_DIGIT:
4381 for (i = 1; i <= min; i++)
4382 {
4383 if (eptr >= md->end_subject)
4384 {
4385 SCHECK_PARTIAL();
4386 RRETURN(MATCH_NOMATCH);
4387 }
4388 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4389 RRETURN(MATCH_NOMATCH);
4390 eptr++;
4391 /* No need to skip more bytes - we know it's a 1-byte character */
4392 }
4393 break;
4394
4395 case OP_NOT_WHITESPACE:
4396 for (i = 1; i <= min; i++)
4397 {
4398 if (eptr >= md->end_subject)
4399 {
4400 SCHECK_PARTIAL();
4401 RRETURN(MATCH_NOMATCH);
4402 }
4403 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4404 RRETURN(MATCH_NOMATCH);
4405 eptr++;
4406 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4407 }
4408 break;
4409
4410 case OP_WHITESPACE:
4411 for (i = 1; i <= min; i++)
4412 {
4413 if (eptr >= md->end_subject)
4414 {
4415 SCHECK_PARTIAL();
4416 RRETURN(MATCH_NOMATCH);
4417 }
4418 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4419 RRETURN(MATCH_NOMATCH);
4420 eptr++;
4421 /* No need to skip more bytes - we know it's a 1-byte character */
4422 }
4423 break;
4424
4425 case OP_NOT_WORDCHAR:
4426 for (i = 1; i <= min; i++)
4427 {
4428 if (eptr >= md->end_subject)
4429 {
4430 SCHECK_PARTIAL();
4431 RRETURN(MATCH_NOMATCH);
4432 }
4433 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4434 RRETURN(MATCH_NOMATCH);
4435 eptr++;
4436 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4437 }
4438 break;
4439
4440 case OP_WORDCHAR:
4441 for (i = 1; i <= min; i++)
4442 {
4443 if (eptr >= md->end_subject)
4444 {
4445 SCHECK_PARTIAL();
4446 RRETURN(MATCH_NOMATCH);
4447 }
4448 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4449 RRETURN(MATCH_NOMATCH);
4450 eptr++;
4451 /* No need to skip more bytes - we know it's a 1-byte character */
4452 }
4453 break;
4454
4455 default:
4456 RRETURN(PCRE_ERROR_INTERNAL);
4457 } /* End switch(ctype) */
4458
4459 else
4460 #endif /* SUPPORT_UTF */
4461
4462 /* Code for the non-UTF-8 case for minimum matching of operators other
4463 than OP_PROP and OP_NOTPROP. */
4464
4465 switch(ctype)
4466 {
4467 case OP_ANY:
4468 for (i = 1; i <= min; i++)
4469 {
4470 if (eptr >= md->end_subject)
4471 {
4472 SCHECK_PARTIAL();
4473 RRETURN(MATCH_NOMATCH);
4474 }
4475 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4476 eptr++;
4477 }
4478 break;
4479
4480 case OP_ALLANY:
4481 if (eptr > md->end_subject - min)
4482 {
4483 SCHECK_PARTIAL();
4484 RRETURN(MATCH_NOMATCH);
4485 }
4486 eptr += min;
4487 break;
4488
4489 case OP_ANYBYTE:
4490 if (eptr > md->end_subject - min)
4491 {
4492 SCHECK_PARTIAL();
4493 RRETURN(MATCH_NOMATCH);
4494 }
4495 eptr += min;
4496 break;
4497
4498 case OP_ANYNL:
4499 for (i = 1; i <= min; i++)
4500 {
4501 if (eptr >= md->end_subject)
4502 {
4503 SCHECK_PARTIAL();
4504 RRETURN(MATCH_NOMATCH);
4505 }
4506 switch(*eptr++)
4507 {
4508 default: RRETURN(MATCH_NOMATCH);
4509
4510 case 0x000d:
4511 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4512 break;
4513
4514 case 0x000a:
4515 break;
4516
4517 case 0x000b:
4518 case 0x000c:
4519 case 0x0085:
4520 #ifdef COMPILE_PCRE16
4521 case 0x2028:
4522 case 0x2029:
4523 #endif
4524 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4525 break;
4526 }
4527 }
4528 break;
4529
4530 case OP_NOT_HSPACE:
4531 for (i = 1; i <= min; i++)
4532 {
4533 if (eptr >= md->end_subject)
4534 {
4535 SCHECK_PARTIAL();
4536 RRETURN(MATCH_NOMATCH);
4537 }
4538 switch(*eptr++)
4539 {
4540 default: break;
4541 case 0x09: /* HT */
4542 case 0x20: /* SPACE */
4543 case 0xa0: /* NBSP */
4544 #ifdef COMPILE_PCRE16
4545 case 0x1680: /* OGHAM SPACE MARK */
4546 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4547 case 0x2000: /* EN QUAD */
4548 case 0x2001: /* EM QUAD */
4549 case 0x2002: /* EN SPACE */
4550 case 0x2003: /* EM SPACE */
4551 case 0x2004: /* THREE-PER-EM SPACE */
4552 case 0x2005: /* FOUR-PER-EM SPACE */
4553 case 0x2006: /* SIX-PER-EM SPACE */
4554 case 0x2007: /* FIGURE SPACE */
4555 case 0x2008: /* PUNCTUATION SPACE */
4556 case 0x2009: /* THIN SPACE */
4557 case 0x200A: /* HAIR SPACE */
4558 case 0x202f: /* NARROW NO-BREAK SPACE */
4559 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4560 case 0x3000: /* IDEOGRAPHIC SPACE */
4561 #endif
4562 RRETURN(MATCH_NOMATCH);
4563 }
4564 }
4565 break;
4566
4567 case OP_HSPACE:
4568 for (i = 1; i <= min; i++)
4569 {
4570 if (eptr >= md->end_subject)
4571 {
4572 SCHECK_PARTIAL();
4573 RRETURN(MATCH_NOMATCH);
4574 }
4575 switch(*eptr++)
4576 {
4577 default: RRETURN(MATCH_NOMATCH);
4578 case 0x09: /* HT */
4579 case 0x20: /* SPACE */
4580 case 0xa0: /* NBSP */
4581 #ifdef COMPILE_PCRE16
4582 case 0x1680: /* OGHAM SPACE MARK */
4583 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4584 case 0x2000: /* EN QUAD */
4585 case 0x2001: /* EM QUAD */
4586 case 0x2002: /* EN SPACE */
4587 case 0x2003: /* EM SPACE */
4588 case 0x2004: /* THREE-PER-EM SPACE */
4589 case 0x2005: /* FOUR-PER-EM SPACE */
4590 case 0x2006: /* SIX-PER-EM SPACE */
4591 case 0x2007: /* FIGURE SPACE */
4592 case 0x2008: /* PUNCTUATION SPACE */
4593 case 0x2009: /* THIN SPACE */
4594 case 0x200A: /* HAIR SPACE */
4595 case 0x202f: /* NARROW NO-BREAK SPACE */
4596 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4597 case 0x3000: /* IDEOGRAPHIC SPACE */
4598 #endif
4599 break;
4600 }
4601 }
4602 break;
4603
4604 case OP_NOT_VSPACE:
4605 for (i = 1; i <= min; i++)
4606 {
4607 if (eptr >= md->end_subject)
4608 {
4609 SCHECK_PARTIAL();
4610 RRETURN(MATCH_NOMATCH);
4611 }
4612 switch(*eptr++)
4613 {
4614 default: break;
4615 case 0x0a: /* LF */
4616 case 0x0b: /* VT */
4617 case 0x0c: /* FF */
4618 case 0x0d: /* CR */
4619 case 0x85: /* NEL */
4620 #ifdef COMPILE_PCRE16
4621 case 0x2028: /* LINE SEPARATOR */
4622 case 0x2029: /* PARAGRAPH SEPARATOR */
4623 #endif
4624 RRETURN(MATCH_NOMATCH);
4625 }
4626 }
4627 break;
4628
4629 case OP_VSPACE:
4630 for (i = 1; i <= min; i++)
4631 {
4632 if (eptr >= md->end_subject)
4633 {
4634 SCHECK_PARTIAL();
4635 RRETURN(MATCH_NOMATCH);
4636 }
4637 switch(*eptr++)
4638 {
4639 default: RRETURN(MATCH_NOMATCH);
4640 case 0x0a: /* LF */
4641 case 0x0b: /* VT */
4642 case 0x0c: /* FF */
4643 case 0x0d: /* CR */
4644 case 0x85: /* NEL */
4645 #ifdef COMPILE_PCRE16
4646 case 0x2028: /* LINE SEPARATOR */
4647 case 0x2029: /* PARAGRAPH SEPARATOR */
4648 #endif
4649 break;
4650 }
4651 }
4652 break;
4653
4654 case OP_NOT_DIGIT:
4655 for (i = 1; i <= min; i++)
4656 {
4657 if (eptr >= md->end_subject)
4658 {
4659 SCHECK_PARTIAL();
4660 RRETURN(MATCH_NOMATCH);
4661 }
4662 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4663 RRETURN(MATCH_NOMATCH);
4664 eptr++;
4665 }
4666 break;
4667
4668 case OP_DIGIT:
4669 for (i = 1; i <= min; i++)
4670 {
4671 if (eptr >= md->end_subject)
4672 {
4673 SCHECK_PARTIAL();
4674 RRETURN(MATCH_NOMATCH);
4675 }
4676 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4677 RRETURN(MATCH_NOMATCH);
4678 eptr++;
4679 }
4680 break;
4681
4682 case OP_NOT_WHITESPACE:
4683 for (i = 1; i <= min; i++)
4684 {
4685 if (eptr >= md->end_subject)
4686 {
4687 SCHECK_PARTIAL();
4688 RRETURN(MATCH_NOMATCH);
4689 }
4690 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4691 RRETURN(MATCH_NOMATCH);
4692 eptr++;
4693 }
4694 break;
4695
4696 case OP_WHITESPACE:
4697 for (i = 1; i <= min; i++)
4698 {
4699 if (eptr >= md->end_subject)
4700 {
4701 SCHECK_PARTIAL();
4702 RRETURN(MATCH_NOMATCH);
4703 }
4704 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4705 RRETURN(MATCH_NOMATCH);
4706 eptr++;
4707 }
4708 break;
4709
4710 case OP_NOT_WORDCHAR:
4711 for (i = 1; i <= min; i++)
4712 {
4713 if (eptr >= md->end_subject)
4714 {
4715 SCHECK_PARTIAL();
4716 RRETURN(MATCH_NOMATCH);
4717 }
4718 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4719 RRETURN(MATCH_NOMATCH);
4720 eptr++;
4721 }
4722 break;
4723
4724 case OP_WORDCHAR:
4725 for (i = 1; i <= min; i++)
4726 {
4727 if (eptr >= md->end_subject)
4728 {
4729 SCHECK_PARTIAL();
4730 RRETURN(MATCH_NOMATCH);
4731 }
4732 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4733 RRETURN(MATCH_NOMATCH);
4734 eptr++;
4735 }
4736 break;
4737
4738 default:
4739 RRETURN(PCRE_ERROR_INTERNAL);
4740 }
4741 }
4742
4743 /* If min = max, continue at the same level without recursing */
4744
4745 if (min == max) continue;
4746
4747 /* If minimizing, we have to test the rest of the pattern before each
4748 subsequent match. Again, separate the UTF-8 case for speed, and also
4749 separate the UCP cases. */
4750
4751 if (minimize)
4752 {
4753 #ifdef SUPPORT_UCP
4754 if (prop_type >= 0)
4755 {
4756 switch(prop_type)
4757 {
4758 case PT_ANY:
4759 for (fi = min;; fi++)
4760 {
4761 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4762 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4763 if (fi >= max) RRETURN(MATCH_NOMATCH);
4764 if (eptr >= md->end_subject)
4765 {
4766 SCHECK_PARTIAL();
4767 RRETURN(MATCH_NOMATCH);
4768 }
4769 GETCHARINCTEST(c, eptr);
4770 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4771 }
4772 /* Control never gets here */
4773
4774 case PT_LAMP:
4775 for (fi = min;; fi++)
4776 {
4777 int chartype;
4778 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4780 if (fi >= max) RRETURN(MATCH_NOMATCH);
4781 if (eptr >= md->end_subject)
4782 {
4783 SCHECK_PARTIAL();
4784 RRETURN(MATCH_NOMATCH);
4785 }
4786 GETCHARINCTEST(c, eptr);
4787 chartype = UCD_CHARTYPE(c);
4788 if ((chartype == ucp_Lu ||
4789 chartype == ucp_Ll ||
4790 chartype == ucp_Lt) == prop_fail_result)
4791 RRETURN(MATCH_NOMATCH);
4792 }
4793 /* Control never gets here */
4794
4795 case PT_GC:
4796 for (fi = min;; fi++)
4797 {
4798 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4800 if (fi >= max) RRETURN(MATCH_NOMATCH);
4801 if (eptr >= md->end_subject)
4802 {
4803 SCHECK_PARTIAL();
4804 RRETURN(MATCH_NOMATCH);
4805 }
4806 GETCHARINCTEST(c, eptr);
4807 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4808 RRETURN(MATCH_NOMATCH);
4809 }
4810 /* Control never gets here */
4811
4812 case PT_PC:
4813 for (fi = min;; fi++)
4814 {
4815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4817 if (fi >= max) RRETURN(MATCH_NOMATCH);
4818 if (eptr >= md->end_subject)
4819 {
4820 SCHECK_PARTIAL();
4821 RRETURN(MATCH_NOMATCH);
4822 }
4823 GETCHARINCTEST(c, eptr);
4824 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4825 RRETURN(MATCH_NOMATCH);
4826 }
4827 /* Control never gets here */
4828
4829 case PT_SC:
4830 for (fi = min;; fi++)
4831 {
4832 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4834 if (fi >= max) RRETURN(MATCH_NOMATCH);
4835 if (eptr >= md->end_subject)
4836 {
4837 SCHECK_PARTIAL();
4838 RRETURN(MATCH_NOMATCH);
4839 }
4840 GETCHARINCTEST(c, eptr);
4841 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4842 RRETURN(MATCH_NOMATCH);
4843 }
4844 /* Control never gets here */
4845
4846 case PT_ALNUM:
4847 for (fi = min;; fi++)
4848 {
4849 int category;
4850 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4852 if (fi >= max) RRETURN(MATCH_NOMATCH);
4853 if (eptr >= md->end_subject)
4854 {
4855 SCHECK_PARTIAL();
4856 RRETURN(MATCH_NOMATCH);
4857 }
4858 GETCHARINCTEST(c, eptr);
4859 category = UCD_CATEGORY(c);
4860 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4861 RRETURN(MATCH_NOMATCH);
4862 }
4863 /* Control never gets here */
4864
4865 case PT_SPACE: /* Perl space */
4866 for (fi = min;; fi++)
4867 {
4868 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4869 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4870 if (fi >= max) RRETURN(MATCH_NOMATCH);
4871 if (eptr >= md->end_subject)
4872 {
4873 SCHECK_PARTIAL();
4874 RRETURN(MATCH_NOMATCH);
4875 }
4876 GETCHARINCTEST(c, eptr);
4877 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4878 c == CHAR_FF || c == CHAR_CR)
4879 == prop_fail_result)
4880 RRETURN(MATCH_NOMATCH);
4881 }
4882 /* Control never gets here */
4883
4884 case PT_PXSPACE: /* POSIX space */
4885 for (fi = min;; fi++)
4886 {
4887 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4889 if (fi >= max) RRETURN(MATCH_NOMATCH);
4890 if (eptr >= md->end_subject)
4891 {
4892 SCHECK_PARTIAL();
4893 RRETURN(MATCH_NOMATCH);
4894 }
4895 GETCHARINCTEST(c, eptr);
4896 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4897 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4898 == prop_fail_result)
4899 RRETURN(MATCH_NOMATCH);
4900 }
4901 /* Control never gets here */
4902
4903 case PT_WORD:
4904 for (fi = min;; fi++)
4905 {
4906 int category;
4907 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4909 if (fi >= max) RRETURN(MATCH_NOMATCH);
4910 if (eptr >= md->end_subject)
4911 {
4912 SCHECK_PARTIAL();
4913 RRETURN(MATCH_NOMATCH);
4914 }
4915 GETCHARINCTEST(c, eptr);
4916 category = UCD_CATEGORY(c);
4917 if ((category == ucp_L ||
4918 category == ucp_N ||
4919 c == CHAR_UNDERSCORE)
4920 == prop_fail_result)
4921 RRETURN(MATCH_NOMATCH);
4922 }
4923 /* Control never gets here */
4924
4925 /* This should never occur */
4926
4927 default:
4928 RRETURN(PCRE_ERROR_INTERNAL);
4929 }
4930 }
4931
4932 /* Match extended Unicode sequences. We will get here only if the
4933 support is in the binary; otherwise a compile-time error occurs. */
4934
4935 else if (ctype == OP_EXTUNI)
4936 {
4937 for (fi = min;; fi++)
4938 {
4939 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4941 if (fi >= max) RRETURN(MATCH_NOMATCH);
4942 if (eptr >= md->end_subject)
4943 {
4944 SCHECK_PARTIAL();
4945 RRETURN(MATCH_NOMATCH);
4946 }
4947 GETCHARINCTEST(c, eptr);
4948 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4949 while (eptr < md->end_subject)
4950 {
4951 int len = 1;
4952 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4953 if (UCD_CATEGORY(c) != ucp_M) break;
4954 eptr += len;
4955 }
4956 }
4957 }
4958 else
4959 #endif /* SUPPORT_UCP */
4960
4961 #ifdef SUPPORT_UTF
4962 if (utf)
4963 {
4964 for (fi = min;; fi++)
4965 {
4966 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4968 if (fi >= max) RRETURN(MATCH_NOMATCH);
4969 if (eptr >= md->end_subject)
4970 {
4971 SCHECK_PARTIAL();
4972 RRETURN(MATCH_NOMATCH);
4973 }
4974 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4975 RRETURN(MATCH_NOMATCH);
4976 GETCHARINC(c, eptr);
4977 switch(ctype)
4978 {
4979 case OP_ANY: /* This is the non-NL case */
4980 case OP_ALLANY:
4981 case OP_ANYBYTE:
4982 break;
4983
4984 case OP_ANYNL:
4985 switch(c)
4986 {
4987 default: RRETURN(MATCH_NOMATCH);
4988 case 0x000d:
4989 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4990 break;
4991 case 0x000a:
4992 break;
4993
4994 case 0x000b:
4995 case 0x000c:
4996 case 0x0085:
4997 case 0x2028:
4998 case 0x2029:
4999 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5000 break;
5001 }
5002 break;
5003
5004 case OP_NOT_HSPACE:
5005 switch(c)
5006 {
5007 default: break;
5008 case 0x09: /* HT */
5009 case 0x20: /* SPACE */
5010 case 0xa0: /* NBSP */
5011 case 0x1680: /* OGHAM SPACE MARK */
5012 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5013 case 0x2000: /* EN QUAD */
5014 case 0x2001: /* EM QUAD */
5015 case 0x2002: /* EN SPACE */
5016 case 0x2003: /* EM SPACE */
5017 case 0x2004: /* THREE-PER-EM SPACE */
5018 case 0x2005: /* FOUR-PER-EM SPACE */
5019 case 0x2006: /* SIX-PER-EM SPACE */
5020 case 0x2007: /* FIGURE SPACE */
5021 case 0x2008: /* PUNCTUATION SPACE */
5022 case 0x2009: /* THIN SPACE */
5023 case 0x200A: /* HAIR SPACE */
5024 case 0x202f: /* NARROW NO-BREAK SPACE */
5025 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5026 case 0x3000: /* IDEOGRAPHIC SPACE */
5027 RRETURN(MATCH_NOMATCH);
5028 }
5029 break;
5030
5031 case OP_HSPACE:
5032 switch(c)
5033 {
5034 default: RRETURN(MATCH_NOMATCH);
5035 case 0x09: /* HT */
5036 case 0x20: /* SPACE */
5037 case 0xa0: /* NBSP */
5038 case 0x1680: /* OGHAM SPACE MARK */
5039 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5040 case 0x2000: /* EN QUAD */
5041 case 0x2001: /* EM QUAD */
5042 case 0x2002: /* EN SPACE */
5043 case 0x2003: /* EM SPACE */
5044 case 0x2004: /* THREE-PER-EM SPACE */
5045 case 0x2005: /* FOUR-PER-EM SPACE */
5046 case 0x2006: /* SIX-PER-EM SPACE */
5047 case 0x2007: /* FIGURE SPACE */
5048 case 0x2008: /* PUNCTUATION SPACE */
5049 case 0x2009: /* THIN SPACE */
5050 case 0x200A: /* HAIR SPACE */
5051 case 0x202f: /* NARROW NO-BREAK SPACE */
5052 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5053 case 0x3000: /* IDEOGRAPHIC SPACE */
5054 break;
5055 }
5056 break;
5057
5058 case OP_NOT_VSPACE:
5059 switch(c)
5060 {
5061 default: break;
5062 case 0x0a: /* LF */
5063 case 0x0b: /* VT */
5064 case 0x0c: /* FF */
5065 case 0x0d: /* CR */
5066 case 0x85: /* NEL */
5067 case 0x2028: /* LINE SEPARATOR */
5068 case 0x2029: /* PARAGRAPH SEPARATOR */
5069 RRETURN(MATCH_NOMATCH);
5070 }
5071 break;
5072
5073 case OP_VSPACE:
5074 switch(c)
5075 {
5076 default: RRETURN(MATCH_NOMATCH);
5077 case 0x0a: /* LF */
5078 case 0x0b: /* VT */
5079 case 0x0c: /* FF */
5080 case 0x0d: /* CR */
5081 case 0x85: /* NEL */
5082 case 0x2028: /* LINE SEPARATOR */
5083 case 0x2029: /* PARAGRAPH SEPARATOR */
5084 break;
5085 }
5086 break;
5087
5088 case OP_NOT_DIGIT:
5089 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5090 RRETURN(MATCH_NOMATCH);
5091 break;
5092
5093 case OP_DIGIT:
5094 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5095 RRETURN(MATCH_NOMATCH);
5096 break;
5097
5098 case OP_NOT_WHITESPACE:
5099 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5100 RRETURN(MATCH_NOMATCH);
5101 break;
5102
5103 case OP_WHITESPACE:
5104 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5105 RRETURN(MATCH_NOMATCH);
5106 break;
5107
5108 case OP_NOT_WORDCHAR:
5109 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5110 RRETURN(MATCH_NOMATCH);
5111 break;
5112
5113 case OP_WORDCHAR:
5114 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5115 RRETURN(MATCH_NOMATCH);
5116 break;
5117
5118 default:
5119 RRETURN(PCRE_ERROR_INTERNAL);
5120 }
5121 }
5122 }
5123 else
5124 #endif
5125 /* Not UTF mode */
5126 {
5127 for (fi = min;; fi++)
5128 {
5129 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5130 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5131 if (fi >= max) RRETURN(MATCH_NOMATCH);
5132 if (eptr >= md->end_subject)
5133 {
5134 SCHECK_PARTIAL();
5135 RRETURN(MATCH_NOMATCH);
5136 }
5137 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5138 RRETURN(MATCH_NOMATCH);
5139 c = *eptr++;
5140 switch(ctype)
5141 {
5142 case OP_ANY: /* This is the non-NL case */
5143 case OP_ALLANY:
5144 case OP_ANYBYTE:
5145 break;
5146
5147 case OP_ANYNL:
5148 switch(c)
5149 {
5150 default: RRETURN(MATCH_NOMATCH);
5151 case 0x000d:
5152 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5153 break;
5154
5155 case 0x000a:
5156 break;
5157
5158 case 0x000b:
5159 case 0x000c:
5160 case 0x0085:
5161 #ifdef COMPILE_PCRE16
5162 case 0x2028:
5163 case 0x2029:
5164 #endif
5165 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5166 break;
5167 }
5168 break;
5169
5170 case OP_NOT_HSPACE:
5171 switch(c)
5172 {
5173 default: break;
5174 case 0x09: /* HT */
5175 case 0x20: /* SPACE */
5176 case 0xa0: /* NBSP */
5177 #ifdef COMPILE_PCRE16
5178 case 0x1680: /* OGHAM SPACE MARK */
5179 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5180 case 0x2000: /* EN QUAD */
5181 case 0x2001: /* EM QUAD */
5182 case 0x2002: /* EN SPACE */
5183 case 0x2003: /* EM SPACE */
5184 case 0x2004: /* THREE-PER-EM SPACE */
5185 case 0x2005: /* FOUR-PER-EM SPACE */
5186 case 0x2006: /* SIX-PER-EM SPACE */
5187 case 0x2007: /* FIGURE SPACE */
5188 case 0x2008: /* PUNCTUATION SPACE */
5189 case 0x2009: /* THIN SPACE */
5190 case 0x200A: /* HAIR SPACE */
5191 case 0x202f: /* NARROW NO-BREAK SPACE */
5192 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5193 case 0x3000: /* IDEOGRAPHIC SPACE */
5194 #endif
5195 RRETURN(MATCH_NOMATCH);
5196 }
5197 break;
5198
5199 case OP_HSPACE:
5200 switch(c)
5201 {
5202 default: RRETURN(MATCH_NOMATCH);
5203 case 0x09: /* HT */
5204 case 0x20: /* SPACE */
5205 case 0xa0: /* NBSP */
5206 #ifdef COMPILE_PCRE16
5207 case 0x1680: /* OGHAM SPACE MARK */
5208 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5209 case 0x2000: /* EN QUAD */
5210 case 0x2001: /* EM QUAD */
5211 case 0x2002: /* EN SPACE */
5212 case 0x2003: /* EM SPACE */
5213 case 0x2004: /* THREE-PER-EM SPACE */
5214 case 0x2005: /* FOUR-PER-EM SPACE */
5215 case 0x2006: /* SIX-PER-EM SPACE */
5216 case 0x2007: /* FIGURE SPACE */
5217 case 0x2008: /* PUNCTUATION SPACE */
5218 case 0x2009: /* THIN SPACE */
5219 case 0x200A: /* HAIR SPACE */
5220 case 0x202f: /* NARROW NO-BREAK SPACE */
5221 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5222 case 0x3000: /* IDEOGRAPHIC SPACE */
5223 #endif
5224 break;
5225 }
5226 break;
5227
5228 case OP_NOT_VSPACE:
5229 switch(c)
5230 {
5231 default: break;
5232 case 0x0a: /* LF */
5233 case 0x0b: /* VT */
5234 case 0x0c: /* FF */
5235 case 0x0d: /* CR */
5236 case 0x85: /* NEL */
5237 #ifdef COMPILE_PCRE16
5238 case 0x2028: /* LINE SEPARATOR */
5239 case 0x2029: /* PARAGRAPH SEPARATOR */
5240 #endif
5241 RRETURN(MATCH_NOMATCH);
5242 }
5243 break;
5244
5245 case OP_VSPACE:
5246 switch(c)
5247 {
5248 default: RRETURN(MATCH_NOMATCH);
5249 case 0x0a: /* LF */
5250 case 0x0b: /* VT */
5251 case 0x0c: /* FF */
5252 case 0x0d: /* CR */
5253 case 0x85: /* NEL */
5254 #ifdef COMPILE_PCRE16
5255 case 0x2028: /* LINE SEPARATOR */
5256 case 0x2029: /* PARAGRAPH SEPARATOR */
5257 #endif
5258 break;
5259 }
5260 break;
5261
5262 case OP_NOT_DIGIT:
5263 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5264 break;
5265
5266 case OP_DIGIT:
5267 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5268 break;
5269
5270 case OP_NOT_WHITESPACE:
5271 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5272 break;
5273
5274 case OP_WHITESPACE:
5275 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5276 break;
5277
5278 case OP_NOT_WORDCHAR:
5279 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5280 break;
5281
5282 case OP_WORDCHAR:
5283 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5284 break;
5285
5286 default:
5287 RRETURN(PCRE_ERROR_INTERNAL);
5288 }
5289 }
5290 }
5291 /* Control never gets here */
5292 }
5293
5294 /* If maximizing, it is worth using inline code for speed, doing the type
5295 test once at the start (i.e. keep it out of the loop). Again, keep the
5296 UTF-8 and UCP stuff separate. */
5297
5298 else
5299 {
5300 pp = eptr; /* Remember where we started */
5301
5302 #ifdef SUPPORT_UCP
5303 if (prop_type >= 0)
5304 {
5305 switch(prop_type)
5306 {
5307 case PT_ANY:
5308 for (i = min; i < max; i++)
5309 {
5310 int len = 1;
5311 if (eptr >= md->end_subject)
5312 {
5313 SCHECK_PARTIAL();
5314 break;
5315 }
5316 GETCHARLENTEST(c, eptr, len);
5317 if (prop_fail_result) break;
5318 eptr+= len;
5319 }
5320 break;
5321
5322 case PT_LAMP:
5323 for (i = min; i < max; i++)
5324 {
5325 int chartype;
5326 int len = 1;
5327 if (eptr >= md->end_subject)
5328 {
5329 SCHECK_PARTIAL();
5330 break;
5331 }
5332 GETCHARLENTEST(c, eptr, len);
5333 chartype = UCD_CHARTYPE(c);
5334 if ((chartype == ucp_Lu ||
5335 chartype == ucp_Ll ||
5336 chartype == ucp_Lt) == prop_fail_result)
5337 break;
5338 eptr+= len;
5339 }
5340 break;
5341
5342 case PT_GC:
5343 for (i = min; i < max; i++)
5344 {
5345 int len = 1;
5346 if (eptr >= md->end_subject)
5347 {
5348 SCHECK_PARTIAL();
5349 break;
5350 }
5351 GETCHARLENTEST(c, eptr, len);
5352 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5353 eptr+= len;
5354 }
5355 break;
5356
5357 case PT_PC:
5358 for (i = min; i < max; i++)
5359 {
5360 int len = 1;
5361 if (eptr >= md->end_subject)
5362 {
5363 SCHECK_PARTIAL();
5364 break;
5365 }
5366 GETCHARLENTEST(c, eptr, len);
5367 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5368 eptr+= len;
5369 }
5370 break;
5371
5372 case PT_SC:
5373 for (i = min; i < max; i++)
5374 {
5375 int len = 1;
5376 if (eptr >= md->end_subject)
5377 {
5378 SCHECK_PARTIAL();
5379 break;
5380 }
5381 GETCHARLENTEST(c, eptr, len);
5382 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5383 eptr+= len;
5384 }
5385 break;
5386
5387 case PT_ALNUM:
5388 for (i = min; i < max; i++)
5389 {
5390 int category;
5391 int len = 1;
5392 if (eptr >= md->end_subject)
5393 {
5394 SCHECK_PARTIAL();
5395 break;
5396 }
5397 GETCHARLENTEST(c, eptr, len);
5398 category = UCD_CATEGORY(c);
5399 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5400 break;
5401 eptr+= len;
5402 }
5403 break;
5404
5405 case PT_SPACE: /* Perl space */
5406 for (i = min; i < max; i++)
5407 {
5408 int len = 1;
5409 if (eptr >= md->end_subject)
5410 {
5411 SCHECK_PARTIAL();
5412 break;
5413 }
5414 GETCHARLENTEST(c, eptr, len);
5415 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5416 c == CHAR_FF || c == CHAR_CR)
5417 == prop_fail_result)
5418 break;
5419 eptr+= len;
5420 }
5421 break;
5422
5423 case PT_PXSPACE: /* POSIX space */
5424 for (i = min; i < max; i++)
5425 {
5426 int len = 1;
5427 if (eptr >= md->end_subject)
5428 {
5429 SCHECK_PARTIAL();
5430 break;
5431 }
5432 GETCHARLENTEST(c, eptr, len);
5433 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5434 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5435 == prop_fail_result)
5436 break;
5437 eptr+= len;
5438 }
5439 break;
5440
5441 case PT_WORD:
5442 for (i = min; i < max; i++)
5443 {
5444 int category;
5445 int len = 1;
5446 if (eptr >= md->end_subject)
5447 {
5448 SCHECK_PARTIAL();
5449 break;
5450 }
5451 GETCHARLENTEST(c, eptr, len);
5452 category = UCD_CATEGORY(c);
5453 if ((category == ucp_L || category == ucp_N ||
5454 c == CHAR_UNDERSCORE) == prop_fail_result)
5455 break;
5456 eptr+= len;
5457 }
5458 break;
5459
5460 default:
5461 RRETURN(PCRE_ERROR_INTERNAL);
5462 }
5463
5464 /* eptr is now past the end of the maximum run */
5465
5466 if (possessive) continue;
5467 for(;;)
5468 {
5469 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5470 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5471 if (eptr-- == pp) break; /* Stop if tried at original pos */
5472 if (utf) BACKCHAR(eptr);
5473 }
5474 }
5475
5476 /* Match extended Unicode sequences. We will get here only if the
5477 support is in the binary; otherwise a compile-time error occurs. */
5478
5479 else if (ctype == OP_EXTUNI)
5480 {
5481 for (i = min; i < max; i++)
5482 {
5483 int len = 1;
5484 if (eptr >= md->end_subject)
5485 {
5486 SCHECK_PARTIAL();
5487 break;
5488 }
5489 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5490 if (UCD_CATEGORY(c) == ucp_M) break;
5491 eptr += len;
5492 while (eptr < md->end_subject)
5493 {
5494 len = 1;
5495 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5496 if (UCD_CATEGORY(c) != ucp_M) break;
5497 eptr += len;
5498 }
5499 }
5500
5501 /* eptr is now past the end of the maximum run */
5502
5503 if (possessive) continue;
5504
5505 for(;;)
5506 {
5507 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5508 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5509 if (eptr-- == pp) break; /* Stop if tried at original pos */
5510 for (;;) /* Move back over one extended */
5511 {
5512 if (!utf) c = *eptr; else
5513 {
5514 BACKCHAR(eptr);
5515 GETCHAR(c, eptr);
5516 }
5517 if (UCD_CATEGORY(c) != ucp_M) break;
5518 eptr--;
5519 }
5520 }
5521 }
5522
5523 else
5524 #endif /* SUPPORT_UCP */
5525
5526 #ifdef SUPPORT_UTF
5527 if (utf)
5528 {
5529 switch(ctype)
5530 {
5531 case OP_ANY:
5532 if (max < INT_MAX)
5533 {
5534 for (i = min; i < max; i++)
5535 {
5536 if (eptr >= md->end_subject)
5537 {
5538 SCHECK_PARTIAL();
5539 break;
5540 }
5541 if (IS_NEWLINE(eptr)) break;
5542 eptr++;
5543 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5544 }
5545 }
5546
5547 /* Handle unlimited UTF-8 repeat */
5548
5549 else
5550 {
5551 for (i = min; i < max; i++)
5552 {
5553 if (eptr >= md->end_subject)
5554 {
5555 SCHECK_PARTIAL();
5556 break;
5557 }
5558 if (IS_NEWLINE(eptr)) break;
5559 eptr++;
5560 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5561 }
5562 }
5563 break;
5564
5565 case OP_ALLANY:
5566 if (max < INT_MAX)
5567 {
5568 for (i = min; i < max; i++)
5569 {
5570 if (eptr >= md->end_subject)
5571 {
5572 SCHECK_PARTIAL();
5573 break;
5574 }
5575 eptr++;
5576 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5577 }
5578 }
5579 else
5580 {
5581 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5582 SCHECK_PARTIAL();
5583 }
5584 break;
5585
5586 /* The byte case is the same as non-UTF8 */
5587
5588 case OP_ANYBYTE:
5589 c = max - min;
5590 if (c > (unsigned int)(md->end_subject - eptr))
5591 {
5592 eptr = md->end_subject;
5593 SCHECK_PARTIAL();
5594 }
5595 else eptr += c;
5596 break;
5597
5598 case OP_ANYNL:
5599 for (i = min; i < max; i++)
5600 {
5601 int len = 1;
5602 if (eptr >= md->end_subject)
5603 {
5604 SCHECK_PARTIAL();
5605 break;
5606 }
5607 GETCHARLEN(c, eptr, len);
5608 if (c == 0x000d)
5609 {
5610 if (++eptr >= md->end_subject) break;
5611 if (*eptr == 0x000a) eptr++;
5612 }
5613 else
5614 {
5615 if (c != 0x000a &&
5616 (md->bsr_anycrlf ||
5617 (c != 0x000b && c != 0x000c &&
5618 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5619 break;
5620 eptr += len;
5621 }
5622 }
5623 break;
5624
5625 case OP_NOT_HSPACE:
5626 case OP_HSPACE:
5627 for (i = min; i < max; i++)
5628 {
5629 BOOL gotspace;
5630 int len = 1;
5631 if (eptr >= md->end_subject)
5632 {
5633 SCHECK_PARTIAL();
5634 break;
5635 }
5636 GETCHARLEN(c, eptr, len);
5637 switch(c)
5638 {
5639 default: gotspace = FALSE; break;
5640 case 0x09: /* HT */
5641 case 0x20: /* SPACE */
5642 case 0xa0: /* NBSP */
5643 case 0x1680: /* OGHAM SPACE MARK */
5644 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5645 case 0x2000: /* EN QUAD */
5646 case 0x2001: /* EM QUAD */
5647 case 0x2002: /* EN SPACE */
5648 case 0x2003: /* EM SPACE */
5649 case 0x2004: /* THREE-PER-EM SPACE */
5650 case 0x2005: /* FOUR-PER-EM SPACE */
5651 case 0x2006: /* SIX-PER-EM SPACE */
5652 case 0x2007: /* FIGURE SPACE */
5653 case 0x2008: /* PUNCTUATION SPACE */
5654 case 0x2009: /* THIN SPACE */
5655 case 0x200A: /* HAIR SPACE */
5656 case 0x202f: /* NARROW NO-BREAK SPACE */
5657 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5658 case 0x3000: /* IDEOGRAPHIC SPACE */
5659 gotspace = TRUE;
5660 break;
5661 }
5662 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5663 eptr += len;
5664 }
5665 break;
5666
5667 case OP_NOT_VSPACE:
5668 case OP_VSPACE:
5669 for (i = min; i < max; i++)
5670 {
5671 BOOL gotspace;
5672 int len = 1;
5673 if (eptr >= md->end_subject)
5674 {
5675 SCHECK_PARTIAL();
5676 break;
5677 }
5678 GETCHARLEN(c, eptr, len);
5679 switch(c)
5680 {
5681 default: gotspace = FALSE; break;
5682 case 0x0a: /* LF */
5683 case 0x0b: /* VT */
5684 case 0x0c: /* FF */
5685 case 0x0d: /* CR */
5686 case 0x85: /* NEL */
5687 case 0x2028: /* LINE SEPARATOR */
5688 case 0x2029: /* PARAGRAPH SEPARATOR */
5689 gotspace = TRUE;
5690 break;
5691 }
5692 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5693 eptr += len;
5694 }
5695 break;
5696
5697 case OP_NOT_DIGIT:
5698 for (i = min; i < max; i++)
5699 {
5700 int len = 1;
5701 if (eptr >= md->end_subject)
5702 {
5703 SCHECK_PARTIAL();
5704 break;
5705 }
5706 GETCHARLEN(c, eptr, len);
5707 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5708 eptr+= len;
5709 }
5710 break;
5711
5712 case OP_DIGIT:
5713 for (i = min; i < max; i++)
5714 {
5715 int len = 1;
5716 if (eptr >= md->end_subject)
5717 {
5718 SCHECK_PARTIAL();
5719 break;
5720 }
5721 GETCHARLEN(c, eptr, len);
5722 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5723 eptr+= len;
5724 }
5725 break;
5726
5727 case OP_NOT_WHITESPACE:
5728 for (i = min; i < max; i++)
5729 {
5730 int len = 1;
5731 if (eptr >= md->end_subject)
5732 {
5733 SCHECK_PARTIAL();
5734 break;
5735 }
5736 GETCHARLEN(c, eptr, len);
5737 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5738 eptr+= len;
5739 }
5740 break;
5741
5742 case OP_WHITESPACE:
5743 for (i = min; i < max; i++)
5744 {
5745 int len = 1;
5746 if (eptr >= md->end_subject)
5747 {
5748 SCHECK_PARTIAL();
5749 break;
5750 }
5751 GETCHARLEN(c, eptr, len);
5752 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5753 eptr+= len;
5754 }
5755 break;
5756
5757 case OP_NOT_WORDCHAR:
5758 for (i = min; i < max; i++)
5759 {
5760 int len = 1;
5761 if (eptr >= md->end_subject)
5762 {
5763 SCHECK_PARTIAL();
5764 break;
5765 }
5766 GETCHARLEN(c, eptr, len);
5767 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5768 eptr+= len;
5769 }
5770 break;
5771
5772 case OP_WORDCHAR:
5773 for (i = min; i < max; i++)
5774 {
5775 int len = 1;
5776 if (eptr >= md->end_subject)
5777 {
5778 SCHECK_PARTIAL();
5779 break;
5780 }
5781 GETCHARLEN(c, eptr, len);
5782 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5783 eptr+= len;
5784 }
5785 break;
5786
5787 default:
5788 RRETURN(PCRE_ERROR_INTERNAL);
5789 }
5790
5791 /* eptr is now past the end of the maximum run. If possessive, we are
5792 done (no backing up). Otherwise, match at this position; anything other
5793 than no match is immediately returned. For nomatch, back up one
5794 character, unless we are matching \R and the last thing matched was
5795 \r\n, in which case, back up two bytes. */
5796
5797 if (possessive) continue;
5798 for(;;)
5799 {
5800 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5802 if (eptr-- == pp) break; /* Stop if tried at original pos */
5803 BACKCHAR(eptr);
5804 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5805 eptr[-1] == '\r') eptr--;
5806 }
5807 }
5808 else
5809 #endif /* SUPPORT_UTF */
5810 /* Not UTF mode */
5811 {
5812 switch(ctype)
5813 {
5814 case OP_ANY:
5815 for (i = min; i < max; i++)
5816 {
5817 if (eptr >= md->end_subject)
5818 {
5819 SCHECK_PARTIAL();
5820 break;
5821 }
5822 if (IS_NEWLINE(eptr)) break;
5823 eptr++;
5824 }
5825 break;
5826
5827 case OP_ALLANY:
5828 case OP_ANYBYTE:
5829 c = max - min;
5830 if (c > (unsigned int)(md->end_subject - eptr))
5831 {
5832 eptr = md->end_subject;
5833 SCHECK_PARTIAL();
5834 }
5835 else eptr += c;
5836 break;
5837
5838 case OP_ANYNL:
5839 for (i = min; i < max; i++)
5840 {
5841 if (eptr >= md->end_subject)
5842 {
5843 SCHECK_PARTIAL();
5844 break;
5845 }
5846 c = *eptr;
5847 if (c == 0x000d)
5848 {
5849 if (++eptr >= md->end_subject) break;
5850 if (*eptr == 0x000a) eptr++;
5851 }
5852 else
5853 {
5854 if (c != 0x000a && (md->bsr_anycrlf ||
5855 (c != 0x000b && c != 0x000c && c != 0x0085
5856 #ifdef COMPILE_PCRE16
5857 && c != 0x2028 && c != 0x2029
5858 #endif
5859 ))) break;
5860 eptr++;
5861 }
5862 }
5863 break;
5864
5865 case OP_NOT_HSPACE:
5866 for (i = min; i < max; i++)
5867 {
5868 if (eptr >= md->end_subject)
5869 {
5870 SCHECK_PARTIAL();
5871 break;
5872 }
5873 c = *eptr;
5874 if (c == 0x09 || c == 0x20 || c == 0xa0
5875 #ifdef COMPILE_PCRE16
5876 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5877 || c == 0x202f || c == 0x205f || c == 0x3000
5878 #endif
5879 ) break;
5880 eptr++;
5881 }
5882 break;
5883
5884 case OP_HSPACE:
5885 for (i = min; i < max; i++)
5886 {
5887 if (eptr >= md->end_subject)
5888 {
5889 SCHECK_PARTIAL();
5890 break;
5891 }
5892 c = *eptr;
5893 if (c != 0x09 && c != 0x20 && c != 0xa0
5894 #ifdef COMPILE_PCRE16
5895 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5896 && c != 0x202f && c != 0x205f && c != 0x3000
5897 #endif
5898 ) break;
5899 eptr++;
5900 }
5901 break;
5902
5903 case OP_NOT_VSPACE:
5904 for (i = min; i < max; i++)
5905 {
5906 if (eptr >= md->end_subject)
5907 {
5908 SCHECK_PARTIAL();
5909 break;
5910 }
5911 c = *eptr;
5912 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5913 #ifdef COMPILE_PCRE16
5914 || c == 0x2028 || c == 0x2029
5915 #endif
5916 ) break;
5917 eptr++;
5918 }
5919 break;
5920
5921 case OP_VSPACE:
5922 for (i = min; i < max; i++)
5923 {
5924 if (eptr >= md->end_subject)
5925 {
5926 SCHECK_PARTIAL();
5927 break;
5928 }
5929 c = *eptr;
5930 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5931 #ifdef COMPILE_PCRE16
5932 && c != 0x2028 && c != 0x2029
5933 #endif
5934 ) break;
5935 eptr++;
5936 }
5937 break;
5938
5939 case OP_NOT_DIGIT:
5940 for (i = min; i < max; i++)
5941 {
5942 if (eptr >= md->end_subject)
5943 {
5944 SCHECK_PARTIAL();
5945 break;
5946 }
5947 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5948 eptr++;
5949 }
5950 break;
5951
5952 case OP_DIGIT:
5953 for (i = min; i < max; i++)
5954 {
5955 if (eptr >= md->end_subject)
5956 {
5957 SCHECK_PARTIAL();
5958 break;
5959 }
5960 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5961 eptr++;
5962 }
5963 break;
5964
5965 case OP_NOT_WHITESPACE:
5966 for (i = min; i < max; i++)
5967 {
5968 if (eptr >= md->end_subject)
5969 {
5970 SCHECK_PARTIAL();
5971 break;
5972 }
5973 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5974 eptr++;
5975 }
5976 break;
5977
5978 case OP_WHITESPACE:
5979 for (i = min; i < max; i++)
5980 {
5981 if (eptr >= md->end_subject)
5982 {
5983 SCHECK_PARTIAL();
5984 break;
5985 }
5986 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5987 eptr++;
5988 }
5989 break;
5990
5991 case OP_NOT_WORDCHAR:
5992 for (i = min; i < max; i++)
5993 {
5994 if (eptr >= md->end_subject)
5995 {
5996 SCHECK_PARTIAL();
5997 break;
5998 }
5999 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6000 eptr++;
6001 }
6002 break;
6003
6004 case OP_WORDCHAR:
6005 for (i = min; i < max; i++)
6006 {
6007 if (eptr >= md->end_subject)
6008 {
6009 SCHECK_PARTIAL();
6010 break;
6011 }
6012 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6013 eptr++;
6014 }
6015 break;
6016
6017 default:
6018 RRETURN(PCRE_ERROR_INTERNAL);
6019 }
6020
6021 /* eptr is now past the end of the maximum run. If possessive, we are
6022 done (no backing up). Otherwise, match at this position; anything other
6023 than no match is immediately returned. For nomatch, back up one
6024 character (byte), unless we are matching \R and the last thing matched
6025 was \r\n, in which case, back up two bytes. */
6026
6027 if (possessive) continue;
6028 while (eptr >= pp)
6029 {
6030 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6031 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6032 eptr--;
6033 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6034 eptr[-1] == '\r') eptr--;
6035 }
6036 }
6037
6038 /* Get here if we can't make it match with any permitted repetitions */
6039
6040 RRETURN(MATCH_NOMATCH);
6041 }
6042 /* Control never gets here */
6043
6044 /* There's been some horrible disaster. Arrival here can only mean there is
6045 something seriously wrong in the code above or the OP_xxx definitions. */
6046
6047 default:
6048 DPRINTF(("Unknown opcode %d\n", *ecode));
6049 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6050 }
6051
6052 /* Do not stick any code in here without much thought; it is assumed
6053 that "continue" in the code above comes out to here to repeat the main
6054 loop. */
6055
6056 } /* End of main loop */
6057 /* Control never reaches here */
6058
6059
6060 /* When compiling to use the heap rather than the stack for recursive calls to
6061 match(), the RRETURN() macro jumps here. The number that is saved in
6062 frame->Xwhere indicates which label we actually want to return to. */
6063
6064 #ifdef NO_RECURSE
6065 #define LBL(val) case val: goto L_RM##val;
6066 HEAP_RETURN:
6067 switch (frame->Xwhere)
6068 {
6069 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6070 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6071 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6072 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6073 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6074 LBL(65) LBL(66)
6075 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6076 LBL(21)
6077 #endif
6078 #ifdef SUPPORT_UTF
6079 LBL(16) LBL(18) LBL(20)
6080 LBL(22) LBL(23) LBL(28) LBL(30)
6081 LBL(32) LBL(34) LBL(42) LBL(46)
6082 #ifdef SUPPORT_UCP
6083 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6084 LBL(59) LBL(60) LBL(61) LBL(62)
6085 #endif /* SUPPORT_UCP */
6086 #endif /* SUPPORT_UTF */
6087 default:
6088 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6089
6090 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6091
6092 return PCRE_ERROR_INTERNAL;
6093 }
6094 #undef LBL
6095 #endif /* NO_RECURSE */
6096 }
6097
6098
6099 /***************************************************************************
6100 ****************************************************************************
6101 RECURSION IN THE match() FUNCTION
6102
6103 Undefine all the macros that were defined above to handle this. */
6104
6105 #ifdef NO_RECURSE
6106 #undef eptr
6107 #undef ecode
6108 #undef mstart
6109 #undef offset_top
6110 #undef eptrb
6111 #undef flags
6112
6113 #undef callpat
6114 #undef charptr
6115 #undef data
6116 #undef next
6117 #undef pp
6118 #undef prev
6119 #undef saved_eptr
6120
6121 #undef new_recursive
6122
6123 #undef cur_is_word
6124 #undef condition
6125 #undef prev_is_word
6126
6127 #undef ctype
6128 #undef length
6129 #undef max
6130 #undef min
6131 #undef number
6132 #undef offset
6133 #undef op
6134 #undef save_capture_last
6135 #undef save_offset1
6136 #undef save_offset2
6137 #undef save_offset3
6138 #undef stacksave
6139
6140 #undef newptrb
6141
6142 #endif
6143
6144 /* These two are defined as macros in both cases */
6145
6146 #undef fc
6147 #undef fi
6148
6149 /***************************************************************************
6150 ***************************************************************************/
6151
6152
6153
6154 /*************************************************
6155 * Execute a Regular Expression *
6156 *************************************************/
6157
6158 /* This function applies a compiled re to a subject string and picks out
6159 portions of the string if it matches. Two elements in the vector are set for
6160 each substring: the offsets to the start and end of the substring.
6161
6162 Arguments:
6163 argument_re points to the compiled expression
6164 extra_data points to extra data or is NULL
6165 subject points to the subject string
6166 length length of subject string (may contain binary zeros)
6167 start_offset where to start in the subject string
6168 options option bits
6169 offsets points to a vector of ints to be filled in with offsets
6170 offsetcount the number of elements in the vector
6171
6172 Returns: > 0 => success; value is the number of elements filled in
6173 = 0 => success, but offsets is not big enough
6174 -1 => failed to match
6175 < -1 => some kind of unexpected problem
6176 */
6177
6178 #ifdef COMPILE_PCRE8
6179 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6180 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6181 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6182 int offsetcount)
6183 #else
6184 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6185 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6186 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6187 int offsetcount)
6188 #endif
6189 {
6190 int rc, ocount, arg_offset_max;
6191 int newline;
6192 BOOL using_temporary_offsets = FALSE;
6193 BOOL anchored;
6194 BOOL startline;
6195 BOOL firstline;
6196 BOOL utf;
6197 BOOL has_first_char = FALSE;
6198 BOOL has_req_char = FALSE;
6199 pcre_uchar first_char = 0;
6200 pcre_uchar first_char2 = 0;
6201 pcre_uchar req_char = 0;
6202 pcre_uchar req_char2 = 0;
6203 match_data match_block;
6204 match_data *md = &match_block;
6205 const pcre_uint8 *tables;
6206 const pcre_uint8 *start_bits = NULL;
6207 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6208 PCRE_PUCHAR end_subject;
6209 PCRE_PUCHAR start_partial = NULL;
6210 PCRE_PUCHAR req_char_ptr = start_match - 1;
6211
6212 const pcre_study_data *study;
6213 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6214
6215 /* Check for the special magic call that measures the size of the stack used
6216 per recursive call of match(). */
6217
6218 if (re == NULL && extra_data == NULL && subject == NULL && length == -1)
6219 #ifdef NO_RECURSE
6220 return -sizeof(heapframe);
6221 #else
6222 return match((PCRE_PUCHAR)&start_partial, NULL, NULL, 0, NULL, NULL, 0);
6223 #endif
6224
6225 /* Plausibility checks */
6226
6227 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6228 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6229 return PCRE_ERROR_NULL;
6230 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6231 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6232
6233 /* Check that the first field in the block is the magic number. If it is not,
6234 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6235 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6236 means that the pattern is likely compiled with different endianness. */
6237
6238 if (re->magic_number != MAGIC_NUMBER)
6239 return re->magic_number == REVERSED_MAGIC_NUMBER?
6240 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6241 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6242
6243 /* These two settings are used in the code for checking a UTF-8 string that
6244 follows immediately afterwards. Other values in the md block are used only
6245 during "normal" pcre_exec() processing, not when the JIT support is in use,
6246 so they are set up later. */
6247
6248 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6249 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6250 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6251 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6252
6253 /* Check a UTF-8 string if required. Pass back the character offset and error
6254 code for an invalid string if a results vector is available. */
6255
6256 #ifdef SUPPORT_UTF
6257 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6258 {
6259 int erroroffset;
6260 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6261 if (errorcode != 0)
6262 {
6263 if (offsetcount >= 2)
6264 {
6265 offsets[0] = erroroffset;
6266 offsets[1] = errorcode;
6267 }
6268 #ifdef COMPILE_PCRE16
6269 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6270 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6271 #else
6272 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6273 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6274 #endif
6275 }
6276
6277 /* Check that a start_offset points to the start of a UTF character. */
6278 if (start_offset > 0 && start_offset < length &&
6279 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6280 return PCRE_ERROR_BADUTF8_OFFSET;
6281 }
6282 #endif
6283
6284 /* If the pattern was successfully studied with JIT support, run the JIT
6285 executable instead of the rest of this function. Most options must be set at
6286 compile time for the JIT code to be usable. Fallback to the normal code path if
6287 an unsupported flag is set. In particular, JIT does not support partial
6288 matching. */
6289
6290 #ifdef SUPPORT_JIT
6291 if (extra_data != NULL
6292 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6293 && extra_data->executable_jit != NULL
6294 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6295 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6296 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6297 return PRIV(jit_exec)(re, extra_data->executable_jit,
6298 (const pcre_uchar *)subject, length, start_offset, options,
6299 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6300 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6301 #endif
6302
6303 /* Carry on with non-JIT matching. This information is for finding all the
6304 numbers associated with a given name, for condition testing. */
6305
6306 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6307 md->name_count = re->name_count;
6308 md->name_entry_size = re->name_entry_size;
6309
6310 /* Fish out the optional data from the extra_data structure, first setting
6311 the default values. */
6312
6313 study = NULL;
6314 md->match_limit = MATCH_LIMIT;
6315 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6316 md->callout_data = NULL;
6317
6318 /* The table pointer is always in native byte order. */
6319
6320 tables = re->tables;
6321
6322 if (extra_data != NULL)
6323 {
6324 register unsigned int flags = extra_data->flags;
6325 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6326 study = (const pcre_study_data *)extra_data->study_data;
6327 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6328 md->match_limit = extra_data->match_limit;
6329 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6330 md->match_limit_recursion = extra_data->match_limit_recursion;
6331 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6332 md->callout_data = extra_data->callout_data;
6333 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6334 }
6335
6336 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6337 is a feature that makes it possible to save compiled regex and re-use them
6338 in other programs later. */
6339
6340 if (tables == NULL) tables = PRIV(default_tables);
6341
6342 /* Set up other data */
6343
6344 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6345 startline = (re->flags & PCRE_STARTLINE) != 0;
6346 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6347
6348 /* The code starts after the real_pcre block and the capture name table. */
6349
6350 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6351 re->name_count * re->name_entry_size;
6352
6353 md->start_subject = (PCRE_PUCHAR)subject;
6354 md->start_offset = start_offset;
6355 md->end_subject = md->start_subject + length;
6356 end_subject = md->end_subject;
6357
6358 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6359 md->use_ucp = (re->options & PCRE_UCP) != 0;
6360 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6361 md->ignore_skip_arg = FALSE;
6362
6363 /* Some options are unpacked into BOOL variables in the hope that testing
6364 them will be faster than individual option bits. */
6365
6366 md->notbol = (options & PCRE_NOTBOL) != 0;
6367 md->noteol = (options & PCRE_NOTEOL) != 0;
6368 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6369 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6370
6371 md->hitend = FALSE;
6372 md->mark = md->nomatch_mark = NULL; /* In case never set */
6373
6374 md->recursive = NULL; /* No recursion at top level */
6375 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6376
6377 md->lcc = tables + lcc_offset;
6378 md->fcc = tables + fcc_offset;
6379 md->ctypes = tables + ctypes_offset;
6380
6381 /* Handle different \R options. */
6382
6383 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6384 {
6385 case 0:
6386 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6387 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6388 else
6389 #ifdef BSR_ANYCRLF
6390 md->bsr_anycrlf = TRUE;
6391 #else
6392 md->bsr_anycrlf = FALSE;
6393 #endif
6394 break;
6395
6396 case PCRE_BSR_ANYCRLF:
6397 md->bsr_anycrlf = TRUE;
6398 break;
6399
6400 case PCRE_BSR_UNICODE:
6401 md->bsr_anycrlf = FALSE;
6402 break;
6403
6404 default: return PCRE_ERROR_BADNEWLINE;
6405 }
6406
6407 /* Handle different types of newline. The three bits give eight cases. If
6408 nothing is set at run time, whatever was used at compile time applies. */
6409
6410 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6411 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6412 {
6413 case 0: newline = NEWLINE; break; /* Compile-time default */
6414 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6415 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6416 case PCRE_NEWLINE_CR+
6417 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6418 case PCRE_NEWLINE_ANY: newline = -1; break;
6419 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6420 default: return PCRE_ERROR_BADNEWLINE;
6421 }
6422
6423 if (newline == -2)
6424 {
6425 md->nltype = NLTYPE_ANYCRLF;
6426 }
6427 else if (newline < 0)
6428 {
6429 md->nltype = NLTYPE_ANY;
6430 }
6431 else
6432 {
6433 md->nltype = NLTYPE_FIXED;
6434 if (newline > 255)
6435 {
6436 md->nllen = 2;
6437 md->nl[0] = (newline >> 8) & 255;
6438 md->nl[1] = newline & 255;
6439 }
6440 else
6441 {
6442 md->nllen = 1;
6443 md->nl[0] = newline;
6444 }
6445 }
6446
6447 /* Partial matching was originally supported only for a restricted set of
6448 regexes; from release 8.00 there are no restrictions, but the bits are still
6449 defined (though never set). So there's no harm in leaving this code. */
6450
6451 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6452 return PCRE_ERROR_BADPARTIAL;
6453
6454 /* If the expression has got more back references than the offsets supplied can
6455 hold, we get a temporary chunk of working store to use during the matching.
6456 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6457 of 3. */
6458
6459 ocount = offsetcount - (offsetcount % 3);
6460 arg_offset_max = (2*ocount)/3;
6461
6462 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6463 {
6464 ocount = re->top_backref * 3 + 3;
6465 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6466 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6467 using_temporary_offsets = TRUE;
6468 DPRINTF(("Got memory to hold back references\n"));
6469 }
6470 else md->offset_vector = offsets;
6471
6472 md->offset_end = ocount;
6473 md->offset_max = (2*ocount)/3;
6474 md->offset_overflow = FALSE;
6475 md->capture_last = -1;
6476
6477 /* Reset the working variable associated with each extraction. These should
6478 never be used unless previously set, but they get saved and restored, and so we
6479 initialize them to avoid reading uninitialized locations. Also, unset the
6480 offsets for the matched string. This is really just for tidiness with callouts,
6481 in case they inspect these fields. */
6482
6483 if (md->offset_vector != NULL)
6484 {
6485 register int *iptr = md->offset_vector + ocount;
6486 register int *iend = iptr - re->top_bracket;
6487 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6488 while (--iptr >= iend) *iptr = -1;
6489 md->offset_vector[0] = md->offset_vector[1] = -1;
6490 }
6491
6492 /* Set up the first character to match, if available. The first_char value is
6493 never set for an anchored regular expression, but the anchoring may be forced
6494 at run time, so we have to test for anchoring. The first char may be unset for
6495 an unanchored pattern, of course. If there's no first char and the pattern was
6496 studied, there may be a bitmap of possible first characters. */
6497
6498 if (!anchored)
6499 {
6500 if ((re->flags & PCRE_FIRSTSET) != 0)
6501 {
6502 has_first_char = TRUE;
6503 first_char = first_char2 = re->first_char;
6504 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6505 {
6506 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6507 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6508 if (utf && first_char > 127)
6509 first_char2 = UCD_OTHERCASE(first_char);
6510 #endif
6511 }
6512 }
6513 else
6514 if (!startline && study != NULL &&
6515 (study->flags & PCRE_STUDY_MAPPED) != 0)
6516 start_bits = study->start_bits;
6517 }
6518
6519 /* For anchored or unanchored matches, there may be a "last known required
6520 character" set. */
6521
6522 if ((re->flags & PCRE_REQCHSET) != 0)
6523 {
6524 has_req_char = TRUE;
6525 req_char = req_char2 = re->req_char;
6526 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6527 {
6528 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6529 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6530 if (utf && req_char > 127)
6531 req_char2 = UCD_OTHERCASE(req_char);
6532 #endif
6533 }
6534 }
6535
6536
6537 /* ==========================================================================*/
6538
6539 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6540 the loop runs just once. */
6541
6542 for(;;)
6543 {
6544 PCRE_PUCHAR save_end_subject = end_subject;
6545 PCRE_PUCHAR new_start_match;
6546
6547 /* If firstline is TRUE, the start of the match is constrained to the first
6548 line of a multiline string. That is, the match must be before or at the first
6549 newline. Implement this by temporarily adjusting end_subject so that we stop
6550 scanning at a newline. If the match fails at the newline, later code breaks
6551 this loop. */
6552
6553 if (firstline)
6554 {
6555 PCRE_PUCHAR t = start_match;
6556 #ifdef SUPPORT_UTF
6557 if (utf)
6558 {
6559 while (t < md->end_subject && !IS_NEWLINE(t))
6560 {
6561 t++;
6562 ACROSSCHAR(t < end_subject, *t, t++);
6563 }
6564 }
6565 else
6566 #endif
6567 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6568 end_subject = t;
6569 }
6570
6571 /* There are some optimizations that avoid running the match if a known
6572 starting point is not found, or if a known later character is not present.
6573 However, there is an option that disables these, for testing and for ensuring
6574 that all callouts do actually occur. The option can be set in the regex by
6575 (*NO_START_OPT) or passed in match-time options. */
6576
6577 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6578 {
6579 /* Advance to a unique first char if there is one. */
6580
6581 if (has_first_char)
6582 {
6583 if (first_char != first_char2)
6584 while (start_match < end_subject &&
6585 *start_match != first_char && *start_match != first_char2)
6586 start_match++;
6587 else
6588 while (start_match < end_subject && *start_match != first_char)
6589 start_match++;
6590 }
6591
6592 /* Or to just after a linebreak for a multiline match */
6593
6594 else if (startline)
6595 {
6596 if (start_match > md->start_subject + start_offset)
6597 {
6598 #ifdef SUPPORT_UTF
6599 if (utf)
6600 {
6601 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6602 {
6603 start_match++;
6604 ACROSSCHAR(start_match < end_subject, *start_match,
6605 start_match++);
6606 }
6607 }
6608 else
6609 #endif
6610 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6611 start_match++;
6612
6613 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6614 and we are now at a LF, advance the match position by one more character.
6615 */
6616
6617 if (start_match[-1] == CHAR_CR &&
6618 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6619 start_match < end_subject &&
6620 *start_match == CHAR_NL)
6621 start_match++;
6622 }
6623 }
6624
6625 /* Or to a non-unique first byte after study */
6626
6627 else if (start_bits != NULL)
6628 {
6629 while (start_match < end_subject)
6630 {
6631 register unsigned int c = *start_match;
6632 #ifndef COMPILE_PCRE8
6633 if (c > 255) c = 255;
6634 #endif
6635 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6636 {
6637 start_match++;
6638 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6639 /* In non 8-bit mode, the iteration will stop for
6640 characters > 255 at the beginning or not stop at all. */
6641 if (utf)
6642 ACROSSCHAR(start_match < end_subject, *start_match,
6643 start_match++);
6644 #endif
6645 }
6646 else break;
6647 }
6648 }
6649 } /* Starting optimizations */
6650
6651 /* Restore fudged end_subject */
6652
6653 end_subject = save_end_subject;
6654
6655 /* The following two optimizations are disabled for partial matching or if
6656 disabling is explicitly requested. */
6657
6658 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6659 {
6660 /* If the pattern was studied, a minimum subject length may be set. This is
6661 a lower bound; no actual string of that length may actually match the
6662 pattern. Although the value is, strictly, in characters, we treat it as
6663 bytes to avoid spending too much time in this optimization. */
6664
6665 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6666 (pcre_uint32)(end_subject - start_match) < study->minlength)
6667 {
6668 rc = MATCH_NOMATCH;
6669 break;
6670 }
6671
6672 /* If req_char is set, we know that that character must appear in the
6673 subject for the match to succeed. If the first character is set, req_char
6674 must be later in the subject; otherwise the test starts at the match point.
6675 This optimization can save a huge amount of backtracking in patterns with
6676 nested unlimited repeats that aren't going to match. Writing separate code
6677 for cased/caseless versions makes it go faster, as does using an
6678 autoincrement and backing off on a match.
6679
6680 HOWEVER: when the subject string is very, very long, searching to its end
6681 can take a long time, and give bad performance on quite ordinary patterns.
6682 This showed up when somebody was matching something like /^\d+C/ on a
6683 32-megabyte string... so we don't do this when the string is sufficiently
6684 long. */
6685
6686 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6687 {
6688 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6689
6690 /* We don't need to repeat the search if we haven't yet reached the
6691 place we found it at last time. */
6692
6693 if (p > req_char_ptr)
6694 {
6695 if (req_char != req_char2)
6696 {
6697 while (p < end_subject)
6698 {
6699 register int pp = *p++;
6700 if (pp == req_char || pp == req_char2) { p--; break; }
6701 }
6702 }
6703 else
6704 {
6705 while (p < end_subject)
6706 {
6707 if (*p++ == req_char) { p--; break; }
6708 }
6709 }
6710
6711 /* If we can't find the required character, break the matching loop,
6712 forcing a match failure. */
6713
6714 if (p >= end_subject)
6715 {
6716 rc = MATCH_NOMATCH;
6717 break;
6718 }
6719
6720 /* If we have found the required character, save the point where we
6721 found it, so that we don't search again next time round the loop if
6722 the start hasn't passed this character yet. */
6723
6724 req_char_ptr = p;
6725 }
6726 }
6727 }
6728
6729 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6730 printf(">>>> Match against: ");
6731 pchars(start_match, end_subject - start_match, TRUE, md);
6732 printf("\n");
6733 #endif
6734
6735 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6736 first starting point for which a partial match was found. */
6737
6738 md->start_match_ptr = start_match;
6739 md->start_used_ptr = start_match;
6740 md->match_call_count = 0;
6741 md->match_function_type = 0;
6742 md->end_offset_top = 0;
6743 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6744 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6745<