/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 913 - (show annotations)
Sun Feb 12 17:06:59 2012 UTC (3 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 213000 byte(s)
Error occurred while calculating annotation data.
Add a cast to fix a compiler warning.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
489 frame on the stack here; subsequent instantiations are obtained from the heap
490 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
491 the top-level on the stack rather than malloc-ing them all gives a performance
492 boost in many cases where there is not much "recursion". */
493
494 #ifdef NO_RECURSE
495 heapframe frame_zero;
496 heapframe *frame = &frame_zero;
497 frame->Xprevframe = NULL; /* Marks the top level */
498
499 /* Copy in the original argument variables */
500
501 frame->Xeptr = eptr;
502 frame->Xecode = ecode;
503 frame->Xmstart = mstart;
504 frame->Xoffset_top = offset_top;
505 frame->Xeptrb = eptrb;
506 frame->Xrdepth = rdepth;
507
508 /* This is where control jumps back to to effect "recursion" */
509
510 HEAP_RECURSE:
511
512 /* Macros make the argument variables come from the current frame */
513
514 #define eptr frame->Xeptr
515 #define ecode frame->Xecode
516 #define mstart frame->Xmstart
517 #define offset_top frame->Xoffset_top
518 #define eptrb frame->Xeptrb
519 #define rdepth frame->Xrdepth
520
521 /* Ditto for the local variables */
522
523 #ifdef SUPPORT_UTF
524 #define charptr frame->Xcharptr
525 #endif
526 #define callpat frame->Xcallpat
527 #define codelink frame->Xcodelink
528 #define data frame->Xdata
529 #define next frame->Xnext
530 #define pp frame->Xpp
531 #define prev frame->Xprev
532 #define saved_eptr frame->Xsaved_eptr
533
534 #define new_recursive frame->Xnew_recursive
535
536 #define cur_is_word frame->Xcur_is_word
537 #define condition frame->Xcondition
538 #define prev_is_word frame->Xprev_is_word
539
540 #ifdef SUPPORT_UCP
541 #define prop_type frame->Xprop_type
542 #define prop_value frame->Xprop_value
543 #define prop_fail_result frame->Xprop_fail_result
544 #define oclength frame->Xoclength
545 #define occhars frame->Xocchars
546 #endif
547
548 #define ctype frame->Xctype
549 #define fc frame->Xfc
550 #define fi frame->Xfi
551 #define length frame->Xlength
552 #define max frame->Xmax
553 #define min frame->Xmin
554 #define number frame->Xnumber
555 #define offset frame->Xoffset
556 #define op frame->Xop
557 #define save_capture_last frame->Xsave_capture_last
558 #define save_offset1 frame->Xsave_offset1
559 #define save_offset2 frame->Xsave_offset2
560 #define save_offset3 frame->Xsave_offset3
561 #define stacksave frame->Xstacksave
562
563 #define newptrb frame->Xnewptrb
564
565 /* When recursion is being used, local variables are allocated on the stack and
566 get preserved during recursion in the normal way. In this environment, fi and
567 i, and fc and c, can be the same variables. */
568
569 #else /* NO_RECURSE not defined */
570 #define fi i
571 #define fc c
572
573 /* Many of the following variables are used only in small blocks of the code.
574 My normal style of coding would have declared them within each of those blocks.
575 However, in order to accommodate the version of this code that uses an external
576 "stack" implemented on the heap, it is easier to declare them all here, so the
577 declarations can be cut out in a block. The only declarations within blocks
578 below are for variables that do not have to be preserved over a recursive call
579 to RMATCH(). */
580
581 #ifdef SUPPORT_UTF
582 const pcre_uchar *charptr;
583 #endif
584 const pcre_uchar *callpat;
585 const pcre_uchar *data;
586 const pcre_uchar *next;
587 PCRE_PUCHAR pp;
588 const pcre_uchar *prev;
589 PCRE_PUCHAR saved_eptr;
590
591 recursion_info new_recursive;
592
593 BOOL cur_is_word;
594 BOOL condition;
595 BOOL prev_is_word;
596
597 #ifdef SUPPORT_UCP
598 int prop_type;
599 int prop_value;
600 int prop_fail_result;
601 int oclength;
602 pcre_uchar occhars[6];
603 #endif
604
605 int codelink;
606 int ctype;
607 int length;
608 int max;
609 int min;
610 int number;
611 int offset;
612 int op;
613 int save_capture_last;
614 int save_offset1, save_offset2, save_offset3;
615 int stacksave[REC_STACK_SAVE_MAX];
616
617 eptrblock newptrb;
618
619 /* There is a special fudge for calling match() in a way that causes it to
620 measure the size of its basic stack frame when the stack is being used for
621 recursion. The second argument (ecode) being NULL triggers this behaviour. It
622 cannot normally ever be NULL. The return is the negated value of the frame
623 size. */
624
625 if (ecode == NULL)
626 {
627 if (rdepth == 0)
628 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
629 else
630 {
631 int len = (char *)&rdepth - (char *)eptr;
632 return (len > 0)? -len : len;
633 }
634 }
635 #endif /* NO_RECURSE */
636
637 /* To save space on the stack and in the heap frame, I have doubled up on some
638 of the local variables that are used only in localised parts of the code, but
639 still need to be preserved over recursive calls of match(). These macros define
640 the alternative names that are used. */
641
642 #define allow_zero cur_is_word
643 #define cbegroup condition
644 #define code_offset codelink
645 #define condassert condition
646 #define matched_once prev_is_word
647 #define foc number
648 #define save_mark data
649
650 /* These statements are here to stop the compiler complaining about unitialized
651 variables. */
652
653 #ifdef SUPPORT_UCP
654 prop_value = 0;
655 prop_fail_result = 0;
656 #endif
657
658
659 /* This label is used for tail recursion, which is used in a few cases even
660 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
661 used. Thanks to Ian Taylor for noticing this possibility and sending the
662 original patch. */
663
664 TAIL_RECURSE:
665
666 /* OK, now we can get on with the real code of the function. Recursive calls
667 are specified by the macro RMATCH and RRETURN is used to return. When
668 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
669 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
670 defined). However, RMATCH isn't like a function call because it's quite a
671 complicated macro. It has to be used in one particular way. This shouldn't,
672 however, impact performance when true recursion is being used. */
673
674 #ifdef SUPPORT_UTF
675 utf = md->utf; /* Local copy of the flag */
676 #else
677 utf = FALSE;
678 #endif
679
680 /* First check that we haven't called match() too many times, or that we
681 haven't exceeded the recursive call limit. */
682
683 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
684 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
685
686 /* At the start of a group with an unlimited repeat that may match an empty
687 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
688 done this way to save having to use another function argument, which would take
689 up space on the stack. See also MATCH_CONDASSERT below.
690
691 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
692 such remembered pointers, to be checked when we hit the closing ket, in order
693 to break infinite loops that match no characters. When match() is called in
694 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
695 NOT be used with tail recursion, because the memory block that is used is on
696 the stack, so a new one may be required for each match(). */
697
698 if (md->match_function_type == MATCH_CBEGROUP)
699 {
700 newptrb.epb_saved_eptr = eptr;
701 newptrb.epb_prev = eptrb;
702 eptrb = &newptrb;
703 md->match_function_type = 0;
704 }
705
706 /* Now start processing the opcodes. */
707
708 for (;;)
709 {
710 minimize = possessive = FALSE;
711 op = *ecode;
712
713 switch(op)
714 {
715 case OP_MARK:
716 md->nomatch_mark = ecode + 2;
717 md->mark = NULL; /* In case previously set by assertion */
718 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
719 eptrb, RM55);
720 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
721 md->mark == NULL) md->mark = ecode + 2;
722
723 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
724 argument, and we must check whether that argument matches this MARK's
725 argument. It is passed back in md->start_match_ptr (an overloading of that
726 variable). If it does match, we reset that variable to the current subject
727 position and return MATCH_SKIP. Otherwise, pass back the return code
728 unaltered. */
729
730 else if (rrc == MATCH_SKIP_ARG &&
731 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
732 {
733 md->start_match_ptr = eptr;
734 RRETURN(MATCH_SKIP);
735 }
736 RRETURN(rrc);
737
738 case OP_FAIL:
739 RRETURN(MATCH_NOMATCH);
740
741 /* COMMIT overrides PRUNE, SKIP, and THEN */
742
743 case OP_COMMIT:
744 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
745 eptrb, RM52);
746 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
747 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
748 rrc != MATCH_THEN)
749 RRETURN(rrc);
750 RRETURN(MATCH_COMMIT);
751
752 /* PRUNE overrides THEN */
753
754 case OP_PRUNE:
755 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
756 eptrb, RM51);
757 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
758 RRETURN(MATCH_PRUNE);
759
760 case OP_PRUNE_ARG:
761 md->nomatch_mark = ecode + 2;
762 md->mark = NULL; /* In case previously set by assertion */
763 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
764 eptrb, RM56);
765 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
766 md->mark == NULL) md->mark = ecode + 2;
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
768 RRETURN(MATCH_PRUNE);
769
770 /* SKIP overrides PRUNE and THEN */
771
772 case OP_SKIP:
773 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
774 eptrb, RM53);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777 md->start_match_ptr = eptr; /* Pass back current position */
778 RRETURN(MATCH_SKIP);
779
780 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
781 nomatch_mark. There is a flag that disables this opcode when re-matching a
782 pattern that ended with a SKIP for which there was not a matching MARK. */
783
784 case OP_SKIP_ARG:
785 if (md->ignore_skip_arg)
786 {
787 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
788 break;
789 }
790 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
791 eptrb, RM57);
792 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
793 RRETURN(rrc);
794
795 /* Pass back the current skip name by overloading md->start_match_ptr and
796 returning the special MATCH_SKIP_ARG return code. This will either be
797 caught by a matching MARK, or get to the top, where it causes a rematch
798 with the md->ignore_skip_arg flag set. */
799
800 md->start_match_ptr = ecode + 2;
801 RRETURN(MATCH_SKIP_ARG);
802
803 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
804 the branch in which it occurs can be determined. Overload the start of
805 match pointer to do this. */
806
807 case OP_THEN:
808 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
809 eptrb, RM54);
810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
811 md->start_match_ptr = ecode;
812 RRETURN(MATCH_THEN);
813
814 case OP_THEN_ARG:
815 md->nomatch_mark = ecode + 2;
816 md->mark = NULL; /* In case previously set by assertion */
817 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
818 md, eptrb, RM58);
819 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
820 md->mark == NULL) md->mark = ecode + 2;
821 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
822 md->start_match_ptr = ecode;
823 RRETURN(MATCH_THEN);
824
825 /* Handle an atomic group that does not contain any capturing parentheses.
826 This can be handled like an assertion. Prior to 8.13, all atomic groups
827 were handled this way. In 8.13, the code was changed as below for ONCE, so
828 that backups pass through the group and thereby reset captured values.
829 However, this uses a lot more stack, so in 8.20, atomic groups that do not
830 contain any captures generate OP_ONCE_NC, which can be handled in the old,
831 less stack intensive way.
832
833 Check the alternative branches in turn - the matching won't pass the KET
834 for this kind of subpattern. If any one branch matches, we carry on as at
835 the end of a normal bracket, leaving the subject pointer, but resetting
836 the start-of-match value in case it was changed by \K. */
837
838 case OP_ONCE_NC:
839 prev = ecode;
840 saved_eptr = eptr;
841 save_mark = md->mark;
842 do
843 {
844 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
845 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
846 {
847 mstart = md->start_match_ptr;
848 break;
849 }
850 if (rrc == MATCH_THEN)
851 {
852 next = ecode + GET(ecode,1);
853 if (md->start_match_ptr < next &&
854 (*ecode == OP_ALT || *next == OP_ALT))
855 rrc = MATCH_NOMATCH;
856 }
857
858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
859 ecode += GET(ecode,1);
860 md->mark = save_mark;
861 }
862 while (*ecode == OP_ALT);
863
864 /* If hit the end of the group (which could be repeated), fail */
865
866 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
867
868 /* Continue as from after the group, updating the offsets high water
869 mark, since extracts may have been taken. */
870
871 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
872
873 offset_top = md->end_offset_top;
874 eptr = md->end_match_ptr;
875
876 /* For a non-repeating ket, just continue at this level. This also
877 happens for a repeating ket if no characters were matched in the group.
878 This is the forcible breaking of infinite loops as implemented in Perl
879 5.005. */
880
881 if (*ecode == OP_KET || eptr == saved_eptr)
882 {
883 ecode += 1+LINK_SIZE;
884 break;
885 }
886
887 /* The repeating kets try the rest of the pattern or restart from the
888 preceding bracket, in the appropriate order. The second "call" of match()
889 uses tail recursion, to avoid using another stack frame. */
890
891 if (*ecode == OP_KETRMIN)
892 {
893 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
895 ecode = prev;
896 goto TAIL_RECURSE;
897 }
898 else /* OP_KETRMAX */
899 {
900 md->match_function_type = MATCH_CBEGROUP;
901 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
902 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
903 ecode += 1 + LINK_SIZE;
904 goto TAIL_RECURSE;
905 }
906 /* Control never gets here */
907
908 /* Handle a capturing bracket, other than those that are possessive with an
909 unlimited repeat. If there is space in the offset vector, save the current
910 subject position in the working slot at the top of the vector. We mustn't
911 change the current values of the data slot, because they may be set from a
912 previous iteration of this group, and be referred to by a reference inside
913 the group. A failure to match might occur after the group has succeeded,
914 if something later on doesn't match. For this reason, we need to restore
915 the working value and also the values of the final offsets, in case they
916 were set by a previous iteration of the same bracket.
917
918 If there isn't enough space in the offset vector, treat this as if it were
919 a non-capturing bracket. Don't worry about setting the flag for the error
920 case here; that is handled in the code for KET. */
921
922 case OP_CBRA:
923 case OP_SCBRA:
924 number = GET2(ecode, 1+LINK_SIZE);
925 offset = number << 1;
926
927 #ifdef PCRE_DEBUG
928 printf("start bracket %d\n", number);
929 printf("subject=");
930 pchars(eptr, 16, TRUE, md);
931 printf("\n");
932 #endif
933
934 if (offset < md->offset_max)
935 {
936 save_offset1 = md->offset_vector[offset];
937 save_offset2 = md->offset_vector[offset+1];
938 save_offset3 = md->offset_vector[md->offset_end - number];
939 save_capture_last = md->capture_last;
940 save_mark = md->mark;
941
942 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
943 md->offset_vector[md->offset_end - number] =
944 (int)(eptr - md->start_subject);
945
946 for (;;)
947 {
948 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
949 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
950 eptrb, RM1);
951 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
952
953 /* If we backed up to a THEN, check whether it is within the current
954 branch by comparing the address of the THEN that is passed back with
955 the end of the branch. If it is within the current branch, and the
956 branch is one of two or more alternatives (it either starts or ends
957 with OP_ALT), we have reached the limit of THEN's action, so convert
958 the return code to NOMATCH, which will cause normal backtracking to
959 happen from now on. Otherwise, THEN is passed back to an outer
960 alternative. This implements Perl's treatment of parenthesized groups,
961 where a group not containing | does not affect the current alternative,
962 that is, (X) is NOT the same as (X|(*F)). */
963
964 if (rrc == MATCH_THEN)
965 {
966 next = ecode + GET(ecode,1);
967 if (md->start_match_ptr < next &&
968 (*ecode == OP_ALT || *next == OP_ALT))
969 rrc = MATCH_NOMATCH;
970 }
971
972 /* Anything other than NOMATCH is passed back. */
973
974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
975 md->capture_last = save_capture_last;
976 ecode += GET(ecode, 1);
977 md->mark = save_mark;
978 if (*ecode != OP_ALT) break;
979 }
980
981 DPRINTF(("bracket %d failed\n", number));
982 md->offset_vector[offset] = save_offset1;
983 md->offset_vector[offset+1] = save_offset2;
984 md->offset_vector[md->offset_end - number] = save_offset3;
985
986 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
987
988 RRETURN(rrc);
989 }
990
991 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
992 as a non-capturing bracket. */
993
994 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
995 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996
997 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
998
999 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1000 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1001
1002 /* Non-capturing or atomic group, except for possessive with unlimited
1003 repeat and ONCE group with no captures. Loop for all the alternatives.
1004
1005 When we get to the final alternative within the brackets, we used to return
1006 the result of a recursive call to match() whatever happened so it was
1007 possible to reduce stack usage by turning this into a tail recursion,
1008 except in the case of a possibly empty group. However, now that there is
1009 the possiblity of (*THEN) occurring in the final alternative, this
1010 optimization is no longer always possible.
1011
1012 We can optimize if we know there are no (*THEN)s in the pattern; at present
1013 this is the best that can be done.
1014
1015 MATCH_ONCE is returned when the end of an atomic group is successfully
1016 reached, but subsequent matching fails. It passes back up the tree (causing
1017 captured values to be reset) until the original atomic group level is
1018 reached. This is tested by comparing md->once_target with the start of the
1019 group. At this point, the return is converted into MATCH_NOMATCH so that
1020 previous backup points can be taken. */
1021
1022 case OP_ONCE:
1023 case OP_BRA:
1024 case OP_SBRA:
1025 DPRINTF(("start non-capturing bracket\n"));
1026
1027 for (;;)
1028 {
1029 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1030
1031 /* If this is not a possibly empty group, and there are no (*THEN)s in
1032 the pattern, and this is the final alternative, optimize as described
1033 above. */
1034
1035 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1036 {
1037 ecode += PRIV(OP_lengths)[*ecode];
1038 goto TAIL_RECURSE;
1039 }
1040
1041 /* In all other cases, we have to make another call to match(). */
1042
1043 save_mark = md->mark;
1044 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1045 RM2);
1046
1047 /* See comment in the code for capturing groups above about handling
1048 THEN. */
1049
1050 if (rrc == MATCH_THEN)
1051 {
1052 next = ecode + GET(ecode,1);
1053 if (md->start_match_ptr < next &&
1054 (*ecode == OP_ALT || *next == OP_ALT))
1055 rrc = MATCH_NOMATCH;
1056 }
1057
1058 if (rrc != MATCH_NOMATCH)
1059 {
1060 if (rrc == MATCH_ONCE)
1061 {
1062 const pcre_uchar *scode = ecode;
1063 if (*scode != OP_ONCE) /* If not at start, find it */
1064 {
1065 while (*scode == OP_ALT) scode += GET(scode, 1);
1066 scode -= GET(scode, 1);
1067 }
1068 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1069 }
1070 RRETURN(rrc);
1071 }
1072 ecode += GET(ecode, 1);
1073 md->mark = save_mark;
1074 if (*ecode != OP_ALT) break;
1075 }
1076
1077 RRETURN(MATCH_NOMATCH);
1078
1079 /* Handle possessive capturing brackets with an unlimited repeat. We come
1080 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1081 handled similarly to the normal case above. However, the matching is
1082 different. The end of these brackets will always be OP_KETRPOS, which
1083 returns MATCH_KETRPOS without going further in the pattern. By this means
1084 we can handle the group by iteration rather than recursion, thereby
1085 reducing the amount of stack needed. */
1086
1087 case OP_CBRAPOS:
1088 case OP_SCBRAPOS:
1089 allow_zero = FALSE;
1090
1091 POSSESSIVE_CAPTURE:
1092 number = GET2(ecode, 1+LINK_SIZE);
1093 offset = number << 1;
1094
1095 #ifdef PCRE_DEBUG
1096 printf("start possessive bracket %d\n", number);
1097 printf("subject=");
1098 pchars(eptr, 16, TRUE, md);
1099 printf("\n");
1100 #endif
1101
1102 if (offset < md->offset_max)
1103 {
1104 matched_once = FALSE;
1105 code_offset = (int)(ecode - md->start_code);
1106
1107 save_offset1 = md->offset_vector[offset];
1108 save_offset2 = md->offset_vector[offset+1];
1109 save_offset3 = md->offset_vector[md->offset_end - number];
1110 save_capture_last = md->capture_last;
1111
1112 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1113
1114 /* Each time round the loop, save the current subject position for use
1115 when the group matches. For MATCH_MATCH, the group has matched, so we
1116 restart it with a new subject starting position, remembering that we had
1117 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1118 usual. If we haven't matched any alternatives in any iteration, check to
1119 see if a previous iteration matched. If so, the group has matched;
1120 continue from afterwards. Otherwise it has failed; restore the previous
1121 capture values before returning NOMATCH. */
1122
1123 for (;;)
1124 {
1125 md->offset_vector[md->offset_end - number] =
1126 (int)(eptr - md->start_subject);
1127 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1128 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1129 eptrb, RM63);
1130 if (rrc == MATCH_KETRPOS)
1131 {
1132 offset_top = md->end_offset_top;
1133 eptr = md->end_match_ptr;
1134 ecode = md->start_code + code_offset;
1135 save_capture_last = md->capture_last;
1136 matched_once = TRUE;
1137 continue;
1138 }
1139
1140 /* See comment in the code for capturing groups above about handling
1141 THEN. */
1142
1143 if (rrc == MATCH_THEN)
1144 {
1145 next = ecode + GET(ecode,1);
1146 if (md->start_match_ptr < next &&
1147 (*ecode == OP_ALT || *next == OP_ALT))
1148 rrc = MATCH_NOMATCH;
1149 }
1150
1151 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1152 md->capture_last = save_capture_last;
1153 ecode += GET(ecode, 1);
1154 if (*ecode != OP_ALT) break;
1155 }
1156
1157 if (!matched_once)
1158 {
1159 md->offset_vector[offset] = save_offset1;
1160 md->offset_vector[offset+1] = save_offset2;
1161 md->offset_vector[md->offset_end - number] = save_offset3;
1162 }
1163
1164 if (allow_zero || matched_once)
1165 {
1166 ecode += 1 + LINK_SIZE;
1167 break;
1168 }
1169
1170 RRETURN(MATCH_NOMATCH);
1171 }
1172
1173 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1174 as a non-capturing bracket. */
1175
1176 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1177 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1178
1179 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1180
1181 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1182 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1183
1184 /* Non-capturing possessive bracket with unlimited repeat. We come here
1185 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1186 without the capturing complication. It is written out separately for speed
1187 and cleanliness. */
1188
1189 case OP_BRAPOS:
1190 case OP_SBRAPOS:
1191 allow_zero = FALSE;
1192
1193 POSSESSIVE_NON_CAPTURE:
1194 matched_once = FALSE;
1195 code_offset = (int)(ecode - md->start_code);
1196
1197 for (;;)
1198 {
1199 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1200 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1201 eptrb, RM48);
1202 if (rrc == MATCH_KETRPOS)
1203 {
1204 offset_top = md->end_offset_top;
1205 eptr = md->end_match_ptr;
1206 ecode = md->start_code + code_offset;
1207 matched_once = TRUE;
1208 continue;
1209 }
1210
1211 /* See comment in the code for capturing groups above about handling
1212 THEN. */
1213
1214 if (rrc == MATCH_THEN)
1215 {
1216 next = ecode + GET(ecode,1);
1217 if (md->start_match_ptr < next &&
1218 (*ecode == OP_ALT || *next == OP_ALT))
1219 rrc = MATCH_NOMATCH;
1220 }
1221
1222 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1223 ecode += GET(ecode, 1);
1224 if (*ecode != OP_ALT) break;
1225 }
1226
1227 if (matched_once || allow_zero)
1228 {
1229 ecode += 1 + LINK_SIZE;
1230 break;
1231 }
1232 RRETURN(MATCH_NOMATCH);
1233
1234 /* Control never reaches here. */
1235
1236 /* Conditional group: compilation checked that there are no more than
1237 two branches. If the condition is false, skipping the first branch takes us
1238 past the end if there is only one branch, but that's OK because that is
1239 exactly what going to the ket would do. */
1240
1241 case OP_COND:
1242 case OP_SCOND:
1243 codelink = GET(ecode, 1);
1244
1245 /* Because of the way auto-callout works during compile, a callout item is
1246 inserted between OP_COND and an assertion condition. */
1247
1248 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1249 {
1250 if (PUBL(callout) != NULL)
1251 {
1252 PUBL(callout_block) cb;
1253 cb.version = 2; /* Version 1 of the callout block */
1254 cb.callout_number = ecode[LINK_SIZE+2];
1255 cb.offset_vector = md->offset_vector;
1256 #ifdef COMPILE_PCRE8
1257 cb.subject = (PCRE_SPTR)md->start_subject;
1258 #else
1259 cb.subject = (PCRE_SPTR16)md->start_subject;
1260 #endif
1261 cb.subject_length = (int)(md->end_subject - md->start_subject);
1262 cb.start_match = (int)(mstart - md->start_subject);
1263 cb.current_position = (int)(eptr - md->start_subject);
1264 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1265 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1266 cb.capture_top = offset_top/2;
1267 cb.capture_last = md->capture_last;
1268 cb.callout_data = md->callout_data;
1269 cb.mark = md->nomatch_mark;
1270 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1271 if (rrc < 0) RRETURN(rrc);
1272 }
1273 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1274 }
1275
1276 condcode = ecode[LINK_SIZE+1];
1277
1278 /* Now see what the actual condition is */
1279
1280 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1281 {
1282 if (md->recursive == NULL) /* Not recursing => FALSE */
1283 {
1284 condition = FALSE;
1285 ecode += GET(ecode, 1);
1286 }
1287 else
1288 {
1289 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1290 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1291
1292 /* If the test is for recursion into a specific subpattern, and it is
1293 false, but the test was set up by name, scan the table to see if the
1294 name refers to any other numbers, and test them. The condition is true
1295 if any one is set. */
1296
1297 if (!condition && condcode == OP_NRREF)
1298 {
1299 pcre_uchar *slotA = md->name_table;
1300 for (i = 0; i < md->name_count; i++)
1301 {
1302 if (GET2(slotA, 0) == recno) break;
1303 slotA += md->name_entry_size;
1304 }
1305
1306 /* Found a name for the number - there can be only one; duplicate
1307 names for different numbers are allowed, but not vice versa. First
1308 scan down for duplicates. */
1309
1310 if (i < md->name_count)
1311 {
1312 pcre_uchar *slotB = slotA;
1313 while (slotB > md->name_table)
1314 {
1315 slotB -= md->name_entry_size;
1316 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1317 {
1318 condition = GET2(slotB, 0) == md->recursive->group_num;
1319 if (condition) break;
1320 }
1321 else break;
1322 }
1323
1324 /* Scan up for duplicates */
1325
1326 if (!condition)
1327 {
1328 slotB = slotA;
1329 for (i++; i < md->name_count; i++)
1330 {
1331 slotB += md->name_entry_size;
1332 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1333 {
1334 condition = GET2(slotB, 0) == md->recursive->group_num;
1335 if (condition) break;
1336 }
1337 else break;
1338 }
1339 }
1340 }
1341 }
1342
1343 /* Chose branch according to the condition */
1344
1345 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1346 }
1347 }
1348
1349 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1350 {
1351 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1352 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1353
1354 /* If the numbered capture is unset, but the reference was by name,
1355 scan the table to see if the name refers to any other numbers, and test
1356 them. The condition is true if any one is set. This is tediously similar
1357 to the code above, but not close enough to try to amalgamate. */
1358
1359 if (!condition && condcode == OP_NCREF)
1360 {
1361 int refno = offset >> 1;
1362 pcre_uchar *slotA = md->name_table;
1363
1364 for (i = 0; i < md->name_count; i++)
1365 {
1366 if (GET2(slotA, 0) == refno) break;
1367 slotA += md->name_entry_size;
1368 }
1369
1370 /* Found a name for the number - there can be only one; duplicate names
1371 for different numbers are allowed, but not vice versa. First scan down
1372 for duplicates. */
1373
1374 if (i < md->name_count)
1375 {
1376 pcre_uchar *slotB = slotA;
1377 while (slotB > md->name_table)
1378 {
1379 slotB -= md->name_entry_size;
1380 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1381 {
1382 offset = GET2(slotB, 0) << 1;
1383 condition = offset < offset_top &&
1384 md->offset_vector[offset] >= 0;
1385 if (condition) break;
1386 }
1387 else break;
1388 }
1389
1390 /* Scan up for duplicates */
1391
1392 if (!condition)
1393 {
1394 slotB = slotA;
1395 for (i++; i < md->name_count; i++)
1396 {
1397 slotB += md->name_entry_size;
1398 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1399 {
1400 offset = GET2(slotB, 0) << 1;
1401 condition = offset < offset_top &&
1402 md->offset_vector[offset] >= 0;
1403 if (condition) break;
1404 }
1405 else break;
1406 }
1407 }
1408 }
1409 }
1410
1411 /* Chose branch according to the condition */
1412
1413 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1414 }
1415
1416 else if (condcode == OP_DEF) /* DEFINE - always false */
1417 {
1418 condition = FALSE;
1419 ecode += GET(ecode, 1);
1420 }
1421
1422 /* The condition is an assertion. Call match() to evaluate it - setting
1423 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1424 an assertion. */
1425
1426 else
1427 {
1428 md->match_function_type = MATCH_CONDASSERT;
1429 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1430 if (rrc == MATCH_MATCH)
1431 {
1432 if (md->end_offset_top > offset_top)
1433 offset_top = md->end_offset_top; /* Captures may have happened */
1434 condition = TRUE;
1435 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1436 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1437 }
1438
1439 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1440 assertion; it is therefore treated as NOMATCH. */
1441
1442 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1443 {
1444 RRETURN(rrc); /* Need braces because of following else */
1445 }
1446 else
1447 {
1448 condition = FALSE;
1449 ecode += codelink;
1450 }
1451 }
1452
1453 /* We are now at the branch that is to be obeyed. As there is only one, can
1454 use tail recursion to avoid using another stack frame, except when there is
1455 unlimited repeat of a possibly empty group. In the latter case, a recursive
1456 call to match() is always required, unless the second alternative doesn't
1457 exist, in which case we can just plough on. Note that, for compatibility
1458 with Perl, the | in a conditional group is NOT treated as creating two
1459 alternatives. If a THEN is encountered in the branch, it propagates out to
1460 the enclosing alternative (unless nested in a deeper set of alternatives,
1461 of course). */
1462
1463 if (condition || *ecode == OP_ALT)
1464 {
1465 if (op != OP_SCOND)
1466 {
1467 ecode += 1 + LINK_SIZE;
1468 goto TAIL_RECURSE;
1469 }
1470
1471 md->match_function_type = MATCH_CBEGROUP;
1472 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1473 RRETURN(rrc);
1474 }
1475
1476 /* Condition false & no alternative; continue after the group. */
1477
1478 else
1479 {
1480 ecode += 1 + LINK_SIZE;
1481 }
1482 break;
1483
1484
1485 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1486 to close any currently open capturing brackets. */
1487
1488 case OP_CLOSE:
1489 number = GET2(ecode, 1);
1490 offset = number << 1;
1491
1492 #ifdef PCRE_DEBUG
1493 printf("end bracket %d at *ACCEPT", number);
1494 printf("\n");
1495 #endif
1496
1497 md->capture_last = number;
1498 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1499 {
1500 md->offset_vector[offset] =
1501 md->offset_vector[md->offset_end - number];
1502 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1503 if (offset_top <= offset) offset_top = offset + 2;
1504 }
1505 ecode += 1 + IMM2_SIZE;
1506 break;
1507
1508
1509 /* End of the pattern, either real or forced. */
1510
1511 case OP_END:
1512 case OP_ACCEPT:
1513 case OP_ASSERT_ACCEPT:
1514
1515 /* If we have matched an empty string, fail if not in an assertion and not
1516 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1517 is set and we have matched at the start of the subject. In both cases,
1518 backtracking will then try other alternatives, if any. */
1519
1520 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1521 md->recursive == NULL &&
1522 (md->notempty ||
1523 (md->notempty_atstart &&
1524 mstart == md->start_subject + md->start_offset)))
1525 RRETURN(MATCH_NOMATCH);
1526
1527 /* Otherwise, we have a match. */
1528
1529 md->end_match_ptr = eptr; /* Record where we ended */
1530 md->end_offset_top = offset_top; /* and how many extracts were taken */
1531 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1532
1533 /* For some reason, the macros don't work properly if an expression is
1534 given as the argument to RRETURN when the heap is in use. */
1535
1536 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1537 RRETURN(rrc);
1538
1539 /* Assertion brackets. Check the alternative branches in turn - the
1540 matching won't pass the KET for an assertion. If any one branch matches,
1541 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1542 start of each branch to move the current point backwards, so the code at
1543 this level is identical to the lookahead case. When the assertion is part
1544 of a condition, we want to return immediately afterwards. The caller of
1545 this incarnation of the match() function will have set MATCH_CONDASSERT in
1546 md->match_function type, and one of these opcodes will be the first opcode
1547 that is processed. We use a local variable that is preserved over calls to
1548 match() to remember this case. */
1549
1550 case OP_ASSERT:
1551 case OP_ASSERTBACK:
1552 save_mark = md->mark;
1553 if (md->match_function_type == MATCH_CONDASSERT)
1554 {
1555 condassert = TRUE;
1556 md->match_function_type = 0;
1557 }
1558 else condassert = FALSE;
1559
1560 do
1561 {
1562 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1563 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1564 {
1565 mstart = md->start_match_ptr; /* In case \K reset it */
1566 break;
1567 }
1568
1569 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1570 as NOMATCH. */
1571
1572 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1573 ecode += GET(ecode, 1);
1574 md->mark = save_mark;
1575 }
1576 while (*ecode == OP_ALT);
1577
1578 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1579
1580 /* If checking an assertion for a condition, return MATCH_MATCH. */
1581
1582 if (condassert) RRETURN(MATCH_MATCH);
1583
1584 /* Continue from after the assertion, updating the offsets high water
1585 mark, since extracts may have been taken during the assertion. */
1586
1587 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1588 ecode += 1 + LINK_SIZE;
1589 offset_top = md->end_offset_top;
1590 continue;
1591
1592 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1593 PRUNE, or COMMIT means we must assume failure without checking subsequent
1594 branches. */
1595
1596 case OP_ASSERT_NOT:
1597 case OP_ASSERTBACK_NOT:
1598 save_mark = md->mark;
1599 if (md->match_function_type == MATCH_CONDASSERT)
1600 {
1601 condassert = TRUE;
1602 md->match_function_type = 0;
1603 }
1604 else condassert = FALSE;
1605
1606 do
1607 {
1608 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1609 md->mark = save_mark;
1610 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1611 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1612 {
1613 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1614 break;
1615 }
1616
1617 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1618 as NOMATCH. */
1619
1620 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1621 ecode += GET(ecode,1);
1622 }
1623 while (*ecode == OP_ALT);
1624
1625 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1626
1627 ecode += 1 + LINK_SIZE;
1628 continue;
1629
1630 /* Move the subject pointer back. This occurs only at the start of
1631 each branch of a lookbehind assertion. If we are too close to the start to
1632 move back, this match function fails. When working with UTF-8 we move
1633 back a number of characters, not bytes. */
1634
1635 case OP_REVERSE:
1636 #ifdef SUPPORT_UTF
1637 if (utf)
1638 {
1639 i = GET(ecode, 1);
1640 while (i-- > 0)
1641 {
1642 eptr--;
1643 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1644 BACKCHAR(eptr);
1645 }
1646 }
1647 else
1648 #endif
1649
1650 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1651
1652 {
1653 eptr -= GET(ecode, 1);
1654 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1655 }
1656
1657 /* Save the earliest consulted character, then skip to next op code */
1658
1659 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1660 ecode += 1 + LINK_SIZE;
1661 break;
1662
1663 /* The callout item calls an external function, if one is provided, passing
1664 details of the match so far. This is mainly for debugging, though the
1665 function is able to force a failure. */
1666
1667 case OP_CALLOUT:
1668 if (PUBL(callout) != NULL)
1669 {
1670 PUBL(callout_block) cb;
1671 cb.version = 2; /* Version 1 of the callout block */
1672 cb.callout_number = ecode[1];
1673 cb.offset_vector = md->offset_vector;
1674 #ifdef COMPILE_PCRE8
1675 cb.subject = (PCRE_SPTR)md->start_subject;
1676 #else
1677 cb.subject = (PCRE_SPTR16)md->start_subject;
1678 #endif
1679 cb.subject_length = (int)(md->end_subject - md->start_subject);
1680 cb.start_match = (int)(mstart - md->start_subject);
1681 cb.current_position = (int)(eptr - md->start_subject);
1682 cb.pattern_position = GET(ecode, 2);
1683 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1684 cb.capture_top = offset_top/2;
1685 cb.capture_last = md->capture_last;
1686 cb.callout_data = md->callout_data;
1687 cb.mark = md->nomatch_mark;
1688 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1689 if (rrc < 0) RRETURN(rrc);
1690 }
1691 ecode += 2 + 2*LINK_SIZE;
1692 break;
1693
1694 /* Recursion either matches the current regex, or some subexpression. The
1695 offset data is the offset to the starting bracket from the start of the
1696 whole pattern. (This is so that it works from duplicated subpatterns.)
1697
1698 The state of the capturing groups is preserved over recursion, and
1699 re-instated afterwards. We don't know how many are started and not yet
1700 finished (offset_top records the completed total) so we just have to save
1701 all the potential data. There may be up to 65535 such values, which is too
1702 large to put on the stack, but using malloc for small numbers seems
1703 expensive. As a compromise, the stack is used when there are no more than
1704 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1705
1706 There are also other values that have to be saved. We use a chained
1707 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1708 for the original version of this logic. It has, however, been hacked around
1709 a lot, so he is not to blame for the current way it works. */
1710
1711 case OP_RECURSE:
1712 {
1713 recursion_info *ri;
1714 int recno;
1715
1716 callpat = md->start_code + GET(ecode, 1);
1717 recno = (callpat == md->start_code)? 0 :
1718 GET2(callpat, 1 + LINK_SIZE);
1719
1720 /* Check for repeating a recursion without advancing the subject pointer.
1721 This should catch convoluted mutual recursions. (Some simple cases are
1722 caught at compile time.) */
1723
1724 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1725 if (recno == ri->group_num && eptr == ri->subject_position)
1726 RRETURN(PCRE_ERROR_RECURSELOOP);
1727
1728 /* Add to "recursing stack" */
1729
1730 new_recursive.group_num = recno;
1731 new_recursive.subject_position = eptr;
1732 new_recursive.prevrec = md->recursive;
1733 md->recursive = &new_recursive;
1734
1735 /* Where to continue from afterwards */
1736
1737 ecode += 1 + LINK_SIZE;
1738
1739 /* Now save the offset data */
1740
1741 new_recursive.saved_max = md->offset_end;
1742 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1743 new_recursive.offset_save = stacksave;
1744 else
1745 {
1746 new_recursive.offset_save =
1747 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1748 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1749 }
1750 memcpy(new_recursive.offset_save, md->offset_vector,
1751 new_recursive.saved_max * sizeof(int));
1752
1753 /* OK, now we can do the recursion. After processing each alternative,
1754 restore the offset data. If there were nested recursions, md->recursive
1755 might be changed, so reset it before looping. */
1756
1757 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1758 cbegroup = (*callpat >= OP_SBRA);
1759 do
1760 {
1761 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1762 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1763 md, eptrb, RM6);
1764 memcpy(md->offset_vector, new_recursive.offset_save,
1765 new_recursive.saved_max * sizeof(int));
1766 md->recursive = new_recursive.prevrec;
1767 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1768 {
1769 DPRINTF(("Recursion matched\n"));
1770 if (new_recursive.offset_save != stacksave)
1771 (PUBL(free))(new_recursive.offset_save);
1772
1773 /* Set where we got to in the subject, and reset the start in case
1774 it was changed by \K. This *is* propagated back out of a recursion,
1775 for Perl compatibility. */
1776
1777 eptr = md->end_match_ptr;
1778 mstart = md->start_match_ptr;
1779 goto RECURSION_MATCHED; /* Exit loop; end processing */
1780 }
1781
1782 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1783 as NOMATCH. */
1784
1785 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1786 {
1787 DPRINTF(("Recursion gave error %d\n", rrc));
1788 if (new_recursive.offset_save != stacksave)
1789 (PUBL(free))(new_recursive.offset_save);
1790 RRETURN(rrc);
1791 }
1792
1793 md->recursive = &new_recursive;
1794 callpat += GET(callpat, 1);
1795 }
1796 while (*callpat == OP_ALT);
1797
1798 DPRINTF(("Recursion didn't match\n"));
1799 md->recursive = new_recursive.prevrec;
1800 if (new_recursive.offset_save != stacksave)
1801 (PUBL(free))(new_recursive.offset_save);
1802 RRETURN(MATCH_NOMATCH);
1803 }
1804
1805 RECURSION_MATCHED:
1806 break;
1807
1808 /* An alternation is the end of a branch; scan along to find the end of the
1809 bracketed group and go to there. */
1810
1811 case OP_ALT:
1812 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1813 break;
1814
1815 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1816 indicating that it may occur zero times. It may repeat infinitely, or not
1817 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1818 with fixed upper repeat limits are compiled as a number of copies, with the
1819 optional ones preceded by BRAZERO or BRAMINZERO. */
1820
1821 case OP_BRAZERO:
1822 next = ecode + 1;
1823 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1825 do next += GET(next, 1); while (*next == OP_ALT);
1826 ecode = next + 1 + LINK_SIZE;
1827 break;
1828
1829 case OP_BRAMINZERO:
1830 next = ecode + 1;
1831 do next += GET(next, 1); while (*next == OP_ALT);
1832 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1834 ecode++;
1835 break;
1836
1837 case OP_SKIPZERO:
1838 next = ecode+1;
1839 do next += GET(next,1); while (*next == OP_ALT);
1840 ecode = next + 1 + LINK_SIZE;
1841 break;
1842
1843 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1844 here; just jump to the group, with allow_zero set TRUE. */
1845
1846 case OP_BRAPOSZERO:
1847 op = *(++ecode);
1848 allow_zero = TRUE;
1849 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1850 goto POSSESSIVE_NON_CAPTURE;
1851
1852 /* End of a group, repeated or non-repeating. */
1853
1854 case OP_KET:
1855 case OP_KETRMIN:
1856 case OP_KETRMAX:
1857 case OP_KETRPOS:
1858 prev = ecode - GET(ecode, 1);
1859
1860 /* If this was a group that remembered the subject start, in order to break
1861 infinite repeats of empty string matches, retrieve the subject start from
1862 the chain. Otherwise, set it NULL. */
1863
1864 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1865 {
1866 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1867 eptrb = eptrb->epb_prev; /* Backup to previous group */
1868 }
1869 else saved_eptr = NULL;
1870
1871 /* If we are at the end of an assertion group or a non-capturing atomic
1872 group, stop matching and return MATCH_MATCH, but record the current high
1873 water mark for use by positive assertions. We also need to record the match
1874 start in case it was changed by \K. */
1875
1876 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1877 *prev == OP_ONCE_NC)
1878 {
1879 md->end_match_ptr = eptr; /* For ONCE_NC */
1880 md->end_offset_top = offset_top;
1881 md->start_match_ptr = mstart;
1882 RRETURN(MATCH_MATCH); /* Sets md->mark */
1883 }
1884
1885 /* For capturing groups we have to check the group number back at the start
1886 and if necessary complete handling an extraction by setting the offsets and
1887 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1888 into group 0, so it won't be picked up here. Instead, we catch it when the
1889 OP_END is reached. Other recursion is handled here. We just have to record
1890 the current subject position and start match pointer and give a MATCH
1891 return. */
1892
1893 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1894 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1895 {
1896 number = GET2(prev, 1+LINK_SIZE);
1897 offset = number << 1;
1898
1899 #ifdef PCRE_DEBUG
1900 printf("end bracket %d", number);
1901 printf("\n");
1902 #endif
1903
1904 /* Handle a recursively called group. */
1905
1906 if (md->recursive != NULL && md->recursive->group_num == number)
1907 {
1908 md->end_match_ptr = eptr;
1909 md->start_match_ptr = mstart;
1910 RRETURN(MATCH_MATCH);
1911 }
1912
1913 /* Deal with capturing */
1914
1915 md->capture_last = number;
1916 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1917 {
1918 /* If offset is greater than offset_top, it means that we are
1919 "skipping" a capturing group, and that group's offsets must be marked
1920 unset. In earlier versions of PCRE, all the offsets were unset at the
1921 start of matching, but this doesn't work because atomic groups and
1922 assertions can cause a value to be set that should later be unset.
1923 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1924 part of the atomic group, but this is not on the final matching path,
1925 so must be unset when 2 is set. (If there is no group 2, there is no
1926 problem, because offset_top will then be 2, indicating no capture.) */
1927
1928 if (offset > offset_top)
1929 {
1930 register int *iptr = md->offset_vector + offset_top;
1931 register int *iend = md->offset_vector + offset;
1932 while (iptr < iend) *iptr++ = -1;
1933 }
1934
1935 /* Now make the extraction */
1936
1937 md->offset_vector[offset] =
1938 md->offset_vector[md->offset_end - number];
1939 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1940 if (offset_top <= offset) offset_top = offset + 2;
1941 }
1942 }
1943
1944 /* For an ordinary non-repeating ket, just continue at this level. This
1945 also happens for a repeating ket if no characters were matched in the
1946 group. This is the forcible breaking of infinite loops as implemented in
1947 Perl 5.005. For a non-repeating atomic group that includes captures,
1948 establish a backup point by processing the rest of the pattern at a lower
1949 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1950 original OP_ONCE level, thereby bypassing intermediate backup points, but
1951 resetting any captures that happened along the way. */
1952
1953 if (*ecode == OP_KET || eptr == saved_eptr)
1954 {
1955 if (*prev == OP_ONCE)
1956 {
1957 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1958 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1959 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1960 RRETURN(MATCH_ONCE);
1961 }
1962 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1963 break;
1964 }
1965
1966 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1967 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1968 at a time from the outer level, thus saving stack. */
1969
1970 if (*ecode == OP_KETRPOS)
1971 {
1972 md->end_match_ptr = eptr;
1973 md->end_offset_top = offset_top;
1974 RRETURN(MATCH_KETRPOS);
1975 }
1976
1977 /* The normal repeating kets try the rest of the pattern or restart from
1978 the preceding bracket, in the appropriate order. In the second case, we can
1979 use tail recursion to avoid using another stack frame, unless we have an
1980 an atomic group or an unlimited repeat of a group that can match an empty
1981 string. */
1982
1983 if (*ecode == OP_KETRMIN)
1984 {
1985 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1987 if (*prev == OP_ONCE)
1988 {
1989 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1991 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1992 RRETURN(MATCH_ONCE);
1993 }
1994 if (*prev >= OP_SBRA) /* Could match an empty string */
1995 {
1996 md->match_function_type = MATCH_CBEGROUP;
1997 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1998 RRETURN(rrc);
1999 }
2000 ecode = prev;
2001 goto TAIL_RECURSE;
2002 }
2003 else /* OP_KETRMAX */
2004 {
2005 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
2006 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2007 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2009 if (*prev == OP_ONCE)
2010 {
2011 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2013 md->once_target = prev;
2014 RRETURN(MATCH_ONCE);
2015 }
2016 ecode += 1 + LINK_SIZE;
2017 goto TAIL_RECURSE;
2018 }
2019 /* Control never gets here */
2020
2021 /* Not multiline mode: start of subject assertion, unless notbol. */
2022
2023 case OP_CIRC:
2024 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2025
2026 /* Start of subject assertion */
2027
2028 case OP_SOD:
2029 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2030 ecode++;
2031 break;
2032
2033 /* Multiline mode: start of subject unless notbol, or after any newline. */
2034
2035 case OP_CIRCM:
2036 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2037 if (eptr != md->start_subject &&
2038 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2039 RRETURN(MATCH_NOMATCH);
2040 ecode++;
2041 break;
2042
2043 /* Start of match assertion */
2044
2045 case OP_SOM:
2046 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2047 ecode++;
2048 break;
2049
2050 /* Reset the start of match point */
2051
2052 case OP_SET_SOM:
2053 mstart = eptr;
2054 ecode++;
2055 break;
2056
2057 /* Multiline mode: assert before any newline, or before end of subject
2058 unless noteol is set. */
2059
2060 case OP_DOLLM:
2061 if (eptr < md->end_subject)
2062 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2063 else
2064 {
2065 if (md->noteol) RRETURN(MATCH_NOMATCH);
2066 SCHECK_PARTIAL();
2067 }
2068 ecode++;
2069 break;
2070
2071 /* Not multiline mode: assert before a terminating newline or before end of
2072 subject unless noteol is set. */
2073
2074 case OP_DOLL:
2075 if (md->noteol) RRETURN(MATCH_NOMATCH);
2076 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2077
2078 /* ... else fall through for endonly */
2079
2080 /* End of subject assertion (\z) */
2081
2082 case OP_EOD:
2083 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2084 SCHECK_PARTIAL();
2085 ecode++;
2086 break;
2087
2088 /* End of subject or ending \n assertion (\Z) */
2089
2090 case OP_EODN:
2091 ASSERT_NL_OR_EOS:
2092 if (eptr < md->end_subject &&
2093 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2094 RRETURN(MATCH_NOMATCH);
2095
2096 /* Either at end of string or \n before end. */
2097
2098 SCHECK_PARTIAL();
2099 ecode++;
2100 break;
2101
2102 /* Word boundary assertions */
2103
2104 case OP_NOT_WORD_BOUNDARY:
2105 case OP_WORD_BOUNDARY:
2106 {
2107
2108 /* Find out if the previous and current characters are "word" characters.
2109 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2110 be "non-word" characters. Remember the earliest consulted character for
2111 partial matching. */
2112
2113 #ifdef SUPPORT_UTF
2114 if (utf)
2115 {
2116 /* Get status of previous character */
2117
2118 if (eptr == md->start_subject) prev_is_word = FALSE; else
2119 {
2120 PCRE_PUCHAR lastptr = eptr - 1;
2121 BACKCHAR(lastptr);
2122 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2123 GETCHAR(c, lastptr);
2124 #ifdef SUPPORT_UCP
2125 if (md->use_ucp)
2126 {
2127 if (c == '_') prev_is_word = TRUE; else
2128 {
2129 int cat = UCD_CATEGORY(c);
2130 prev_is_word = (cat == ucp_L || cat == ucp_N);
2131 }
2132 }
2133 else
2134 #endif
2135 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2136 }
2137
2138 /* Get status of next character */
2139
2140 if (eptr >= md->end_subject)
2141 {
2142 SCHECK_PARTIAL();
2143 cur_is_word = FALSE;
2144 }
2145 else
2146 {
2147 GETCHAR(c, eptr);
2148 #ifdef SUPPORT_UCP
2149 if (md->use_ucp)
2150 {
2151 if (c == '_') cur_is_word = TRUE; else
2152 {
2153 int cat = UCD_CATEGORY(c);
2154 cur_is_word = (cat == ucp_L || cat == ucp_N);
2155 }
2156 }
2157 else
2158 #endif
2159 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2160 }
2161 }
2162 else
2163 #endif
2164
2165 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2166 consistency with the behaviour of \w we do use it in this case. */
2167
2168 {
2169 /* Get status of previous character */
2170
2171 if (eptr == md->start_subject) prev_is_word = FALSE; else
2172 {
2173 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2174 #ifdef SUPPORT_UCP
2175 if (md->use_ucp)
2176 {
2177 c = eptr[-1];
2178 if (c == '_') prev_is_word = TRUE; else
2179 {
2180 int cat = UCD_CATEGORY(c);
2181 prev_is_word = (cat == ucp_L || cat == ucp_N);
2182 }
2183 }
2184 else
2185 #endif
2186 prev_is_word = MAX_255(eptr[-1])
2187 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2188 }
2189
2190 /* Get status of next character */
2191
2192 if (eptr >= md->end_subject)
2193 {
2194 SCHECK_PARTIAL();
2195 cur_is_word = FALSE;
2196 }
2197 else
2198 #ifdef SUPPORT_UCP
2199 if (md->use_ucp)
2200 {
2201 c = *eptr;
2202 if (c == '_') cur_is_word = TRUE; else
2203 {
2204 int cat = UCD_CATEGORY(c);
2205 cur_is_word = (cat == ucp_L || cat == ucp_N);
2206 }
2207 }
2208 else
2209 #endif
2210 cur_is_word = MAX_255(*eptr)
2211 && ((md->ctypes[*eptr] & ctype_word) != 0);
2212 }
2213
2214 /* Now see if the situation is what we want */
2215
2216 if ((*ecode++ == OP_WORD_BOUNDARY)?
2217 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2218 RRETURN(MATCH_NOMATCH);
2219 }
2220 break;
2221
2222 /* Match a single character type; inline for speed */
2223
2224 case OP_ANY:
2225 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2226 /* Fall through */
2227
2228 case OP_ALLANY:
2229 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2230 { /* not be updated before SCHECK_PARTIAL. */
2231 SCHECK_PARTIAL();
2232 RRETURN(MATCH_NOMATCH);
2233 }
2234 eptr++;
2235 #ifdef SUPPORT_UTF
2236 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2237 #endif
2238 ecode++;
2239 break;
2240
2241 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2242 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2243
2244 case OP_ANYBYTE:
2245 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2246 { /* not be updated before SCHECK_PARTIAL. */
2247 SCHECK_PARTIAL();
2248 RRETURN(MATCH_NOMATCH);
2249 }
2250 eptr++;
2251 ecode++;
2252 break;
2253
2254 case OP_NOT_DIGIT:
2255 if (eptr >= md->end_subject)
2256 {
2257 SCHECK_PARTIAL();
2258 RRETURN(MATCH_NOMATCH);
2259 }
2260 GETCHARINCTEST(c, eptr);
2261 if (
2262 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2263 c < 256 &&
2264 #endif
2265 (md->ctypes[c] & ctype_digit) != 0
2266 )
2267 RRETURN(MATCH_NOMATCH);
2268 ecode++;
2269 break;
2270
2271 case OP_DIGIT:
2272 if (eptr >= md->end_subject)
2273 {
2274 SCHECK_PARTIAL();
2275 RRETURN(MATCH_NOMATCH);
2276 }
2277 GETCHARINCTEST(c, eptr);
2278 if (
2279 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2280 c > 255 ||
2281 #endif
2282 (md->ctypes[c] & ctype_digit) == 0
2283 )
2284 RRETURN(MATCH_NOMATCH);
2285 ecode++;
2286 break;
2287
2288 case OP_NOT_WHITESPACE:
2289 if (eptr >= md->end_subject)
2290 {
2291 SCHECK_PARTIAL();
2292 RRETURN(MATCH_NOMATCH);
2293 }
2294 GETCHARINCTEST(c, eptr);
2295 if (
2296 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2297 c < 256 &&
2298 #endif
2299 (md->ctypes[c] & ctype_space) != 0
2300 )
2301 RRETURN(MATCH_NOMATCH);
2302 ecode++;
2303 break;
2304
2305 case OP_WHITESPACE:
2306 if (eptr >= md->end_subject)
2307 {
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 GETCHARINCTEST(c, eptr);
2312 if (
2313 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2314 c > 255 ||
2315 #endif
2316 (md->ctypes[c] & ctype_space) == 0
2317 )
2318 RRETURN(MATCH_NOMATCH);
2319 ecode++;
2320 break;
2321
2322 case OP_NOT_WORDCHAR:
2323 if (eptr >= md->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 GETCHARINCTEST(c, eptr);
2329 if (
2330 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2331 c < 256 &&
2332 #endif
2333 (md->ctypes[c] & ctype_word) != 0
2334 )
2335 RRETURN(MATCH_NOMATCH);
2336 ecode++;
2337 break;
2338
2339 case OP_WORDCHAR:
2340 if (eptr >= md->end_subject)
2341 {
2342 SCHECK_PARTIAL();
2343 RRETURN(MATCH_NOMATCH);
2344 }
2345 GETCHARINCTEST(c, eptr);
2346 if (
2347 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2348 c > 255 ||
2349 #endif
2350 (md->ctypes[c] & ctype_word) == 0
2351 )
2352 RRETURN(MATCH_NOMATCH);
2353 ecode++;
2354 break;
2355
2356 case OP_ANYNL:
2357 if (eptr >= md->end_subject)
2358 {
2359 SCHECK_PARTIAL();
2360 RRETURN(MATCH_NOMATCH);
2361 }
2362 GETCHARINCTEST(c, eptr);
2363 switch(c)
2364 {
2365 default: RRETURN(MATCH_NOMATCH);
2366
2367 case 0x000d:
2368 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2369 break;
2370
2371 case 0x000a:
2372 break;
2373
2374 case 0x000b:
2375 case 0x000c:
2376 case 0x0085:
2377 case 0x2028:
2378 case 0x2029:
2379 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2380 break;
2381 }
2382 ecode++;
2383 break;
2384
2385 case OP_NOT_HSPACE:
2386 if (eptr >= md->end_subject)
2387 {
2388 SCHECK_PARTIAL();
2389 RRETURN(MATCH_NOMATCH);
2390 }
2391 GETCHARINCTEST(c, eptr);
2392 switch(c)
2393 {
2394 default: break;
2395 case 0x09: /* HT */
2396 case 0x20: /* SPACE */
2397 case 0xa0: /* NBSP */
2398 case 0x1680: /* OGHAM SPACE MARK */
2399 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2400 case 0x2000: /* EN QUAD */
2401 case 0x2001: /* EM QUAD */
2402 case 0x2002: /* EN SPACE */
2403 case 0x2003: /* EM SPACE */
2404 case 0x2004: /* THREE-PER-EM SPACE */
2405 case 0x2005: /* FOUR-PER-EM SPACE */
2406 case 0x2006: /* SIX-PER-EM SPACE */
2407 case 0x2007: /* FIGURE SPACE */
2408 case 0x2008: /* PUNCTUATION SPACE */
2409 case 0x2009: /* THIN SPACE */
2410 case 0x200A: /* HAIR SPACE */
2411 case 0x202f: /* NARROW NO-BREAK SPACE */
2412 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2413 case 0x3000: /* IDEOGRAPHIC SPACE */
2414 RRETURN(MATCH_NOMATCH);
2415 }
2416 ecode++;
2417 break;
2418
2419 case OP_HSPACE:
2420 if (eptr >= md->end_subject)
2421 {
2422 SCHECK_PARTIAL();
2423 RRETURN(MATCH_NOMATCH);
2424 }
2425 GETCHARINCTEST(c, eptr);
2426 switch(c)
2427 {
2428 default: RRETURN(MATCH_NOMATCH);
2429 case 0x09: /* HT */
2430 case 0x20: /* SPACE */
2431 case 0xa0: /* NBSP */
2432 case 0x1680: /* OGHAM SPACE MARK */
2433 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2434 case 0x2000: /* EN QUAD */
2435 case 0x2001: /* EM QUAD */
2436 case 0x2002: /* EN SPACE */
2437 case 0x2003: /* EM SPACE */
2438 case 0x2004: /* THREE-PER-EM SPACE */
2439 case 0x2005: /* FOUR-PER-EM SPACE */
2440 case 0x2006: /* SIX-PER-EM SPACE */
2441 case 0x2007: /* FIGURE SPACE */
2442 case 0x2008: /* PUNCTUATION SPACE */
2443 case 0x2009: /* THIN SPACE */
2444 case 0x200A: /* HAIR SPACE */
2445 case 0x202f: /* NARROW NO-BREAK SPACE */
2446 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2447 case 0x3000: /* IDEOGRAPHIC SPACE */
2448 break;
2449 }
2450 ecode++;
2451 break;
2452
2453 case OP_NOT_VSPACE:
2454 if (eptr >= md->end_subject)
2455 {
2456 SCHECK_PARTIAL();
2457 RRETURN(MATCH_NOMATCH);
2458 }
2459 GETCHARINCTEST(c, eptr);
2460 switch(c)
2461 {
2462 default: break;
2463 case 0x0a: /* LF */
2464 case 0x0b: /* VT */
2465 case 0x0c: /* FF */
2466 case 0x0d: /* CR */
2467 case 0x85: /* NEL */
2468 case 0x2028: /* LINE SEPARATOR */
2469 case 0x2029: /* PARAGRAPH SEPARATOR */
2470 RRETURN(MATCH_NOMATCH);
2471 }
2472 ecode++;
2473 break;
2474
2475 case OP_VSPACE:
2476 if (eptr >= md->end_subject)
2477 {
2478 SCHECK_PARTIAL();
2479 RRETURN(MATCH_NOMATCH);
2480 }
2481 GETCHARINCTEST(c, eptr);
2482 switch(c)
2483 {
2484 default: RRETURN(MATCH_NOMATCH);
2485 case 0x0a: /* LF */
2486 case 0x0b: /* VT */
2487 case 0x0c: /* FF */
2488 case 0x0d: /* CR */
2489 case 0x85: /* NEL */
2490 case 0x2028: /* LINE SEPARATOR */
2491 case 0x2029: /* PARAGRAPH SEPARATOR */
2492 break;
2493 }
2494 ecode++;
2495 break;
2496
2497 #ifdef SUPPORT_UCP
2498 /* Check the next character by Unicode property. We will get here only
2499 if the support is in the binary; otherwise a compile-time error occurs. */
2500
2501 case OP_PROP:
2502 case OP_NOTPROP:
2503 if (eptr >= md->end_subject)
2504 {
2505 SCHECK_PARTIAL();
2506 RRETURN(MATCH_NOMATCH);
2507 }
2508 GETCHARINCTEST(c, eptr);
2509 {
2510 const ucd_record *prop = GET_UCD(c);
2511
2512 switch(ecode[1])
2513 {
2514 case PT_ANY:
2515 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2516 break;
2517
2518 case PT_LAMP:
2519 if ((prop->chartype == ucp_Lu ||
2520 prop->chartype == ucp_Ll ||
2521 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2522 RRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 case PT_GC:
2526 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2527 RRETURN(MATCH_NOMATCH);
2528 break;
2529
2530 case PT_PC:
2531 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2532 RRETURN(MATCH_NOMATCH);
2533 break;
2534
2535 case PT_SC:
2536 if ((ecode[2] != prop->script) == (op == OP_PROP))
2537 RRETURN(MATCH_NOMATCH);
2538 break;
2539
2540 /* These are specials */
2541
2542 case PT_ALNUM:
2543 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2544 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2545 RRETURN(MATCH_NOMATCH);
2546 break;
2547
2548 case PT_SPACE: /* Perl space */
2549 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2550 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2551 == (op == OP_NOTPROP))
2552 RRETURN(MATCH_NOMATCH);
2553 break;
2554
2555 case PT_PXSPACE: /* POSIX space */
2556 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2557 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2558 c == CHAR_FF || c == CHAR_CR)
2559 == (op == OP_NOTPROP))
2560 RRETURN(MATCH_NOMATCH);
2561 break;
2562
2563 case PT_WORD:
2564 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2565 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2566 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2567 RRETURN(MATCH_NOMATCH);
2568 break;
2569
2570 /* This should never occur */
2571
2572 default:
2573 RRETURN(PCRE_ERROR_INTERNAL);
2574 }
2575
2576 ecode += 3;
2577 }
2578 break;
2579
2580 /* Match an extended Unicode sequence. We will get here only if the support
2581 is in the binary; otherwise a compile-time error occurs. */
2582
2583 case OP_EXTUNI:
2584 if (eptr >= md->end_subject)
2585 {
2586 SCHECK_PARTIAL();
2587 RRETURN(MATCH_NOMATCH);
2588 }
2589 GETCHARINCTEST(c, eptr);
2590 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2591 while (eptr < md->end_subject)
2592 {
2593 int len = 1;
2594 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2595 if (UCD_CATEGORY(c) != ucp_M) break;
2596 eptr += len;
2597 }
2598 ecode++;
2599 break;
2600 #endif
2601
2602
2603 /* Match a back reference, possibly repeatedly. Look past the end of the
2604 item to see if there is repeat information following. The code is similar
2605 to that for character classes, but repeated for efficiency. Then obey
2606 similar code to character type repeats - written out again for speed.
2607 However, if the referenced string is the empty string, always treat
2608 it as matched, any number of times (otherwise there could be infinite
2609 loops). */
2610
2611 case OP_REF:
2612 case OP_REFI:
2613 caseless = op == OP_REFI;
2614 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2615 ecode += 1 + IMM2_SIZE;
2616
2617 /* If the reference is unset, there are two possibilities:
2618
2619 (a) In the default, Perl-compatible state, set the length negative;
2620 this ensures that every attempt at a match fails. We can't just fail
2621 here, because of the possibility of quantifiers with zero minima.
2622
2623 (b) If the JavaScript compatibility flag is set, set the length to zero
2624 so that the back reference matches an empty string.
2625
2626 Otherwise, set the length to the length of what was matched by the
2627 referenced subpattern. */
2628
2629 if (offset >= offset_top || md->offset_vector[offset] < 0)
2630 length = (md->jscript_compat)? 0 : -1;
2631 else
2632 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2633
2634 /* Set up for repetition, or handle the non-repeated case */
2635
2636 switch (*ecode)
2637 {
2638 case OP_CRSTAR:
2639 case OP_CRMINSTAR:
2640 case OP_CRPLUS:
2641 case OP_CRMINPLUS:
2642 case OP_CRQUERY:
2643 case OP_CRMINQUERY:
2644 c = *ecode++ - OP_CRSTAR;
2645 minimize = (c & 1) != 0;
2646 min = rep_min[c]; /* Pick up values from tables; */
2647 max = rep_max[c]; /* zero for max => infinity */
2648 if (max == 0) max = INT_MAX;
2649 break;
2650
2651 case OP_CRRANGE:
2652 case OP_CRMINRANGE:
2653 minimize = (*ecode == OP_CRMINRANGE);
2654 min = GET2(ecode, 1);
2655 max = GET2(ecode, 1 + IMM2_SIZE);
2656 if (max == 0) max = INT_MAX;
2657 ecode += 1 + 2 * IMM2_SIZE;
2658 break;
2659
2660 default: /* No repeat follows */
2661 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2662 {
2663 CHECK_PARTIAL();
2664 RRETURN(MATCH_NOMATCH);
2665 }
2666 eptr += length;
2667 continue; /* With the main loop */
2668 }
2669
2670 /* Handle repeated back references. If the length of the reference is
2671 zero, just continue with the main loop. If the length is negative, it
2672 means the reference is unset in non-Java-compatible mode. If the minimum is
2673 zero, we can continue at the same level without recursion. For any other
2674 minimum, carrying on will result in NOMATCH. */
2675
2676 if (length == 0) continue;
2677 if (length < 0 && min == 0) continue;
2678
2679 /* First, ensure the minimum number of matches are present. We get back
2680 the length of the reference string explicitly rather than passing the
2681 address of eptr, so that eptr can be a register variable. */
2682
2683 for (i = 1; i <= min; i++)
2684 {
2685 int slength;
2686 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2687 {
2688 CHECK_PARTIAL();
2689 RRETURN(MATCH_NOMATCH);
2690 }
2691 eptr += slength;
2692 }
2693
2694 /* If min = max, continue at the same level without recursion.
2695 They are not both allowed to be zero. */
2696
2697 if (min == max) continue;
2698
2699 /* If minimizing, keep trying and advancing the pointer */
2700
2701 if (minimize)
2702 {
2703 for (fi = min;; fi++)
2704 {
2705 int slength;
2706 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2708 if (fi >= max) RRETURN(MATCH_NOMATCH);
2709 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2710 {
2711 CHECK_PARTIAL();
2712 RRETURN(MATCH_NOMATCH);
2713 }
2714 eptr += slength;
2715 }
2716 /* Control never gets here */
2717 }
2718
2719 /* If maximizing, find the longest string and work backwards */
2720
2721 else
2722 {
2723 pp = eptr;
2724 for (i = min; i < max; i++)
2725 {
2726 int slength;
2727 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2728 {
2729 CHECK_PARTIAL();
2730 break;
2731 }
2732 eptr += slength;
2733 }
2734 while (eptr >= pp)
2735 {
2736 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2737 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2738 eptr -= length;
2739 }
2740 RRETURN(MATCH_NOMATCH);
2741 }
2742 /* Control never gets here */
2743
2744 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2745 used when all the characters in the class have values in the range 0-255,
2746 and either the matching is caseful, or the characters are in the range
2747 0-127 when UTF-8 processing is enabled. The only difference between
2748 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2749 encountered.
2750
2751 First, look past the end of the item to see if there is repeat information
2752 following. Then obey similar code to character type repeats - written out
2753 again for speed. */
2754
2755 case OP_NCLASS:
2756 case OP_CLASS:
2757 {
2758 /* The data variable is saved across frames, so the byte map needs to
2759 be stored there. */
2760 #define BYTE_MAP ((pcre_uint8 *)data)
2761 data = ecode + 1; /* Save for matching */
2762 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2763
2764 switch (*ecode)
2765 {
2766 case OP_CRSTAR:
2767 case OP_CRMINSTAR:
2768 case OP_CRPLUS:
2769 case OP_CRMINPLUS:
2770 case OP_CRQUERY:
2771 case OP_CRMINQUERY:
2772 c = *ecode++ - OP_CRSTAR;
2773 minimize = (c & 1) != 0;
2774 min = rep_min[c]; /* Pick up values from tables; */
2775 max = rep_max[c]; /* zero for max => infinity */
2776 if (max == 0) max = INT_MAX;
2777 break;
2778
2779 case OP_CRRANGE:
2780 case OP_CRMINRANGE:
2781 minimize = (*ecode == OP_CRMINRANGE);
2782 min = GET2(ecode, 1);
2783 max = GET2(ecode, 1 + IMM2_SIZE);
2784 if (max == 0) max = INT_MAX;
2785 ecode += 1 + 2 * IMM2_SIZE;
2786 break;
2787
2788 default: /* No repeat follows */
2789 min = max = 1;
2790 break;
2791 }
2792
2793 /* First, ensure the minimum number of matches are present. */
2794
2795 #ifdef SUPPORT_UTF
2796 if (utf)
2797 {
2798 for (i = 1; i <= min; i++)
2799 {
2800 if (eptr >= md->end_subject)
2801 {
2802 SCHECK_PARTIAL();
2803 RRETURN(MATCH_NOMATCH);
2804 }
2805 GETCHARINC(c, eptr);
2806 if (c > 255)
2807 {
2808 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2809 }
2810 else
2811 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2812 }
2813 }
2814 else
2815 #endif
2816 /* Not UTF mode */
2817 {
2818 for (i = 1; i <= min; i++)
2819 {
2820 if (eptr >= md->end_subject)
2821 {
2822 SCHECK_PARTIAL();
2823 RRETURN(MATCH_NOMATCH);
2824 }
2825 c = *eptr++;
2826 #ifndef COMPILE_PCRE8
2827 if (c > 255)
2828 {
2829 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2830 }
2831 else
2832 #endif
2833 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2834 }
2835 }
2836
2837 /* If max == min we can continue with the main loop without the
2838 need to recurse. */
2839
2840 if (min == max) continue;
2841
2842 /* If minimizing, keep testing the rest of the expression and advancing
2843 the pointer while it matches the class. */
2844
2845 if (minimize)
2846 {
2847 #ifdef SUPPORT_UTF
2848 if (utf)
2849 {
2850 for (fi = min;; fi++)
2851 {
2852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2854 if (fi >= max) RRETURN(MATCH_NOMATCH);
2855 if (eptr >= md->end_subject)
2856 {
2857 SCHECK_PARTIAL();
2858 RRETURN(MATCH_NOMATCH);
2859 }
2860 GETCHARINC(c, eptr);
2861 if (c > 255)
2862 {
2863 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2864 }
2865 else
2866 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2867 }
2868 }
2869 else
2870 #endif
2871 /* Not UTF mode */
2872 {
2873 for (fi = min;; fi++)
2874 {
2875 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2877 if (fi >= max) RRETURN(MATCH_NOMATCH);
2878 if (eptr >= md->end_subject)
2879 {
2880 SCHECK_PARTIAL();
2881 RRETURN(MATCH_NOMATCH);
2882 }
2883 c = *eptr++;
2884 #ifndef COMPILE_PCRE8
2885 if (c > 255)
2886 {
2887 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2888 }
2889 else
2890 #endif
2891 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2892 }
2893 }
2894 /* Control never gets here */
2895 }
2896
2897 /* If maximizing, find the longest possible run, then work backwards. */
2898
2899 else
2900 {
2901 pp = eptr;
2902
2903 #ifdef SUPPORT_UTF
2904 if (utf)
2905 {
2906 for (i = min; i < max; i++)
2907 {
2908 int len = 1;
2909 if (eptr >= md->end_subject)
2910 {
2911 SCHECK_PARTIAL();
2912 break;
2913 }
2914 GETCHARLEN(c, eptr, len);
2915 if (c > 255)
2916 {
2917 if (op == OP_CLASS) break;
2918 }
2919 else
2920 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2921 eptr += len;
2922 }
2923 for (;;)
2924 {
2925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2927 if (eptr-- == pp) break; /* Stop if tried at original pos */
2928 BACKCHAR(eptr);
2929 }
2930 }
2931 else
2932 #endif
2933 /* Not UTF mode */
2934 {
2935 for (i = min; i < max; i++)
2936 {
2937 if (eptr >= md->end_subject)
2938 {
2939 SCHECK_PARTIAL();
2940 break;
2941 }
2942 c = *eptr;
2943 #ifndef COMPILE_PCRE8
2944 if (c > 255)
2945 {
2946 if (op == OP_CLASS) break;
2947 }
2948 else
2949 #endif
2950 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2951 eptr++;
2952 }
2953 while (eptr >= pp)
2954 {
2955 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2956 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2957 eptr--;
2958 }
2959 }
2960
2961 RRETURN(MATCH_NOMATCH);
2962 }
2963 #undef BYTE_MAP
2964 }
2965 /* Control never gets here */
2966
2967
2968 /* Match an extended character class. This opcode is encountered only
2969 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2970 mode, because Unicode properties are supported in non-UTF-8 mode. */
2971
2972 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2973 case OP_XCLASS:
2974 {
2975 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2976 ecode += GET(ecode, 1); /* Advance past the item */
2977
2978 switch (*ecode)
2979 {
2980 case OP_CRSTAR:
2981 case OP_CRMINSTAR:
2982 case OP_CRPLUS:
2983 case OP_CRMINPLUS:
2984 case OP_CRQUERY:
2985 case OP_CRMINQUERY:
2986 c = *ecode++ - OP_CRSTAR;
2987 minimize = (c & 1) != 0;
2988 min = rep_min[c]; /* Pick up values from tables; */
2989 max = rep_max[c]; /* zero for max => infinity */
2990 if (max == 0) max = INT_MAX;
2991 break;
2992
2993 case OP_CRRANGE:
2994 case OP_CRMINRANGE:
2995 minimize = (*ecode == OP_CRMINRANGE);
2996 min = GET2(ecode, 1);
2997 max = GET2(ecode, 1 + IMM2_SIZE);
2998 if (max == 0) max = INT_MAX;
2999 ecode += 1 + 2 * IMM2_SIZE;
3000 break;
3001
3002 default: /* No repeat follows */
3003 min = max = 1;
3004 break;
3005 }
3006
3007 /* First, ensure the minimum number of matches are present. */
3008
3009 for (i = 1; i <= min; i++)
3010 {
3011 if (eptr >= md->end_subject)
3012 {
3013 SCHECK_PARTIAL();
3014 RRETURN(MATCH_NOMATCH);
3015 }
3016 GETCHARINCTEST(c, eptr);
3017 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3018 }
3019
3020 /* If max == min we can continue with the main loop without the
3021 need to recurse. */
3022
3023 if (min == max) continue;
3024
3025 /* If minimizing, keep testing the rest of the expression and advancing
3026 the pointer while it matches the class. */
3027
3028 if (minimize)
3029 {
3030 for (fi = min;; fi++)
3031 {
3032 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3034 if (fi >= max) RRETURN(MATCH_NOMATCH);
3035 if (eptr >= md->end_subject)
3036 {
3037 SCHECK_PARTIAL();
3038 RRETURN(MATCH_NOMATCH);
3039 }
3040 GETCHARINCTEST(c, eptr);
3041 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3042 }
3043 /* Control never gets here */
3044 }
3045
3046 /* If maximizing, find the longest possible run, then work backwards. */
3047
3048 else
3049 {
3050 pp = eptr;
3051 for (i = min; i < max; i++)
3052 {
3053 int len = 1;
3054 if (eptr >= md->end_subject)
3055 {
3056 SCHECK_PARTIAL();
3057 break;
3058 }
3059 #ifdef SUPPORT_UTF
3060 GETCHARLENTEST(c, eptr, len);
3061 #else
3062 c = *eptr;
3063 #endif
3064 if (!PRIV(xclass)(c, data, utf)) break;
3065 eptr += len;
3066 }
3067 for(;;)
3068 {
3069 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3071 if (eptr-- == pp) break; /* Stop if tried at original pos */
3072 #ifdef SUPPORT_UTF
3073 if (utf) BACKCHAR(eptr);
3074 #endif
3075 }
3076 RRETURN(MATCH_NOMATCH);
3077 }
3078
3079 /* Control never gets here */
3080 }
3081 #endif /* End of XCLASS */
3082
3083 /* Match a single character, casefully */
3084
3085 case OP_CHAR:
3086 #ifdef SUPPORT_UTF
3087 if (utf)
3088 {
3089 length = 1;
3090 ecode++;
3091 GETCHARLEN(fc, ecode, length);
3092 if (length > md->end_subject - eptr)
3093 {
3094 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3095 RRETURN(MATCH_NOMATCH);
3096 }
3097 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3098 }
3099 else
3100 #endif
3101 /* Not UTF mode */
3102 {
3103 if (md->end_subject - eptr < 1)
3104 {
3105 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3106 RRETURN(MATCH_NOMATCH);
3107 }
3108 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3109 ecode += 2;
3110 }
3111 break;
3112
3113 /* Match a single character, caselessly. If we are at the end of the
3114 subject, give up immediately. */
3115
3116 case OP_CHARI:
3117 if (eptr >= md->end_subject)
3118 {
3119 SCHECK_PARTIAL();
3120 RRETURN(MATCH_NOMATCH);
3121 }
3122
3123 #ifdef SUPPORT_UTF
3124 if (utf)
3125 {
3126 length = 1;
3127 ecode++;
3128 GETCHARLEN(fc, ecode, length);
3129
3130 /* If the pattern character's value is < 128, we have only one byte, and
3131 we know that its other case must also be one byte long, so we can use the
3132 fast lookup table. We know that there is at least one byte left in the
3133 subject. */
3134
3135 if (fc < 128)
3136 {
3137 if (md->lcc[fc]
3138 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3139 ecode++;
3140 eptr++;
3141 }
3142
3143 /* Otherwise we must pick up the subject character. Note that we cannot
3144 use the value of "length" to check for sufficient bytes left, because the
3145 other case of the character may have more or fewer bytes. */
3146
3147 else
3148 {
3149 unsigned int dc;
3150 GETCHARINC(dc, eptr);
3151 ecode += length;
3152
3153 /* If we have Unicode property support, we can use it to test the other
3154 case of the character, if there is one. */
3155
3156 if (fc != dc)
3157 {
3158 #ifdef SUPPORT_UCP
3159 if (dc != UCD_OTHERCASE(fc))
3160 #endif
3161 RRETURN(MATCH_NOMATCH);
3162 }
3163 }
3164 }
3165 else
3166 #endif /* SUPPORT_UTF */
3167
3168 /* Not UTF mode */
3169 {
3170 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3171 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3172 eptr++;
3173 ecode += 2;
3174 }
3175 break;
3176
3177 /* Match a single character repeatedly. */
3178
3179 case OP_EXACT:
3180 case OP_EXACTI:
3181 min = max = GET2(ecode, 1);
3182 ecode += 1 + IMM2_SIZE;
3183 goto REPEATCHAR;
3184
3185 case OP_POSUPTO:
3186 case OP_POSUPTOI:
3187 possessive = TRUE;
3188 /* Fall through */
3189
3190 case OP_UPTO:
3191 case OP_UPTOI:
3192 case OP_MINUPTO:
3193 case OP_MINUPTOI:
3194 min = 0;
3195 max = GET2(ecode, 1);
3196 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3197 ecode += 1 + IMM2_SIZE;
3198 goto REPEATCHAR;
3199
3200 case OP_POSSTAR:
3201 case OP_POSSTARI:
3202 possessive = TRUE;
3203 min = 0;
3204 max = INT_MAX;
3205 ecode++;
3206 goto REPEATCHAR;
3207
3208 case OP_POSPLUS:
3209 case OP_POSPLUSI:
3210 possessive = TRUE;
3211 min = 1;
3212 max = INT_MAX;
3213 ecode++;
3214 goto REPEATCHAR;
3215
3216 case OP_POSQUERY:
3217 case OP_POSQUERYI:
3218 possessive = TRUE;
3219 min = 0;
3220 max = 1;
3221 ecode++;
3222 goto REPEATCHAR;
3223
3224 case OP_STAR:
3225 case OP_STARI:
3226 case OP_MINSTAR:
3227 case OP_MINSTARI:
3228 case OP_PLUS:
3229 case OP_PLUSI:
3230 case OP_MINPLUS:
3231 case OP_MINPLUSI:
3232 case OP_QUERY:
3233 case OP_QUERYI:
3234 case OP_MINQUERY:
3235 case OP_MINQUERYI:
3236 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3237 minimize = (c & 1) != 0;
3238 min = rep_min[c]; /* Pick up values from tables; */
3239 max = rep_max[c]; /* zero for max => infinity */
3240 if (max == 0) max = INT_MAX;
3241
3242 /* Common code for all repeated single-character matches. */
3243
3244 REPEATCHAR:
3245 #ifdef SUPPORT_UTF
3246 if (utf)
3247 {
3248 length = 1;
3249 charptr = ecode;
3250 GETCHARLEN(fc, ecode, length);
3251 ecode += length;
3252
3253 /* Handle multibyte character matching specially here. There is
3254 support for caseless matching if UCP support is present. */
3255
3256 if (length > 1)
3257 {
3258 #ifdef SUPPORT_UCP
3259 unsigned int othercase;
3260 if (op >= OP_STARI && /* Caseless */
3261 (othercase = UCD_OTHERCASE(fc)) != fc)
3262 oclength = PRIV(ord2utf)(othercase, occhars);
3263 else oclength = 0;
3264 #endif /* SUPPORT_UCP */
3265
3266 for (i = 1; i <= min; i++)
3267 {
3268 if (eptr <= md->end_subject - length &&
3269 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3270 #ifdef SUPPORT_UCP
3271 else if (oclength > 0 &&
3272 eptr <= md->end_subject - oclength &&
3273 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3274 #endif /* SUPPORT_UCP */
3275 else
3276 {
3277 CHECK_PARTIAL();
3278 RRETURN(MATCH_NOMATCH);
3279 }
3280 }
3281
3282 if (min == max) continue;
3283
3284 if (minimize)
3285 {
3286 for (fi = min;; fi++)
3287 {
3288 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3289 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3290 if (fi >= max) RRETURN(MATCH_NOMATCH);
3291 if (eptr <= md->end_subject - length &&
3292 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3293 #ifdef SUPPORT_UCP
3294 else if (oclength > 0 &&
3295 eptr <= md->end_subject - oclength &&
3296 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3297 #endif /* SUPPORT_UCP */
3298 else
3299 {
3300 CHECK_PARTIAL();
3301 RRETURN(MATCH_NOMATCH);
3302 }
3303 }
3304 /* Control never gets here */
3305 }
3306
3307 else /* Maximize */
3308 {
3309 pp = eptr;
3310 for (i = min; i < max; i++)
3311 {
3312 if (eptr <= md->end_subject - length &&
3313 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3314 #ifdef SUPPORT_UCP
3315 else if (oclength > 0 &&
3316 eptr <= md->end_subject - oclength &&
3317 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3318 #endif /* SUPPORT_UCP */
3319 else
3320 {
3321 CHECK_PARTIAL();
3322 break;
3323 }
3324 }
3325
3326 if (possessive) continue;
3327
3328 for(;;)
3329 {
3330 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3331 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3332 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3333 #ifdef SUPPORT_UCP
3334 eptr--;
3335 BACKCHAR(eptr);
3336 #else /* without SUPPORT_UCP */
3337 eptr -= length;
3338 #endif /* SUPPORT_UCP */
3339 }
3340 }
3341 /* Control never gets here */
3342 }
3343
3344 /* If the length of a UTF-8 character is 1, we fall through here, and
3345 obey the code as for non-UTF-8 characters below, though in this case the
3346 value of fc will always be < 128. */
3347 }
3348 else
3349 #endif /* SUPPORT_UTF */
3350 /* When not in UTF-8 mode, load a single-byte character. */
3351 fc = *ecode++;
3352
3353 /* The value of fc at this point is always one character, though we may
3354 or may not be in UTF mode. The code is duplicated for the caseless and
3355 caseful cases, for speed, since matching characters is likely to be quite
3356 common. First, ensure the minimum number of matches are present. If min =
3357 max, continue at the same level without recursing. Otherwise, if
3358 minimizing, keep trying the rest of the expression and advancing one
3359 matching character if failing, up to the maximum. Alternatively, if
3360 maximizing, find the maximum number of characters and work backwards. */
3361
3362 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3363 max, eptr));
3364
3365 if (op >= OP_STARI) /* Caseless */
3366 {
3367 #ifdef COMPILE_PCRE8
3368 /* fc must be < 128 if UTF is enabled. */
3369 foc = md->fcc[fc];
3370 #else
3371 #ifdef SUPPORT_UTF
3372 #ifdef SUPPORT_UCP
3373 if (utf && fc > 127)
3374 foc = UCD_OTHERCASE(fc);
3375 #else
3376 if (utf && fc > 127)
3377 foc = fc;
3378 #endif /* SUPPORT_UCP */
3379 else
3380 #endif /* SUPPORT_UTF */
3381 foc = TABLE_GET(fc, md->fcc, fc);
3382 #endif /* COMPILE_PCRE8 */
3383
3384 for (i = 1; i <= min; i++)
3385 {
3386 if (eptr >= md->end_subject)
3387 {
3388 SCHECK_PARTIAL();
3389 RRETURN(MATCH_NOMATCH);
3390 }
3391 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3392 eptr++;
3393 }
3394 if (min == max) continue;
3395 if (minimize)
3396 {
3397 for (fi = min;; fi++)
3398 {
3399 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3400 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3401 if (fi >= max) RRETURN(MATCH_NOMATCH);
3402 if (eptr >= md->end_subject)
3403 {
3404 SCHECK_PARTIAL();
3405 RRETURN(MATCH_NOMATCH);
3406 }
3407 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3408 eptr++;
3409 }
3410 /* Control never gets here */
3411 }
3412 else /* Maximize */
3413 {
3414 pp = eptr;
3415 for (i = min; i < max; i++)
3416 {
3417 if (eptr >= md->end_subject)
3418 {
3419 SCHECK_PARTIAL();
3420 break;
3421 }
3422 if (fc != *eptr && foc != *eptr) break;
3423 eptr++;
3424 }
3425
3426 if (possessive) continue;
3427
3428 while (eptr >= pp)
3429 {
3430 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3431 eptr--;
3432 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3433 }
3434 RRETURN(MATCH_NOMATCH);
3435 }
3436 /* Control never gets here */
3437 }
3438
3439 /* Caseful comparisons (includes all multi-byte characters) */
3440
3441 else
3442 {
3443 for (i = 1; i <= min; i++)
3444 {
3445 if (eptr >= md->end_subject)
3446 {
3447 SCHECK_PARTIAL();
3448 RRETURN(MATCH_NOMATCH);
3449 }
3450 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3451 }
3452
3453 if (min == max) continue;
3454
3455 if (minimize)
3456 {
3457 for (fi = min;; fi++)
3458 {
3459 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3460 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3461 if (fi >= max) RRETURN(MATCH_NOMATCH);
3462 if (eptr >= md->end_subject)
3463 {
3464 SCHECK_PARTIAL();
3465 RRETURN(MATCH_NOMATCH);
3466 }
3467 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3468 }
3469 /* Control never gets here */
3470 }
3471 else /* Maximize */
3472 {
3473 pp = eptr;
3474 for (i = min; i < max; i++)
3475 {
3476 if (eptr >= md->end_subject)
3477 {
3478 SCHECK_PARTIAL();
3479 break;
3480 }
3481 if (fc != *eptr) break;
3482 eptr++;
3483 }
3484 if (possessive) continue;
3485
3486 while (eptr >= pp)
3487 {
3488 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3489 eptr--;
3490 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3491 }
3492 RRETURN(MATCH_NOMATCH);
3493 }
3494 }
3495 /* Control never gets here */
3496
3497 /* Match a negated single one-byte character. The character we are
3498 checking can be multibyte. */
3499
3500 case OP_NOT:
3501 case OP_NOTI:
3502 if (eptr >= md->end_subject)
3503 {
3504 SCHECK_PARTIAL();
3505 RRETURN(MATCH_NOMATCH);
3506 }
3507 ecode++;
3508 GETCHARINCTEST(c, eptr);
3509 if (op == OP_NOTI) /* The caseless case */
3510 {
3511 register unsigned int ch, och;
3512 ch = *ecode++;
3513 #ifdef COMPILE_PCRE8
3514 /* ch must be < 128 if UTF is enabled. */
3515 och = md->fcc[ch];
3516 #else
3517 #ifdef SUPPORT_UTF
3518 #ifdef SUPPORT_UCP
3519 if (utf && ch > 127)
3520 och = UCD_OTHERCASE(ch);
3521 #else
3522 if (utf && ch > 127)
3523 och = ch;
3524 #endif /* SUPPORT_UCP */
3525 else
3526 #endif /* SUPPORT_UTF */
3527 och = TABLE_GET(ch, md->fcc, ch);
3528 #endif /* COMPILE_PCRE8 */
3529 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3530 }
3531 else /* Caseful */
3532 {
3533 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3534 }
3535 break;
3536
3537 /* Match a negated single one-byte character repeatedly. This is almost a
3538 repeat of the code for a repeated single character, but I haven't found a
3539 nice way of commoning these up that doesn't require a test of the
3540 positive/negative option for each character match. Maybe that wouldn't add
3541 very much to the time taken, but character matching *is* what this is all
3542 about... */
3543
3544 case OP_NOTEXACT:
3545 case OP_NOTEXACTI:
3546 min = max = GET2(ecode, 1);
3547 ecode += 1 + IMM2_SIZE;
3548 goto REPEATNOTCHAR;
3549
3550 case OP_NOTUPTO:
3551 case OP_NOTUPTOI:
3552 case OP_NOTMINUPTO:
3553 case OP_NOTMINUPTOI:
3554 min = 0;
3555 max = GET2(ecode, 1);
3556 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3557 ecode += 1 + IMM2_SIZE;
3558 goto REPEATNOTCHAR;
3559
3560 case OP_NOTPOSSTAR:
3561 case OP_NOTPOSSTARI:
3562 possessive = TRUE;
3563 min = 0;
3564 max = INT_MAX;
3565 ecode++;
3566 goto REPEATNOTCHAR;
3567
3568 case OP_NOTPOSPLUS:
3569 case OP_NOTPOSPLUSI:
3570 possessive = TRUE;
3571 min = 1;
3572 max = INT_MAX;
3573 ecode++;
3574 goto REPEATNOTCHAR;
3575
3576 case OP_NOTPOSQUERY:
3577 case OP_NOTPOSQUERYI:
3578 possessive = TRUE;
3579 min = 0;
3580 max = 1;
3581 ecode++;
3582 goto REPEATNOTCHAR;
3583
3584 case OP_NOTPOSUPTO:
3585 case OP_NOTPOSUPTOI:
3586 possessive = TRUE;
3587 min = 0;
3588 max = GET2(ecode, 1);
3589 ecode += 1 + IMM2_SIZE;
3590 goto REPEATNOTCHAR;
3591
3592 case OP_NOTSTAR:
3593 case OP_NOTSTARI:
3594 case OP_NOTMINSTAR:
3595 case OP_NOTMINSTARI:
3596 case OP_NOTPLUS:
3597 case OP_NOTPLUSI:
3598 case OP_NOTMINPLUS:
3599 case OP_NOTMINPLUSI:
3600 case OP_NOTQUERY:
3601 case OP_NOTQUERYI:
3602 case OP_NOTMINQUERY:
3603 case OP_NOTMINQUERYI:
3604 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3605 minimize = (c & 1) != 0;
3606 min = rep_min[c]; /* Pick up values from tables; */
3607 max = rep_max[c]; /* zero for max => infinity */
3608 if (max == 0) max = INT_MAX;
3609
3610 /* Common code for all repeated single-byte matches. */
3611
3612 REPEATNOTCHAR:
3613 fc = *ecode++;
3614
3615 /* The code is duplicated for the caseless and caseful cases, for speed,
3616 since matching characters is likely to be quite common. First, ensure the
3617 minimum number of matches are present. If min = max, continue at the same
3618 level without recursing. Otherwise, if minimizing, keep trying the rest of
3619 the expression and advancing one matching character if failing, up to the
3620 maximum. Alternatively, if maximizing, find the maximum number of
3621 characters and work backwards. */
3622
3623 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3624 max, eptr));
3625
3626 if (op >= OP_NOTSTARI) /* Caseless */
3627 {
3628 #ifdef COMPILE_PCRE8
3629 /* fc must be < 128 if UTF is enabled. */
3630 foc = md->fcc[fc];
3631 #else
3632 #ifdef SUPPORT_UTF
3633 #ifdef SUPPORT_UCP
3634 if (utf && fc > 127)
3635 foc = UCD_OTHERCASE(fc);
3636 #else
3637 if (utf && fc > 127)
3638 foc = fc;
3639 #endif /* SUPPORT_UCP */
3640 else
3641 #endif /* SUPPORT_UTF */
3642 foc = TABLE_GET(fc, md->fcc, fc);
3643 #endif /* COMPILE_PCRE8 */
3644
3645 #ifdef SUPPORT_UTF
3646 if (utf)
3647 {
3648 register unsigned int d;
3649 for (i = 1; i <= min; i++)
3650 {
3651 if (eptr >= md->end_subject)
3652 {
3653 SCHECK_PARTIAL();
3654 RRETURN(MATCH_NOMATCH);
3655 }
3656 GETCHARINC(d, eptr);
3657 if (fc == d || (unsigned int) foc == d) RRETURN(MATCH_NOMATCH);
3658 }
3659 }
3660 else
3661 #endif
3662 /* Not UTF mode */
3663 {
3664 for (i = 1; i <= min; i++)
3665 {
3666 if (eptr >= md->end_subject)
3667 {
3668 SCHECK_PARTIAL();
3669 RRETURN(MATCH_NOMATCH);
3670 }
3671 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3672 eptr++;
3673 }
3674 }
3675
3676 if (min == max) continue;
3677
3678 if (minimize)
3679 {
3680 #ifdef SUPPORT_UTF
3681 if (utf)
3682 {
3683 register unsigned int d;
3684 for (fi = min;; fi++)
3685 {
3686 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3688 if (fi >= max) RRETURN(MATCH_NOMATCH);
3689 if (eptr >= md->end_subject)
3690 {
3691 SCHECK_PARTIAL();
3692 RRETURN(MATCH_NOMATCH);
3693 }
3694 GETCHARINC(d, eptr);
3695 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3696 }
3697 }
3698 else
3699 #endif
3700 /* Not UTF mode */
3701 {
3702 for (fi = min;; fi++)
3703 {
3704 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3705 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3706 if (fi >= max) RRETURN(MATCH_NOMATCH);
3707 if (eptr >= md->end_subject)
3708 {
3709 SCHECK_PARTIAL();
3710 RRETURN(MATCH_NOMATCH);
3711 }
3712 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3713 eptr++;
3714 }
3715 }
3716 /* Control never gets here */
3717 }
3718
3719 /* Maximize case */
3720
3721 else
3722 {
3723 pp = eptr;
3724
3725 #ifdef SUPPORT_UTF
3726 if (utf)
3727 {
3728 register unsigned int d;
3729 for (i = min; i < max; i++)
3730 {
3731 int len = 1;
3732 if (eptr >= md->end_subject)
3733 {
3734 SCHECK_PARTIAL();
3735 break;
3736 }
3737 GETCHARLEN(d, eptr, len);
3738 if (fc == d || (unsigned int)foc == d) break;
3739 eptr += len;
3740 }
3741 if (possessive) continue;
3742 for(;;)
3743 {
3744 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3746 if (eptr-- == pp) break; /* Stop if tried at original pos */
3747 BACKCHAR(eptr);
3748 }
3749 }
3750 else
3751 #endif
3752 /* Not UTF mode */
3753 {
3754 for (i = min; i < max; i++)
3755 {
3756 if (eptr >= md->end_subject)
3757 {
3758 SCHECK_PARTIAL();
3759 break;
3760 }
3761 if (fc == *eptr || foc == *eptr) break;
3762 eptr++;
3763 }
3764 if (possessive) continue;
3765 while (eptr >= pp)
3766 {
3767 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3768 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3769 eptr--;
3770 }
3771 }
3772
3773 RRETURN(MATCH_NOMATCH);
3774 }
3775 /* Control never gets here */
3776 }
3777
3778 /* Caseful comparisons */
3779
3780 else
3781 {
3782 #ifdef SUPPORT_UTF
3783 if (utf)
3784 {
3785 register unsigned int d;
3786 for (i = 1; i <= min; i++)
3787 {
3788 if (eptr >= md->end_subject)
3789 {
3790 SCHECK_PARTIAL();
3791 RRETURN(MATCH_NOMATCH);
3792 }
3793 GETCHARINC(d, eptr);
3794 if (fc == d) RRETURN(MATCH_NOMATCH);
3795 }
3796 }
3797 else
3798 #endif
3799 /* Not UTF mode */
3800 {
3801 for (i = 1; i <= min; i++)
3802 {
3803 if (eptr >= md->end_subject)
3804 {
3805 SCHECK_PARTIAL();
3806 RRETURN(MATCH_NOMATCH);
3807 }
3808 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3809 }
3810 }
3811
3812 if (min == max) continue;
3813
3814 if (minimize)
3815 {
3816 #ifdef SUPPORT_UTF
3817 if (utf)
3818 {
3819 register unsigned int d;
3820 for (fi = min;; fi++)
3821 {
3822 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3824 if (fi >= max) RRETURN(MATCH_NOMATCH);
3825 if (eptr >= md->end_subject)
3826 {
3827 SCHECK_PARTIAL();
3828 RRETURN(MATCH_NOMATCH);
3829 }
3830 GETCHARINC(d, eptr);
3831 if (fc == d) RRETURN(MATCH_NOMATCH);
3832 }
3833 }
3834 else
3835 #endif
3836 /* Not UTF mode */
3837 {
3838 for (fi = min;; fi++)
3839 {
3840 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3842 if (fi >= max) RRETURN(MATCH_NOMATCH);
3843 if (eptr >= md->end_subject)
3844 {
3845 SCHECK_PARTIAL();
3846 RRETURN(MATCH_NOMATCH);
3847 }
3848 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3849 }
3850 }
3851 /* Control never gets here */
3852 }
3853
3854 /* Maximize case */
3855
3856 else
3857 {
3858 pp = eptr;
3859
3860 #ifdef SUPPORT_UTF
3861 if (utf)
3862 {
3863 register unsigned int d;
3864 for (i = min; i < max; i++)
3865 {
3866 int len = 1;
3867 if (eptr >= md->end_subject)
3868 {
3869 SCHECK_PARTIAL();
3870 break;
3871 }
3872 GETCHARLEN(d, eptr, len);
3873 if (fc == d) break;
3874 eptr += len;
3875 }
3876 if (possessive) continue;
3877 for(;;)
3878 {
3879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3881 if (eptr-- == pp) break; /* Stop if tried at original pos */
3882 BACKCHAR(eptr);
3883 }
3884 }
3885 else
3886 #endif
3887 /* Not UTF mode */
3888 {
3889 for (i = min; i < max; i++)
3890 {
3891 if (eptr >= md->end_subject)
3892 {
3893 SCHECK_PARTIAL();
3894 break;
3895 }
3896 if (fc == *eptr) break;
3897 eptr++;
3898 }
3899 if (possessive) continue;
3900 while (eptr >= pp)
3901 {
3902 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3904 eptr--;
3905 }
3906 }
3907
3908 RRETURN(MATCH_NOMATCH);
3909 }
3910 }
3911 /* Control never gets here */
3912
3913 /* Match a single character type repeatedly; several different opcodes
3914 share code. This is very similar to the code for single characters, but we
3915 repeat it in the interests of efficiency. */
3916
3917 case OP_TYPEEXACT:
3918 min = max = GET2(ecode, 1);
3919 minimize = TRUE;
3920 ecode += 1 + IMM2_SIZE;
3921 goto REPEATTYPE;
3922
3923 case OP_TYPEUPTO:
3924 case OP_TYPEMINUPTO:
3925 min = 0;
3926 max = GET2(ecode, 1);
3927 minimize = *ecode == OP_TYPEMINUPTO;
3928 ecode += 1 + IMM2_SIZE;
3929 goto REPEATTYPE;
3930
3931 case OP_TYPEPOSSTAR:
3932 possessive = TRUE;
3933 min = 0;
3934 max = INT_MAX;
3935 ecode++;
3936 goto REPEATTYPE;
3937
3938 case OP_TYPEPOSPLUS:
3939 possessive = TRUE;
3940 min = 1;
3941 max = INT_MAX;
3942 ecode++;
3943 goto REPEATTYPE;
3944
3945 case OP_TYPEPOSQUERY:
3946 possessive = TRUE;
3947 min = 0;
3948 max = 1;
3949 ecode++;
3950 goto REPEATTYPE;
3951
3952 case OP_TYPEPOSUPTO:
3953 possessive = TRUE;
3954 min = 0;
3955 max = GET2(ecode, 1);
3956 ecode += 1 + IMM2_SIZE;
3957 goto REPEATTYPE;
3958
3959 case OP_TYPESTAR:
3960 case OP_TYPEMINSTAR:
3961 case OP_TYPEPLUS:
3962 case OP_TYPEMINPLUS:
3963 case OP_TYPEQUERY:
3964 case OP_TYPEMINQUERY:
3965 c = *ecode++ - OP_TYPESTAR;
3966 minimize = (c & 1) != 0;
3967 min = rep_min[c]; /* Pick up values from tables; */
3968 max = rep_max[c]; /* zero for max => infinity */
3969 if (max == 0) max = INT_MAX;
3970
3971 /* Common code for all repeated single character type matches. Note that
3972 in UTF-8 mode, '.' matches a character of any length, but for the other
3973 character types, the valid characters are all one-byte long. */
3974
3975 REPEATTYPE:
3976 ctype = *ecode++; /* Code for the character type */
3977
3978 #ifdef SUPPORT_UCP
3979 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3980 {
3981 prop_fail_result = ctype == OP_NOTPROP;
3982 prop_type = *ecode++;
3983 prop_value = *ecode++;
3984 }
3985 else prop_type = -1;
3986 #endif
3987
3988 /* First, ensure the minimum number of matches are present. Use inline
3989 code for maximizing the speed, and do the type test once at the start
3990 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3991 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3992 and single-bytes. */
3993
3994 if (min > 0)
3995 {
3996 #ifdef SUPPORT_UCP
3997 if (prop_type >= 0)
3998 {
3999 switch(prop_type)
4000 {
4001 case PT_ANY:
4002 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4003 for (i = 1; i <= min; i++)
4004 {
4005 if (eptr >= md->end_subject)
4006 {
4007 SCHECK_PARTIAL();
4008 RRETURN(MATCH_NOMATCH);
4009 }
4010 GETCHARINCTEST(c, eptr);
4011 }
4012 break;
4013
4014 case PT_LAMP:
4015 for (i = 1; i <= min; i++)
4016 {
4017 int chartype;
4018 if (eptr >= md->end_subject)
4019 {
4020 SCHECK_PARTIAL();
4021 RRETURN(MATCH_NOMATCH);
4022 }
4023 GETCHARINCTEST(c, eptr);
4024 chartype = UCD_CHARTYPE(c);
4025 if ((chartype == ucp_Lu ||
4026 chartype == ucp_Ll ||
4027 chartype == ucp_Lt) == prop_fail_result)
4028 RRETURN(MATCH_NOMATCH);
4029 }
4030 break;
4031
4032 case PT_GC:
4033 for (i = 1; i <= min; i++)
4034 {
4035 if (eptr >= md->end_subject)
4036 {
4037 SCHECK_PARTIAL();
4038 RRETURN(MATCH_NOMATCH);
4039 }
4040 GETCHARINCTEST(c, eptr);
4041 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4042 RRETURN(MATCH_NOMATCH);
4043 }
4044 break;
4045
4046 case PT_PC:
4047 for (i = 1; i <= min; i++)
4048 {
4049 if (eptr >= md->end_subject)
4050 {
4051 SCHECK_PARTIAL();
4052 RRETURN(MATCH_NOMATCH);
4053 }
4054 GETCHARINCTEST(c, eptr);
4055 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4056 RRETURN(MATCH_NOMATCH);
4057 }
4058 break;
4059
4060 case PT_SC:
4061 for (i = 1; i <= min; i++)
4062 {
4063 if (eptr >= md->end_subject)
4064 {
4065 SCHECK_PARTIAL();
4066 RRETURN(MATCH_NOMATCH);
4067 }
4068 GETCHARINCTEST(c, eptr);
4069 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4070 RRETURN(MATCH_NOMATCH);
4071 }
4072 break;
4073
4074 case PT_ALNUM:
4075 for (i = 1; i <= min; i++)
4076 {
4077 int category;
4078 if (eptr >= md->end_subject)
4079 {
4080 SCHECK_PARTIAL();
4081 RRETURN(MATCH_NOMATCH);
4082 }
4083 GETCHARINCTEST(c, eptr);
4084 category = UCD_CATEGORY(c);
4085 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4086 RRETURN(MATCH_NOMATCH);
4087 }
4088 break;
4089
4090 case PT_SPACE: /* Perl space */
4091 for (i = 1; i <= min; i++)
4092 {
4093 if (eptr >= md->end_subject)
4094 {
4095 SCHECK_PARTIAL();
4096 RRETURN(MATCH_NOMATCH);
4097 }
4098 GETCHARINCTEST(c, eptr);
4099 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4100 c == CHAR_FF || c == CHAR_CR)
4101 == prop_fail_result)
4102 RRETURN(MATCH_NOMATCH);
4103 }
4104 break;
4105
4106 case PT_PXSPACE: /* POSIX space */
4107 for (i = 1; i <= min; i++)
4108 {
4109 if (eptr >= md->end_subject)
4110 {
4111 SCHECK_PARTIAL();
4112 RRETURN(MATCH_NOMATCH);
4113 }
4114 GETCHARINCTEST(c, eptr);
4115 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4116 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4117 == prop_fail_result)
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 break;
4121
4122 case PT_WORD:
4123 for (i = 1; i <= min; i++)
4124 {
4125 int category;
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 RRETURN(MATCH_NOMATCH);
4130 }
4131 GETCHARINCTEST(c, eptr);
4132 category = UCD_CATEGORY(c);
4133 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4134 == prop_fail_result)
4135 RRETURN(MATCH_NOMATCH);
4136 }
4137 break;
4138
4139 /* This should not occur */
4140
4141 default:
4142 RRETURN(PCRE_ERROR_INTERNAL);
4143 }
4144 }
4145
4146 /* Match extended Unicode sequences. We will get here only if the
4147 support is in the binary; otherwise a compile-time error occurs. */
4148
4149 else if (ctype == OP_EXTUNI)
4150 {
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 RRETURN(MATCH_NOMATCH);
4157 }
4158 GETCHARINCTEST(c, eptr);
4159 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4160 while (eptr < md->end_subject)
4161 {
4162 int len = 1;
4163 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4164 if (UCD_CATEGORY(c) != ucp_M) break;
4165 eptr += len;
4166 }
4167 }
4168 }
4169
4170 else
4171 #endif /* SUPPORT_UCP */
4172
4173 /* Handle all other cases when the coding is UTF-8 */
4174
4175 #ifdef SUPPORT_UTF
4176 if (utf) switch(ctype)
4177 {
4178 case OP_ANY:
4179 for (i = 1; i <= min; i++)
4180 {
4181 if (eptr >= md->end_subject)
4182 {
4183 SCHECK_PARTIAL();
4184 RRETURN(MATCH_NOMATCH);
4185 }
4186 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4187 eptr++;
4188 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4189 }
4190 break;
4191
4192 case OP_ALLANY:
4193 for (i = 1; i <= min; i++)
4194 {
4195 if (eptr >= md->end_subject)
4196 {
4197 SCHECK_PARTIAL();
4198 RRETURN(MATCH_NOMATCH);
4199 }
4200 eptr++;
4201 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4202 }
4203 break;
4204
4205 case OP_ANYBYTE:
4206 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4207 eptr += min;
4208 break;
4209
4210 case OP_ANYNL:
4211 for (i = 1; i <= min; i++)
4212 {
4213 if (eptr >= md->end_subject)
4214 {
4215 SCHECK_PARTIAL();
4216 RRETURN(MATCH_NOMATCH);
4217 }
4218 GETCHARINC(c, eptr);
4219 switch(c)
4220 {
4221 default: RRETURN(MATCH_NOMATCH);
4222
4223 case 0x000d:
4224 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4225 break;
4226
4227 case 0x000a:
4228 break;
4229
4230 case 0x000b:
4231 case 0x000c:
4232 case 0x0085:
4233 case 0x2028:
4234 case 0x2029:
4235 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4236 break;
4237 }
4238 }
4239 break;
4240
4241 case OP_NOT_HSPACE:
4242 for (i = 1; i <= min; i++)
4243 {
4244 if (eptr >= md->end_subject)
4245 {
4246 SCHECK_PARTIAL();
4247 RRETURN(MATCH_NOMATCH);
4248 }
4249 GETCHARINC(c, eptr);
4250 switch(c)
4251 {
4252 default: break;
4253 case 0x09: /* HT */
4254 case 0x20: /* SPACE */
4255 case 0xa0: /* NBSP */
4256 case 0x1680: /* OGHAM SPACE MARK */
4257 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4258 case 0x2000: /* EN QUAD */
4259 case 0x2001: /* EM QUAD */
4260 case 0x2002: /* EN SPACE */
4261 case 0x2003: /* EM SPACE */
4262 case 0x2004: /* THREE-PER-EM SPACE */
4263 case 0x2005: /* FOUR-PER-EM SPACE */
4264 case 0x2006: /* SIX-PER-EM SPACE */
4265 case 0x2007: /* FIGURE SPACE */
4266 case 0x2008: /* PUNCTUATION SPACE */
4267 case 0x2009: /* THIN SPACE */
4268 case 0x200A: /* HAIR SPACE */
4269 case 0x202f: /* NARROW NO-BREAK SPACE */
4270 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4271 case 0x3000: /* IDEOGRAPHIC SPACE */
4272 RRETURN(MATCH_NOMATCH);
4273 }
4274 }
4275 break;
4276
4277 case OP_HSPACE:
4278 for (i = 1; i <= min; i++)
4279 {
4280 if (eptr >= md->end_subject)
4281 {
4282 SCHECK_PARTIAL();
4283 RRETURN(MATCH_NOMATCH);
4284 }
4285 GETCHARINC(c, eptr);
4286 switch(c)
4287 {
4288 default: RRETURN(MATCH_NOMATCH);
4289 case 0x09: /* HT */
4290 case 0x20: /* SPACE */
4291 case 0xa0: /* NBSP */
4292 case 0x1680: /* OGHAM SPACE MARK */
4293 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4294 case 0x2000: /* EN QUAD */
4295 case 0x2001: /* EM QUAD */
4296 case 0x2002: /* EN SPACE */
4297 case 0x2003: /* EM SPACE */
4298 case 0x2004: /* THREE-PER-EM SPACE */
4299 case 0x2005: /* FOUR-PER-EM SPACE */
4300 case 0x2006: /* SIX-PER-EM SPACE */
4301 case 0x2007: /* FIGURE SPACE */
4302 case 0x2008: /* PUNCTUATION SPACE */
4303 case 0x2009: /* THIN SPACE */
4304 case 0x200A: /* HAIR SPACE */
4305 case 0x202f: /* NARROW NO-BREAK SPACE */
4306 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4307 case 0x3000: /* IDEOGRAPHIC SPACE */
4308 break;
4309 }
4310 }
4311 break;
4312
4313 case OP_NOT_VSPACE:
4314 for (i = 1; i <= min; i++)
4315 {
4316 if (eptr >= md->end_subject)
4317 {
4318 SCHECK_PARTIAL();
4319 RRETURN(MATCH_NOMATCH);
4320 }
4321 GETCHARINC(c, eptr);
4322 switch(c)
4323 {
4324 default: break;
4325 case 0x0a: /* LF */
4326 case 0x0b: /* VT */
4327 case 0x0c: /* FF */
4328 case 0x0d: /* CR */
4329 case 0x85: /* NEL */
4330 case 0x2028: /* LINE SEPARATOR */
4331 case 0x2029: /* PARAGRAPH SEPARATOR */
4332 RRETURN(MATCH_NOMATCH);
4333 }
4334 }
4335 break;
4336
4337 case OP_VSPACE:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 RRETURN(MATCH_NOMATCH);
4344 }
4345 GETCHARINC(c, eptr);
4346 switch(c)
4347 {
4348 default: RRETURN(MATCH_NOMATCH);
4349 case 0x0a: /* LF */
4350 case 0x0b: /* VT */
4351 case 0x0c: /* FF */
4352 case 0x0d: /* CR */
4353 case 0x85: /* NEL */
4354 case 0x2028: /* LINE SEPARATOR */
4355 case 0x2029: /* PARAGRAPH SEPARATOR */
4356 break;
4357 }
4358 }
4359 break;
4360
4361 case OP_NOT_DIGIT:
4362 for (i = 1; i <= min; i++)
4363 {
4364 if (eptr >= md->end_subject)
4365 {
4366 SCHECK_PARTIAL();
4367 RRETURN(MATCH_NOMATCH);
4368 }
4369 GETCHARINC(c, eptr);
4370 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4371 RRETURN(MATCH_NOMATCH);
4372 }
4373 break;
4374
4375 case OP_DIGIT:
4376 for (i = 1; i <= min; i++)
4377 {
4378 if (eptr >= md->end_subject)
4379 {
4380 SCHECK_PARTIAL();
4381 RRETURN(MATCH_NOMATCH);
4382 }
4383 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4384 RRETURN(MATCH_NOMATCH);
4385 eptr++;
4386 /* No need to skip more bytes - we know it's a 1-byte character */
4387 }
4388 break;
4389
4390 case OP_NOT_WHITESPACE:
4391 for (i = 1; i <= min; i++)
4392 {
4393 if (eptr >= md->end_subject)
4394 {
4395 SCHECK_PARTIAL();
4396 RRETURN(MATCH_NOMATCH);
4397 }
4398 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4399 RRETURN(MATCH_NOMATCH);
4400 eptr++;
4401 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4402 }
4403 break;
4404
4405 case OP_WHITESPACE:
4406 for (i = 1; i <= min; i++)
4407 {
4408 if (eptr >= md->end_subject)
4409 {
4410 SCHECK_PARTIAL();
4411 RRETURN(MATCH_NOMATCH);
4412 }
4413 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4414 RRETURN(MATCH_NOMATCH);
4415 eptr++;
4416 /* No need to skip more bytes - we know it's a 1-byte character */
4417 }
4418 break;
4419
4420 case OP_NOT_WORDCHAR:
4421 for (i = 1; i <= min; i++)
4422 {
4423 if (eptr >= md->end_subject)
4424 {
4425 SCHECK_PARTIAL();
4426 RRETURN(MATCH_NOMATCH);
4427 }
4428 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4429 RRETURN(MATCH_NOMATCH);
4430 eptr++;
4431 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4432 }
4433 break;
4434
4435 case OP_WORDCHAR:
4436 for (i = 1; i <= min; i++)
4437 {
4438 if (eptr >= md->end_subject)
4439 {
4440 SCHECK_PARTIAL();
4441 RRETURN(MATCH_NOMATCH);
4442 }
4443 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4444 RRETURN(MATCH_NOMATCH);
4445 eptr++;
4446 /* No need to skip more bytes - we know it's a 1-byte character */
4447 }
4448 break;
4449
4450 default:
4451 RRETURN(PCRE_ERROR_INTERNAL);
4452 } /* End switch(ctype) */
4453
4454 else
4455 #endif /* SUPPORT_UTF */
4456
4457 /* Code for the non-UTF-8 case for minimum matching of operators other
4458 than OP_PROP and OP_NOTPROP. */
4459
4460 switch(ctype)
4461 {
4462 case OP_ANY:
4463 for (i = 1; i <= min; i++)
4464 {
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 RRETURN(MATCH_NOMATCH);
4469 }
4470 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4471 eptr++;
4472 }
4473 break;
4474
4475 case OP_ALLANY:
4476 if (eptr > md->end_subject - min)
4477 {
4478 SCHECK_PARTIAL();
4479 RRETURN(MATCH_NOMATCH);
4480 }
4481 eptr += min;
4482 break;
4483
4484 case OP_ANYBYTE:
4485 if (eptr > md->end_subject - min)
4486 {
4487 SCHECK_PARTIAL();
4488 RRETURN(MATCH_NOMATCH);
4489 }
4490 eptr += min;
4491 break;
4492
4493 case OP_ANYNL:
4494 for (i = 1; i <= min; i++)
4495 {
4496 if (eptr >= md->end_subject)
4497 {
4498 SCHECK_PARTIAL();
4499 RRETURN(MATCH_NOMATCH);
4500 }
4501 switch(*eptr++)
4502 {
4503 default: RRETURN(MATCH_NOMATCH);
4504
4505 case 0x000d:
4506 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4507 break;
4508
4509 case 0x000a:
4510 break;
4511
4512 case 0x000b:
4513 case 0x000c:
4514 case 0x0085:
4515 #ifdef COMPILE_PCRE16
4516 case 0x2028:
4517 case 0x2029:
4518 #endif
4519 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4520 break;
4521 }
4522 }
4523 break;
4524
4525 case OP_NOT_HSPACE:
4526 for (i = 1; i <= min; i++)
4527 {
4528 if (eptr >= md->end_subject)
4529 {
4530 SCHECK_PARTIAL();
4531 RRETURN(MATCH_NOMATCH);
4532 }
4533 switch(*eptr++)
4534 {
4535 default: break;
4536 case 0x09: /* HT */
4537 case 0x20: /* SPACE */
4538 case 0xa0: /* NBSP */
4539 #ifdef COMPILE_PCRE16
4540 case 0x1680: /* OGHAM SPACE MARK */
4541 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4542 case 0x2000: /* EN QUAD */
4543 case 0x2001: /* EM QUAD */
4544 case 0x2002: /* EN SPACE */
4545 case 0x2003: /* EM SPACE */
4546 case 0x2004: /* THREE-PER-EM SPACE */
4547 case 0x2005: /* FOUR-PER-EM SPACE */
4548 case 0x2006: /* SIX-PER-EM SPACE */
4549 case 0x2007: /* FIGURE SPACE */
4550 case 0x2008: /* PUNCTUATION SPACE */
4551 case 0x2009: /* THIN SPACE */
4552 case 0x200A: /* HAIR SPACE */
4553 case 0x202f: /* NARROW NO-BREAK SPACE */
4554 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4555 case 0x3000: /* IDEOGRAPHIC SPACE */
4556 #endif
4557 RRETURN(MATCH_NOMATCH);
4558 }
4559 }
4560 break;
4561
4562 case OP_HSPACE:
4563 for (i = 1; i <= min; i++)
4564 {
4565 if (eptr >= md->end_subject)
4566 {
4567 SCHECK_PARTIAL();
4568 RRETURN(MATCH_NOMATCH);
4569 }
4570 switch(*eptr++)
4571 {
4572 default: RRETURN(MATCH_NOMATCH);
4573 case 0x09: /* HT */
4574 case 0x20: /* SPACE */
4575 case 0xa0: /* NBSP */
4576 #ifdef COMPILE_PCRE16
4577 case 0x1680: /* OGHAM SPACE MARK */
4578 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4579 case 0x2000: /* EN QUAD */
4580 case 0x2001: /* EM QUAD */
4581 case 0x2002: /* EN SPACE */
4582 case 0x2003: /* EM SPACE */
4583 case 0x2004: /* THREE-PER-EM SPACE */
4584 case 0x2005: /* FOUR-PER-EM SPACE */
4585 case 0x2006: /* SIX-PER-EM SPACE */
4586 case 0x2007: /* FIGURE SPACE */
4587 case 0x2008: /* PUNCTUATION SPACE */
4588 case 0x2009: /* THIN SPACE */
4589 case 0x200A: /* HAIR SPACE */
4590 case 0x202f: /* NARROW NO-BREAK SPACE */
4591 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4592 case 0x3000: /* IDEOGRAPHIC SPACE */
4593 #endif
4594 break;
4595 }
4596 }
4597 break;
4598
4599 case OP_NOT_VSPACE:
4600 for (i = 1; i <= min; i++)
4601 {
4602 if (eptr >= md->end_subject)
4603 {
4604 SCHECK_PARTIAL();
4605 RRETURN(MATCH_NOMATCH);
4606 }
4607 switch(*eptr++)
4608 {
4609 default: break;
4610 case 0x0a: /* LF */
4611 case 0x0b: /* VT */
4612 case 0x0c: /* FF */
4613 case 0x0d: /* CR */
4614 case 0x85: /* NEL */
4615 #ifdef COMPILE_PCRE16
4616 case 0x2028: /* LINE SEPARATOR */
4617 case 0x2029: /* PARAGRAPH SEPARATOR */
4618 #endif
4619 RRETURN(MATCH_NOMATCH);
4620 }
4621 }
4622 break;
4623
4624 case OP_VSPACE:
4625 for (i = 1; i <= min; i++)
4626 {
4627 if (eptr >= md->end_subject)
4628 {
4629 SCHECK_PARTIAL();
4630 RRETURN(MATCH_NOMATCH);
4631 }
4632 switch(*eptr++)
4633 {
4634 default: RRETURN(MATCH_NOMATCH);
4635 case 0x0a: /* LF */
4636 case 0x0b: /* VT */
4637 case 0x0c: /* FF */
4638 case 0x0d: /* CR */
4639 case 0x85: /* NEL */
4640 #ifdef COMPILE_PCRE16
4641 case 0x2028: /* LINE SEPARATOR */
4642 case 0x2029: /* PARAGRAPH SEPARATOR */
4643 #endif
4644 break;
4645 }
4646 }
4647 break;
4648
4649 case OP_NOT_DIGIT:
4650 for (i = 1; i <= min; i++)
4651 {
4652 if (eptr >= md->end_subject)
4653 {
4654 SCHECK_PARTIAL();
4655 RRETURN(MATCH_NOMATCH);
4656 }
4657 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4658 RRETURN(MATCH_NOMATCH);
4659 eptr++;
4660 }
4661 break;
4662
4663 case OP_DIGIT:
4664 for (i = 1; i <= min; i++)
4665 {
4666 if (eptr >= md->end_subject)
4667 {
4668 SCHECK_PARTIAL();
4669 RRETURN(MATCH_NOMATCH);
4670 }
4671 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4672 RRETURN(MATCH_NOMATCH);
4673 eptr++;
4674 }
4675 break;
4676
4677 case OP_NOT_WHITESPACE:
4678 for (i = 1; i <= min; i++)
4679 {
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 RRETURN(MATCH_NOMATCH);
4684 }
4685 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4686 RRETURN(MATCH_NOMATCH);
4687 eptr++;
4688 }
4689 break;
4690
4691 case OP_WHITESPACE:
4692 for (i = 1; i <= min; i++)
4693 {
4694 if (eptr >= md->end_subject)
4695 {
4696 SCHECK_PARTIAL();
4697 RRETURN(MATCH_NOMATCH);
4698 }
4699 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4700 RRETURN(MATCH_NOMATCH);
4701 eptr++;
4702 }
4703 break;
4704
4705 case OP_NOT_WORDCHAR:
4706 for (i = 1; i <= min; i++)
4707 {
4708 if (eptr >= md->end_subject)
4709 {
4710 SCHECK_PARTIAL();
4711 RRETURN(MATCH_NOMATCH);
4712 }
4713 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4714 RRETURN(MATCH_NOMATCH);
4715 eptr++;
4716 }
4717 break;
4718
4719 case OP_WORDCHAR:
4720 for (i = 1; i <= min; i++)
4721 {
4722 if (eptr >= md->end_subject)
4723 {
4724 SCHECK_PARTIAL();
4725 RRETURN(MATCH_NOMATCH);
4726 }
4727 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4728 RRETURN(MATCH_NOMATCH);
4729 eptr++;
4730 }
4731 break;
4732
4733 default:
4734 RRETURN(PCRE_ERROR_INTERNAL);
4735 }
4736 }
4737
4738 /* If min = max, continue at the same level without recursing */
4739
4740 if (min == max) continue;
4741
4742 /* If minimizing, we have to test the rest of the pattern before each
4743 subsequent match. Again, separate the UTF-8 case for speed, and also
4744 separate the UCP cases. */
4745
4746 if (minimize)
4747 {
4748 #ifdef SUPPORT_UCP
4749 if (prop_type >= 0)
4750 {
4751 switch(prop_type)
4752 {
4753 case PT_ANY:
4754 for (fi = min;; fi++)
4755 {
4756 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4757 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4758 if (fi >= max) RRETURN(MATCH_NOMATCH);
4759 if (eptr >= md->end_subject)
4760 {
4761 SCHECK_PARTIAL();
4762 RRETURN(MATCH_NOMATCH);
4763 }
4764 GETCHARINCTEST(c, eptr);
4765 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4766 }
4767 /* Control never gets here */
4768
4769 case PT_LAMP:
4770 for (fi = min;; fi++)
4771 {
4772 int chartype;
4773 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4775 if (fi >= max) RRETURN(MATCH_NOMATCH);
4776 if (eptr >= md->end_subject)
4777 {
4778 SCHECK_PARTIAL();
4779 RRETURN(MATCH_NOMATCH);
4780 }
4781 GETCHARINCTEST(c, eptr);
4782 chartype = UCD_CHARTYPE(c);
4783 if ((chartype == ucp_Lu ||
4784 chartype == ucp_Ll ||
4785 chartype == ucp_Lt) == prop_fail_result)
4786 RRETURN(MATCH_NOMATCH);
4787 }
4788 /* Control never gets here */
4789
4790 case PT_GC:
4791 for (fi = min;; fi++)
4792 {
4793 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4795 if (fi >= max) RRETURN(MATCH_NOMATCH);
4796 if (eptr >= md->end_subject)
4797 {
4798 SCHECK_PARTIAL();
4799 RRETURN(MATCH_NOMATCH);
4800 }
4801 GETCHARINCTEST(c, eptr);
4802 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4803 RRETURN(MATCH_NOMATCH);
4804 }
4805 /* Control never gets here */
4806
4807 case PT_PC:
4808 for (fi = min;; fi++)
4809 {
4810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4812 if (fi >= max) RRETURN(MATCH_NOMATCH);
4813 if (eptr >= md->end_subject)
4814 {
4815 SCHECK_PARTIAL();
4816 RRETURN(MATCH_NOMATCH);
4817 }
4818 GETCHARINCTEST(c, eptr);
4819 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4820 RRETURN(MATCH_NOMATCH);
4821 }
4822 /* Control never gets here */
4823
4824 case PT_SC:
4825 for (fi = min;; fi++)
4826 {
4827 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4829 if (fi >= max) RRETURN(MATCH_NOMATCH);
4830 if (eptr >= md->end_subject)
4831 {
4832 SCHECK_PARTIAL();
4833 RRETURN(MATCH_NOMATCH);
4834 }
4835 GETCHARINCTEST(c, eptr);
4836 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4837 RRETURN(MATCH_NOMATCH);
4838 }
4839 /* Control never gets here */
4840
4841 case PT_ALNUM:
4842 for (fi = min;; fi++)
4843 {
4844 int category;
4845 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4847 if (fi >= max) RRETURN(MATCH_NOMATCH);
4848 if (eptr >= md->end_subject)
4849 {
4850 SCHECK_PARTIAL();
4851 RRETURN(MATCH_NOMATCH);
4852 }
4853 GETCHARINCTEST(c, eptr);
4854 category = UCD_CATEGORY(c);
4855 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4856 RRETURN(MATCH_NOMATCH);
4857 }
4858 /* Control never gets here */
4859
4860 case PT_SPACE: /* Perl space */
4861 for (fi = min;; fi++)
4862 {
4863 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4865 if (fi >= max) RRETURN(MATCH_NOMATCH);
4866 if (eptr >= md->end_subject)
4867 {
4868 SCHECK_PARTIAL();
4869 RRETURN(MATCH_NOMATCH);
4870 }
4871 GETCHARINCTEST(c, eptr);
4872 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4873 c == CHAR_FF || c == CHAR_CR)
4874 == prop_fail_result)
4875 RRETURN(MATCH_NOMATCH);
4876 }
4877 /* Control never gets here */
4878
4879 case PT_PXSPACE: /* POSIX space */
4880 for (fi = min;; fi++)
4881 {
4882 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4884 if (fi >= max) RRETURN(MATCH_NOMATCH);
4885 if (eptr >= md->end_subject)
4886 {
4887 SCHECK_PARTIAL();
4888 RRETURN(MATCH_NOMATCH);
4889 }
4890 GETCHARINCTEST(c, eptr);
4891 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4892 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4893 == prop_fail_result)
4894 RRETURN(MATCH_NOMATCH);
4895 }
4896 /* Control never gets here */
4897
4898 case PT_WORD:
4899 for (fi = min;; fi++)
4900 {
4901 int category;
4902 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4904 if (fi >= max) RRETURN(MATCH_NOMATCH);
4905 if (eptr >= md->end_subject)
4906 {
4907 SCHECK_PARTIAL();
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 GETCHARINCTEST(c, eptr);
4911 category = UCD_CATEGORY(c);
4912 if ((category == ucp_L ||
4913 category == ucp_N ||
4914 c == CHAR_UNDERSCORE)
4915 == prop_fail_result)
4916 RRETURN(MATCH_NOMATCH);
4917 }
4918 /* Control never gets here */
4919
4920 /* This should never occur */
4921
4922 default:
4923 RRETURN(PCRE_ERROR_INTERNAL);
4924 }
4925 }
4926
4927 /* Match extended Unicode sequences. We will get here only if the
4928 support is in the binary; otherwise a compile-time error occurs. */
4929
4930 else if (ctype == OP_EXTUNI)
4931 {
4932 for (fi = min;; fi++)
4933 {
4934 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4936 if (fi >= max) RRETURN(MATCH_NOMATCH);
4937 if (eptr >= md->end_subject)
4938 {
4939 SCHECK_PARTIAL();
4940 RRETURN(MATCH_NOMATCH);
4941 }
4942 GETCHARINCTEST(c, eptr);
4943 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4944 while (eptr < md->end_subject)
4945 {
4946 int len = 1;
4947 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4948 if (UCD_CATEGORY(c) != ucp_M) break;
4949 eptr += len;
4950 }
4951 }
4952 }
4953 else
4954 #endif /* SUPPORT_UCP */
4955
4956 #ifdef SUPPORT_UTF
4957 if (utf)
4958 {
4959 for (fi = min;; fi++)
4960 {
4961 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4962 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4963 if (fi >= max) RRETURN(MATCH_NOMATCH);
4964 if (eptr >= md->end_subject)
4965 {
4966 SCHECK_PARTIAL();
4967 RRETURN(MATCH_NOMATCH);
4968 }
4969 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4970 RRETURN(MATCH_NOMATCH);
4971 GETCHARINC(c, eptr);
4972 switch(ctype)
4973 {
4974 case OP_ANY: /* This is the non-NL case */
4975 case OP_ALLANY:
4976 case OP_ANYBYTE:
4977 break;
4978
4979 case OP_ANYNL:
4980 switch(c)
4981 {
4982 default: RRETURN(MATCH_NOMATCH);
4983 case 0x000d:
4984 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4985 break;
4986 case 0x000a:
4987 break;
4988
4989 case 0x000b:
4990 case 0x000c:
4991 case 0x0085:
4992 case 0x2028:
4993 case 0x2029:
4994 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4995 break;
4996 }
4997 break;
4998
4999 case OP_NOT_HSPACE:
5000 switch(c)
5001 {
5002 default: break;
5003 case 0x09: /* HT */
5004 case 0x20: /* SPACE */
5005 case 0xa0: /* NBSP */
5006 case 0x1680: /* OGHAM SPACE MARK */
5007 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5008 case 0x2000: /* EN QUAD */
5009 case 0x2001: /* EM QUAD */
5010 case 0x2002: /* EN SPACE */
5011 case 0x2003: /* EM SPACE */
5012 case 0x2004: /* THREE-PER-EM SPACE */
5013 case 0x2005: /* FOUR-PER-EM SPACE */
5014 case 0x2006: /* SIX-PER-EM SPACE */
5015 case 0x2007: /* FIGURE SPACE */
5016 case 0x2008: /* PUNCTUATION SPACE */
5017 case 0x2009: /* THIN SPACE */
5018 case 0x200A: /* HAIR SPACE */
5019 case 0x202f: /* NARROW NO-BREAK SPACE */
5020 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5021 case 0x3000: /* IDEOGRAPHIC SPACE */
5022 RRETURN(MATCH_NOMATCH);
5023 }
5024 break;
5025
5026 case OP_HSPACE:
5027 switch(c)
5028 {
5029 default: RRETURN(MATCH_NOMATCH);
5030 case 0x09: /* HT */
5031 case 0x20: /* SPACE */
5032 case 0xa0: /* NBSP */
5033 case 0x1680: /* OGHAM SPACE MARK */
5034 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5035 case 0x2000: /* EN QUAD */
5036 case 0x2001: /* EM QUAD */
5037 case 0x2002: /* EN SPACE */
5038 case 0x2003: /* EM SPACE */
5039 case 0x2004: /* THREE-PER-EM SPACE */
5040 case 0x2005: /* FOUR-PER-EM SPACE */
5041 case 0x2006: /* SIX-PER-EM SPACE */
5042 case 0x2007: /* FIGURE SPACE */
5043 case 0x2008: /* PUNCTUATION SPACE */
5044 case 0x2009: /* THIN SPACE */
5045 case 0x200A: /* HAIR SPACE */
5046 case 0x202f: /* NARROW NO-BREAK SPACE */
5047 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5048 case 0x3000: /* IDEOGRAPHIC SPACE */
5049 break;
5050 }
5051 break;
5052
5053 case OP_NOT_VSPACE:
5054 switch(c)
5055 {
5056 default: break;
5057 case 0x0a: /* LF */
5058 case 0x0b: /* VT */
5059 case 0x0c: /* FF */
5060 case 0x0d: /* CR */
5061 case 0x85: /* NEL */
5062 case 0x2028: /* LINE SEPARATOR */
5063 case 0x2029: /* PARAGRAPH SEPARATOR */
5064 RRETURN(MATCH_NOMATCH);
5065 }
5066 break;
5067
5068 case OP_VSPACE:
5069 switch(c)
5070 {
5071 default: RRETURN(MATCH_NOMATCH);
5072 case 0x0a: /* LF */
5073 case 0x0b: /* VT */
5074 case 0x0c: /* FF */
5075 case 0x0d: /* CR */
5076 case 0x85: /* NEL */
5077 case 0x2028: /* LINE SEPARATOR */
5078 case 0x2029: /* PARAGRAPH SEPARATOR */
5079 break;
5080 }
5081 break;
5082
5083 case OP_NOT_DIGIT:
5084 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5085 RRETURN(MATCH_NOMATCH);
5086 break;
5087
5088 case OP_DIGIT:
5089 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5090 RRETURN(MATCH_NOMATCH);
5091 break;
5092
5093 case OP_NOT_WHITESPACE:
5094 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5095 RRETURN(MATCH_NOMATCH);
5096 break;
5097
5098 case OP_WHITESPACE:
5099 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5100 RRETURN(MATCH_NOMATCH);
5101 break;
5102
5103 case OP_NOT_WORDCHAR:
5104 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5105 RRETURN(MATCH_NOMATCH);
5106 break;
5107
5108 case OP_WORDCHAR:
5109 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5110 RRETURN(MATCH_NOMATCH);
5111 break;
5112
5113 default:
5114 RRETURN(PCRE_ERROR_INTERNAL);
5115 }
5116 }
5117 }
5118 else
5119 #endif
5120 /* Not UTF mode */
5121 {
5122 for (fi = min;; fi++)
5123 {
5124 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5126 if (fi >= max) RRETURN(MATCH_NOMATCH);
5127 if (eptr >= md->end_subject)
5128 {
5129 SCHECK_PARTIAL();
5130 RRETURN(MATCH_NOMATCH);
5131 }
5132 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5133 RRETURN(MATCH_NOMATCH);
5134 c = *eptr++;
5135 switch(ctype)
5136 {
5137 case OP_ANY: /* This is the non-NL case */
5138 case OP_ALLANY:
5139 case OP_ANYBYTE:
5140 break;
5141
5142 case OP_ANYNL:
5143 switch(c)
5144 {
5145 default: RRETURN(MATCH_NOMATCH);
5146 case 0x000d:
5147 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5148 break;
5149
5150 case 0x000a:
5151 break;
5152
5153 case 0x000b:
5154 case 0x000c:
5155 case 0x0085:
5156 #ifdef COMPILE_PCRE16
5157 case 0x2028:
5158 case 0x2029:
5159 #endif
5160 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5161 break;
5162 }
5163 break;
5164
5165 case OP_NOT_HSPACE:
5166 switch(c)
5167 {
5168 default: break;
5169 case 0x09: /* HT */
5170 case 0x20: /* SPACE */
5171 case 0xa0: /* NBSP */
5172 #ifdef COMPILE_PCRE16
5173 case 0x1680: /* OGHAM SPACE MARK */
5174 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5175 case 0x2000: /* EN QUAD */
5176 case 0x2001: /* EM QUAD */
5177 case 0x2002: /* EN SPACE */
5178 case 0x2003: /* EM SPACE */
5179 case 0x2004: /* THREE-PER-EM SPACE */
5180 case 0x2005: /* FOUR-PER-EM SPACE */
5181 case 0x2006: /* SIX-PER-EM SPACE */
5182 case 0x2007: /* FIGURE SPACE */
5183 case 0x2008: /* PUNCTUATION SPACE */
5184 case 0x2009: /* THIN SPACE */
5185 case 0x200A: /* HAIR SPACE */
5186 case 0x202f: /* NARROW NO-BREAK SPACE */
5187 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5188 case 0x3000: /* IDEOGRAPHIC SPACE */
5189 #endif
5190 RRETURN(MATCH_NOMATCH);
5191 }
5192 break;
5193
5194 case OP_HSPACE:
5195 switch(c)
5196 {
5197 default: RRETURN(MATCH_NOMATCH);
5198 case 0x09: /* HT */
5199 case 0x20: /* SPACE */
5200 case 0xa0: /* NBSP */
5201 #ifdef COMPILE_PCRE16
5202 case 0x1680: /* OGHAM SPACE MARK */
5203 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5204 case 0x2000: /* EN QUAD */
5205 case 0x2001: /* EM QUAD */
5206 case 0x2002: /* EN SPACE */
5207 case 0x2003: /* EM SPACE */
5208 case 0x2004: /* THREE-PER-EM SPACE */
5209 case 0x2005: /* FOUR-PER-EM SPACE */
5210 case 0x2006: /* SIX-PER-EM SPACE */
5211 case 0x2007: /* FIGURE SPACE */
5212 case 0x2008: /* PUNCTUATION SPACE */
5213 case 0x2009: /* THIN SPACE */
5214 case 0x200A: /* HAIR SPACE */
5215 case 0x202f: /* NARROW NO-BREAK SPACE */
5216 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5217 case 0x3000: /* IDEOGRAPHIC SPACE */
5218 #endif
5219 break;
5220 }
5221 break;
5222
5223 case OP_NOT_VSPACE:
5224 switch(c)
5225 {
5226 default: break;
5227 case 0x0a: /* LF */
5228 case 0x0b: /* VT */
5229 case 0x0c: /* FF */
5230 case 0x0d: /* CR */
5231 case 0x85: /* NEL */
5232 #ifdef COMPILE_PCRE16
5233 case 0x2028: /* LINE SEPARATOR */
5234 case 0x2029: /* PARAGRAPH SEPARATOR */
5235 #endif
5236 RRETURN(MATCH_NOMATCH);
5237 }
5238 break;
5239
5240 case OP_VSPACE:
5241 switch(c)
5242 {
5243 default: RRETURN(MATCH_NOMATCH);
5244 case 0x0a: /* LF */
5245 case 0x0b: /* VT */
5246 case 0x0c: /* FF */
5247 case 0x0d: /* CR */
5248 case 0x85: /* NEL */
5249 #ifdef COMPILE_PCRE16
5250 case 0x2028: /* LINE SEPARATOR */
5251 case 0x2029: /* PARAGRAPH SEPARATOR */
5252 #endif
5253 break;
5254 }
5255 break;
5256
5257 case OP_NOT_DIGIT:
5258 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5259 break;
5260
5261 case OP_DIGIT:
5262 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5263 break;
5264
5265 case OP_NOT_WHITESPACE:
5266 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5267 break;
5268
5269 case OP_WHITESPACE:
5270 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5271 break;
5272
5273 case OP_NOT_WORDCHAR:
5274 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5275 break;
5276
5277 case OP_WORDCHAR:
5278 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5279 break;
5280
5281 default:
5282 RRETURN(PCRE_ERROR_INTERNAL);
5283 }
5284 }
5285 }
5286 /* Control never gets here */
5287 }
5288
5289 /* If maximizing, it is worth using inline code for speed, doing the type
5290 test once at the start (i.e. keep it out of the loop). Again, keep the
5291 UTF-8 and UCP stuff separate. */
5292
5293 else
5294 {
5295 pp = eptr; /* Remember where we started */
5296
5297 #ifdef SUPPORT_UCP
5298 if (prop_type >= 0)
5299 {
5300 switch(prop_type)
5301 {
5302 case PT_ANY:
5303 for (i = min; i < max; i++)
5304 {
5305 int len = 1;
5306 if (eptr >= md->end_subject)
5307 {
5308 SCHECK_PARTIAL();
5309 break;
5310 }
5311 GETCHARLENTEST(c, eptr, len);
5312 if (prop_fail_result) break;
5313 eptr+= len;
5314 }
5315 break;
5316
5317 case PT_LAMP:
5318 for (i = min; i < max; i++)
5319 {
5320 int chartype;
5321 int len = 1;
5322 if (eptr >= md->end_subject)
5323 {
5324 SCHECK_PARTIAL();
5325 break;
5326 }
5327 GETCHARLENTEST(c, eptr, len);
5328 chartype = UCD_CHARTYPE(c);
5329 if ((chartype == ucp_Lu ||
5330 chartype == ucp_Ll ||
5331 chartype == ucp_Lt) == prop_fail_result)
5332 break;
5333 eptr+= len;
5334 }
5335 break;
5336
5337 case PT_GC:
5338 for (i = min; i < max; i++)
5339 {
5340 int len = 1;
5341 if (eptr >= md->end_subject)
5342 {
5343 SCHECK_PARTIAL();
5344 break;
5345 }
5346 GETCHARLENTEST(c, eptr, len);
5347 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5348 eptr+= len;
5349 }
5350 break;
5351
5352 case PT_PC:
5353 for (i = min; i < max; i++)
5354 {
5355 int len = 1;
5356 if (eptr >= md->end_subject)
5357 {
5358 SCHECK_PARTIAL();
5359 break;
5360 }
5361 GETCHARLENTEST(c, eptr, len);
5362 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5363 eptr+= len;
5364 }
5365 break;
5366
5367 case PT_SC:
5368 for (i = min; i < max; i++)
5369 {
5370 int len = 1;
5371 if (eptr >= md->end_subject)
5372 {
5373 SCHECK_PARTIAL();
5374 break;
5375 }
5376 GETCHARLENTEST(c, eptr, len);
5377 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5378 eptr+= len;
5379 }
5380 break;
5381
5382 case PT_ALNUM:
5383 for (i = min; i < max; i++)
5384 {
5385 int category;
5386 int len = 1;
5387 if (eptr >= md->end_subject)
5388 {
5389 SCHECK_PARTIAL();
5390 break;
5391 }
5392 GETCHARLENTEST(c, eptr, len);
5393 category = UCD_CATEGORY(c);
5394 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5395 break;
5396 eptr+= len;
5397 }
5398 break;
5399
5400 case PT_SPACE: /* Perl space */
5401 for (i = min; i < max; i++)
5402 {
5403 int len = 1;
5404 if (eptr >= md->end_subject)
5405 {
5406 SCHECK_PARTIAL();
5407 break;
5408 }
5409 GETCHARLENTEST(c, eptr, len);
5410 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5411 c == CHAR_FF || c == CHAR_CR)
5412 == prop_fail_result)
5413 break;
5414 eptr+= len;
5415 }
5416 break;
5417
5418 case PT_PXSPACE: /* POSIX space */
5419 for (i = min; i < max; i++)
5420 {
5421 int len = 1;
5422 if (eptr >= md->end_subject)
5423 {
5424 SCHECK_PARTIAL();
5425 break;
5426 }
5427 GETCHARLENTEST(c, eptr, len);
5428 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5429 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5430 == prop_fail_result)
5431 break;
5432 eptr+= len;
5433 }
5434 break;
5435
5436 case PT_WORD:
5437 for (i = min; i < max; i++)
5438 {
5439 int category;
5440 int len = 1;
5441 if (eptr >= md->end_subject)
5442 {
5443 SCHECK_PARTIAL();
5444 break;
5445 }
5446 GETCHARLENTEST(c, eptr, len);
5447 category = UCD_CATEGORY(c);
5448 if ((category == ucp_L || category == ucp_N ||
5449 c == CHAR_UNDERSCORE) == prop_fail_result)
5450 break;
5451 eptr+= len;
5452 }
5453 break;
5454
5455 default:
5456 RRETURN(PCRE_ERROR_INTERNAL);
5457 }
5458
5459 /* eptr is now past the end of the maximum run */
5460
5461 if (possessive) continue;
5462 for(;;)
5463 {
5464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5466 if (eptr-- == pp) break; /* Stop if tried at original pos */
5467 if (utf) BACKCHAR(eptr);
5468 }
5469 }
5470
5471 /* Match extended Unicode sequences. We will get here only if the
5472 support is in the binary; otherwise a compile-time error occurs. */
5473
5474 else if (ctype == OP_EXTUNI)
5475 {
5476 for (i = min; i < max; i++)
5477 {
5478 int len = 1;
5479 if (eptr >= md->end_subject)
5480 {
5481 SCHECK_PARTIAL();
5482 break;
5483 }
5484 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5485 if (UCD_CATEGORY(c) == ucp_M) break;
5486 eptr += len;
5487 while (eptr < md->end_subject)
5488 {
5489 len = 1;
5490 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5491 if (UCD_CATEGORY(c) != ucp_M) break;
5492 eptr += len;
5493 }
5494 }
5495
5496 /* eptr is now past the end of the maximum run */
5497
5498 if (possessive) continue;
5499
5500 for(;;)
5501 {
5502 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5503 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5504 if (eptr-- == pp) break; /* Stop if tried at original pos */
5505 for (;;) /* Move back over one extended */
5506 {
5507 if (!utf) c = *eptr; else
5508 {
5509 BACKCHAR(eptr);
5510 GETCHAR(c, eptr);
5511 }
5512 if (UCD_CATEGORY(c) != ucp_M) break;
5513 eptr--;
5514 }
5515 }
5516 }
5517
5518 else
5519 #endif /* SUPPORT_UCP */
5520
5521 #ifdef SUPPORT_UTF
5522 if (utf)
5523 {
5524 switch(ctype)
5525 {
5526 case OP_ANY:
5527 if (max < INT_MAX)
5528 {
5529 for (i = min; i < max; i++)
5530 {
5531 if (eptr >= md->end_subject)
5532 {
5533 SCHECK_PARTIAL();
5534 break;
5535 }
5536 if (IS_NEWLINE(eptr)) break;
5537 eptr++;
5538 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5539 }
5540 }
5541
5542 /* Handle unlimited UTF-8 repeat */
5543
5544 else
5545 {
5546 for (i = min; i < max; i++)
5547 {
5548 if (eptr >= md->end_subject)
5549 {
5550 SCHECK_PARTIAL();
5551 break;
5552 }
5553 if (IS_NEWLINE(eptr)) break;
5554 eptr++;
5555 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5556 }
5557 }
5558 break;
5559
5560 case OP_ALLANY:
5561 if (max < INT_MAX)
5562 {
5563 for (i = min; i < max; i++)
5564 {
5565 if (eptr >= md->end_subject)
5566 {
5567 SCHECK_PARTIAL();
5568 break;
5569 }
5570 eptr++;
5571 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5572 }
5573 }
5574 else
5575 {
5576 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5577 SCHECK_PARTIAL();
5578 }
5579 break;
5580
5581 /* The byte case is the same as non-UTF8 */
5582
5583 case OP_ANYBYTE:
5584 c = max - min;
5585 if (c > (unsigned int)(md->end_subject - eptr))
5586 {
5587 eptr = md->end_subject;
5588 SCHECK_PARTIAL();
5589 }
5590 else eptr += c;
5591 break;
5592
5593 case OP_ANYNL:
5594 for (i = min; i < max; i++)
5595 {
5596 int len = 1;
5597 if (eptr >= md->end_subject)
5598 {
5599 SCHECK_PARTIAL();
5600 break;
5601 }
5602 GETCHARLEN(c, eptr, len);
5603 if (c == 0x000d)
5604 {
5605 if (++eptr >= md->end_subject) break;
5606 if (*eptr == 0x000a) eptr++;
5607 }
5608 else
5609 {
5610 if (c != 0x000a &&
5611 (md->bsr_anycrlf ||
5612 (c != 0x000b && c != 0x000c &&
5613 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5614 break;
5615 eptr += len;
5616 }
5617 }
5618 break;
5619
5620 case OP_NOT_HSPACE:
5621 case OP_HSPACE:
5622 for (i = min; i < max; i++)
5623 {
5624 BOOL gotspace;
5625 int len = 1;
5626 if (eptr >= md->end_subject)
5627 {
5628 SCHECK_PARTIAL();
5629 break;
5630 }
5631 GETCHARLEN(c, eptr, len);
5632 switch(c)
5633 {
5634 default: gotspace = FALSE; break;
5635 case 0x09: /* HT */
5636 case 0x20: /* SPACE */
5637 case 0xa0: /* NBSP */
5638 case 0x1680: /* OGHAM SPACE MARK */
5639 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5640 case 0x2000: /* EN QUAD */
5641 case 0x2001: /* EM QUAD */
5642 case 0x2002: /* EN SPACE */
5643 case 0x2003: /* EM SPACE */
5644 case 0x2004: /* THREE-PER-EM SPACE */
5645 case 0x2005: /* FOUR-PER-EM SPACE */
5646 case 0x2006: /* SIX-PER-EM SPACE */
5647 case 0x2007: /* FIGURE SPACE */
5648 case 0x2008: /* PUNCTUATION SPACE */
5649 case 0x2009: /* THIN SPACE */
5650 case 0x200A: /* HAIR SPACE */
5651 case 0x202f: /* NARROW NO-BREAK SPACE */
5652 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5653 case 0x3000: /* IDEOGRAPHIC SPACE */
5654 gotspace = TRUE;
5655 break;
5656 }
5657 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5658 eptr += len;
5659 }
5660 break;
5661
5662 case OP_NOT_VSPACE:
5663 case OP_VSPACE:
5664 for (i = min; i < max; i++)
5665 {
5666 BOOL gotspace;
5667 int len = 1;
5668 if (eptr >= md->end_subject)
5669 {
5670 SCHECK_PARTIAL();
5671 break;
5672 }
5673 GETCHARLEN(c, eptr, len);
5674 switch(c)
5675 {
5676 default: gotspace = FALSE; break;
5677 case 0x0a: /* LF */
5678 case 0x0b: /* VT */
5679 case 0x0c: /* FF */
5680 case 0x0d: /* CR */
5681 case 0x85: /* NEL */
5682 case 0x2028: /* LINE SEPARATOR */
5683 case 0x2029: /* PARAGRAPH SEPARATOR */
5684 gotspace = TRUE;
5685 break;
5686 }
5687 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5688 eptr += len;
5689 }
5690 break;
5691
5692 case OP_NOT_DIGIT:
5693 for (i = min; i < max; i++)
5694 {
5695 int len = 1;
5696 if (eptr >= md->end_subject)
5697 {
5698 SCHECK_PARTIAL();
5699 break;
5700 }
5701 GETCHARLEN(c, eptr, len);
5702 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5703 eptr+= len;
5704 }
5705 break;
5706
5707 case OP_DIGIT:
5708 for (i = min; i < max; i++)
5709 {
5710 int len = 1;
5711 if (eptr >= md->end_subject)
5712 {
5713 SCHECK_PARTIAL();
5714 break;
5715 }
5716 GETCHARLEN(c, eptr, len);
5717 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5718 eptr+= len;
5719 }
5720 break;
5721
5722 case OP_NOT_WHITESPACE:
5723 for (i = min; i < max; i++)
5724 {
5725 int len = 1;
5726 if (eptr >= md->end_subject)
5727 {
5728 SCHECK_PARTIAL();
5729 break;
5730 }
5731 GETCHARLEN(c, eptr, len);
5732 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5733 eptr+= len;
5734 }
5735 break;
5736
5737 case OP_WHITESPACE:
5738 for (i = min; i < max; i++)
5739 {
5740 int len = 1;
5741 if (eptr >= md->end_subject)
5742 {
5743 SCHECK_PARTIAL();
5744 break;
5745 }
5746 GETCHARLEN(c, eptr, len);
5747 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5748 eptr+= len;
5749 }
5750 break;
5751
5752 case OP_NOT_WORDCHAR:
5753 for (i = min; i < max; i++)
5754 {
5755 int len = 1;
5756 if (eptr >= md->end_subject)
5757 {
5758 SCHECK_PARTIAL();
5759 break;
5760 }
5761 GETCHARLEN(c, eptr, len);
5762 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5763 eptr+= len;
5764 }
5765 break;
5766
5767 case OP_WORDCHAR:
5768 for (i = min; i < max; i++)
5769 {
5770 int len = 1;
5771 if (eptr >= md->end_subject)
5772 {
5773 SCHECK_PARTIAL();
5774 break;
5775 }
5776 GETCHARLEN(c, eptr, len);
5777 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5778 eptr+= len;
5779 }
5780 break;
5781
5782 default:
5783 RRETURN(PCRE_ERROR_INTERNAL);
5784 }
5785
5786 /* eptr is now past the end of the maximum run. If possessive, we are
5787 done (no backing up). Otherwise, match at this position; anything other
5788 than no match is immediately returned. For nomatch, back up one
5789 character, unless we are matching \R and the last thing matched was
5790 \r\n, in which case, back up two bytes. */
5791
5792 if (possessive) continue;
5793 for(;;)
5794 {
5795 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5797 if (eptr-- == pp) break; /* Stop if tried at original pos */
5798 BACKCHAR(eptr);
5799 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5800 eptr[-1] == '\r') eptr--;
5801 }
5802 }
5803 else
5804 #endif /* SUPPORT_UTF */
5805 /* Not UTF mode */
5806 {
5807 switch(ctype)
5808 {
5809 case OP_ANY:
5810 for (i = min; i < max; i++)
5811 {
5812 if (eptr >= md->end_subject)
5813 {
5814 SCHECK_PARTIAL();
5815 break;
5816 }
5817 if (IS_NEWLINE(eptr)) break;
5818 eptr++;
5819 }
5820 break;
5821
5822 case OP_ALLANY:
5823 case OP_ANYBYTE:
5824 c = max - min;
5825 if (c > (unsigned int)(md->end_subject - eptr))
5826 {
5827 eptr = md->end_subject;
5828 SCHECK_PARTIAL();
5829 }
5830 else eptr += c;
5831 break;
5832
5833 case OP_ANYNL:
5834 for (i = min; i < max; i++)
5835 {
5836 if (eptr >= md->end_subject)
5837 {
5838 SCHECK_PARTIAL();
5839 break;
5840 }
5841 c = *eptr;
5842 if (c == 0x000d)
5843 {
5844 if (++eptr >= md->end_subject) break;
5845 if (*eptr == 0x000a) eptr++;
5846 }
5847 else
5848 {
5849 if (c != 0x000a && (md->bsr_anycrlf ||
5850 (c != 0x000b && c != 0x000c && c != 0x0085
5851 #ifdef COMPILE_PCRE16
5852 && c != 0x2028 && c != 0x2029
5853 #endif
5854 ))) break;
5855 eptr++;
5856 }
5857 }
5858 break;
5859
5860 case OP_NOT_HSPACE:
5861 for (i = min; i < max; i++)
5862 {
5863 if (eptr >= md->end_subject)
5864 {
5865 SCHECK_PARTIAL();
5866 break;
5867 }
5868 c = *eptr;
5869 if (c == 0x09 || c == 0x20 || c == 0xa0
5870 #ifdef COMPILE_PCRE16
5871 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5872 || c == 0x202f || c == 0x205f || c == 0x3000
5873 #endif
5874 ) break;
5875 eptr++;
5876 }
5877 break;
5878
5879 case OP_HSPACE:
5880 for (i = min; i < max; i++)
5881 {
5882 if (eptr >= md->end_subject)
5883 {
5884 SCHECK_PARTIAL();
5885 break;
5886 }
5887 c = *eptr;
5888 if (c != 0x09 && c != 0x20 && c != 0xa0
5889 #ifdef COMPILE_PCRE16
5890 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5891 && c != 0x202f && c != 0x205f && c != 0x3000
5892 #endif
5893 ) break;
5894 eptr++;
5895 }
5896 break;
5897
5898 case OP_NOT_VSPACE:
5899 for (i = min; i < max; i++)
5900 {
5901 if (eptr >= md->end_subject)
5902 {
5903 SCHECK_PARTIAL();
5904 break;
5905 }
5906 c = *eptr;
5907 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5908 #ifdef COMPILE_PCRE16
5909 || c == 0x2028 || c == 0x2029
5910 #endif
5911 ) break;
5912 eptr++;
5913 }
5914 break;
5915
5916 case OP_VSPACE:
5917 for (i = min; i < max; i++)
5918 {
5919 if (eptr >= md->end_subject)
5920 {
5921 SCHECK_PARTIAL();
5922 break;
5923 }
5924 c = *eptr;
5925 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5926 #ifdef COMPILE_PCRE16
5927 && c != 0x2028 && c != 0x2029
5928 #endif
5929 ) break;
5930 eptr++;
5931 }
5932 break;
5933
5934 case OP_NOT_DIGIT:
5935 for (i = min; i < max; i++)
5936 {
5937 if (eptr >= md->end_subject)
5938 {
5939 SCHECK_PARTIAL();
5940 break;
5941 }
5942 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5943 eptr++;
5944 }
5945 break;
5946
5947 case OP_DIGIT:
5948 for (i = min; i < max; i++)
5949 {
5950 if (eptr >= md->end_subject)
5951 {
5952 SCHECK_PARTIAL();
5953 break;
5954 }
5955 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5956 eptr++;
5957 }
5958 break;
5959
5960 case OP_NOT_WHITESPACE:
5961 for (i = min; i < max; i++)
5962 {
5963 if (eptr >= md->end_subject)
5964 {
5965 SCHECK_PARTIAL();
5966 break;
5967 }
5968 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5969 eptr++;
5970 }
5971 break;
5972
5973 case OP_WHITESPACE:
5974 for (i = min; i < max; i++)
5975 {
5976 if (eptr >= md->end_subject)
5977 {
5978 SCHECK_PARTIAL();
5979 break;
5980 }
5981 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5982 eptr++;
5983 }
5984 break;
5985
5986 case OP_NOT_WORDCHAR:
5987 for (i = min; i < max; i++)
5988 {
5989 if (eptr >= md->end_subject)
5990 {
5991 SCHECK_PARTIAL();
5992 break;
5993 }
5994 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5995 eptr++;
5996 }
5997 break;
5998
5999 case OP_WORDCHAR:
6000 for (i = min; i < max; i++)
6001 {
6002 if (eptr >= md->end_subject)
6003 {
6004 SCHECK_PARTIAL();
6005 break;
6006 }
6007 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6008 eptr++;
6009 }
6010 break;
6011
6012 default:
6013 RRETURN(PCRE_ERROR_INTERNAL);
6014 }
6015
6016 /* eptr is now past the end of the maximum run. If possessive, we are
6017 done (no backing up). Otherwise, match at this position; anything other
6018 than no match is immediately returned. For nomatch, back up one
6019 character (byte), unless we are matching \R and the last thing matched
6020 was \r\n, in which case, back up two bytes. */
6021
6022 if (possessive) continue;
6023 while (eptr >= pp)
6024 {
6025 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6026 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6027 eptr--;
6028 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6029 eptr[-1] == '\r') eptr--;
6030 }
6031 }
6032
6033 /* Get here if we can't make it match with any permitted repetitions */
6034
6035 RRETURN(MATCH_NOMATCH);
6036 }
6037 /* Control never gets here */
6038
6039 /* There's been some horrible disaster. Arrival here can only mean there is
6040 something seriously wrong in the code above or the OP_xxx definitions. */
6041
6042 default:
6043 DPRINTF(("Unknown opcode %d\n", *ecode));
6044 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6045 }
6046
6047 /* Do not stick any code in here without much thought; it is assumed
6048 that "continue" in the code above comes out to here to repeat the main
6049 loop. */
6050
6051 } /* End of main loop */
6052 /* Control never reaches here */
6053
6054
6055 /* When compiling to use the heap rather than the stack for recursive calls to
6056 match(), the RRETURN() macro jumps here. The number that is saved in
6057 frame->Xwhere indicates which label we actually want to return to. */
6058
6059 #ifdef NO_RECURSE
6060 #define LBL(val) case val: goto L_RM##val;
6061 HEAP_RETURN:
6062 switch (frame->Xwhere)
6063 {
6064 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6065 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6066 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6067 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6068 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6069 LBL(65) LBL(66)
6070 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6071 LBL(21)
6072 #endif
6073 #ifdef SUPPORT_UTF
6074 LBL(16) LBL(18) LBL(20)
6075 LBL(22) LBL(23) LBL(28) LBL(30)
6076 LBL(32) LBL(34) LBL(42) LBL(46)
6077 #ifdef SUPPORT_UCP
6078 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6079 LBL(59) LBL(60) LBL(61) LBL(62)
6080 #endif /* SUPPORT_UCP */
6081 #endif /* SUPPORT_UTF */
6082 default:
6083 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6084
6085 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6086
6087 return PCRE_ERROR_INTERNAL;
6088 }
6089 #undef LBL
6090 #endif /* NO_RECURSE */
6091 }
6092
6093
6094 /***************************************************************************
6095 ****************************************************************************
6096 RECURSION IN THE match() FUNCTION
6097
6098 Undefine all the macros that were defined above to handle this. */
6099
6100 #ifdef NO_RECURSE
6101 #undef eptr
6102 #undef ecode
6103 #undef mstart
6104 #undef offset_top
6105 #undef eptrb
6106 #undef flags
6107
6108 #undef callpat
6109 #undef charptr
6110 #undef data
6111 #undef next
6112 #undef pp
6113 #undef prev
6114 #undef saved_eptr
6115
6116 #undef new_recursive
6117
6118 #undef cur_is_word
6119 #undef condition
6120 #undef prev_is_word
6121
6122 #undef ctype
6123 #undef length
6124 #undef max
6125 #undef min
6126 #undef number
6127 #undef offset
6128 #undef op
6129 #undef save_capture_last
6130 #undef save_offset1
6131 #undef save_offset2
6132 #undef save_offset3
6133 #undef stacksave
6134
6135 #undef newptrb
6136
6137 #endif
6138
6139 /* These two are defined as macros in both cases */
6140
6141 #undef fc
6142 #undef fi
6143
6144 /***************************************************************************
6145 ***************************************************************************/
6146
6147
6148
6149 /*************************************************
6150 * Execute a Regular Expression *
6151 *************************************************/
6152
6153 /* This function applies a compiled re to a subject string and picks out
6154 portions of the string if it matches. Two elements in the vector are set for
6155 each substring: the offsets to the start and end of the substring.
6156
6157 Arguments:
6158 argument_re points to the compiled expression
6159 extra_data points to extra data or is NULL
6160 subject points to the subject string
6161 length length of subject string (may contain binary zeros)
6162 start_offset where to start in the subject string
6163 options option bits
6164 offsets points to a vector of ints to be filled in with offsets
6165 offsetcount the number of elements in the vector
6166
6167 Returns: > 0 => success; value is the number of elements filled in
6168 = 0 => success, but offsets is not big enough
6169 -1 => failed to match
6170 < -1 => some kind of unexpected problem
6171 */
6172
6173 #ifdef COMPILE_PCRE8
6174 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6175 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6176 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6177 int offsetcount)
6178 #else
6179 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6180 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6181 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6182 int offsetcount)
6183 #endif
6184 {
6185 int rc, ocount, arg_offset_max;
6186 int newline;
6187 BOOL using_temporary_offsets = FALSE;
6188 BOOL anchored;
6189 BOOL startline;
6190 BOOL firstline;
6191 BOOL utf;
6192 BOOL has_first_char = FALSE;
6193 BOOL has_req_char = FALSE;
6194 pcre_uchar first_char = 0;
6195 pcre_uchar first_char2 = 0;
6196 pcre_uchar req_char = 0;
6197 pcre_uchar req_char2 = 0;
6198 match_data match_block;
6199 match_data *md = &match_block;
6200 const pcre_uint8 *tables;
6201 const pcre_uint8 *start_bits = NULL;
6202 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6203 PCRE_PUCHAR end_subject;
6204 PCRE_PUCHAR start_partial = NULL;
6205 PCRE_PUCHAR req_char_ptr = start_match - 1;
6206
6207 const pcre_study_data *study;
6208 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6209
6210 /* Check for the special magic call that measures the size of the stack used
6211 per recursive call of match(). Without the funny casting for sizeof, a Windows
6212 compiler gave this error: "unary minus operator applied to unsigned type,
6213 result still unsigned". Hopefully the cast fixes that. */
6214
6215 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6216 start_offset == -999)
6217 #ifdef NO_RECURSE
6218 return -((int)sizeof(heapframe));
6219 #else
6220 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6221 #endif
6222
6223 /* Plausibility checks */
6224
6225 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6226 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6227 return PCRE_ERROR_NULL;
6228 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6229 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6230
6231 /* Check that the first field in the block is the magic number. If it is not,
6232 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6233 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6234 means that the pattern is likely compiled with different endianness. */
6235
6236 if (re->magic_number != MAGIC_NUMBER)
6237 return re->magic_number == REVERSED_MAGIC_NUMBER?
6238 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6239 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6240
6241 /* These two settings are used in the code for checking a UTF-8 string that
6242 follows immediately afterwards. Other values in the md block are used only
6243 during "normal" pcre_exec() processing, not when the JIT support is in use,
6244 so they are set up later. */
6245
6246 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6247 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6248 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6249 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6250
6251 /* Check a UTF-8 string if required. Pass back the character offset and error
6252 code for an invalid string if a results vector is available. */
6253
6254 #ifdef SUPPORT_UTF
6255 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6256 {
6257 int erroroffset;
6258 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6259 if (errorcode != 0)
6260 {
6261 if (offsetcount >= 2)
6262 {
6263 offsets[0] = erroroffset;
6264 offsets[1] = errorcode;
6265 }
6266 #ifdef COMPILE_PCRE16
6267 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6268 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6269 #else
6270 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6271 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6272 #endif
6273 }
6274
6275 /* Check that a start_offset points to the start of a UTF character. */
6276 if (start_offset > 0 && start_offset < length &&
6277 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6278 return PCRE_ERROR_BADUTF8_OFFSET;
6279 }
6280 #endif
6281
6282 /* If the pattern was successfully studied with JIT support, run the JIT
6283 executable instead of the rest of this function. Most options must be set at
6284 compile time for the JIT code to be usable. Fallback to the normal code path if
6285 an unsupported flag is set. In particular, JIT does not support partial
6286 matching. */
6287
6288 #ifdef SUPPORT_JIT
6289 if (extra_data != NULL
6290 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6291 && extra_data->executable_jit != NULL
6292 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6293 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6294 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6295 return PRIV(jit_exec)(re, extra_data->executable_jit,
6296 (const pcre_uchar *)subject, length, start_offset, options,
6297 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6298 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6299 #endif
6300
6301 /* Carry on with non-JIT matching. This information is for finding all the
6302 numbers associated with a given name, for condition testing. */
6303
6304 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6305 md->name_count = re->name_count;
6306 md->name_entry_size = re->name_entry_size;
6307
6308 /* Fish out the optional data from the extra_data structure, first setting
6309 the default values. */
6310
6311 study = NULL;
6312 md->match_limit = MATCH_LIMIT;
6313 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6314 md->callout_data = NULL;
6315
6316 /* The table pointer is always in native byte order. */
6317
6318 tables = re->tables;
6319
6320 if (extra_data != NULL)
6321 {
6322 register unsigned int flags = extra_data->flags;
6323 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6324 study = (const pcre_study_data *)extra_data->study_data;
6325 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6326 md->match_limit = extra_data->match_limit;
6327 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6328 md->match_limit_recursion = extra_data->match_limit_recursion;
6329 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6330 md->callout_data = extra_data->callout_data;
6331 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6332 }
6333
6334 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6335 is a feature that makes it possible to save compiled regex and re-use them
6336 in other programs later. */
6337
6338 if (tables == NULL) tables = PRIV(default_tables);
6339
6340 /* Set up other data */
6341
6342 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6343 startline = (re->flags & PCRE_STARTLINE) != 0;
6344 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6345
6346 /* The code starts after the real_pcre block and the capture name table. */
6347
6348 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6349 re->name_count * re->name_entry_size;
6350
6351 md->start_subject = (PCRE_PUCHAR)subject;
6352 md->start_offset = start_offset;
6353 md->end_subject = md->start_subject + length;
6354 end_subject = md->end_subject;
6355
6356 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6357 md->use_ucp = (re->options & PCRE_UCP) != 0;
6358 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6359 md->ignore_skip_arg = FALSE;
6360
6361 /* Some options are unpacked into BOOL variables in the hope that testing
6362 them will be faster than individual option bits. */
6363
6364 md->notbol = (options & PCRE_NOTBOL) != 0;
6365 md->noteol = (options & PCRE_NOTEOL) != 0;
6366 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6367 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6368
6369 md->hitend = FALSE;
6370 md->mark = md->nomatch_mark = NULL; /* In case never set */
6371
6372 md->recursive = NULL; /* No recursion at top level */
6373 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6374
6375 md->lcc = tables + lcc_offset;
6376 md->fcc = tables + fcc_offset;
6377 md->ctypes = tables + ctypes_offset;
6378
6379 /* Handle different \R options. */
6380
6381 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6382 {
6383 case 0:
6384 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6385 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6386 else
6387 #ifdef BSR_ANYCRLF
6388 md->bsr_anycrlf = TRUE;
6389 #else
6390 md->bsr_anycrlf = FALSE;
6391 #endif
6392 break;
6393
6394 case PCRE_BSR_ANYCRLF:
6395 md->bsr_anycrlf = TRUE;
6396 break;
6397
6398 case PCRE_BSR_UNICODE:
6399 md->bsr_anycrlf = FALSE;
6400 break;
6401
6402 default: return PCRE_ERROR_BADNEWLINE;
6403 }
6404
6405 /* Handle different types of newline. The three bits give eight cases. If
6406 nothing is set at run time, whatever was used at compile time applies. */
6407
6408 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6409 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6410 {
6411 case 0: newline = NEWLINE; break; /* Compile-time default */
6412 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6413 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6414 case PCRE_NEWLINE_CR+
6415 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6416 case PCRE_NEWLINE_ANY: newline = -1; break;
6417 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6418 default: return PCRE_ERROR_BADNEWLINE;
6419 }
6420
6421 if (newline == -2)
6422 {
6423 md->nltype = NLTYPE_ANYCRLF;
6424 }
6425 else if (newline < 0)
6426 {
6427 md->nltype = NLTYPE_ANY;
6428 }
6429 else
6430 {
6431 md->nltype = NLTYPE_FIXED;
6432 if (newline > 255)
6433 {
6434 md->nllen = 2;
6435 md->nl[0] = (newline >> 8) & 255;
6436 md->nl[1] = newline & 255;
6437 }
6438 else
6439 {
6440 md->nllen = 1;
6441 md->nl[0] = newline;
6442 }
6443 }
6444
6445 /* Partial matching was originally supported only for a restricted set of
6446 regexes; from release 8.00 there are no restrictions, but the bits are still
6447 defined (though never set). So there's no harm in leaving this code. */
6448
6449 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6450 return PCRE_ERROR_BADPARTIAL;
6451
6452 /* If the expression has got more back references than the offsets supplied can
6453 hold, we get a temporary chunk of working store to use during the matching.
6454 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6455 of 3. */
6456
6457 ocount = offsetcount - (offsetcount % 3);
6458 arg_offset_max = (2*ocount)/3;
6459
6460 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6461 {
6462 ocount = re->top_backref * 3 + 3;
6463 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6464 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6465 using_temporary_offsets = TRUE;
6466 DPRINTF(("Got memory to hold back references\n"));
6467 }
6468 else md->offset_vector = offsets;
6469
6470 md->offset_end = ocount;
6471 md->offset_max = (2*ocount)/3;
6472 md->offset_overflow = FALSE;
6473 md->capture_last = -1;
6474
6475 /* Reset the working variable associated with each extraction. These should
6476 never be used unless previously set, but they get saved and restored, and so we
6477 initialize them to avoid reading uninitialized locations. Also, unset the
6478 offsets for the matched string. This is really just for tidiness with callouts,
6479 in case they inspect these fields. */
6480
6481 if (md->offset_vector != NULL)
6482 {
6483 register int *iptr = md->offset_vector + ocount;
6484 register int *iend = iptr - re->top_bracket;
6485 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6486 while (--iptr >= iend) *iptr = -1;
6487 md->offset_vector[0] = md->offset_vector[1] = -1;
6488 }
6489
6490 /* Set up the first character to match, if available. The first_char value is
6491 never set for an anchored regular expression, but the anchoring may be forced
6492 at run time, so we have to test for anchoring. The first char may be unset for
6493 an unanchored pattern, of course. If there's no first char and the pattern was
6494 studied, there may be a bitmap of possible first characters. */
6495
6496 if (!anchored)
6497 {
6498 if ((re->flags & PCRE_FIRSTSET) != 0)
6499 {
6500 has_first_char = TRUE;
6501 first_char = first_char2 = (pcre_uchar)(re->first_char);
6502 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6503 {
6504 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6505 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6506 if (utf && first_char > 127)
6507 first_char2 = UCD_OTHERCASE(first_char);
6508 #endif
6509 }
6510 }
6511 else
6512 if (!startline && study != NULL &&
6513 (study->flags & PCRE_STUDY_MAPPED) != 0)
6514 start_bits = study->start_bits;
6515 }
6516
6517 /* For anchored or unanchored matches, there may be a "last known required
6518 character" set. */
6519
6520 if ((re->flags & PCRE_REQCHSET) != 0)
6521 {
6522 has_req_char = TRUE;
6523 req_char = req_char2 = (pcre_uchar)(re->req_char);
6524 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6525 {
6526 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6527 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6528 if (utf && req_char > 127)
6529 req_char2 = UCD_OTHERCASE(req_char);
6530 #endif
6531 }
6532 }
6533
6534
6535 /* ==========================================================================*/
6536
6537 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6538 the loop runs just once. */
6539
6540 for(;;)
6541 {
6542 PCRE_PUCHAR save_end_subject = end_subject;
6543 PCRE_PUCHAR new_start_match;
6544
6545 /* If firstline is TRUE, the start of the match is constrained to the first
6546 line of a multiline string. That is, the match must be before or at the first
6547 newline. Implement this by temporarily adjusting end_subject so that we stop
6548 scanning at a newline. If the match fails at the newline, later code breaks
6549 this loop. */
6550
6551 if (firstline)
6552 {
6553 PCRE_PUCHAR t = start_match;
6554 #ifdef SUPPORT_UTF
6555 if (utf)
6556 {
6557 while (t < md->end_subject && !IS_NEWLINE(t))
6558 {
6559 t++;
6560 ACROSSCHAR(t < end_subject, *t, t++);
6561 }
6562 }
6563 else
6564 #endif
6565 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6566 end_subject = t;
6567 }
6568
6569 /* There are some optimizations that avoid running the match if a known
6570 starting point is not found, or if a known later character is not present.
6571 However, there is an option that disables these, for testing and for ensuring
6572 that all callouts do actually occur. The option can be set in the regex by
6573 (*NO_START_OPT) or passed in match-time options. */
6574
6575 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6576 {
6577 /* Advance to a unique first char if there is one. */
6578
6579 if (has_first_char)
6580 {
6581 if (first_char != first_char2)
6582 while (start_match < end_subject &&
6583 *start_match != first_char && *start_match != first_char2)
6584 start_match++;
6585 else
6586 while (start_match < end_subject && *start_match != first_char)
6587 start_match++;
6588 }
6589
6590 /* Or to just after a linebreak for a multiline match */
6591
6592 else if (startline)
6593 {
6594 if (start_match > md->start_subject + start_offset)
6595 {
6596 #ifdef SUPPORT_UTF
6597 if (utf)
6598 {
6599 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6600 {
6601 start_match++;
6602 ACROSSCHAR(start_match < end_subject, *start_match,
6603 start_match++);
6604 }
6605 }
6606 else
6607 #endif
6608 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6609 start_match++;
6610
6611 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6612 and we are now at a LF, advance the match position by one more character.
6613 */
6614
6615 if (start_match[-1] == CHAR_CR &&
6616 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6617 start_match < end_subject &&
6618 *start_match == CHAR_NL)
6619 start_match++;
6620 }
6621 }
6622
6623 /* Or to a non-unique first byte after study */
6624
6625 else if (start_bits != NULL)
6626 {
6627 while (start_match < end_subject)
6628 {
6629 register unsigned int c = *start_match;
6630 #ifndef COMPILE_PCRE8
6631 if (c > 255) c = 255;
6632 #endif
6633 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6634 {
6635 start_match++;
6636 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6637 /* In non 8-bit mode, the iteration will stop for
6638 characters > 255 at the beginning or not stop at all. */
6639 if (utf)
6640 ACROSSCHAR(start_match < end_subject, *start_match,
6641 start_match++);
6642 #endif
6643 }
6644 else break;
6645 }
6646 }
6647 } /* Starting optimizations */
6648
6649 /* Restore fudged end_subject */
6650
6651 end_subject = save_end_subject;
6652
6653 /* The following two optimizations are disabled for partial matching or if
6654 disabling is explicitly requested. */
6655
6656 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6657 {
6658 /* If the pattern was studied, a minimum subject length may be set. This is
6659 a lower bound; no actual string of that length may actually match the
6660 pattern. Although the value is, strictly, in characters, we treat it as
6661 bytes to avoid spending too much time in this optimization. */
6662
6663 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6664 (pcre_uint32)(end_subject - start_match) < study->minlength)
6665 {
6666 rc = MATCH_NOMATCH;
6667 break;
6668 }
6669
6670 /* If req_char is set, we know that that character must appear in the
6671 subject for the match to succeed. If the first character is set, req_char
6672 must be later in the subject; otherwise the test starts at the match point.
6673 This optimization can save a huge amount of backtracking in patterns with
6674 nested unlimited repeats that aren't going to match. Writing separate code
6675 for cased/caseless versions makes it go faster, as does using an
6676 autoincrement and backing off on a match.
6677
6678 HOWEVER: when the subject string is very, very long, searching to its end
6679 can take a long time, and give bad performance on quite ordinary patterns.
6680 This showed up when somebody was matching something like /^\d+C/ on a
6681 32-megabyte string... so we don't do this when the string is sufficiently
6682 long. */
6683
6684 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6685 {
6686 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6687
6688 /* We don't need to repeat the search if we haven't yet reached the
6689 place we found it at last time. */
6690
6691 if (p > req_char_ptr)
6692 {
6693 if (req_char != req_char2)
6694 {
6695 while (p < end_subject)
6696 {
6697 register int pp = *p++;
6698 if (pp == req_char || pp == req_char2) { p--; break; }
6699 }
6700 }
6701 else
6702 {
6703 while (p < end_subject)
6704 {
6705 if (*p++ == req_char) { p--; break; }
6706 }
6707 }
6708
6709 /* If we can't find the required character, break the matching loop,
6710 forcing a match failure. */
6711
6712 if (p >= end_subject)
6713 {
6714 rc = MATCH_NOMATCH;
6715 break;
6716 }
6717
6718 /* If we have found the required character, save the point where we
6719 found it, so that we don't search again next time round the loop if
6720 the start hasn't passed this character yet. */
6721
6722 req_char_ptr = p;
6723 }
6724 }
6725 }
6726
6727 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6728 printf(">>>> Match against: ");
6729 pchars(start_match, end_subject - start_match, TRUE, md);
6730 printf("\n");
6731 #endif
6732
6733 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6734 first starting point for which a partial match was found. */
6735
6736 md->start_match_ptr = start_match;
6737 md->start_used_ptr = start_match;
6738 md->match_call_count = 0;
6739