/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 613 - (show annotations)
Sat Jul 2 16:59:52 2011 UTC (8 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 194605 byte(s)
Error occurred while calculating annotation data.
Fix problem with the interaction of (*ACCEPT) in an assertion with 
PCRE_NOTEMPTY.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63};
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. If we fail to match, we need to restore this value and also the
812 values of the final offsets, in case they were set by a previous iteration
813 of the same bracket.
814
815 If there isn't enough space in the offset vector, treat this as if it were
816 a non-capturing bracket. Don't worry about setting the flag for the error
817 case here; that is handled in the code for KET. */
818
819 case OP_CBRA:
820 case OP_SCBRA:
821 number = GET2(ecode, 1+LINK_SIZE);
822 offset = number << 1;
823
824 #ifdef PCRE_DEBUG
825 printf("start bracket %d\n", number);
826 printf("subject=");
827 pchars(eptr, 16, TRUE, md);
828 printf("\n");
829 #endif
830
831 if (offset < md->offset_max)
832 {
833 save_offset1 = md->offset_vector[offset];
834 save_offset2 = md->offset_vector[offset+1];
835 save_offset3 = md->offset_vector[md->offset_end - number];
836 save_capture_last = md->capture_last;
837
838 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 md->offset_vector[md->offset_end - number] =
840 (int)(eptr - md->start_subject);
841
842 for (;;)
843 {
844 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846 eptrb, RM1);
847 if (rrc != MATCH_NOMATCH &&
848 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849 RRETURN(rrc);
850 md->capture_last = save_capture_last;
851 ecode += GET(ecode, 1);
852 if (*ecode != OP_ALT) break;
853 }
854
855 DPRINTF(("bracket %d failed\n", number));
856
857 md->offset_vector[offset] = save_offset1;
858 md->offset_vector[offset+1] = save_offset2;
859 md->offset_vector[md->offset_end - number] = save_offset3;
860
861 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
862 RRETURN(MATCH_NOMATCH);
863 }
864
865 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866 as a non-capturing bracket. */
867
868 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870
871 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877 for all the alternatives. When we get to the final alternative within the
878 brackets, we used to return the result of a recursive call to match()
879 whatever happened so it was possible to reduce stack usage by turning this
880 into a tail recursion, except in the case of a possibly empty group.
881 However, now that there is the possiblity of (*THEN) occurring in the final
882 alternative, this optimization is no longer possible. */
883
884 case OP_BRA:
885 case OP_SBRA:
886 DPRINTF(("start non-capturing bracket\n"));
887 for (;;)
888 {
889 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
890 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
891 RM2);
892 if (rrc != MATCH_NOMATCH &&
893 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
894 RRETURN(rrc);
895 ecode += GET(ecode, 1);
896 if (*ecode != OP_ALT) break;
897 }
898
899 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
900 RRETURN(MATCH_NOMATCH);
901
902 /* Handle possessive capturing brackets with an unlimited repeat. We come
903 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
904 handled similarly to the normal case above. However, the matching is
905 different. The end of these brackets will always be OP_KETRPOS, which
906 returns MATCH_KETRPOS without going further in the pattern. By this means
907 we can handle the group by iteration rather than recursion, thereby
908 reducing the amount of stack needed. */
909
910 case OP_CBRAPOS:
911 case OP_SCBRAPOS:
912 allow_zero = FALSE;
913
914 POSSESSIVE_CAPTURE:
915 number = GET2(ecode, 1+LINK_SIZE);
916 offset = number << 1;
917
918 #ifdef PCRE_DEBUG
919 printf("start possessive bracket %d\n", number);
920 printf("subject=");
921 pchars(eptr, 16, TRUE, md);
922 printf("\n");
923 #endif
924
925 if (offset < md->offset_max)
926 {
927 matched_once = FALSE;
928 code_offset = ecode - md->start_code;
929
930 save_offset1 = md->offset_vector[offset];
931 save_offset2 = md->offset_vector[offset+1];
932 save_offset3 = md->offset_vector[md->offset_end - number];
933 save_capture_last = md->capture_last;
934
935 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
936
937 /* Each time round the loop, save the current subject position for use
938 when the group matches. For MATCH_MATCH, the group has matched, so we
939 restart it with a new subject starting position, remembering that we had
940 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
941 usual. If we haven't matched any alternatives in any iteration, check to
942 see if a previous iteration matched. If so, the group has matched;
943 continue from afterwards. Otherwise it has failed; restore the previous
944 capture values before returning NOMATCH. */
945
946 for (;;)
947 {
948 md->offset_vector[md->offset_end - number] =
949 (int)(eptr - md->start_subject);
950 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
951 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
952 eptrb, RM63);
953 if (rrc == MATCH_KETRPOS)
954 {
955 offset_top = md->end_offset_top;
956 eptr = md->end_match_ptr;
957 ecode = md->start_code + code_offset;
958 save_capture_last = md->capture_last;
959 matched_once = TRUE;
960 continue;
961 }
962 if (rrc != MATCH_NOMATCH &&
963 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
964 RRETURN(rrc);
965 md->capture_last = save_capture_last;
966 ecode += GET(ecode, 1);
967 if (*ecode != OP_ALT) break;
968 }
969
970 if (!matched_once)
971 {
972 md->offset_vector[offset] = save_offset1;
973 md->offset_vector[offset+1] = save_offset2;
974 md->offset_vector[md->offset_end - number] = save_offset3;
975 }
976
977 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
978 if (allow_zero || matched_once)
979 {
980 ecode += 1 + LINK_SIZE;
981 break;
982 }
983
984 RRETURN(MATCH_NOMATCH);
985 }
986
987 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
988 as a non-capturing bracket. */
989
990 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
991 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
992
993 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
994
995 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
997
998 /* Non-capturing possessive bracket with unlimited repeat. We come here
999 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1000 without the capturing complication. It is written out separately for speed
1001 and cleanliness. */
1002
1003 case OP_BRAPOS:
1004 case OP_SBRAPOS:
1005 allow_zero = FALSE;
1006
1007 POSSESSIVE_NON_CAPTURE:
1008 matched_once = FALSE;
1009 code_offset = ecode - md->start_code;
1010
1011 for (;;)
1012 {
1013 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1014 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1015 eptrb, RM48);
1016 if (rrc == MATCH_KETRPOS)
1017 {
1018 offset_top = md->end_offset_top;
1019 eptr = md->end_match_ptr;
1020 ecode = md->start_code + code_offset;
1021 matched_once = TRUE;
1022 continue;
1023 }
1024 if (rrc != MATCH_NOMATCH &&
1025 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1026 RRETURN(rrc);
1027 ecode += GET(ecode, 1);
1028 if (*ecode != OP_ALT) break;
1029 }
1030
1031 if (matched_once || allow_zero)
1032 {
1033 ecode += 1 + LINK_SIZE;
1034 break;
1035 }
1036 RRETURN(MATCH_NOMATCH);
1037
1038 /* Control never reaches here. */
1039
1040 /* Conditional group: compilation checked that there are no more than
1041 two branches. If the condition is false, skipping the first branch takes us
1042 past the end if there is only one branch, but that's OK because that is
1043 exactly what going to the ket would do. */
1044
1045 case OP_COND:
1046 case OP_SCOND:
1047 codelink = GET(ecode, 1);
1048
1049 /* Because of the way auto-callout works during compile, a callout item is
1050 inserted between OP_COND and an assertion condition. */
1051
1052 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1053 {
1054 if (pcre_callout != NULL)
1055 {
1056 pcre_callout_block cb;
1057 cb.version = 1; /* Version 1 of the callout block */
1058 cb.callout_number = ecode[LINK_SIZE+2];
1059 cb.offset_vector = md->offset_vector;
1060 cb.subject = (PCRE_SPTR)md->start_subject;
1061 cb.subject_length = (int)(md->end_subject - md->start_subject);
1062 cb.start_match = (int)(mstart - md->start_subject);
1063 cb.current_position = (int)(eptr - md->start_subject);
1064 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1065 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1066 cb.capture_top = offset_top/2;
1067 cb.capture_last = md->capture_last;
1068 cb.callout_data = md->callout_data;
1069 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1070 if (rrc < 0) RRETURN(rrc);
1071 }
1072 ecode += _pcre_OP_lengths[OP_CALLOUT];
1073 }
1074
1075 condcode = ecode[LINK_SIZE+1];
1076
1077 /* Now see what the actual condition is */
1078
1079 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1080 {
1081 if (md->recursive == NULL) /* Not recursing => FALSE */
1082 {
1083 condition = FALSE;
1084 ecode += GET(ecode, 1);
1085 }
1086 else
1087 {
1088 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1089 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1090
1091 /* If the test is for recursion into a specific subpattern, and it is
1092 false, but the test was set up by name, scan the table to see if the
1093 name refers to any other numbers, and test them. The condition is true
1094 if any one is set. */
1095
1096 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1097 {
1098 uschar *slotA = md->name_table;
1099 for (i = 0; i < md->name_count; i++)
1100 {
1101 if (GET2(slotA, 0) == recno) break;
1102 slotA += md->name_entry_size;
1103 }
1104
1105 /* Found a name for the number - there can be only one; duplicate
1106 names for different numbers are allowed, but not vice versa. First
1107 scan down for duplicates. */
1108
1109 if (i < md->name_count)
1110 {
1111 uschar *slotB = slotA;
1112 while (slotB > md->name_table)
1113 {
1114 slotB -= md->name_entry_size;
1115 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1116 {
1117 condition = GET2(slotB, 0) == md->recursive->group_num;
1118 if (condition) break;
1119 }
1120 else break;
1121 }
1122
1123 /* Scan up for duplicates */
1124
1125 if (!condition)
1126 {
1127 slotB = slotA;
1128 for (i++; i < md->name_count; i++)
1129 {
1130 slotB += md->name_entry_size;
1131 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132 {
1133 condition = GET2(slotB, 0) == md->recursive->group_num;
1134 if (condition) break;
1135 }
1136 else break;
1137 }
1138 }
1139 }
1140 }
1141
1142 /* Chose branch according to the condition */
1143
1144 ecode += condition? 3 : GET(ecode, 1);
1145 }
1146 }
1147
1148 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1149 {
1150 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1151 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1152
1153 /* If the numbered capture is unset, but the reference was by name,
1154 scan the table to see if the name refers to any other numbers, and test
1155 them. The condition is true if any one is set. This is tediously similar
1156 to the code above, but not close enough to try to amalgamate. */
1157
1158 if (!condition && condcode == OP_NCREF)
1159 {
1160 int refno = offset >> 1;
1161 uschar *slotA = md->name_table;
1162
1163 for (i = 0; i < md->name_count; i++)
1164 {
1165 if (GET2(slotA, 0) == refno) break;
1166 slotA += md->name_entry_size;
1167 }
1168
1169 /* Found a name for the number - there can be only one; duplicate names
1170 for different numbers are allowed, but not vice versa. First scan down
1171 for duplicates. */
1172
1173 if (i < md->name_count)
1174 {
1175 uschar *slotB = slotA;
1176 while (slotB > md->name_table)
1177 {
1178 slotB -= md->name_entry_size;
1179 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1180 {
1181 offset = GET2(slotB, 0) << 1;
1182 condition = offset < offset_top &&
1183 md->offset_vector[offset] >= 0;
1184 if (condition) break;
1185 }
1186 else break;
1187 }
1188
1189 /* Scan up for duplicates */
1190
1191 if (!condition)
1192 {
1193 slotB = slotA;
1194 for (i++; i < md->name_count; i++)
1195 {
1196 slotB += md->name_entry_size;
1197 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1198 {
1199 offset = GET2(slotB, 0) << 1;
1200 condition = offset < offset_top &&
1201 md->offset_vector[offset] >= 0;
1202 if (condition) break;
1203 }
1204 else break;
1205 }
1206 }
1207 }
1208 }
1209
1210 /* Chose branch according to the condition */
1211
1212 ecode += condition? 3 : GET(ecode, 1);
1213 }
1214
1215 else if (condcode == OP_DEF) /* DEFINE - always false */
1216 {
1217 condition = FALSE;
1218 ecode += GET(ecode, 1);
1219 }
1220
1221 /* The condition is an assertion. Call match() to evaluate it - setting
1222 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1223 an assertion. */
1224
1225 else
1226 {
1227 md->match_function_type = MATCH_CONDASSERT;
1228 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1229 if (rrc == MATCH_MATCH)
1230 {
1231 condition = TRUE;
1232 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1233 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1234 }
1235 else if (rrc != MATCH_NOMATCH &&
1236 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1237 {
1238 RRETURN(rrc); /* Need braces because of following else */
1239 }
1240 else
1241 {
1242 condition = FALSE;
1243 ecode += codelink;
1244 }
1245 }
1246
1247 /* We are now at the branch that is to be obeyed. As there is only one,
1248 we used to use tail recursion to avoid using another stack frame, except
1249 when there was unlimited repeat of a possibly empty group. However, that
1250 strategy no longer works because of the possibilty of (*THEN) being
1251 encountered in the branch. A recursive call to match() is always required,
1252 unless the second alternative doesn't exist, in which case we can just
1253 plough on. */
1254
1255 if (condition || *ecode == OP_ALT)
1256 {
1257 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1258 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1259 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1260 rrc = MATCH_NOMATCH;
1261 RRETURN(rrc);
1262 }
1263 else /* Condition false & no alternative */
1264 {
1265 ecode += 1 + LINK_SIZE;
1266 }
1267 break;
1268
1269
1270 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1271 to close any currently open capturing brackets. */
1272
1273 case OP_CLOSE:
1274 number = GET2(ecode, 1);
1275 offset = number << 1;
1276
1277 #ifdef PCRE_DEBUG
1278 printf("end bracket %d at *ACCEPT", number);
1279 printf("\n");
1280 #endif
1281
1282 md->capture_last = number;
1283 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1284 {
1285 md->offset_vector[offset] =
1286 md->offset_vector[md->offset_end - number];
1287 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1288 if (offset_top <= offset) offset_top = offset + 2;
1289 }
1290 ecode += 3;
1291 break;
1292
1293
1294 /* End of the pattern, either real or forced. If we are in a recursion, we
1295 should restore the offsets appropriately, and if it's a top-level
1296 recursion, continue from after the call. */
1297
1298 case OP_ACCEPT:
1299 case OP_ASSERT_ACCEPT:
1300 case OP_END:
1301 if (md->recursive != NULL)
1302 {
1303 recursion_info *rec = md->recursive;
1304 md->recursive = rec->prevrec;
1305 memmove(md->offset_vector, rec->offset_save,
1306 rec->saved_max * sizeof(int));
1307 offset_top = rec->save_offset_top;
1308 if (rec->group_num == 0)
1309 {
1310 ecode = rec->after_call;
1311 break;
1312 }
1313 }
1314
1315 /* Otherwise, if we have matched an empty string, fail if not in an
1316 assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1317 is set and we have matched at the start of the subject. In both cases,
1318 backtracking will then try other alternatives, if any. */
1319
1320 else if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1321 (md->notempty ||
1322 (md->notempty_atstart &&
1323 mstart == md->start_subject + md->start_offset)))
1324 MRRETURN(MATCH_NOMATCH);
1325
1326 /* Otherwise, we have a match. */
1327
1328 md->end_match_ptr = eptr; /* Record where we ended */
1329 md->end_offset_top = offset_top; /* and how many extracts were taken */
1330 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1331
1332 /* For some reason, the macros don't work properly if an expression is
1333 given as the argument to MRRETURN when the heap is in use. */
1334
1335 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1336 MRRETURN(rrc);
1337
1338 /* Assertion brackets. Check the alternative branches in turn - the
1339 matching won't pass the KET for an assertion. If any one branch matches,
1340 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1341 start of each branch to move the current point backwards, so the code at
1342 this level is identical to the lookahead case. When the assertion is part
1343 of a condition, we want to return immediately afterwards. The caller of
1344 this incarnation of the match() function will have set MATCH_CONDASSERT in
1345 md->match_function type, and one of these opcodes will be the first opcode
1346 that is processed. We use a local variable that is preserved over calls to
1347 match() to remember this case. */
1348
1349 case OP_ASSERT:
1350 case OP_ASSERTBACK:
1351 if (md->match_function_type == MATCH_CONDASSERT)
1352 {
1353 condassert = TRUE;
1354 md->match_function_type = 0;
1355 }
1356 else condassert = FALSE;
1357
1358 do
1359 {
1360 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1361 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1362 {
1363 mstart = md->start_match_ptr; /* In case \K reset it */
1364 break;
1365 }
1366 if (rrc != MATCH_NOMATCH &&
1367 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1368 RRETURN(rrc);
1369 ecode += GET(ecode, 1);
1370 }
1371 while (*ecode == OP_ALT);
1372
1373 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1374
1375 /* If checking an assertion for a condition, return MATCH_MATCH. */
1376
1377 if (condassert) RRETURN(MATCH_MATCH);
1378
1379 /* Continue from after the assertion, updating the offsets high water
1380 mark, since extracts may have been taken during the assertion. */
1381
1382 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1383 ecode += 1 + LINK_SIZE;
1384 offset_top = md->end_offset_top;
1385 continue;
1386
1387 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1388 PRUNE, or COMMIT means we must assume failure without checking subsequent
1389 branches. */
1390
1391 case OP_ASSERT_NOT:
1392 case OP_ASSERTBACK_NOT:
1393 if (md->match_function_type == MATCH_CONDASSERT)
1394 {
1395 condassert = TRUE;
1396 md->match_function_type = 0;
1397 }
1398 else condassert = FALSE;
1399
1400 do
1401 {
1402 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1403 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1404 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1405 {
1406 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1407 break;
1408 }
1409 if (rrc != MATCH_NOMATCH &&
1410 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1411 RRETURN(rrc);
1412 ecode += GET(ecode,1);
1413 }
1414 while (*ecode == OP_ALT);
1415
1416 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1417
1418 ecode += 1 + LINK_SIZE;
1419 continue;
1420
1421 /* Move the subject pointer back. This occurs only at the start of
1422 each branch of a lookbehind assertion. If we are too close to the start to
1423 move back, this match function fails. When working with UTF-8 we move
1424 back a number of characters, not bytes. */
1425
1426 case OP_REVERSE:
1427 #ifdef SUPPORT_UTF8
1428 if (utf8)
1429 {
1430 i = GET(ecode, 1);
1431 while (i-- > 0)
1432 {
1433 eptr--;
1434 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1435 BACKCHAR(eptr);
1436 }
1437 }
1438 else
1439 #endif
1440
1441 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1442
1443 {
1444 eptr -= GET(ecode, 1);
1445 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1446 }
1447
1448 /* Save the earliest consulted character, then skip to next op code */
1449
1450 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1451 ecode += 1 + LINK_SIZE;
1452 break;
1453
1454 /* The callout item calls an external function, if one is provided, passing
1455 details of the match so far. This is mainly for debugging, though the
1456 function is able to force a failure. */
1457
1458 case OP_CALLOUT:
1459 if (pcre_callout != NULL)
1460 {
1461 pcre_callout_block cb;
1462 cb.version = 1; /* Version 1 of the callout block */
1463 cb.callout_number = ecode[1];
1464 cb.offset_vector = md->offset_vector;
1465 cb.subject = (PCRE_SPTR)md->start_subject;
1466 cb.subject_length = (int)(md->end_subject - md->start_subject);
1467 cb.start_match = (int)(mstart - md->start_subject);
1468 cb.current_position = (int)(eptr - md->start_subject);
1469 cb.pattern_position = GET(ecode, 2);
1470 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1471 cb.capture_top = offset_top/2;
1472 cb.capture_last = md->capture_last;
1473 cb.callout_data = md->callout_data;
1474 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1475 if (rrc < 0) RRETURN(rrc);
1476 }
1477 ecode += 2 + 2*LINK_SIZE;
1478 break;
1479
1480 /* Recursion either matches the current regex, or some subexpression. The
1481 offset data is the offset to the starting bracket from the start of the
1482 whole pattern. (This is so that it works from duplicated subpatterns.)
1483
1484 If there are any capturing brackets started but not finished, we have to
1485 save their starting points and reinstate them after the recursion. However,
1486 we don't know how many such there are (offset_top records the completed
1487 total) so we just have to save all the potential data. There may be up to
1488 65535 such values, which is too large to put on the stack, but using malloc
1489 for small numbers seems expensive. As a compromise, the stack is used when
1490 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1491 is used. A problem is what to do if the malloc fails ... there is no way of
1492 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1493 values on the stack, and accept that the rest may be wrong.
1494
1495 There are also other values that have to be saved. We use a chained
1496 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1497 for the original version of this logic. */
1498
1499 case OP_RECURSE:
1500 {
1501 callpat = md->start_code + GET(ecode, 1);
1502 new_recursive.group_num = (callpat == md->start_code)? 0 :
1503 GET2(callpat, 1 + LINK_SIZE);
1504
1505 /* Add to "recursing stack" */
1506
1507 new_recursive.prevrec = md->recursive;
1508 md->recursive = &new_recursive;
1509
1510 /* Find where to continue from afterwards */
1511
1512 ecode += 1 + LINK_SIZE;
1513 new_recursive.after_call = ecode;
1514
1515 /* Now save the offset data. */
1516
1517 new_recursive.saved_max = md->offset_end;
1518 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1519 new_recursive.offset_save = stacksave;
1520 else
1521 {
1522 new_recursive.offset_save =
1523 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1524 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1525 }
1526
1527 memcpy(new_recursive.offset_save, md->offset_vector,
1528 new_recursive.saved_max * sizeof(int));
1529 new_recursive.save_offset_top = offset_top;
1530
1531 /* OK, now we can do the recursion. For each top-level alternative we
1532 restore the offset and recursion data. */
1533
1534 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1535 cbegroup = (*callpat >= OP_SBRA);
1536 do
1537 {
1538 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1539 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1540 md, eptrb, RM6);
1541 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1542 {
1543 DPRINTF(("Recursion matched\n"));
1544 md->recursive = new_recursive.prevrec;
1545 if (new_recursive.offset_save != stacksave)
1546 (pcre_free)(new_recursive.offset_save);
1547 MRRETURN(MATCH_MATCH);
1548 }
1549 else if (rrc != MATCH_NOMATCH &&
1550 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1551 {
1552 DPRINTF(("Recursion gave error %d\n", rrc));
1553 if (new_recursive.offset_save != stacksave)
1554 (pcre_free)(new_recursive.offset_save);
1555 RRETURN(rrc);
1556 }
1557
1558 md->recursive = &new_recursive;
1559 memcpy(md->offset_vector, new_recursive.offset_save,
1560 new_recursive.saved_max * sizeof(int));
1561 callpat += GET(callpat, 1);
1562 }
1563 while (*callpat == OP_ALT);
1564
1565 DPRINTF(("Recursion didn't match\n"));
1566 md->recursive = new_recursive.prevrec;
1567 if (new_recursive.offset_save != stacksave)
1568 (pcre_free)(new_recursive.offset_save);
1569 MRRETURN(MATCH_NOMATCH);
1570 }
1571 /* Control never reaches here */
1572
1573 /* "Once" brackets are like assertion brackets except that after a match,
1574 the point in the subject string is not moved back. Thus there can never be
1575 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1576 Check the alternative branches in turn - the matching won't pass the KET
1577 for this kind of subpattern. If any one branch matches, we carry on as at
1578 the end of a normal bracket, leaving the subject pointer, but resetting
1579 the start-of-match value in case it was changed by \K. */
1580
1581 case OP_ONCE:
1582 prev = ecode;
1583 saved_eptr = eptr;
1584
1585 do
1586 {
1587 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1588 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1589 {
1590 mstart = md->start_match_ptr;
1591 break;
1592 }
1593 if (rrc != MATCH_NOMATCH &&
1594 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1595 RRETURN(rrc);
1596 ecode += GET(ecode,1);
1597 }
1598 while (*ecode == OP_ALT);
1599
1600 /* If hit the end of the group (which could be repeated), fail */
1601
1602 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1603
1604 /* Continue as from after the assertion, updating the offsets high water
1605 mark, since extracts may have been taken. */
1606
1607 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1608
1609 offset_top = md->end_offset_top;
1610 eptr = md->end_match_ptr;
1611
1612 /* For a non-repeating ket, just continue at this level. This also
1613 happens for a repeating ket if no characters were matched in the group.
1614 This is the forcible breaking of infinite loops as implemented in Perl
1615 5.005. If there is an options reset, it will get obeyed in the normal
1616 course of events. */
1617
1618 if (*ecode == OP_KET || eptr == saved_eptr)
1619 {
1620 ecode += 1+LINK_SIZE;
1621 break;
1622 }
1623
1624 /* The repeating kets try the rest of the pattern or restart from the
1625 preceding bracket, in the appropriate order. The second "call" of match()
1626 uses tail recursion, to avoid using another stack frame. */
1627
1628 if (*ecode == OP_KETRMIN)
1629 {
1630 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1632 ecode = prev;
1633 goto TAIL_RECURSE;
1634 }
1635 else /* OP_KETRMAX */
1636 {
1637 md->match_function_type = MATCH_CBEGROUP;
1638 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1640 ecode += 1 + LINK_SIZE;
1641 goto TAIL_RECURSE;
1642 }
1643 /* Control never gets here */
1644
1645 /* An alternation is the end of a branch; scan along to find the end of the
1646 bracketed group and go to there. */
1647
1648 case OP_ALT:
1649 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1650 break;
1651
1652 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1653 indicating that it may occur zero times. It may repeat infinitely, or not
1654 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1655 with fixed upper repeat limits are compiled as a number of copies, with the
1656 optional ones preceded by BRAZERO or BRAMINZERO. */
1657
1658 case OP_BRAZERO:
1659 next = ecode + 1;
1660 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1662 do next += GET(next, 1); while (*next == OP_ALT);
1663 ecode = next + 1 + LINK_SIZE;
1664 break;
1665
1666 case OP_BRAMINZERO:
1667 next = ecode + 1;
1668 do next += GET(next, 1); while (*next == OP_ALT);
1669 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1670 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1671 ecode++;
1672 break;
1673
1674 case OP_SKIPZERO:
1675 next = ecode+1;
1676 do next += GET(next,1); while (*next == OP_ALT);
1677 ecode = next + 1 + LINK_SIZE;
1678 break;
1679
1680 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1681 here; just jump to the group, with allow_zero set TRUE. */
1682
1683 case OP_BRAPOSZERO:
1684 op = *(++ecode);
1685 allow_zero = TRUE;
1686 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1687 goto POSSESSIVE_NON_CAPTURE;
1688
1689 /* End of a group, repeated or non-repeating. */
1690
1691 case OP_KET:
1692 case OP_KETRMIN:
1693 case OP_KETRMAX:
1694 case OP_KETRPOS:
1695 prev = ecode - GET(ecode, 1);
1696
1697 /* If this was a group that remembered the subject start, in order to break
1698 infinite repeats of empty string matches, retrieve the subject start from
1699 the chain. Otherwise, set it NULL. */
1700
1701 if (*prev >= OP_SBRA)
1702 {
1703 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1704 eptrb = eptrb->epb_prev; /* Backup to previous group */
1705 }
1706 else saved_eptr = NULL;
1707
1708 /* If we are at the end of an assertion group or an atomic group, stop
1709 matching and return MATCH_MATCH, but record the current high water mark for
1710 use by positive assertions. We also need to record the match start in case
1711 it was changed by \K. */
1712
1713 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1714 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1715 *prev == OP_ONCE)
1716 {
1717 md->end_match_ptr = eptr; /* For ONCE */
1718 md->end_offset_top = offset_top;
1719 md->start_match_ptr = mstart;
1720 MRRETURN(MATCH_MATCH);
1721 }
1722
1723 /* For capturing groups we have to check the group number back at the start
1724 and if necessary complete handling an extraction by setting the offsets and
1725 bumping the high water mark. Note that whole-pattern recursion is coded as
1726 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1727 when the OP_END is reached. Other recursion is handled here. */
1728
1729 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1730 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1731 {
1732 number = GET2(prev, 1+LINK_SIZE);
1733 offset = number << 1;
1734
1735 #ifdef PCRE_DEBUG
1736 printf("end bracket %d", number);
1737 printf("\n");
1738 #endif
1739
1740 md->capture_last = number;
1741 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1742 {
1743 md->offset_vector[offset] =
1744 md->offset_vector[md->offset_end - number];
1745 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1746 if (offset_top <= offset) offset_top = offset + 2;
1747 }
1748
1749 /* Handle a recursively called group. Restore the offsets
1750 appropriately and continue from after the call. */
1751
1752 if (md->recursive != NULL && md->recursive->group_num == number)
1753 {
1754 recursion_info *rec = md->recursive;
1755 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1756 md->recursive = rec->prevrec;
1757 memcpy(md->offset_vector, rec->offset_save,
1758 rec->saved_max * sizeof(int));
1759 offset_top = rec->save_offset_top;
1760 ecode = rec->after_call;
1761 break;
1762 }
1763 }
1764
1765 /* For a non-repeating ket, just continue at this level. This also
1766 happens for a repeating ket if no characters were matched in the group.
1767 This is the forcible breaking of infinite loops as implemented in Perl
1768 5.005. If there is an options reset, it will get obeyed in the normal
1769 course of events. */
1770
1771 if (*ecode == OP_KET || eptr == saved_eptr)
1772 {
1773 ecode += 1 + LINK_SIZE;
1774 break;
1775 }
1776
1777 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1778 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1779 at a time from the outer level, thus saving stack. */
1780
1781 if (*ecode == OP_KETRPOS)
1782 {
1783 md->end_match_ptr = eptr;
1784 md->end_offset_top = offset_top;
1785 RRETURN(MATCH_KETRPOS);
1786 }
1787
1788 /* The normal repeating kets try the rest of the pattern or restart from
1789 the preceding bracket, in the appropriate order. In the second case, we can
1790 use tail recursion to avoid using another stack frame, unless we have an
1791 unlimited repeat of a group that can match an empty string. */
1792
1793 if (*ecode == OP_KETRMIN)
1794 {
1795 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1797 if (*prev >= OP_SBRA) /* Could match an empty string */
1798 {
1799 md->match_function_type = MATCH_CBEGROUP;
1800 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1801 RRETURN(rrc);
1802 }
1803 ecode = prev;
1804 goto TAIL_RECURSE;
1805 }
1806 else /* OP_KETRMAX */
1807 {
1808 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1809 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811 ecode += 1 + LINK_SIZE;
1812 goto TAIL_RECURSE;
1813 }
1814 /* Control never gets here */
1815
1816 /* Not multiline mode: start of subject assertion, unless notbol. */
1817
1818 case OP_CIRC:
1819 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1820
1821 /* Start of subject assertion */
1822
1823 case OP_SOD:
1824 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1825 ecode++;
1826 break;
1827
1828 /* Multiline mode: start of subject unless notbol, or after any newline. */
1829
1830 case OP_CIRCM:
1831 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1832 if (eptr != md->start_subject &&
1833 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1834 MRRETURN(MATCH_NOMATCH);
1835 ecode++;
1836 break;
1837
1838 /* Start of match assertion */
1839
1840 case OP_SOM:
1841 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1842 ecode++;
1843 break;
1844
1845 /* Reset the start of match point */
1846
1847 case OP_SET_SOM:
1848 mstart = eptr;
1849 ecode++;
1850 break;
1851
1852 /* Multiline mode: assert before any newline, or before end of subject
1853 unless noteol is set. */
1854
1855 case OP_DOLLM:
1856 if (eptr < md->end_subject)
1857 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1858 else
1859 {
1860 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1861 SCHECK_PARTIAL();
1862 }
1863 ecode++;
1864 break;
1865
1866 /* Not multiline mode: assert before a terminating newline or before end of
1867 subject unless noteol is set. */
1868
1869 case OP_DOLL:
1870 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1871 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1872
1873 /* ... else fall through for endonly */
1874
1875 /* End of subject assertion (\z) */
1876
1877 case OP_EOD:
1878 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1879 SCHECK_PARTIAL();
1880 ecode++;
1881 break;
1882
1883 /* End of subject or ending \n assertion (\Z) */
1884
1885 case OP_EODN:
1886 ASSERT_NL_OR_EOS:
1887 if (eptr < md->end_subject &&
1888 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1889 MRRETURN(MATCH_NOMATCH);
1890
1891 /* Either at end of string or \n before end. */
1892
1893 SCHECK_PARTIAL();
1894 ecode++;
1895 break;
1896
1897 /* Word boundary assertions */
1898
1899 case OP_NOT_WORD_BOUNDARY:
1900 case OP_WORD_BOUNDARY:
1901 {
1902
1903 /* Find out if the previous and current characters are "word" characters.
1904 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1905 be "non-word" characters. Remember the earliest consulted character for
1906 partial matching. */
1907
1908 #ifdef SUPPORT_UTF8
1909 if (utf8)
1910 {
1911 /* Get status of previous character */
1912
1913 if (eptr == md->start_subject) prev_is_word = FALSE; else
1914 {
1915 USPTR lastptr = eptr - 1;
1916 while((*lastptr & 0xc0) == 0x80) lastptr--;
1917 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1918 GETCHAR(c, lastptr);
1919 #ifdef SUPPORT_UCP
1920 if (md->use_ucp)
1921 {
1922 if (c == '_') prev_is_word = TRUE; else
1923 {
1924 int cat = UCD_CATEGORY(c);
1925 prev_is_word = (cat == ucp_L || cat == ucp_N);
1926 }
1927 }
1928 else
1929 #endif
1930 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1931 }
1932
1933 /* Get status of next character */
1934
1935 if (eptr >= md->end_subject)
1936 {
1937 SCHECK_PARTIAL();
1938 cur_is_word = FALSE;
1939 }
1940 else
1941 {
1942 GETCHAR(c, eptr);
1943 #ifdef SUPPORT_UCP
1944 if (md->use_ucp)
1945 {
1946 if (c == '_') cur_is_word = TRUE; else
1947 {
1948 int cat = UCD_CATEGORY(c);
1949 cur_is_word = (cat == ucp_L || cat == ucp_N);
1950 }
1951 }
1952 else
1953 #endif
1954 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1955 }
1956 }
1957 else
1958 #endif
1959
1960 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1961 consistency with the behaviour of \w we do use it in this case. */
1962
1963 {
1964 /* Get status of previous character */
1965
1966 if (eptr == md->start_subject) prev_is_word = FALSE; else
1967 {
1968 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1969 #ifdef SUPPORT_UCP
1970 if (md->use_ucp)
1971 {
1972 c = eptr[-1];
1973 if (c == '_') prev_is_word = TRUE; else
1974 {
1975 int cat = UCD_CATEGORY(c);
1976 prev_is_word = (cat == ucp_L || cat == ucp_N);
1977 }
1978 }
1979 else
1980 #endif
1981 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1982 }
1983
1984 /* Get status of next character */
1985
1986 if (eptr >= md->end_subject)
1987 {
1988 SCHECK_PARTIAL();
1989 cur_is_word = FALSE;
1990 }
1991 else
1992 #ifdef SUPPORT_UCP
1993 if (md->use_ucp)
1994 {
1995 c = *eptr;
1996 if (c == '_') cur_is_word = TRUE; else
1997 {
1998 int cat = UCD_CATEGORY(c);
1999 cur_is_word = (cat == ucp_L || cat == ucp_N);
2000 }
2001 }
2002 else
2003 #endif
2004 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2005 }
2006
2007 /* Now see if the situation is what we want */
2008
2009 if ((*ecode++ == OP_WORD_BOUNDARY)?
2010 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2011 MRRETURN(MATCH_NOMATCH);
2012 }
2013 break;
2014
2015 /* Match a single character type; inline for speed */
2016
2017 case OP_ANY:
2018 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2019 /* Fall through */
2020
2021 case OP_ALLANY:
2022 if (eptr++ >= md->end_subject)
2023 {
2024 SCHECK_PARTIAL();
2025 MRRETURN(MATCH_NOMATCH);
2026 }
2027 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2028 ecode++;
2029 break;
2030
2031 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2032 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2033
2034 case OP_ANYBYTE:
2035 if (eptr++ >= md->end_subject)
2036 {
2037 SCHECK_PARTIAL();
2038 MRRETURN(MATCH_NOMATCH);
2039 }
2040 ecode++;
2041 break;
2042
2043 case OP_NOT_DIGIT:
2044 if (eptr >= md->end_subject)
2045 {
2046 SCHECK_PARTIAL();
2047 MRRETURN(MATCH_NOMATCH);
2048 }
2049 GETCHARINCTEST(c, eptr);
2050 if (
2051 #ifdef SUPPORT_UTF8
2052 c < 256 &&
2053 #endif
2054 (md->ctypes[c] & ctype_digit) != 0
2055 )
2056 MRRETURN(MATCH_NOMATCH);
2057 ecode++;
2058 break;
2059
2060 case OP_DIGIT:
2061 if (eptr >= md->end_subject)
2062 {
2063 SCHECK_PARTIAL();
2064 MRRETURN(MATCH_NOMATCH);
2065 }
2066 GETCHARINCTEST(c, eptr);
2067 if (
2068 #ifdef SUPPORT_UTF8
2069 c >= 256 ||
2070 #endif
2071 (md->ctypes[c] & ctype_digit) == 0
2072 )
2073 MRRETURN(MATCH_NOMATCH);
2074 ecode++;
2075 break;
2076
2077 case OP_NOT_WHITESPACE:
2078 if (eptr >= md->end_subject)
2079 {
2080 SCHECK_PARTIAL();
2081 MRRETURN(MATCH_NOMATCH);
2082 }
2083 GETCHARINCTEST(c, eptr);
2084 if (
2085 #ifdef SUPPORT_UTF8
2086 c < 256 &&
2087 #endif
2088 (md->ctypes[c] & ctype_space) != 0
2089 )
2090 MRRETURN(MATCH_NOMATCH);
2091 ecode++;
2092 break;
2093
2094 case OP_WHITESPACE:
2095 if (eptr >= md->end_subject)
2096 {
2097 SCHECK_PARTIAL();
2098 MRRETURN(MATCH_NOMATCH);
2099 }
2100 GETCHARINCTEST(c, eptr);
2101 if (
2102 #ifdef SUPPORT_UTF8
2103 c >= 256 ||
2104 #endif
2105 (md->ctypes[c] & ctype_space) == 0
2106 )
2107 MRRETURN(MATCH_NOMATCH);
2108 ecode++;
2109 break;
2110
2111 case OP_NOT_WORDCHAR:
2112 if (eptr >= md->end_subject)
2113 {
2114 SCHECK_PARTIAL();
2115 MRRETURN(MATCH_NOMATCH);
2116 }
2117 GETCHARINCTEST(c, eptr);
2118 if (
2119 #ifdef SUPPORT_UTF8
2120 c < 256 &&
2121 #endif
2122 (md->ctypes[c] & ctype_word) != 0
2123 )
2124 MRRETURN(MATCH_NOMATCH);
2125 ecode++;
2126 break;
2127
2128 case OP_WORDCHAR:
2129 if (eptr >= md->end_subject)
2130 {
2131 SCHECK_PARTIAL();
2132 MRRETURN(MATCH_NOMATCH);
2133 }
2134 GETCHARINCTEST(c, eptr);
2135 if (
2136 #ifdef SUPPORT_UTF8
2137 c >= 256 ||
2138 #endif
2139 (md->ctypes[c] & ctype_word) == 0
2140 )
2141 MRRETURN(MATCH_NOMATCH);
2142 ecode++;
2143 break;
2144
2145 case OP_ANYNL:
2146 if (eptr >= md->end_subject)
2147 {
2148 SCHECK_PARTIAL();
2149 MRRETURN(MATCH_NOMATCH);
2150 }
2151 GETCHARINCTEST(c, eptr);
2152 switch(c)
2153 {
2154 default: MRRETURN(MATCH_NOMATCH);
2155
2156 case 0x000d:
2157 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2158 break;
2159
2160 case 0x000a:
2161 break;
2162
2163 case 0x000b:
2164 case 0x000c:
2165 case 0x0085:
2166 case 0x2028:
2167 case 0x2029:
2168 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2169 break;
2170 }
2171 ecode++;
2172 break;
2173
2174 case OP_NOT_HSPACE:
2175 if (eptr >= md->end_subject)
2176 {
2177 SCHECK_PARTIAL();
2178 MRRETURN(MATCH_NOMATCH);
2179 }
2180 GETCHARINCTEST(c, eptr);
2181 switch(c)
2182 {
2183 default: break;
2184 case 0x09: /* HT */
2185 case 0x20: /* SPACE */
2186 case 0xa0: /* NBSP */
2187 case 0x1680: /* OGHAM SPACE MARK */
2188 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2189 case 0x2000: /* EN QUAD */
2190 case 0x2001: /* EM QUAD */
2191 case 0x2002: /* EN SPACE */
2192 case 0x2003: /* EM SPACE */
2193 case 0x2004: /* THREE-PER-EM SPACE */
2194 case 0x2005: /* FOUR-PER-EM SPACE */
2195 case 0x2006: /* SIX-PER-EM SPACE */
2196 case 0x2007: /* FIGURE SPACE */
2197 case 0x2008: /* PUNCTUATION SPACE */
2198 case 0x2009: /* THIN SPACE */
2199 case 0x200A: /* HAIR SPACE */
2200 case 0x202f: /* NARROW NO-BREAK SPACE */
2201 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2202 case 0x3000: /* IDEOGRAPHIC SPACE */
2203 MRRETURN(MATCH_NOMATCH);
2204 }
2205 ecode++;
2206 break;
2207
2208 case OP_HSPACE:
2209 if (eptr >= md->end_subject)
2210 {
2211 SCHECK_PARTIAL();
2212 MRRETURN(MATCH_NOMATCH);
2213 }
2214 GETCHARINCTEST(c, eptr);
2215 switch(c)
2216 {
2217 default: MRRETURN(MATCH_NOMATCH);
2218 case 0x09: /* HT */
2219 case 0x20: /* SPACE */
2220 case 0xa0: /* NBSP */
2221 case 0x1680: /* OGHAM SPACE MARK */
2222 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2223 case 0x2000: /* EN QUAD */
2224 case 0x2001: /* EM QUAD */
2225 case 0x2002: /* EN SPACE */
2226 case 0x2003: /* EM SPACE */
2227 case 0x2004: /* THREE-PER-EM SPACE */
2228 case 0x2005: /* FOUR-PER-EM SPACE */
2229 case 0x2006: /* SIX-PER-EM SPACE */
2230 case 0x2007: /* FIGURE SPACE */
2231 case 0x2008: /* PUNCTUATION SPACE */
2232 case 0x2009: /* THIN SPACE */
2233 case 0x200A: /* HAIR SPACE */
2234 case 0x202f: /* NARROW NO-BREAK SPACE */
2235 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2236 case 0x3000: /* IDEOGRAPHIC SPACE */
2237 break;
2238 }
2239 ecode++;
2240 break;
2241
2242 case OP_NOT_VSPACE:
2243 if (eptr >= md->end_subject)
2244 {
2245 SCHECK_PARTIAL();
2246 MRRETURN(MATCH_NOMATCH);
2247 }
2248 GETCHARINCTEST(c, eptr);
2249 switch(c)
2250 {
2251 default: break;
2252 case 0x0a: /* LF */
2253 case 0x0b: /* VT */
2254 case 0x0c: /* FF */
2255 case 0x0d: /* CR */
2256 case 0x85: /* NEL */
2257 case 0x2028: /* LINE SEPARATOR */
2258 case 0x2029: /* PARAGRAPH SEPARATOR */
2259 MRRETURN(MATCH_NOMATCH);
2260 }
2261 ecode++;
2262 break;
2263
2264 case OP_VSPACE:
2265 if (eptr >= md->end_subject)
2266 {
2267 SCHECK_PARTIAL();
2268 MRRETURN(MATCH_NOMATCH);
2269 }
2270 GETCHARINCTEST(c, eptr);
2271 switch(c)
2272 {
2273 default: MRRETURN(MATCH_NOMATCH);
2274 case 0x0a: /* LF */
2275 case 0x0b: /* VT */
2276 case 0x0c: /* FF */
2277 case 0x0d: /* CR */
2278 case 0x85: /* NEL */
2279 case 0x2028: /* LINE SEPARATOR */
2280 case 0x2029: /* PARAGRAPH SEPARATOR */
2281 break;
2282 }
2283 ecode++;
2284 break;
2285
2286 #ifdef SUPPORT_UCP
2287 /* Check the next character by Unicode property. We will get here only
2288 if the support is in the binary; otherwise a compile-time error occurs. */
2289
2290 case OP_PROP:
2291 case OP_NOTPROP:
2292 if (eptr >= md->end_subject)
2293 {
2294 SCHECK_PARTIAL();
2295 MRRETURN(MATCH_NOMATCH);
2296 }
2297 GETCHARINCTEST(c, eptr);
2298 {
2299 const ucd_record *prop = GET_UCD(c);
2300
2301 switch(ecode[1])
2302 {
2303 case PT_ANY:
2304 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2305 break;
2306
2307 case PT_LAMP:
2308 if ((prop->chartype == ucp_Lu ||
2309 prop->chartype == ucp_Ll ||
2310 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2311 MRRETURN(MATCH_NOMATCH);
2312 break;
2313
2314 case PT_GC:
2315 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2316 MRRETURN(MATCH_NOMATCH);
2317 break;
2318
2319 case PT_PC:
2320 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2321 MRRETURN(MATCH_NOMATCH);
2322 break;
2323
2324 case PT_SC:
2325 if ((ecode[2] != prop->script) == (op == OP_PROP))
2326 MRRETURN(MATCH_NOMATCH);
2327 break;
2328
2329 /* These are specials */
2330
2331 case PT_ALNUM:
2332 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2333 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2334 MRRETURN(MATCH_NOMATCH);
2335 break;
2336
2337 case PT_SPACE: /* Perl space */
2338 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2339 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2340 == (op == OP_NOTPROP))
2341 MRRETURN(MATCH_NOMATCH);
2342 break;
2343
2344 case PT_PXSPACE: /* POSIX space */
2345 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2346 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2347 c == CHAR_FF || c == CHAR_CR)
2348 == (op == OP_NOTPROP))
2349 MRRETURN(MATCH_NOMATCH);
2350 break;
2351
2352 case PT_WORD:
2353 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2354 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2355 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2356 MRRETURN(MATCH_NOMATCH);
2357 break;
2358
2359 /* This should never occur */
2360
2361 default:
2362 RRETURN(PCRE_ERROR_INTERNAL);
2363 }
2364
2365 ecode += 3;
2366 }
2367 break;
2368
2369 /* Match an extended Unicode sequence. We will get here only if the support
2370 is in the binary; otherwise a compile-time error occurs. */
2371
2372 case OP_EXTUNI:
2373 if (eptr >= md->end_subject)
2374 {
2375 SCHECK_PARTIAL();
2376 MRRETURN(MATCH_NOMATCH);
2377 }
2378 GETCHARINCTEST(c, eptr);
2379 {
2380 int category = UCD_CATEGORY(c);
2381 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2382 while (eptr < md->end_subject)
2383 {
2384 int len = 1;
2385 if (!utf8) c = *eptr; else
2386 {
2387 GETCHARLEN(c, eptr, len);
2388 }
2389 category = UCD_CATEGORY(c);
2390 if (category != ucp_M) break;
2391 eptr += len;
2392 }
2393 }
2394 ecode++;
2395 break;
2396 #endif
2397
2398
2399 /* Match a back reference, possibly repeatedly. Look past the end of the
2400 item to see if there is repeat information following. The code is similar
2401 to that for character classes, but repeated for efficiency. Then obey
2402 similar code to character type repeats - written out again for speed.
2403 However, if the referenced string is the empty string, always treat
2404 it as matched, any number of times (otherwise there could be infinite
2405 loops). */
2406
2407 case OP_REF:
2408 case OP_REFI:
2409 caseless = op == OP_REFI;
2410 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2411 ecode += 3;
2412
2413 /* If the reference is unset, there are two possibilities:
2414
2415 (a) In the default, Perl-compatible state, set the length negative;
2416 this ensures that every attempt at a match fails. We can't just fail
2417 here, because of the possibility of quantifiers with zero minima.
2418
2419 (b) If the JavaScript compatibility flag is set, set the length to zero
2420 so that the back reference matches an empty string.
2421
2422 Otherwise, set the length to the length of what was matched by the
2423 referenced subpattern. */
2424
2425 if (offset >= offset_top || md->offset_vector[offset] < 0)
2426 length = (md->jscript_compat)? 0 : -1;
2427 else
2428 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2429
2430 /* Set up for repetition, or handle the non-repeated case */
2431
2432 switch (*ecode)
2433 {
2434 case OP_CRSTAR:
2435 case OP_CRMINSTAR:
2436 case OP_CRPLUS:
2437 case OP_CRMINPLUS:
2438 case OP_CRQUERY:
2439 case OP_CRMINQUERY:
2440 c = *ecode++ - OP_CRSTAR;
2441 minimize = (c & 1) != 0;
2442 min = rep_min[c]; /* Pick up values from tables; */
2443 max = rep_max[c]; /* zero for max => infinity */
2444 if (max == 0) max = INT_MAX;
2445 break;
2446
2447 case OP_CRRANGE:
2448 case OP_CRMINRANGE:
2449 minimize = (*ecode == OP_CRMINRANGE);
2450 min = GET2(ecode, 1);
2451 max = GET2(ecode, 3);
2452 if (max == 0) max = INT_MAX;
2453 ecode += 5;
2454 break;
2455
2456 default: /* No repeat follows */
2457 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2458 {
2459 CHECK_PARTIAL();
2460 MRRETURN(MATCH_NOMATCH);
2461 }
2462 eptr += length;
2463 continue; /* With the main loop */
2464 }
2465
2466 /* Handle repeated back references. If the length of the reference is
2467 zero, just continue with the main loop. */
2468
2469 if (length == 0) continue;
2470
2471 /* First, ensure the minimum number of matches are present. We get back
2472 the length of the reference string explicitly rather than passing the
2473 address of eptr, so that eptr can be a register variable. */
2474
2475 for (i = 1; i <= min; i++)
2476 {
2477 int slength;
2478 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2479 {
2480 CHECK_PARTIAL();
2481 MRRETURN(MATCH_NOMATCH);
2482 }
2483 eptr += slength;
2484 }
2485
2486 /* If min = max, continue at the same level without recursion.
2487 They are not both allowed to be zero. */
2488
2489 if (min == max) continue;
2490
2491 /* If minimizing, keep trying and advancing the pointer */
2492
2493 if (minimize)
2494 {
2495 for (fi = min;; fi++)
2496 {
2497 int slength;
2498 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2499 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2500 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2501 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2502 {
2503 CHECK_PARTIAL();
2504 MRRETURN(MATCH_NOMATCH);
2505 }
2506 eptr += slength;
2507 }
2508 /* Control never gets here */
2509 }
2510
2511 /* If maximizing, find the longest string and work backwards */
2512
2513 else
2514 {
2515 pp = eptr;
2516 for (i = min; i < max; i++)
2517 {
2518 int slength;
2519 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2520 {
2521 CHECK_PARTIAL();
2522 break;
2523 }
2524 eptr += slength;
2525 }
2526 while (eptr >= pp)
2527 {
2528 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2529 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2530 eptr -= length;
2531 }
2532 MRRETURN(MATCH_NOMATCH);
2533 }
2534 /* Control never gets here */
2535
2536 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2537 used when all the characters in the class have values in the range 0-255,
2538 and either the matching is caseful, or the characters are in the range
2539 0-127 when UTF-8 processing is enabled. The only difference between
2540 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2541 encountered.
2542
2543 First, look past the end of the item to see if there is repeat information
2544 following. Then obey similar code to character type repeats - written out
2545 again for speed. */
2546
2547 case OP_NCLASS:
2548 case OP_CLASS:
2549 {
2550 data = ecode + 1; /* Save for matching */
2551 ecode += 33; /* Advance past the item */
2552
2553 switch (*ecode)
2554 {
2555 case OP_CRSTAR:
2556 case OP_CRMINSTAR:
2557 case OP_CRPLUS:
2558 case OP_CRMINPLUS:
2559 case OP_CRQUERY:
2560 case OP_CRMINQUERY:
2561 c = *ecode++ - OP_CRSTAR;
2562 minimize = (c & 1) != 0;
2563 min = rep_min[c]; /* Pick up values from tables; */
2564 max = rep_max[c]; /* zero for max => infinity */
2565 if (max == 0) max = INT_MAX;
2566 break;
2567
2568 case OP_CRRANGE:
2569 case OP_CRMINRANGE:
2570 minimize = (*ecode == OP_CRMINRANGE);
2571 min = GET2(ecode, 1);
2572 max = GET2(ecode, 3);
2573 if (max == 0) max = INT_MAX;
2574 ecode += 5;
2575 break;
2576
2577 default: /* No repeat follows */
2578 min = max = 1;
2579 break;
2580 }
2581
2582 /* First, ensure the minimum number of matches are present. */
2583
2584 #ifdef SUPPORT_UTF8
2585 /* UTF-8 mode */
2586 if (utf8)
2587 {
2588 for (i = 1; i <= min; i++)
2589 {
2590 if (eptr >= md->end_subject)
2591 {
2592 SCHECK_PARTIAL();
2593 MRRETURN(MATCH_NOMATCH);
2594 }
2595 GETCHARINC(c, eptr);
2596 if (c > 255)
2597 {
2598 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2599 }
2600 else
2601 {
2602 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2603 }
2604 }
2605 }
2606 else
2607 #endif
2608 /* Not UTF-8 mode */
2609 {
2610 for (i = 1; i <= min; i++)
2611 {
2612 if (eptr >= md->end_subject)
2613 {
2614 SCHECK_PARTIAL();
2615 MRRETURN(MATCH_NOMATCH);
2616 }
2617 c = *eptr++;
2618 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2619 }
2620 }
2621
2622 /* If max == min we can continue with the main loop without the
2623 need to recurse. */
2624
2625 if (min == max) continue;
2626
2627 /* If minimizing, keep testing the rest of the expression and advancing
2628 the pointer while it matches the class. */
2629
2630 if (minimize)
2631 {
2632 #ifdef SUPPORT_UTF8
2633 /* UTF-8 mode */
2634 if (utf8)
2635 {
2636 for (fi = min;; fi++)
2637 {
2638 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2640 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2641 if (eptr >= md->end_subject)
2642 {
2643 SCHECK_PARTIAL();
2644 MRRETURN(MATCH_NOMATCH);
2645 }
2646 GETCHARINC(c, eptr);
2647 if (c > 255)
2648 {
2649 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2650 }
2651 else
2652 {
2653 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2654 }
2655 }
2656 }
2657 else
2658 #endif
2659 /* Not UTF-8 mode */
2660 {
2661 for (fi = min;; fi++)
2662 {
2663 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2664 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2665 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2666 if (eptr >= md->end_subject)
2667 {
2668 SCHECK_PARTIAL();
2669 MRRETURN(MATCH_NOMATCH);
2670 }
2671 c = *eptr++;
2672 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2673 }
2674 }
2675 /* Control never gets here */
2676 }
2677
2678 /* If maximizing, find the longest possible run, then work backwards. */
2679
2680 else
2681 {
2682 pp = eptr;
2683
2684 #ifdef SUPPORT_UTF8
2685 /* UTF-8 mode */
2686 if (utf8)
2687 {
2688 for (i = min; i < max; i++)
2689 {
2690 int len = 1;
2691 if (eptr >= md->end_subject)
2692 {
2693 SCHECK_PARTIAL();
2694 break;
2695 }
2696 GETCHARLEN(c, eptr, len);
2697 if (c > 255)
2698 {
2699 if (op == OP_CLASS) break;
2700 }
2701 else
2702 {
2703 if ((data[c/8] & (1 << (c&7))) == 0) break;
2704 }
2705 eptr += len;
2706 }
2707 for (;;)
2708 {
2709 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2711 if (eptr-- == pp) break; /* Stop if tried at original pos */
2712 BACKCHAR(eptr);
2713 }
2714 }
2715 else
2716 #endif
2717 /* Not UTF-8 mode */
2718 {
2719 for (i = min; i < max; i++)
2720 {
2721 if (eptr >= md->end_subject)
2722 {
2723 SCHECK_PARTIAL();
2724 break;
2725 }
2726 c = *eptr;
2727 if ((data[c/8] & (1 << (c&7))) == 0) break;
2728 eptr++;
2729 }
2730 while (eptr >= pp)
2731 {
2732 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2733 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2734 eptr--;
2735 }
2736 }
2737
2738 MRRETURN(MATCH_NOMATCH);
2739 }
2740 }
2741 /* Control never gets here */
2742
2743
2744 /* Match an extended character class. This opcode is encountered only
2745 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2746 mode, because Unicode properties are supported in non-UTF-8 mode. */
2747
2748 #ifdef SUPPORT_UTF8
2749 case OP_XCLASS:
2750 {
2751 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2752 ecode += GET(ecode, 1); /* Advance past the item */
2753
2754 switch (*ecode)
2755 {
2756 case OP_CRSTAR:
2757 case OP_CRMINSTAR:
2758 case OP_CRPLUS:
2759 case OP_CRMINPLUS:
2760 case OP_CRQUERY:
2761 case OP_CRMINQUERY:
2762 c = *ecode++ - OP_CRSTAR;
2763 minimize = (c & 1) != 0;
2764 min = rep_min[c]; /* Pick up values from tables; */
2765 max = rep_max[c]; /* zero for max => infinity */
2766 if (max == 0) max = INT_MAX;
2767 break;
2768
2769 case OP_CRRANGE:
2770 case OP_CRMINRANGE:
2771 minimize = (*ecode == OP_CRMINRANGE);
2772 min = GET2(ecode, 1);
2773 max = GET2(ecode, 3);
2774 if (max == 0) max = INT_MAX;
2775 ecode += 5;
2776 break;
2777
2778 default: /* No repeat follows */
2779 min = max = 1;
2780 break;
2781 }
2782
2783 /* First, ensure the minimum number of matches are present. */
2784
2785 for (i = 1; i <= min; i++)
2786 {
2787 if (eptr >= md->end_subject)
2788 {
2789 SCHECK_PARTIAL();
2790 MRRETURN(MATCH_NOMATCH);
2791 }
2792 GETCHARINCTEST(c, eptr);
2793 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2794 }
2795
2796 /* If max == min we can continue with the main loop without the
2797 need to recurse. */
2798
2799 if (min == max) continue;
2800
2801 /* If minimizing, keep testing the rest of the expression and advancing
2802 the pointer while it matches the class. */
2803
2804 if (minimize)
2805 {
2806 for (fi = min;; fi++)
2807 {
2808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2810 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2811 if (eptr >= md->end_subject)
2812 {
2813 SCHECK_PARTIAL();
2814 MRRETURN(MATCH_NOMATCH);
2815 }
2816 GETCHARINCTEST(c, eptr);
2817 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2818 }
2819 /* Control never gets here */
2820 }
2821
2822 /* If maximizing, find the longest possible run, then work backwards. */
2823
2824 else
2825 {
2826 pp = eptr;
2827 for (i = min; i < max; i++)
2828 {
2829 int len = 1;
2830 if (eptr >= md->end_subject)
2831 {
2832 SCHECK_PARTIAL();
2833 break;
2834 }
2835 GETCHARLENTEST(c, eptr, len);
2836 if (!_pcre_xclass(c, data)) break;
2837 eptr += len;
2838 }
2839 for(;;)
2840 {
2841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2843 if (eptr-- == pp) break; /* Stop if tried at original pos */
2844 if (utf8) BACKCHAR(eptr);
2845 }
2846 MRRETURN(MATCH_NOMATCH);
2847 }
2848
2849 /* Control never gets here */
2850 }
2851 #endif /* End of XCLASS */
2852
2853 /* Match a single character, casefully */
2854
2855 case OP_CHAR:
2856 #ifdef SUPPORT_UTF8
2857 if (utf8)
2858 {
2859 length = 1;
2860 ecode++;
2861 GETCHARLEN(fc, ecode, length);
2862 if (length > md->end_subject - eptr)
2863 {
2864 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2865 MRRETURN(MATCH_NOMATCH);
2866 }
2867 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2868 }
2869 else
2870 #endif
2871
2872 /* Non-UTF-8 mode */
2873 {
2874 if (md->end_subject - eptr < 1)
2875 {
2876 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2877 MRRETURN(MATCH_NOMATCH);
2878 }
2879 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2880 ecode += 2;
2881 }
2882 break;
2883
2884 /* Match a single character, caselessly */
2885
2886 case OP_CHARI:
2887 #ifdef SUPPORT_UTF8
2888 if (utf8)
2889 {
2890 length = 1;
2891 ecode++;
2892 GETCHARLEN(fc, ecode, length);
2893
2894 if (length > md->end_subject - eptr)
2895 {
2896 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2897 MRRETURN(MATCH_NOMATCH);
2898 }
2899
2900 /* If the pattern character's value is < 128, we have only one byte, and
2901 can use the fast lookup table. */
2902
2903 if (fc < 128)
2904 {
2905 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2906 }
2907
2908 /* Otherwise we must pick up the subject character */
2909
2910 else
2911 {
2912 unsigned int dc;
2913 GETCHARINC(dc, eptr);
2914 ecode += length;
2915
2916 /* If we have Unicode property support, we can use it to test the other
2917 case of the character, if there is one. */
2918
2919 if (fc != dc)
2920 {
2921 #ifdef SUPPORT_UCP
2922 if (dc != UCD_OTHERCASE(fc))
2923 #endif
2924 MRRETURN(MATCH_NOMATCH);
2925 }
2926 }
2927 }
2928 else
2929 #endif /* SUPPORT_UTF8 */
2930
2931 /* Non-UTF-8 mode */
2932 {
2933 if (md->end_subject - eptr < 1)
2934 {
2935 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2936 MRRETURN(MATCH_NOMATCH);
2937 }
2938 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2939 ecode += 2;
2940 }
2941 break;
2942
2943 /* Match a single character repeatedly. */
2944
2945 case OP_EXACT:
2946 case OP_EXACTI:
2947 min = max = GET2(ecode, 1);
2948 ecode += 3;
2949 goto REPEATCHAR;
2950
2951 case OP_POSUPTO:
2952 case OP_POSUPTOI:
2953 possessive = TRUE;
2954 /* Fall through */
2955
2956 case OP_UPTO:
2957 case OP_UPTOI:
2958 case OP_MINUPTO:
2959 case OP_MINUPTOI:
2960 min = 0;
2961 max = GET2(ecode, 1);
2962 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2963 ecode += 3;
2964 goto REPEATCHAR;
2965
2966 case OP_POSSTAR:
2967 case OP_POSSTARI:
2968 possessive = TRUE;
2969 min = 0;
2970 max = INT_MAX;
2971 ecode++;
2972 goto REPEATCHAR;
2973
2974 case OP_POSPLUS:
2975 case OP_POSPLUSI:
2976 possessive = TRUE;
2977 min = 1;
2978 max = INT_MAX;
2979 ecode++;
2980 goto REPEATCHAR;
2981
2982 case OP_POSQUERY:
2983 case OP_POSQUERYI:
2984 possessive = TRUE;
2985 min = 0;
2986 max = 1;
2987 ecode++;
2988 goto REPEATCHAR;
2989
2990 case OP_STAR:
2991 case OP_STARI:
2992 case OP_MINSTAR:
2993 case OP_MINSTARI:
2994 case OP_PLUS:
2995 case OP_PLUSI:
2996 case OP_MINPLUS:
2997 case OP_MINPLUSI:
2998 case OP_QUERY:
2999 case OP_QUERYI:
3000 case OP_MINQUERY:
3001 case OP_MINQUERYI:
3002 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3003 minimize = (c & 1) != 0;
3004 min = rep_min[c]; /* Pick up values from tables; */
3005 max = rep_max[c]; /* zero for max => infinity */
3006 if (max == 0) max = INT_MAX;
3007
3008 /* Common code for all repeated single-character matches. */
3009
3010 REPEATCHAR:
3011 #ifdef SUPPORT_UTF8
3012 if (utf8)
3013 {
3014 length = 1;
3015 charptr = ecode;
3016 GETCHARLEN(fc, ecode, length);
3017 ecode += length;
3018
3019 /* Handle multibyte character matching specially here. There is
3020 support for caseless matching if UCP support is present. */
3021
3022 if (length > 1)
3023 {
3024 #ifdef SUPPORT_UCP
3025 unsigned int othercase;
3026 if (op >= OP_STARI && /* Caseless */
3027 (othercase = UCD_OTHERCASE(fc)) != fc)
3028 oclength = _pcre_ord2utf8(othercase, occhars);
3029 else oclength = 0;
3030 #endif /* SUPPORT_UCP */
3031
3032 for (i = 1; i <= min; i++)
3033 {
3034 if (eptr <= md->end_subject - length &&
3035 memcmp(eptr, charptr, length) == 0) eptr += length;
3036 #ifdef SUPPORT_UCP
3037 else if (oclength > 0 &&
3038 eptr <= md->end_subject - oclength &&
3039 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3040 #endif /* SUPPORT_UCP */
3041 else
3042 {
3043 CHECK_PARTIAL();
3044 MRRETURN(MATCH_NOMATCH);
3045 }
3046 }
3047
3048 if (min == max) continue;
3049
3050 if (minimize)
3051 {
3052 for (fi = min;; fi++)
3053 {
3054 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3055 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3056 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3057 if (eptr <= md->end_subject - length &&
3058 memcmp(eptr, charptr, length) == 0) eptr += length;
3059 #ifdef SUPPORT_UCP
3060 else if (oclength > 0 &&
3061 eptr <= md->end_subject - oclength &&
3062 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3063 #endif /* SUPPORT_UCP */
3064 else
3065 {
3066 CHECK_PARTIAL();
3067 MRRETURN(MATCH_NOMATCH);
3068 }
3069 }
3070 /* Control never gets here */
3071 }
3072
3073 else /* Maximize */
3074 {
3075 pp = eptr;
3076 for (i = min; i < max; i++)
3077 {
3078 if (eptr <= md->end_subject - length &&
3079 memcmp(eptr, charptr, length) == 0) eptr += length;
3080 #ifdef SUPPORT_UCP
3081 else if (oclength > 0 &&
3082 eptr <= md->end_subject - oclength &&
3083 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3084 #endif /* SUPPORT_UCP */
3085 else
3086 {
3087 CHECK_PARTIAL();
3088 break;
3089 }
3090 }
3091
3092 if (possessive) continue;
3093
3094 for(;;)
3095 {
3096 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3097 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3098 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3099 #ifdef SUPPORT_UCP
3100 eptr--;
3101 BACKCHAR(eptr);
3102 #else /* without SUPPORT_UCP */
3103 eptr -= length;
3104 #endif /* SUPPORT_UCP */
3105 }
3106 }
3107 /* Control never gets here */
3108 }
3109
3110 /* If the length of a UTF-8 character is 1, we fall through here, and
3111 obey the code as for non-UTF-8 characters below, though in this case the
3112 value of fc will always be < 128. */
3113 }
3114 else
3115 #endif /* SUPPORT_UTF8 */
3116
3117 /* When not in UTF-8 mode, load a single-byte character. */
3118
3119 fc = *ecode++;
3120
3121 /* The value of fc at this point is always less than 256, though we may or
3122 may not be in UTF-8 mode. The code is duplicated for the caseless and
3123 caseful cases, for speed, since matching characters is likely to be quite
3124 common. First, ensure the minimum number of matches are present. If min =
3125 max, continue at the same level without recursing. Otherwise, if
3126 minimizing, keep trying the rest of the expression and advancing one
3127 matching character if failing, up to the maximum. Alternatively, if
3128 maximizing, find the maximum number of characters and work backwards. */
3129
3130 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3131 max, eptr));
3132
3133 if (op >= OP_STARI) /* Caseless */
3134 {
3135 fc = md->lcc[fc];
3136 for (i = 1; i <= min; i++)
3137 {
3138 if (eptr >= md->end_subject)
3139 {
3140 SCHECK_PARTIAL();
3141 MRRETURN(MATCH_NOMATCH);
3142 }
3143 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3144 }
3145 if (min == max) continue;
3146 if (minimize)
3147 {
3148 for (fi = min;; fi++)
3149 {
3150 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3151 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3152 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3153 if (eptr >= md->end_subject)
3154 {
3155 SCHECK_PARTIAL();
3156 MRRETURN(MATCH_NOMATCH);
3157 }
3158 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3159 }
3160 /* Control never gets here */
3161 }
3162 else /* Maximize */
3163 {
3164 pp = eptr;
3165 for (i = min; i < max; i++)
3166 {
3167 if (eptr >= md->end_subject)
3168 {
3169 SCHECK_PARTIAL();
3170 break;
3171 }
3172 if (fc != md->lcc[*eptr]) break;
3173 eptr++;
3174 }
3175
3176 if (possessive) continue;
3177
3178 while (eptr >= pp)
3179 {
3180 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3181 eptr--;
3182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3183 }
3184 MRRETURN(MATCH_NOMATCH);
3185 }
3186 /* Control never gets here */
3187 }
3188
3189 /* Caseful comparisons (includes all multi-byte characters) */
3190
3191 else
3192 {
3193 for (i = 1; i <= min; i++)
3194 {
3195 if (eptr >= md->end_subject)
3196 {
3197 SCHECK_PARTIAL();
3198 MRRETURN(MATCH_NOMATCH);
3199 }
3200 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3201 }
3202
3203 if (min == max) continue;
3204
3205 if (minimize)
3206 {
3207 for (fi = min;; fi++)
3208 {
3209 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3211 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3212 if (eptr >= md->end_subject)
3213 {
3214 SCHECK_PARTIAL();
3215 MRRETURN(MATCH_NOMATCH);
3216 }
3217 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3218 }
3219 /* Control never gets here */
3220 }
3221 else /* Maximize */
3222 {
3223 pp = eptr;
3224 for (i = min; i < max; i++)
3225 {
3226 if (eptr >= md->end_subject)
3227 {
3228 SCHECK_PARTIAL();
3229 break;
3230 }
3231 if (fc != *eptr) break;
3232 eptr++;
3233 }
3234 if (possessive) continue;
3235
3236 while (eptr >= pp)
3237 {
3238 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3239 eptr--;
3240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3241 }
3242 MRRETURN(MATCH_NOMATCH);
3243 }
3244 }
3245 /* Control never gets here */
3246
3247 /* Match a negated single one-byte character. The character we are
3248 checking can be multibyte. */
3249
3250 case OP_NOT:
3251 case OP_NOTI:
3252 if (eptr >= md->end_subject)
3253 {
3254 SCHECK_PARTIAL();
3255 MRRETURN(MATCH_NOMATCH);
3256 }
3257 ecode++;
3258 GETCHARINCTEST(c, eptr);
3259 if (op == OP_NOTI) /* The caseless case */
3260 {
3261 #ifdef SUPPORT_UTF8
3262 if (c < 256)
3263 #endif
3264 c = md->lcc[c];
3265 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3266 }
3267 else /* Caseful */
3268 {
3269 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3270 }
3271 break;
3272
3273 /* Match a negated single one-byte character repeatedly. This is almost a
3274 repeat of the code for a repeated single character, but I haven't found a
3275 nice way of commoning these up that doesn't require a test of the
3276 positive/negative option for each character match. Maybe that wouldn't add
3277 very much to the time taken, but character matching *is* what this is all
3278 about... */
3279
3280 case OP_NOTEXACT:
3281 case OP_NOTEXACTI:
3282 min = max = GET2(ecode, 1);
3283 ecode += 3;
3284 goto REPEATNOTCHAR;
3285
3286 case OP_NOTUPTO:
3287 case OP_NOTUPTOI:
3288 case OP_NOTMINUPTO:
3289 case OP_NOTMINUPTOI:
3290 min = 0;
3291 max = GET2(ecode, 1);
3292 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3293 ecode += 3;
3294 goto REPEATNOTCHAR;
3295
3296 case OP_NOTPOSSTAR:
3297 case OP_NOTPOSSTARI:
3298 possessive = TRUE;
3299 min = 0;
3300 max = INT_MAX;
3301 ecode++;
3302 goto REPEATNOTCHAR;
3303
3304 case OP_NOTPOSPLUS:
3305 case OP_NOTPOSPLUSI:
3306 possessive = TRUE;
3307 min = 1;
3308 max = INT_MAX;
3309 ecode++;
3310 goto REPEATNOTCHAR;
3311
3312 case OP_NOTPOSQUERY:
3313 case OP_NOTPOSQUERYI:
3314 possessive = TRUE;
3315 min = 0;
3316 max = 1;
3317 ecode++;
3318 goto REPEATNOTCHAR;
3319
3320 case OP_NOTPOSUPTO:
3321 case OP_NOTPOSUPTOI:
3322 possessive = TRUE;
3323 min = 0;
3324 max = GET2(ecode, 1);
3325 ecode += 3;
3326 goto REPEATNOTCHAR;
3327
3328 case OP_NOTSTAR:
3329 case OP_NOTSTARI:
3330 case OP_NOTMINSTAR:
3331 case OP_NOTMINSTARI:
3332 case OP_NOTPLUS:
3333 case OP_NOTPLUSI:
3334 case OP_NOTMINPLUS:
3335 case OP_NOTMINPLUSI:
3336 case OP_NOTQUERY:
3337 case OP_NOTQUERYI:
3338 case OP_NOTMINQUERY:
3339 case OP_NOTMINQUERYI:
3340 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3341 minimize = (c & 1) != 0;
3342 min = rep_min[c]; /* Pick up values from tables; */
3343 max = rep_max[c]; /* zero for max => infinity */
3344 if (max == 0) max = INT_MAX;
3345
3346 /* Common code for all repeated single-byte matches. */
3347
3348 REPEATNOTCHAR:
3349 fc = *ecode++;
3350
3351 /* The code is duplicated for the caseless and caseful cases, for speed,
3352 since matching characters is likely to be quite common. First, ensure the
3353 minimum number of matches are present. If min = max, continue at the same
3354 level without recursing. Otherwise, if minimizing, keep trying the rest of
3355 the expression and advancing one matching character if failing, up to the
3356 maximum. Alternatively, if maximizing, find the maximum number of
3357 characters and work backwards. */
3358
3359 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3360 max, eptr));
3361
3362 if (op >= OP_NOTSTARI) /* Caseless */
3363 {
3364 fc = md->lcc[fc];
3365
3366 #ifdef SUPPORT_UTF8
3367 /* UTF-8 mode */
3368 if (utf8)
3369 {
3370 register unsigned int d;
3371 for (i = 1; i <= min; i++)
3372 {
3373 if (eptr >= md->end_subject)
3374 {
3375 SCHECK_PARTIAL();
3376 MRRETURN(MATCH_NOMATCH);
3377 }
3378 GETCHARINC(d, eptr);
3379 if (d < 256) d = md->lcc[d];
3380 if (fc == d) MRRETURN(MATCH_NOMATCH);
3381 }
3382 }
3383 else
3384 #endif
3385
3386 /* Not UTF-8 mode */
3387 {
3388 for (i = 1; i <= min; i++)
3389 {
3390 if (eptr >= md->end_subject)
3391 {
3392 SCHECK_PARTIAL();
3393 MRRETURN(MATCH_NOMATCH);
3394 }
3395 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3396 }
3397 }
3398
3399 if (min == max) continue;
3400
3401 if (minimize)
3402 {
3403 #ifdef SUPPORT_UTF8
3404 /* UTF-8 mode */
3405 if (utf8)
3406 {
3407 register unsigned int d;
3408 for (fi = min;; fi++)
3409 {
3410 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3411 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3412 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3413 if (eptr >= md->end_subject)
3414 {
3415 SCHECK_PARTIAL();
3416 MRRETURN(MATCH_NOMATCH);
3417 }
3418 GETCHARINC(d, eptr);
3419 if (d < 256) d = md->lcc[d];
3420 if (fc == d) MRRETURN(MATCH_NOMATCH);
3421 }
3422 }
3423 else
3424 #endif
3425 /* Not UTF-8 mode */
3426 {
3427 for (fi = min;; fi++)
3428 {
3429 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3432 if (eptr >= md->end_subject)
3433 {
3434 SCHECK_PARTIAL();
3435 MRRETURN(MATCH_NOMATCH);
3436 }
3437 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3438 }
3439 }
3440 /* Control never gets here */
3441 }
3442
3443 /* Maximize case */
3444
3445 else
3446 {
3447 pp = eptr;
3448
3449 #ifdef SUPPORT_UTF8
3450 /* UTF-8 mode */
3451 if (utf8)
3452 {
3453 register unsigned int d;
3454 for (i = min; i < max; i++)
3455 {
3456 int len = 1;
3457 if (eptr >= md->end_subject)
3458 {
3459 SCHECK_PARTIAL();
3460 break;
3461 }
3462 GETCHARLEN(d, eptr, len);
3463 if (d < 256) d = md->lcc[d];
3464 if (fc == d) break;
3465 eptr += len;
3466 }
3467 if (possessive) continue;
3468 for(;;)
3469 {
3470 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3471 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3472 if (eptr-- == pp) break; /* Stop if tried at original pos */
3473 BACKCHAR(eptr);
3474 }
3475 }
3476 else
3477 #endif
3478 /* Not UTF-8 mode */
3479 {
3480 for (i = min; i < max; i++)
3481 {
3482 if (eptr >= md->end_subject)
3483 {
3484 SCHECK_PARTIAL();
3485 break;
3486 }
3487 if (fc == md->lcc[*eptr]) break;
3488 eptr++;
3489 }
3490 if (possessive) continue;
3491 while (eptr >= pp)
3492 {
3493 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3495 eptr--;
3496 }
3497 }
3498
3499 MRRETURN(MATCH_NOMATCH);
3500 }
3501 /* Control never gets here */
3502 }
3503
3504 /* Caseful comparisons */
3505
3506 else
3507 {
3508 #ifdef SUPPORT_UTF8
3509 /* UTF-8 mode */
3510 if (utf8)
3511 {
3512 register unsigned int d;
3513 for (i = 1; i <= min; i++)
3514 {
3515 if (eptr >= md->end_subject)
3516 {
3517 SCHECK_PARTIAL();
3518 MRRETURN(MATCH_NOMATCH);
3519 }
3520 GETCHARINC(d, eptr);
3521 if (fc == d) MRRETURN(MATCH_NOMATCH);
3522 }
3523 }
3524 else
3525 #endif
3526 /* Not UTF-8 mode */
3527 {
3528 for (i = 1; i <= min; i++)
3529 {
3530 if (eptr >= md->end_subject)
3531 {
3532 SCHECK_PARTIAL();
3533 MRRETURN(MATCH_NOMATCH);
3534 }
3535 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3536 }
3537 }
3538
3539 if (min == max) continue;
3540
3541 if (minimize)
3542 {
3543 #ifdef SUPPORT_UTF8
3544 /* UTF-8 mode */
3545 if (utf8)
3546 {
3547 register unsigned int d;
3548 for (fi = min;; fi++)
3549 {
3550 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3551 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3552 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3553 if (eptr >= md->end_subject)
3554 {
3555 SCHECK_PARTIAL();
3556 MRRETURN(MATCH_NOMATCH);
3557 }
3558 GETCHARINC(d, eptr);
3559 if (fc == d) MRRETURN(MATCH_NOMATCH);
3560 }
3561 }
3562 else
3563 #endif
3564 /* Not UTF-8 mode */
3565 {
3566 for (fi = min;; fi++)
3567 {
3568 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3570 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3571 if (eptr >= md->end_subject)
3572 {
3573 SCHECK_PARTIAL();
3574 MRRETURN(MATCH_NOMATCH);
3575 }
3576 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3577 }
3578 }
3579 /* Control never gets here */
3580 }
3581
3582 /* Maximize case */
3583
3584 else
3585 {
3586 pp = eptr;
3587
3588 #ifdef SUPPORT_UTF8
3589 /* UTF-8 mode */
3590 if (utf8)
3591 {
3592 register unsigned int d;
3593 for (i = min; i < max; i++)
3594 {
3595 int len = 1;
3596 if (eptr >= md->end_subject)
3597 {
3598 SCHECK_PARTIAL();
3599 break;
3600 }
3601 GETCHARLEN(d, eptr, len);
3602 if (fc == d) break;
3603 eptr += len;
3604 }
3605 if (possessive) continue;
3606 for(;;)
3607 {
3608 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3609 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3610 if (eptr-- == pp) break; /* Stop if tried at original pos */
3611 BACKCHAR(eptr);
3612 }
3613 }
3614 else
3615 #endif
3616 /* Not UTF-8 mode */
3617 {
3618 for (i = min; i < max; i++)
3619 {
3620 if (eptr >= md->end_subject)
3621 {
3622 SCHECK_PARTIAL();
3623 break;
3624 }
3625 if (fc == *eptr) break;
3626 eptr++;
3627 }
3628 if (possessive) continue;
3629 while (eptr >= pp)
3630 {
3631 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3633 eptr--;
3634 }
3635 }
3636
3637 MRRETURN(MATCH_NOMATCH);
3638 }
3639 }
3640 /* Control never gets here */
3641
3642 /* Match a single character type repeatedly; several different opcodes
3643 share code. This is very similar to the code for single characters, but we
3644 repeat it in the interests of efficiency. */
3645
3646 case OP_TYPEEXACT:
3647 min = max = GET2(ecode, 1);
3648 minimize = TRUE;
3649 ecode += 3;
3650 goto REPEATTYPE;
3651
3652 case OP_TYPEUPTO:
3653 case OP_TYPEMINUPTO:
3654 min = 0;
3655 max = GET2(ecode, 1);
3656 minimize = *ecode == OP_TYPEMINUPTO;
3657 ecode += 3;
3658 goto REPEATTYPE;
3659
3660 case OP_TYPEPOSSTAR:
3661 possessive = TRUE;
3662 min = 0;
3663 max = INT_MAX;
3664 ecode++;
3665 goto REPEATTYPE;
3666
3667 case OP_TYPEPOSPLUS:
3668 possessive = TRUE;
3669 min = 1;
3670 max = INT_MAX;
3671 ecode++;
3672 goto REPEATTYPE;
3673
3674 case OP_TYPEPOSQUERY:
3675 possessive = TRUE;
3676 min = 0;
3677 max = 1;
3678 ecode++;
3679 goto REPEATTYPE;
3680
3681 case OP_TYPEPOSUPTO:
3682 possessive = TRUE;
3683 min = 0;
3684 max = GET2(ecode, 1);
3685 ecode += 3;
3686 goto REPEATTYPE;
3687
3688 case OP_TYPESTAR:
3689 case OP_TYPEMINSTAR:
3690 case OP_TYPEPLUS:
3691 case OP_TYPEMINPLUS:
3692 case OP_TYPEQUERY:
3693 case OP_TYPEMINQUERY:
3694 c = *ecode++ - OP_TYPESTAR;
3695 minimize = (c & 1) != 0;
3696 min = rep_min[c]; /* Pick up values from tables; */
3697 max = rep_max[c]; /* zero for max => infinity */
3698 if (max == 0) max = INT_MAX;
3699
3700 /* Common code for all repeated single character type matches. Note that
3701 in UTF-8 mode, '.' matches a character of any length, but for the other
3702 character types, the valid characters are all one-byte long. */
3703
3704 REPEATTYPE:
3705 ctype = *ecode++; /* Code for the character type */
3706
3707 #ifdef SUPPORT_UCP
3708 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3709 {
3710 prop_fail_result = ctype == OP_NOTPROP;
3711 prop_type = *ecode++;
3712 prop_value = *ecode++;
3713 }
3714 else prop_type = -1;
3715 #endif
3716
3717 /* First, ensure the minimum number of matches are present. Use inline
3718 code for maximizing the speed, and do the type test once at the start
3719 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3720 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3721 and single-bytes. */
3722
3723 if (min > 0)
3724 {
3725 #ifdef SUPPORT_UCP
3726 if (prop_type >= 0)
3727 {
3728 switch(prop_type)
3729 {
3730 case PT_ANY:
3731 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3732 for (i = 1; i <= min; i++)
3733 {
3734 if (eptr >= md->end_subject)
3735 {
3736 SCHECK_PARTIAL();
3737 MRRETURN(MATCH_NOMATCH);
3738 }
3739 GETCHARINCTEST(c, eptr);
3740 }
3741 break;
3742
3743 case PT_LAMP:
3744 for (i = 1; i <= min; i++)
3745 {
3746 if (eptr >= md->end_subject)
3747 {
3748 SCHECK_PARTIAL();
3749 MRRETURN(MATCH_NOMATCH);
3750 }
3751 GETCHARINCTEST(c, eptr);
3752 prop_chartype = UCD_CHARTYPE(c);
3753 if ((prop_chartype == ucp_Lu ||
3754 prop_chartype == ucp_Ll ||
3755 prop_chartype == ucp_Lt) == prop_fail_result)
3756 MRRETURN(MATCH_NOMATCH);
3757 }
3758 break;
3759
3760 case PT_GC:
3761 for (i = 1; i <= min; i++)
3762 {
3763 if (eptr >= md->end_subject)
3764 {
3765 SCHECK_PARTIAL();
3766 MRRETURN(MATCH_NOMATCH);
3767 }
3768 GETCHARINCTEST(c, eptr);
3769 prop_category = UCD_CATEGORY(c);
3770 if ((prop_category == prop_value) == prop_fail_result)
3771 MRRETURN(MATCH_NOMATCH);
3772 }
3773 break;
3774
3775 case PT_PC:
3776 for (i = 1; i <= min; i++)
3777 {
3778 if (eptr >= md->end_subject)
3779 {
3780 SCHECK_PARTIAL();
3781 MRRETURN(MATCH_NOMATCH);
3782 }
3783 GETCHARINCTEST(c, eptr);
3784 prop_chartype = UCD_CHARTYPE(c);
3785 if ((prop_chartype == prop_value) == prop_fail_result)
3786 MRRETURN(MATCH_NOMATCH);
3787 }
3788 break;
3789
3790 case PT_SC:
3791 for (i = 1; i <= min; i++)
3792 {
3793 if (eptr >= md->end_subject)
3794 {
3795 SCHECK_PARTIAL();
3796 MRRETURN(MATCH_NOMATCH);
3797 }
3798 GETCHARINCTEST(c, eptr);
3799 prop_script = UCD_SCRIPT(c);
3800 if ((prop_script == prop_value) == prop_fail_result)
3801 MRRETURN(MATCH_NOMATCH);
3802 }
3803 break;
3804
3805 case PT_ALNUM:
3806 for (i = 1; i <= min; i++)
3807 {
3808 if (eptr >= md->end_subject)
3809 {
3810 SCHECK_PARTIAL();
3811 MRRETURN(MATCH_NOMATCH);
3812 }
3813 GETCHARINCTEST(c, eptr);
3814 prop_category = UCD_CATEGORY(c);
3815 if ((prop_category == ucp_L || prop_category == ucp_N)
3816 == prop_fail_result)
3817 MRRETURN(MATCH_NOMATCH);
3818 }
3819 break;
3820
3821 case PT_SPACE: /* Perl space */
3822 for (i = 1; i <= min; i++)
3823 {
3824 if (eptr >= md->end_subject)
3825 {
3826 SCHECK_PARTIAL();
3827 MRRETURN(MATCH_NOMATCH);
3828 }
3829 GETCHARINCTEST(c, eptr);
3830 prop_category = UCD_CATEGORY(c);
3831 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3832 c == CHAR_FF || c == CHAR_CR)
3833 == prop_fail_result)
3834 MRRETURN(MATCH_NOMATCH);
3835 }
3836 break;
3837
3838 case PT_PXSPACE: /* POSIX space */
3839 for (i = 1; i <= min; i++)
3840 {
3841 if (eptr >= md->end_subject)
3842 {
3843 SCHECK_PARTIAL();
3844 MRRETURN(MATCH_NOMATCH);
3845 }
3846 GETCHARINCTEST(c, eptr);
3847 prop_category = UCD_CATEGORY(c);
3848 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3849 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3850 == prop_fail_result)
3851 MRRETURN(MATCH_NOMATCH);
3852 }
3853 break;
3854
3855 case PT_WORD:
3856 for (i = 1; i <= min; i++)
3857 {
3858 if (eptr >= md->end_subject)
3859 {
3860 SCHECK_PARTIAL();
3861 MRRETURN(MATCH_NOMATCH);
3862 }
3863 GETCHARINCTEST(c, eptr);
3864 prop_category = UCD_CATEGORY(c);
3865 if ((prop_category == ucp_L || prop_category == ucp_N ||
3866 c == CHAR_UNDERSCORE)
3867 == prop_fail_result)
3868 MRRETURN(MATCH_NOMATCH);
3869 }
3870 break;
3871
3872 /* This should not occur */
3873
3874 default:
3875 RRETURN(PCRE_ERROR_INTERNAL);
3876 }
3877 }
3878
3879 /* Match extended Unicode sequences. We will get here only if the
3880 support is in the binary; otherwise a compile-time error occurs. */
3881
3882 else if (ctype == OP_EXTUNI)
3883 {
3884 for (i = 1; i <= min; i++)
3885 {
3886 if (eptr >= md->end_subject)
3887 {
3888 SCHECK_PARTIAL();
3889 MRRETURN(MATCH_NOMATCH);
3890 }
3891 GETCHARINCTEST(c, eptr);
3892 prop_category = UCD_CATEGORY(c);
3893 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3894 while (eptr < md->end_subject)
3895 {
3896 int len = 1;
3897 if (!utf8) c = *eptr;
3898 else { GETCHARLEN(c, eptr, len); }
3899 prop_category = UCD_CATEGORY(c);
3900 if (prop_category != ucp_M) break;
3901 eptr += len;
3902 }
3903 }
3904 }
3905
3906 else
3907 #endif /* SUPPORT_UCP */
3908
3909 /* Handle all other cases when the coding is UTF-8 */
3910
3911 #ifdef SUPPORT_UTF8
3912 if (utf8) switch(ctype)
3913 {
3914 case OP_ANY:
3915 for (i = 1; i <= min; i++)
3916 {
3917 if (eptr >= md->end_subject)
3918 {
3919 SCHECK_PARTIAL();
3920 MRRETURN(MATCH_NOMATCH);
3921 }
3922 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3923 eptr++;
3924 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3925 }
3926 break;
3927
3928 case OP_ALLANY:
3929 for (i = 1; i <= min; i++)
3930 {
3931 if (eptr >= md->end_subject)
3932 {
3933 SCHECK_PARTIAL();
3934 MRRETURN(MATCH_NOMATCH);
3935 }
3936 eptr++;
3937 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3938 }
3939 break;
3940
3941 case OP_ANYBYTE:
3942 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3943 eptr += min;
3944 break;
3945
3946 case OP_ANYNL:
3947 for (i = 1; i <= min; i++)
3948 {
3949 if (eptr >= md->end_subject)
3950 {
3951 SCHECK_PARTIAL();
3952 MRRETURN(MATCH_NOMATCH);
3953 }
3954 GETCHARINC(c, eptr);
3955 switch(c)
3956 {
3957 default: MRRETURN(MATCH_NOMATCH);
3958
3959 case 0x000d:
3960 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3961 break;
3962
3963 case 0x000a:
3964 break;
3965
3966 case 0x000b:
3967 case 0x000c:
3968 case 0x0085:
3969 case 0x2028:
3970 case 0x2029:
3971 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3972 break;
3973 }
3974 }
3975 break;
3976
3977 case OP_NOT_HSPACE:
3978 for (i = 1; i <= min; i++)
3979 {
3980 if (eptr >= md->end_subject)
3981 {
3982 SCHECK_PARTIAL();
3983 MRRETURN(MATCH_NOMATCH);
3984 }
3985 GETCHARINC(c, eptr);
3986 switch(c)
3987 {
3988 default: break;
3989 case 0x09: /* HT */
3990 case 0x20: /* SPACE */
3991 case 0xa0: /* NBSP */
3992 case 0x1680: /* OGHAM SPACE MARK */
3993 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3994 case 0x2000: /* EN QUAD */
3995 case 0x2001: /* EM QUAD */
3996 case 0x2002: /* EN SPACE */
3997 case 0x2003: /* EM SPACE */
3998 case 0x2004: /* THREE-PER-EM SPACE */
3999 case 0x2005: /* FOUR-PER-EM SPACE */
4000 case 0x2006: /* SIX-PER-EM SPACE */
4001 case 0x2007: /* FIGURE SPACE */
4002 case 0x2008: /* PUNCTUATION SPACE */
4003 case 0x2009: /* THIN SPACE */
4004 case 0x200A: /* HAIR SPACE */
4005 case 0x202f: /* NARROW NO-BREAK SPACE */
4006 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4007 case 0x3000: /* IDEOGRAPHIC SPACE */
4008 MRRETURN(MATCH_NOMATCH);
4009 }
4010 }
4011 break;
4012
4013 case OP_HSPACE:
4014 for (i = 1; i <= min; i++)
4015 {
4016 if (eptr >= md->end_subject)
4017 {
4018 SCHECK_PARTIAL();
4019 MRRETURN(MATCH_NOMATCH);
4020 }
4021 GETCHARINC(c, eptr);
4022 switch(c)
4023 {
4024 default: MRRETURN(MATCH_NOMATCH);
4025 case 0x09: /* HT */
4026 case 0x20: /* SPACE */
4027 case 0xa0: /* NBSP */
4028 case 0x1680: /* OGHAM SPACE MARK */
4029 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4030 case 0x2000: /* EN QUAD */
4031 case 0x2001: /* EM QUAD */
4032 case 0x2002: /* EN SPACE */
4033 case 0x2003: /* EM SPACE */
4034 case 0x2004: /* THREE-PER-EM SPACE */
4035 case 0x2005: /* FOUR-PER-EM SPACE */
4036 case 0x2006: /* SIX-PER-EM SPACE */
4037 case 0x2007: /* FIGURE SPACE */
4038 case 0x2008: /* PUNCTUATION SPACE */
4039 case 0x2009: /* THIN SPACE */
4040 case 0x200A: /* HAIR SPACE */
4041 case 0x202f: /* NARROW NO-BREAK SPACE */
4042 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4043 case 0x3000: /* IDEOGRAPHIC SPACE */
4044 break;
4045 }
4046 }
4047 break;
4048
4049 case OP_NOT_VSPACE:
4050 for (i = 1; i <= min; i++)
4051 {
4052 if (eptr >= md->end_subject)
4053 {
4054 SCHECK_PARTIAL();
4055 MRRETURN(MATCH_NOMATCH);
4056 }
4057 GETCHARINC(c, eptr);
4058 switch(c)
4059 {
4060 default: break;
4061 case 0x0a: /* LF */
4062 case 0x0b: /* VT */
4063 case 0x0c: /* FF */
4064 case 0x0d: /* CR */
4065 case 0x85: /* NEL */
4066 case 0x2028: /* LINE SEPARATOR */
4067 case 0x2029: /* PARAGRAPH SEPARATOR */
4068 MRRETURN(MATCH_NOMATCH);
4069 }
4070 }
4071 break;
4072
4073 case OP_VSPACE:
4074 for (i = 1; i <= min; i++)
4075 {
4076 if (eptr >= md->end_subject)
4077 {
4078 SCHECK_PARTIAL();
4079 MRRETURN(MATCH_NOMATCH);
4080 }
4081 GETCHARINC(c, eptr);
4082 switch(c)
4083 {
4084 default: MRRETURN(MATCH_NOMATCH);
4085 case 0x0a: /* LF */
4086 case 0x0b: /* VT */
4087 case 0x0c: /* FF */
4088 case 0x0d: /* CR */
4089 case 0x85: /* NEL */
4090 case 0x2028: /* LINE SEPARATOR */
4091 case 0x2029: /* PARAGRAPH SEPARATOR */
4092 break;
4093 }
4094 }
4095 break;
4096
4097 case OP_NOT_DIGIT:
4098 for (i = 1; i <= min; i++)
4099 {
4100 if (eptr >= md->end_subject)
4101 {
4102 SCHECK_PARTIAL();
4103 MRRETURN(MATCH_NOMATCH);
4104 }
4105 GETCHARINC(c, eptr);
4106 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4107 MRRETURN(MATCH_NOMATCH);
4108 }
4109 break;
4110
4111 case OP_DIGIT:
4112 for (i = 1; i <= min; i++)
4113 {
4114 if (eptr >= md->end_subject)
4115 {
4116 SCHECK_PARTIAL();
4117 MRRETURN(MATCH_NOMATCH);
4118 }
4119 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4120 MRRETURN(MATCH_NOMATCH);
4121 /* No need to skip more bytes - we know it's a 1-byte character */
4122 }
4123 break;
4124
4125 case OP_NOT_WHITESPACE:
4126 for (i = 1; i <= min; i++)
4127 {
4128 if (eptr >= md->end_subject)
4129 {
4130 SCHECK_PARTIAL();
4131 MRRETURN(MATCH_NOMATCH);
4132 }
4133 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4134 MRRETURN(MATCH_NOMATCH);
4135 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4136 }
4137 break;
4138
4139 case OP_WHITESPACE:
4140 for (i = 1; i <= min; i++)
4141 {
4142 if (eptr >= md->end_subject)
4143 {
4144 SCHECK_PARTIAL();
4145 MRRETURN(MATCH_NOMATCH);
4146 }
4147 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4148 MRRETURN(MATCH_NOMATCH);
4149 /* No need to skip more bytes - we know it's a 1-byte character */
4150 }
4151 break;
4152
4153 case OP_NOT_WORDCHAR:
4154 for (i = 1; i <= min; i++)
4155 {
4156 if (eptr >= md->end_subject)
4157 {
4158 SCHECK_PARTIAL();
4159 MRRETURN(MATCH_NOMATCH);
4160 }
4161 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4162 MRRETURN(MATCH_NOMATCH);
4163 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4164 }
4165 break;
4166
4167 case OP_WORDCHAR:
4168 for (i = 1; i <= min; i++)
4169 {
4170 if (eptr >= md->end_subject)
4171 {
4172 SCHECK_PARTIAL();
4173 MRRETURN(MATCH_NOMATCH);
4174 }
4175 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4176 MRRETURN(MATCH_NOMATCH);
4177 /* No need to skip more bytes - we know it's a 1-byte character */
4178 }
4179 break;
4180
4181 default:
4182 RRETURN(PCRE_ERROR_INTERNAL);
4183 } /* End switch(ctype) */
4184
4185 else
4186 #endif /* SUPPORT_UTF8 */
4187
4188 /* Code for the non-UTF-8 case for minimum matching of operators other
4189 than OP_PROP and OP_NOTPROP. */
4190
4191 switch(ctype)
4192 {
4193 case OP_ANY:
4194 for (i = 1; i <= min; i++)
4195 {
4196 if (eptr >= md->end_subject)
4197 {
4198 SCHECK_PARTIAL();
4199 MRRETURN(MATCH_NOMATCH);
4200 }
4201 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4202 eptr++;
4203 }
4204 break;
4205
4206 case OP_ALLANY:
4207 if (eptr > md->end_subject - min)
4208 {
4209 SCHECK_PARTIAL();
4210 MRRETURN(MATCH_NOMATCH);
4211 }
4212 eptr += min;
4213 break;
4214
4215 case OP_ANYBYTE:
4216 if (eptr > md->end_subject - min)
4217 {
4218 SCHECK_PARTIAL();
4219 MRRETURN(MATCH_NOMATCH);
4220 }
4221 eptr += min;
4222 break;
4223
4224 case OP_ANYNL:
4225 for (i = 1; i <= min; i++)
4226 {
4227 if (eptr >= md->end_subject)
4228 {
4229 SCHECK_PARTIAL();
4230 MRRETURN(MATCH_NOMATCH);
4231 }
4232 switch(*eptr++)
4233 {
4234 default: MRRETURN(MATCH_NOMATCH);
4235
4236 case 0x000d:
4237 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4238 break;
4239
4240 case 0x000a:
4241 break;
4242
4243 case 0x000b:
4244 case 0x000c:
4245 case 0x0085:
4246 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4247 break;
4248 }
4249 }
4250 break;
4251
4252 case OP_NOT_HSPACE:
4253 for (i = 1; i <= min; i++)
4254 {
4255 if (eptr >= md->end_subject)
4256 {
4257 SCHECK_PARTIAL();
4258 MRRETURN(MATCH_NOMATCH);
4259 }
4260 switch(*eptr++)
4261 {
4262 default: break;
4263 case 0x09: /* HT */
4264 case 0x20: /* SPACE */
4265 case 0xa0: /* NBSP */
4266 MRRETURN(MATCH_NOMATCH);
4267 }
4268 }
4269 break;
4270
4271 case OP_HSPACE:
4272 for (i = 1; i <= min; i++)
4273 {
4274 if (eptr >= md->end_subject)
4275 {
4276 SCHECK_PARTIAL();
4277 MRRETURN(MATCH_NOMATCH);
4278 }
4279 switch(*eptr++)
4280 {
4281 default: MRRETURN(MATCH_NOMATCH);
4282 case 0x09: /* HT */
4283 case 0x20: /* SPACE */
4284 case 0xa0: /* NBSP */
4285 break;
4286 }
4287 }
4288 break;
4289
4290 case OP_NOT_VSPACE:
4291 for (i = 1; i <= min; i++)
4292 {
4293 if (eptr >= md->end_subject)
4294 {
4295 SCHECK_PARTIAL();
4296 MRRETURN(MATCH_NOMATCH);
4297 }
4298 switch(*eptr++)
4299 {
4300 default: break;
4301 case 0x0a: /* LF */
4302 case 0x0b: /* VT */
4303 case 0x0c: /* FF */
4304 case 0x0d: /* CR */
4305 case 0x85: /* NEL */
4306 MRRETURN(MATCH_NOMATCH);
4307 }
4308 }
4309 break;
4310
4311 case OP_VSPACE:
4312 for (i = 1; i <= min; i++)
4313 {
4314 if (eptr >= md->end_subject)
4315 {
4316 SCHECK_PARTIAL();
4317 MRRETURN(MATCH_NOMATCH);
4318 }
4319 switch(*eptr++)
4320 {
4321 default: MRRETURN(MATCH_NOMATCH);
4322 case 0x0a: /* LF */
4323 case 0x0b: /* VT */
4324 case 0x0c: /* FF */
4325 case 0x0d: /* CR */
4326 case 0x85: /* NEL */
4327 break;
4328 }
4329 }
4330 break;
4331
4332 case OP_NOT_DIGIT:
4333 for (i = 1; i <= min; i++)
4334 {
4335 if (eptr >= md->end_subject)
4336 {
4337 SCHECK_PARTIAL();
4338 MRRETURN(MATCH_NOMATCH);
4339 }
4340 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4341 }
4342 break;
4343
4344 case OP_DIGIT:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 MRRETURN(MATCH_NOMATCH);
4351 }
4352 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4353 }
4354 break;
4355
4356 case OP_NOT_WHITESPACE:
4357 for (i = 1; i <= min; i++)
4358 {
4359 if (eptr >= md->end_subject)
4360 {
4361 SCHECK_PARTIAL();
4362 MRRETURN(MATCH_NOMATCH);
4363 }
4364 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4365 }
4366 break;
4367
4368 case OP_WHITESPACE:
4369 for (i = 1; i <= min; i++)
4370 {
4371 if (eptr >= md->end_subject)
4372 {
4373 SCHECK_PARTIAL();
4374 MRRETURN(MATCH_NOMATCH);
4375 }
4376 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4377 }
4378 break;
4379
4380 case OP_NOT_WORDCHAR:
4381 for (i = 1; i <= min; i++)
4382 {
4383 if (eptr >= md->end_subject)
4384 {
4385 SCHECK_PARTIAL();
4386 MRRETURN(MATCH_NOMATCH);
4387 }
4388 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4389 MRRETURN(MATCH_NOMATCH);
4390 }
4391 break;
4392
4393 case OP_WORDCHAR:
4394 for (i = 1; i <= min; i++)
4395 {
4396 if (eptr >= md->end_subject)
4397 {
4398 SCHECK_PARTIAL();
4399 MRRETURN(MATCH_NOMATCH);
4400 }
4401 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4402 MRRETURN(MATCH_NOMATCH);
4403 }
4404 break;
4405
4406 default:
4407 RRETURN(PCRE_ERROR_INTERNAL);
4408 }
4409 }
4410
4411 /* If min = max, continue at the same level without recursing */
4412
4413 if (min == max) continue;
4414
4415 /* If minimizing, we have to test the rest of the pattern before each
4416 subsequent match. Again, separate the UTF-8 case for speed, and also
4417 separate the UCP cases. */
4418
4419 if (minimize)
4420 {
4421 #ifdef SUPPORT_UCP
4422 if (prop_type >= 0)
4423 {
4424 switch(prop_type)
4425 {
4426 case PT_ANY:
4427 for (fi = min;; fi++)
4428 {
4429 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4431 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4432 if (eptr >= md->end_subject)
4433 {
4434 SCHECK_PARTIAL();
4435 MRRETURN(MATCH_NOMATCH);
4436 }
4437 GETCHARINCTEST(c, eptr);
4438 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4439 }
4440 /* Control never gets here */
4441
4442 case PT_LAMP:
4443 for (fi = min;; fi++)
4444 {
4445 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4446 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4447 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4448 if (eptr >= md->end_subject)
4449 {
4450 SCHECK_PARTIAL();
4451 MRRETURN(MATCH_NOMATCH);
4452 }
4453 GETCHARINCTEST(c, eptr);
4454 prop_chartype = UCD_CHARTYPE(c);
4455 if ((prop_chartype == ucp_Lu ||
4456 prop_chartype == ucp_Ll ||
4457 prop_chartype == ucp_Lt) == prop_fail_result)
4458 MRRETURN(MATCH_NOMATCH);
4459 }
4460 /* Control never gets here */
4461
4462 case PT_GC:
4463 for (fi = min;; fi++)
4464 {
4465 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4467 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4468 if (eptr >= md->end_subject)
4469 {
4470 SCHECK_PARTIAL();
4471 MRRETURN(MATCH_NOMATCH);
4472 }
4473 GETCHARINCTEST(c, eptr);
4474 prop_category = UCD_CATEGORY(c);
4475 if ((prop_category == prop_value) == prop_fail_result)
4476 MRRETURN(MATCH_NOMATCH);
4477 }
4478 /* Control never gets here */
4479
4480 case PT_PC:
4481 for (fi = min;; fi++)
4482 {
4483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4485 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 MRRETURN(MATCH_NOMATCH);
4490 }
4491 GETCHARINCTEST(c, eptr);
4492 prop_chartype = UCD_CHARTYPE(c);
4493 if ((prop_chartype == prop_value) == prop_fail_result)
4494 MRRETURN(MATCH_NOMATCH);
4495 }
4496 /* Control never gets here */
4497
4498 case PT_SC:
4499 for (fi = min;; fi++)
4500 {
4501 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4502 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4503 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 MRRETURN(MATCH_NOMATCH);
4508 }
4509 GETCHARINCTEST(c, eptr);
4510 prop_script = UCD_SCRIPT(c);
4511 if ((prop_script == prop_value) == prop_fail_result)
4512 MRRETURN(MATCH_NOMATCH);
4513 }
4514 /* Control never gets here */
4515
4516 case PT_ALNUM:
4517 for (fi = min;; fi++)
4518 {
4519 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4520 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4521 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4522 if (eptr >= md->end_subject)
4523 {
4524 SCHECK_PARTIAL();
4525 MRRETURN(MATCH_NOMATCH);
4526 }
4527 GETCHARINCTEST(c, eptr);
4528 prop_category = UCD_CATEGORY(c);
4529 if ((prop_category == ucp_L || prop_category == ucp_N)
4530 == prop_fail_result)
4531 MRRETURN(MATCH_NOMATCH);
4532 }
4533 /* Control never gets here */
4534
4535 case PT_SPACE: /* Perl space */
4536 for (fi = min;; fi++)
4537 {
4538 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4540 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 MRRETURN(MATCH_NOMATCH);
4545 }
4546 GETCHARINCTEST(c, eptr);
4547 prop_category = UCD_CATEGORY(c);
4548 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4549 c == CHAR_FF || c == CHAR_CR)
4550 == prop_fail_result)
4551 MRRETURN(MATCH_NOMATCH);
4552 }
4553 /* Control never gets here */
4554
4555 case PT_PXSPACE: /* POSIX space */
4556 for (fi = min;; fi++)
4557 {
4558 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4560 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4561 if (eptr >= md->end_subject)
4562 {
4563 SCHECK_PARTIAL();
4564 MRRETURN(MATCH_NOMATCH);
4565 }
4566 GETCHARINCTEST(c, eptr);
4567 prop_category = UCD_CATEGORY(c);
4568 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4569 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4570 == prop_fail_result)
4571 MRRETURN(MATCH_NOMATCH);
4572 }
4573 /* Control never gets here */
4574
4575 case PT_WORD:
4576 for (fi = min;; fi++)
4577 {
4578 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4579 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4580 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4581 if (eptr >= md->end_subject)
4582 {
4583 SCHECK_PARTIAL();
4584 MRRETURN(MATCH_NOMATCH);
4585 }
4586 GETCHARINCTEST(c, eptr);
4587 prop_category = UCD_CATEGORY(c);
4588 if ((prop_category == ucp_L ||
4589 prop_category == ucp_N ||
4590 c == CHAR_UNDERSCORE)
4591 == prop_fail_result)
4592 MRRETURN(MATCH_NOMATCH);
4593 }
4594 /* Control never gets here */
4595
4596 /* This should never occur */
4597
4598 default:
4599 RRETURN(PCRE_ERROR_INTERNAL);
4600 }
4601 }
4602
4603 /* Match extended Unicode sequences. We will get here only if the
4604 support is in the binary; otherwise a compile-time error occurs. */
4605
4606 else if (ctype == OP_EXTUNI)
4607 {
4608 for (fi = min;; fi++)
4609 {
4610 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4611 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4612 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4613 if (eptr >= md->end_subject)
4614 {
4615 SCHECK_PARTIAL();
4616 MRRETURN(MATCH_NOMATCH);
4617 }
4618 GETCHARINCTEST(c, eptr);
4619 prop_category = UCD_CATEGORY(c);
4620 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4621 while (eptr < md->end_subject)
4622 {
4623 int len = 1;
4624 if (!utf8) c = *eptr;
4625 else { GETCHARLEN(c, eptr, len); }
4626 prop_category = UCD_CATEGORY(c);
4627 if (prop_category != ucp_M) break;
4628 eptr += len;
4629 }
4630 }
4631 }
4632
4633 else
4634 #endif /* SUPPORT_UCP */
4635
4636 #ifdef SUPPORT_UTF8
4637 /* UTF-8 mode */
4638 if (utf8)
4639 {
4640 for (fi = min;; fi++)
4641 {
4642 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4643 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4644 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4645 if (eptr >= md->end_subject)
4646 {
4647 SCHECK_PARTIAL();
4648 MRRETURN(MATCH_NOMATCH);
4649 }
4650 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4651 MRRETURN(MATCH_NOMATCH);
4652 GETCHARINC(c, eptr);
4653 switch(ctype)
4654 {
4655 case OP_ANY: /* This is the non-NL case */
4656 case OP_ALLANY:
4657 case OP_ANYBYTE:
4658 break;
4659
4660 case OP_ANYNL:
4661 switch(c)
4662 {
4663 default: MRRETURN(MATCH_NOMATCH);
4664 case 0x000d:
4665 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4666 break;
4667 case 0x000a:
4668 break;
4669
4670 case 0x000b:
4671 case 0x000c:
4672 case 0x0085:
4673 case 0x2028:
4674 case 0x2029:
4675 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4676 break;
4677 }
4678 break;
4679
4680 case OP_NOT_HSPACE:
4681 switch(c)
4682 {
4683 default: break;
4684 case 0x09: /* HT */
4685 case 0x20: /* SPACE */
4686 case 0xa0: /* NBSP */
4687 case 0x1680: /* OGHAM SPACE MARK */
4688 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4689 case 0x2000: /* EN QUAD */
4690 case 0x2001: /* EM QUAD */
4691 case 0x2002: /* EN SPACE */
4692 case 0x2003: /* EM SPACE */
4693 case 0x2004: /* THREE-PER-EM SPACE */
4694 case 0x2005: /* FOUR-PER-EM SPACE */
4695 case 0x2006: /* SIX-PER-EM SPACE */
4696 case 0x2007: /* FIGURE SPACE */
4697 case 0x2008: /* PUNCTUATION SPACE */
4698 case 0x2009: /* THIN SPACE */
4699 case 0x200A: /* HAIR SPACE */
4700 case 0x202f: /* NARROW NO-BREAK SPACE */
4701 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4702 case 0x3000: /* IDEOGRAPHIC SPACE */
4703 MRRETURN(MATCH_NOMATCH);
4704 }
4705 break;
4706
4707 case OP_HSPACE:
4708 switch(c)
4709 {
4710 default: MRRETURN(MATCH_NOMATCH);
4711 case 0x09: /* HT */
4712 case 0x20: /* SPACE */
4713 case 0xa0: /* NBSP */
4714 case 0x1680: /* OGHAM SPACE MARK */
4715 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4716 case 0x2000: /* EN QUAD */
4717 case 0x2001: /* EM QUAD */
4718 case 0x2002: /* EN SPACE */
4719 case 0x2003: /* EM SPACE */
4720 case 0x2004: /* THREE-PER-EM SPACE */
4721 case 0x2005: /* FOUR-PER-EM SPACE */
4722 case 0x2006: /* SIX-PER-EM SPACE */
4723 case 0x2007: /* FIGURE SPACE */
4724 case 0x2008: /* PUNCTUATION SPACE */
4725 case 0x2009: /* THIN SPACE */
4726 case 0x200A: /* HAIR SPACE */
4727 case 0x202f: /* NARROW NO-BREAK SPACE */
4728 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4729 case 0x3000: /* IDEOGRAPHIC SPACE */
4730 break;
4731 }
4732 break;
4733
4734 case OP_NOT_VSPACE:
4735 switch(c)
4736 {
4737 default: break;
4738 case 0x0a: /* LF */
4739 case 0x0b: /* VT */
4740 case 0x0c: /* FF */
4741 case 0x0d: /* CR */
4742 case 0x85: /* NEL */
4743 case 0x2028: /* LINE SEPARATOR */
4744 case 0x2029: /* PARAGRAPH SEPARATOR */
4745 MRRETURN(MATCH_NOMATCH);
4746 }
4747 break;
4748
4749 case OP_VSPACE:
4750 switch(c)
4751 {
4752 default: MRRETURN(MATCH_NOMATCH);
4753 case 0x0a: /* LF */
4754 case 0x0b: /* VT */
4755 case 0x0c: /* FF */
4756 case 0x0d: /* CR */
4757 case 0x85: /* NEL */
4758 case 0x2028: /* LINE SEPARATOR */
4759 case 0x2029: /* PARAGRAPH SEPARATOR */
4760 break;
4761 }
4762 break;
4763
4764 case OP_NOT_DIGIT:
4765 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4766 MRRETURN(MATCH_NOMATCH);
4767 break;
4768
4769 case OP_DIGIT:
4770 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4771 MRRETURN(MATCH_NOMATCH);
4772 break;
4773
4774 case OP_NOT_WHITESPACE:
4775 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4776 MRRETURN(MATCH_NOMATCH);
4777 break;
4778
4779 case OP_WHITESPACE:
4780 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4781 MRRETURN(MATCH_NOMATCH);
4782 break;
4783
4784 case OP_NOT_WORDCHAR:
4785 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4786 MRRETURN(MATCH_NOMATCH);
4787 break;
4788
4789 case OP_WORDCHAR:
4790 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4791 MRRETURN(MATCH_NOMATCH);
4792 break;
4793
4794 default:
4795 RRETURN(PCRE_ERROR_INTERNAL);
4796 }
4797 }
4798 }
4799 else
4800 #endif
4801 /* Not UTF-8 mode */
4802 {
4803 for (fi = min;; fi++)
4804 {
4805 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4807 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4808 if (eptr >= md->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 MRRETURN(MATCH_NOMATCH);
4812 }
4813 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4814 MRRETURN(MATCH_NOMATCH);
4815 c = *eptr++;
4816 switch(ctype)
4817 {
4818 case OP_ANY: /* This is the non-NL case */
4819 case OP_ALLANY:
4820 case OP_ANYBYTE:
4821 break;
4822
4823 case OP_ANYNL:
4824 switch(c)
4825 {
4826 default: MRRETURN(MATCH_NOMATCH);
4827 case 0x000d:
4828 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4829 break;
4830
4831 case 0x000a:
4832 break;
4833
4834 case 0x000b:
4835 case 0x000c:
4836 case 0x0085:
4837 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4838 break;
4839 }
4840 break;
4841
4842 case OP_NOT_HSPACE:
4843 switch(c)
4844 {
4845 default: break;
4846 case 0x09: /* HT */
4847 case 0x20: /* SPACE */
4848 case 0xa0: /* NBSP */
4849 MRRETURN(MATCH_NOMATCH);
4850 }
4851 break;
4852
4853 case OP_HSPACE:
4854 switch(c)
4855 {
4856 default: MRRETURN(MATCH_NOMATCH);
4857 case 0x09: /* HT */
4858 case 0x20: /* SPACE */
4859 case 0xa0: /* NBSP */
4860 break;
4861 }
4862 break;
4863
4864 case OP_NOT_VSPACE:
4865 switch(c)
4866 {
4867 default: break;
4868 case 0x0a: /* LF */
4869 case 0x0b: /* VT */
4870 case 0x0c: /* FF */
4871 case 0x0d: /* CR */
4872 case 0x85: /* NEL */
4873 MRRETURN(MATCH_NOMATCH);
4874 }
4875 break;
4876
4877 case OP_VSPACE:
4878 switch(c)
4879 {
4880 default: MRRETURN(MATCH_NOMATCH);
4881 case 0x0a: /* LF */
4882 case 0x0b: /* VT */
4883 case 0x0c: /* FF */
4884 case 0x0d: /* CR */
4885 case 0x85: /* NEL */
4886 break;
4887 }
4888 break;
4889
4890 case OP_NOT_DIGIT:
4891 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4892 break;
4893
4894 case OP_DIGIT:
4895 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4896 break;
4897
4898 case OP_NOT_WHITESPACE:
4899 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4900 break;
4901
4902 case OP_WHITESPACE:
4903 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4904 break;
4905
4906 case OP_NOT_WORDCHAR:
4907 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4908 break;
4909
4910 case OP_WORDCHAR:
4911 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4912 break;
4913
4914 default:
4915 RRETURN(PCRE_ERROR_INTERNAL);
4916 }
4917 }
4918 }
4919 /* Control never gets here */
4920 }
4921
4922 /* If maximizing, it is worth using inline code for speed, doing the type
4923 test once at the start (i.e. keep it out of the loop). Again, keep the
4924 UTF-8 and UCP stuff separate. */
4925
4926 else
4927 {
4928 pp = eptr; /* Remember where we started */
4929
4930 #ifdef SUPPORT_UCP
4931 if (prop_type >= 0)
4932 {
4933 switch(prop_type)
4934 {
4935 case PT_ANY:
4936 for (i = min; i < max; i++)
4937 {
4938 int len = 1;
4939 if (eptr >= md->end_subject)
4940 {
4941 SCHECK_PARTIAL();
4942 break;
4943 }
4944 GETCHARLENTEST(c, eptr, len);
4945 if (prop_fail_result) break;
4946 eptr+= len;
4947 }
4948 break;
4949
4950 case PT_LAMP:
4951 for (i = min; i < max; i++)
4952 {
4953 int len = 1;
4954 if (eptr >= md->end_subject)
4955 {
4956 SCHECK_PARTIAL();
4957 break;
4958 }
4959 GETCHARLENTEST(c, eptr, len);
4960 prop_chartype = UCD_CHARTYPE(c);
4961 if ((prop_chartype == ucp_Lu ||
4962 prop_chartype == ucp_Ll ||
4963 prop_chartype == ucp_Lt) == prop_fail_result)
4964 break;
4965 eptr+= len;
4966 }
4967 break;
4968
4969 case PT_GC:
4970 for (i = min; i < max; i++)
4971 {
4972 int len = 1;
4973 if (eptr >= md->end_subject)
4974 {
4975 SCHECK_PARTIAL();
4976 break;
4977 }
4978 GETCHARLENTEST(c, eptr, len);
4979 prop_category = UCD_CATEGORY(c);
4980 if ((prop_category == prop_value) == prop_fail_result)
4981 break;
4982 eptr+= len;
4983 }
4984 break;
4985
4986 case PT_PC:
4987 for (i = min; i < max; i++)
4988 {
4989 int len = 1;
4990 if (eptr >= md->end_subject)
4991 {
4992 SCHECK_PARTIAL();
4993 break;
4994 }
4995 GETCHARLENTEST(c, eptr, len);
4996 prop_chartype = UCD_CHARTYPE(c);
4997 if ((prop_chartype == prop_value) == prop_fail_result)
4998 break;
4999 eptr+= len;
5000 }
5001 break;
5002
5003 case PT_SC:
5004 for (i = min; i < max; i++)
5005 {
5006 int len = 1;
5007 if (eptr >= md->end_subject)
5008 {
5009 SCHECK_PARTIAL();
5010 break;
5011 }
5012 GETCHARLENTEST(c, eptr, len);
5013 prop_script = UCD_SCRIPT(c);
5014 if ((prop_script == prop_value) == prop_fail_result)
5015 break;
5016 eptr+= len;
5017 }
5018 break;
5019
5020 case PT_ALNUM:
5021 for (i = min; i < max; i++)
5022 {
5023 int len = 1;
5024 if (eptr >= md->end_subject)
5025 {
5026 SCHECK_PARTIAL();
5027 break;
5028 }
5029 GETCHARLENTEST(c, eptr, len);
5030 prop_category = UCD_CATEGORY(c);
5031 if ((prop_category == ucp_L || prop_category == ucp_N)
5032 == prop_fail_result)
5033 break;
5034 eptr+= len;
5035 }
5036 break;
5037
5038 case PT_SPACE: /* Perl space */
5039 for (i = min; i < max; i++)
5040 {
5041 int len = 1;
5042 if (eptr >= md->end_subject)
5043 {
5044 SCHECK_PARTIAL();
5045 break;
5046 }
5047 GETCHARLENTEST(c, eptr, len);
5048 prop_category = UCD_CATEGORY(c);
5049 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5050 c == CHAR_FF || c == CHAR_CR)
5051 == prop_fail_result)
5052 break;
5053 eptr+= len;
5054 }
5055 break;
5056
5057 case PT_PXSPACE: /* POSIX space */
5058 for (i = min; i < max; i++)
5059 {
5060 int len = 1;
5061 if (eptr >= md->end_subject)
5062 {
5063 SCHECK_PARTIAL();
5064 break;
5065 }
5066 GETCHARLENTEST(c, eptr, len);
5067 prop_category = UCD_CATEGORY(c);
5068 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5069 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5070 == prop_fail_result)
5071 break;
5072 eptr+= len;
5073 }
5074 break;
5075
5076 case PT_WORD:
5077 for (i = min; i < max; i++)
5078 {
5079 int len = 1;
5080 if (eptr >= md->end_subject)
5081 {
5082 SCHECK_PARTIAL();
5083 break;
5084 }
5085 GETCHARLENTEST(c, eptr, len);
5086 prop_category = UCD_CATEGORY(c);
5087 if ((prop_category == ucp_L || prop_category == ucp_N ||
5088 c == CHAR_UNDERSCORE) == prop_fail_result)
5089 break;
5090 eptr+= len;
5091 }
5092 break;
5093
5094 default:
5095 RRETURN(PCRE_ERROR_INTERNAL);
5096 }
5097
5098 /* eptr is now past the end of the maximum run */
5099
5100 if (possessive) continue;
5101 for(;;)
5102 {
5103 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5104 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5105 if (eptr-- == pp) break; /* Stop if tried at original pos */
5106 if (utf8) BACKCHAR(eptr);
5107 }
5108 }
5109
5110 /* Match extended Unicode sequences. We will get here only if the
5111 support is in the binary; otherwise a compile-time error occurs. */
5112
5113 else if (ctype == OP_EXTUNI)
5114 {
5115 for (i = min; i < max; i++)
5116 {
5117 if (eptr >= md->end_subject)
5118 {
5119 SCHECK_PARTIAL();
5120 break;
5121 }
5122 GETCHARINCTEST(c, eptr);
5123 prop_category = UCD_CATEGORY(c);
5124 if (prop_category == ucp_M) break;
5125 while (eptr < md->end_subject)
5126 {
5127 int len = 1;
5128 if (!utf8) c = *eptr; else
5129 {
5130 GETCHARLEN(c, eptr, len);
5131 }
5132 prop_category = UCD_CATEGORY(c);
5133 if (prop_category != ucp_M) break;
5134 eptr += len;
5135 }
5136 }
5137
5138 /* eptr is now past the end of the maximum run */
5139
5140 if (possessive) continue;
5141
5142 for(;;)
5143 {
5144 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5145 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5146 if (eptr-- == pp) break; /* Stop if tried at original pos */
5147 for (;;) /* Move back over one extended */
5148 {
5149 int len = 1;
5150 if (!utf8) c = *eptr; else
5151 {
5152 BACKCHAR(eptr);
5153 GETCHARLEN(c, eptr, len);
5154 }
5155 prop_category = UCD_CATEGORY(c);
5156 if (prop_category != ucp_M) break;
5157 eptr--;
5158 }
5159 }
5160 }
5161
5162 else
5163 #endif /* SUPPORT_UCP */
5164
5165 #ifdef SUPPORT_UTF8
5166 /* UTF-8 mode */
5167
5168 if (utf8)
5169 {
5170 switch(ctype)
5171 {
5172 case OP_ANY:
5173 if (max < INT_MAX)
5174 {
5175 for (i = min; i < max; i++)
5176 {
5177 if (eptr >= md->end_subject)
5178 {
5179 SCHECK_PARTIAL();
5180 break;
5181 }
5182 if (IS_NEWLINE(eptr)) break;
5183 eptr++;
5184 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5185 }
5186 }
5187
5188 /* Handle unlimited UTF-8 repeat */
5189
5190 else
5191 {
5192 for (i = min; i < max; i++)
5193 {
5194 if (eptr >= md->end_subject)
5195 {
5196 SCHECK_PARTIAL();
5197 break;
5198 }
5199 if (IS_NEWLINE(eptr)) break;
5200 eptr++;
5201 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5202 }
5203 }
5204 break;
5205
5206 case OP_ALLANY:
5207 if (max < INT_MAX)
5208 {
5209 for (i = min; i < max; i++)
5210 {
5211 if (eptr >= md->end_subject)
5212 {
5213 SCHECK_PARTIAL();
5214 break;
5215 }
5216 eptr++;
5217 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5218 }
5219 }
5220 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5221 break;
5222
5223 /* The byte case is the same as non-UTF8 */
5224
5225 case OP_ANYBYTE:
5226 c = max - min;
5227 if (c > (unsigned int)(md->end_subject - eptr))
5228 {
5229 eptr = md->end_subject;
5230 SCHECK_PARTIAL();
5231 }
5232 else eptr += c;
5233 break;
5234
5235 case OP_ANYNL:
5236 for (i = min; i < max; i++)
5237 {
5238 int len = 1;
5239 if (eptr >= md->end_subject)
5240 {
5241 SCHECK_PARTIAL();
5242 break;
5243 }
5244 GETCHARLEN(c, eptr, len);
5245 if (c == 0x000d)
5246 {
5247 if (++eptr >= md->end_subject) break;
5248 if (*eptr == 0x000a) eptr++;
5249 }
5250 else
5251 {
5252 if (c != 0x000a &&
5253 (md->bsr_anycrlf ||
5254 (c != 0x000b && c != 0x000c &&
5255 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5256 break;
5257 eptr += len;
5258 }
5259 }
5260 break;
5261
5262 case OP_NOT_HSPACE:
5263 case OP_HSPACE:
5264 for (i = min; i < max; i++)
5265 {
5266 BOOL gotspace;
5267 int len = 1;
5268 if (eptr >= md->end_subject)
5269 {
5270 SCHECK_PARTIAL();
5271 break;
5272 }
5273 GETCHARLEN(c, eptr, len);
5274 switch(c)
5275 {
5276 default: gotspace = FALSE; break;
5277 case 0x09: /* HT */
5278 case 0x20: /* SPACE */
5279 case 0xa0: /* NBSP */
5280 case 0x1680: /* OGHAM SPACE MARK */
5281 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5282 case 0x2000: /* EN QUAD */
5283 case 0x2001: /* EM QUAD */
5284 case 0x2002: /* EN SPACE */
5285 case 0x2003: /* EM SPACE */
5286 case 0x2004: /* THREE-PER-EM SPACE */
5287 case 0x2005: /* FOUR-PER-EM SPACE */
5288 case 0x2006: /* SIX-PER-EM SPACE */
5289 case 0x2007: /* FIGURE SPACE */
5290 case 0x2008: /* PUNCTUATION SPACE */
5291 case 0x2009: /* THIN SPACE */
5292 case 0x200A: /* HAIR SPACE */
5293 case 0x202f: /* NARROW NO-BREAK SPACE */
5294 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5295 case 0x3000: /* IDEOGRAPHIC SPACE */
5296 gotspace = TRUE;
5297 break;
5298 }
5299 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5300 eptr += len;
5301 }
5302 break;
5303
5304 case OP_NOT_VSPACE:
5305 case OP_VSPACE:
5306 for (i = min; i < max; i++)
5307 {
5308 BOOL gotspace;
5309 int len = 1;
5310 if (eptr >= md->end_subject)
5311 {
5312 SCHECK_PARTIAL();
5313 break;
5314 }
5315 GETCHARLEN(c, eptr, len);
5316 switch(c)
5317 {
5318 default: gotspace = FALSE; break;
5319 case 0x0a: /* LF */
5320 case 0x0b: /* VT */
5321 case 0x0c: /* FF */
5322 case 0x0d: /* CR */
5323 case 0x85: /* NEL */
5324 case 0x2028: /* LINE SEPARATOR */
5325 case 0x2029: /* PARAGRAPH SEPARATOR */
5326 gotspace = TRUE;
5327 break;
5328 }
5329 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5330 eptr += len;
5331 }
5332 break;
5333
5334 case OP_NOT_DIGIT:
5335 for (i = min; i < max; i++)
5336 {
5337 int len = 1;
5338 if (eptr >= md->end_subject)
5339 {
5340 SCHECK_PARTIAL();
5341 break;
5342 }
5343 GETCHARLEN(c, eptr, len);
5344 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5345 eptr+= len;
5346 }
5347 break;
5348
5349 case OP_DIGIT:
5350 for (i = min; i < max; i++)
5351 {
5352 int len = 1;
5353 if (eptr >= md->end_subject)
5354 {
5355 SCHECK_PARTIAL();
5356 break;
5357 }
5358 GETCHARLEN(c, eptr, len);
5359 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5360 eptr+= len;
5361 }
5362 break;
5363
5364 case OP_NOT_WHITESPACE:
5365 for (i = min; i < max; i++)
5366 {
5367 int len = 1;
5368 if (eptr >= md->end_subject)
5369 {
5370 SCHECK_PARTIAL();
5371 break;
5372 }
5373 GETCHARLEN(c, eptr, len);
5374 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5375 eptr+= len;
5376 }
5377 break;
5378
5379 case OP_WHITESPACE:
5380 for (i = min; i < max; i++)
5381 {
5382 int len = 1;
5383 if (eptr >= md->end_subject)
5384 {
5385 SCHECK_PARTIAL();
5386 break;
5387 }
5388 GETCHARLEN(c, eptr, len);
5389 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5390 eptr+= len;
5391 }
5392 break;
5393
5394 case OP_NOT_WORDCHAR:
5395 for (i = min; i < max; i++)
5396 {
5397 int len = 1;
5398 if (eptr >= md->end_subject)
5399 {
5400 SCHECK_PARTIAL();
5401 break;
5402 }
5403 GETCHARLEN(c, eptr, len);
5404 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5405 eptr+= len;
5406 }
5407 break;
5408
5409 case OP_WORDCHAR:
5410 for (i = min; i < max; i++)
5411 {
5412 int len = 1;
5413 if (eptr >= md->end_subject)
5414 {
5415 SCHECK_PARTIAL();
5416 break;
5417 }
5418 GETCHARLEN(c, eptr, len);
5419 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5420 eptr+= len;
5421 }
5422 break;
5423
5424 default:
5425 RRETURN(PCRE_ERROR_INTERNAL);
5426 }
5427
5428 /* eptr is now past the end of the maximum run. If possessive, we are
5429 done (no backing up). Otherwise, match at this position; anything other
5430 than no match is immediately returned. For nomatch, back up one
5431 character, unless we are matching \R and the last thing matched was
5432 \r\n, in which case, back up two bytes. */
5433
5434 if (possessive) continue;
5435 for(;;)
5436 {
5437 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5438 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5439 if (eptr-- == pp) break; /* Stop if tried at original pos */
5440 BACKCHAR(eptr);
5441 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5442 eptr[-1] == '\r') eptr--;
5443 }
5444 }
5445 else
5446 #endif /* SUPPORT_UTF8 */
5447
5448 /* Not UTF-8 mode */
5449 {
5450 switch(ctype)
5451 {
5452 case OP_ANY:
5453 for (i = min; i < max; i++)
5454 {
5455 if (eptr >= md->end_subject)
5456 {
5457 SCHECK_PARTIAL();
5458 break;
5459 }
5460 if (IS_NEWLINE(eptr)) break;
5461 eptr++;
5462 }
5463 break;
5464
5465 case OP_ALLANY:
5466 case OP_ANYBYTE:
5467 c = max - min;
5468 if (c > (unsigned int)(md->end_subject - eptr))
5469 {
5470 eptr = md->end_subject;
5471 SCHECK_PARTIAL();
5472 }
5473 else eptr += c;
5474 break;
5475
5476 case OP_ANYNL:
5477 for (i = min; i < max; i++)
5478 {
5479 if (eptr >= md->end_subject)
5480 {
5481 SCHECK_PARTIAL();
5482 break;
5483 }
5484 c = *eptr;
5485 if (c == 0x000d)
5486 {
5487 if (++eptr >= md->end_subject) break;
5488 if (*eptr == 0x000a) eptr++;
5489 }
5490 else
5491 {
5492 if (c != 0x000a &&
5493 (md->bsr_anycrlf ||
5494 (c != 0x000b && c != 0x000c && c != 0x0085)))
5495 break;
5496 eptr++;
5497 }
5498 }
5499 break;
5500
5501 case OP_NOT_HSPACE:
5502 for (i = min; i < max; i++)
5503 {
5504 if (eptr >= md->end_subject)
5505 {
5506 SCHECK_PARTIAL();
5507 break;
5508 }
5509 c = *eptr;
5510 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5511 eptr++;
5512 }
5513 break;
5514
5515 case OP_HSPACE:
5516 for (i = min; i < max; i++)
5517 {
5518 if (eptr >= md->end_subject)
5519 {
5520 SCHECK_PARTIAL();
5521 break;
5522 }
5523 c = *eptr;
5524 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5525 eptr++;
5526 }
5527 break;
5528
5529 case OP_NOT_VSPACE:
5530 for (i = min; i < max; i++)
5531 {
5532 if (eptr >= md->end_subject)
5533 {
5534 SCHECK_PARTIAL();
5535 break;
5536 }
5537 c = *eptr;
5538 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5539 break;
5540 eptr++;
5541 }
5542 break;
5543
5544 case OP_VSPACE:
5545 for (i = min; i < max; i++)
5546 {
5547 if (eptr >= md->end_subject)
5548 {
5549 SCHECK_PARTIAL();
5550 break;
5551 }
5552 c = *eptr;
5553 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5554 break;
5555 eptr++;
5556 }
5557 break;
5558
5559 case OP_NOT_DIGIT:
5560 for (i = min; i < max; i++)
5561 {
5562 if (eptr >= md->end_subject)
5563 {
5564 SCHECK_PARTIAL();
5565 break;
5566 }
5567 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5568 eptr++;
5569 }
5570 break;
5571
5572 case OP_DIGIT:
5573 for (i = min; i < max; i++)
5574 {
5575 if (eptr >= md->end_subject)
5576 {
5577 SCHECK_PARTIAL();
5578 break;
5579 }
5580 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5581 eptr++;
5582 }
5583 break;
5584
5585 case OP_NOT_WHITESPACE:
5586 for (i = min; i < max; i++)
5587 {
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5594 eptr++;
5595 }
5596 break;
5597
5598 case OP_WHITESPACE:
5599 for (i = min; i < max; i++)
5600 {
5601 if (eptr >= md->end_subject)
5602 {
5603 SCHECK_PARTIAL();
5604 break;
5605 }
5606 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5607 eptr++;
5608 }
5609 break;
5610
5611 case OP_NOT_WORDCHAR:
5612 for (i = min; i < max; i++)
5613 {
5614 if (eptr >= md->end_subject)
5615 {
5616 SCHECK_PARTIAL();
5617 break;
5618 }
5619 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5620 eptr++;
5621 }
5622 break;
5623
5624 case OP_WORDCHAR:
5625 for (i = min; i < max; i++)
5626 {
5627 if (eptr >= md->end_subject)
5628 {
5629 SCHECK_PARTIAL();
5630 break;
5631 }
5632 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5633 eptr++;
5634 }
5635 break;
5636
5637 default:
5638 RRETURN(PCRE_ERROR_INTERNAL);
5639 }
5640
5641 /* eptr is now past the end of the maximum run. If possessive, we are
5642 done (no backing up). Otherwise, match at this position; anything other
5643 than no match is immediately returned. For nomatch, back up one
5644 character (byte), unless we are matching \R and the last thing matched
5645 was \r\n, in which case, back up two bytes. */
5646
5647 if (possessive) continue;
5648 while (eptr >= pp)
5649 {
5650 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5652 eptr--;
5653 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5654 eptr[-1] == '\r') eptr--;
5655 }
5656 }
5657
5658 /* Get here if we can't make it match with any permitted repetitions */
5659
5660 MRRETURN(MATCH_NOMATCH);
5661 }
5662 /* Control never gets here */
5663
5664 /* There's been some horrible disaster. Arrival here can only mean there is
5665 something seriously wrong in the code above or the OP_xxx definitions. */
5666
5667 default:
5668 DPRINTF(("Unknown opcode %d\n", *ecode));
5669 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5670 }
5671
5672 /* Do not stick any code in here without much thought; it is assumed
5673 that "continue" in the code above comes out to here to repeat the main
5674 loop. */
5675
5676 } /* End of main loop */
5677 /* Control never reaches here */
5678
5679
5680 /* When compiling to use the heap rather than the stack for recursive calls to
5681 match(), the RRETURN() macro jumps here. The number that is saved in
5682 frame->Xwhere indicates which label we actually want to return to. */
5683
5684 #ifdef NO_RECURSE
5685 #define LBL(val) case val: goto L_RM##val;
5686 HEAP_RETURN:
5687 switch (frame->Xwhere)
5688 {
5689 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5690 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5691 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5692 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5693 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5694 #ifdef SUPPORT_UTF8
5695 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5696 LBL(32) LBL(34) LBL(42) LBL(46)
5697 #ifdef SUPPORT_UCP
5698 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5699 LBL(59) LBL(60) LBL(61) LBL(62)
5700 #endif /* SUPPORT_UCP */
5701 #endif /* SUPPORT_UTF8 */
5702 default:
5703 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5704 return PCRE_ERROR_INTERNAL;
5705 }
5706 #undef LBL
5707 #endif /* NO_RECURSE */
5708 }
5709
5710
5711 /***************************************************************************
5712 ****************************************************************************
5713 RECURSION IN THE match() FUNCTION
5714
5715 Undefine all the macros that were defined above to handle this. */
5716
5717 #ifdef NO_RECURSE
5718 #undef eptr
5719 #undef ecode
5720 #undef mstart
5721 #undef offset_top
5722 #undef eptrb
5723 #undef flags
5724
5725 #undef callpat
5726 #undef charptr
5727 #undef data
5728 #undef next
5729 #undef pp
5730 #undef prev
5731 #undef saved_eptr
5732
5733 #undef new_recursive
5734
5735 #undef cur_is_word
5736 #undef condition
5737 #undef prev_is_word
5738
5739 #undef ctype
5740 #undef length
5741 #undef max
5742 #undef min
5743 #undef number
5744 #undef offset
5745 #undef op
5746 #undef save_capture_last
5747 #undef save_offset1
5748 #undef save_offset2
5749 #undef save_offset3
5750 #undef stacksave
5751
5752 #undef newptrb
5753
5754 #endif
5755
5756 /* These two are defined as macros in both cases */
5757
5758 #undef fc
5759 #undef fi
5760
5761 /***************************************************************************
5762 ***************************************************************************/
5763
5764
5765
5766 /*************************************************
5767 * Execute a Regular Expression *
5768 *************************************************/
5769
5770 /* This function applies a compiled re to a subject string and picks out
5771 portions of the string if it matches. Two elements in the vector are set for
5772 each substring: the offsets to the start and end of the substring.
5773
5774 Arguments:
5775 argument_re points to the compiled expression
5776 extra_data points to extra data or is NULL
5777 subject points to the subject string
5778 length length of subject string (may contain binary zeros)
5779 start_offset where to start in the subject string
5780 options option bits
5781 offsets points to a vector of ints to be filled in with offsets
5782 offsetcount the number of elements in the vector
5783
5784 Returns: > 0 => success; value is the number of elements filled in
5785 = 0 => success, but offsets is not big enough
5786 -1 => failed to match
5787 < -1 => some kind of unexpected problem
5788 */
5789
5790 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5791 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5792 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5793 int offsetcount)
5794 {
5795 int rc, resetcount, ocount;
5796 int first_byte = -1;
5797 int req_byte = -1;
5798 int req_byte2 = -1;
5799 int newline;
5800 BOOL using_temporary_offsets = FALSE;
5801 BOOL anchored;
5802 BOOL startline;
5803 BOOL firstline;
5804 BOOL first_byte_caseless = FALSE;
5805 BOOL req_byte_caseless = FALSE;
5806 BOOL utf8;
5807 match_data match_block;
5808 match_data *md = &match_block;
5809 const uschar *tables;
5810 const uschar *start_bits = NULL;
5811 USPTR start_match = (USPTR)subject + start_offset;
5812 USPTR end_subject;
5813 USPTR start_partial = NULL;
5814 USPTR req_byte_ptr = start_match - 1;
5815
5816 pcre_study_data internal_study;
5817 const pcre_study_data *study;
5818
5819 real_pcre internal_re;
5820 const real_pcre *external_re = (const real_pcre *)argument_re;
5821 const real_pcre *re = external_re;
5822
5823 /* Plausibility checks */
5824
5825 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5826 if (re == NULL || subject == NULL ||
5827 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5828 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5829 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5830
5831 /* This information is for finding all the numbers associated with a given
5832 name, for condition testing. */
5833
5834 md->name_table = (uschar *)re + re->name_table_offset;
5835 md->name_count = re->name_count;
5836 md->name_entry_size = re->name_entry_size;
5837
5838 /* Fish out the optional data from the extra_data structure, first setting
5839 the default values. */
5840
5841 study = NULL;
5842 md->match_limit = MATCH_LIMIT;
5843 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5844 md->callout_data = NULL;
5845
5846 /* The table pointer is always in native byte order. */
5847
5848 tables = external_re->tables;
5849
5850 if (extra_data != NULL)
5851 {
5852 register unsigned int flags = extra_data->flags;
5853 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5854 study = (const pcre_study_data *)extra_data->study_data;
5855 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5856 md->match_limit = extra_data->match_limit;
5857 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5858 md->match_limit_recursion = extra_data->match_limit_recursion;
5859 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5860 md->callout_data = extra_data->callout_data;
5861 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5862 }
5863
5864 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5865 is a feature that makes it possible to save compiled regex and re-use them
5866 in other programs later. */
5867
5868 if (tables == NULL) tables = _pcre_default_tables;
5869
5870 /* Check that the first field in the block is the magic number. If it is not,
5871 test for a regex that was compiled on a host of opposite endianness. If this is
5872 the case, flipped values are put in internal_re and internal_study if there was
5873 study data too. */
5874
5875 if (re->magic_number != MAGIC_NUMBER)
5876 {
5877 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5878 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5879 if (study != NULL) study = &internal_study;
5880 }
5881
5882 /* Set up other data */
5883
5884 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5885 startline = (re->flags & PCRE_STARTLINE) != 0;
5886 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5887
5888 /* The code starts after the real_pcre block and the capture name table. */
5889
5890 md->start_code = (const uschar *)external_re + re->name_table_offset +
5891 re->name_count * re->name_entry_size;
5892
5893 md->start_subject = (USPTR)subject;
5894 md->start_offset = start_offset;
5895 md->end_subject = md->start_subject + length;
5896 end_subject = md->end_subject;
5897
5898 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5899 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5900 md->use_ucp = (re->options & PCRE_UCP) != 0;
5901 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5902
5903 /* Some options are unpacked into BOOL variables in the hope that testing
5904 them will be faster than individual option bits. */
5905
5906 md->notbol = (options & PCRE_NOTBOL) != 0;
5907 md->noteol = (options & PCRE_NOTEOL) != 0;
5908 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5909 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5910 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5911 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5912
5913
5914 md->hitend = FALSE;
5915 md->mark = NULL; /* In case never set */
5916
5917 md->recursive = NULL; /* No recursion at top level */
5918
5919 md->lcc = tables + lcc_offset;
5920 md->ctypes = tables + ctypes_offset;
5921
5922 /* Handle different \R options. */
5923
5924 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5925 {
5926 case 0:
5927 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5928 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5929 else
5930 #ifdef BSR_ANYCRLF
5931 md->bsr_anycrlf = TRUE;
5932 #else
5933 md->bsr_anycrlf = FALSE;
5934 #endif
5935 break;
5936
5937 case PCRE_BSR_ANYCRLF:
5938 md->bsr_anycrlf = TRUE;
5939 break;
5940
5941 case PCRE_BSR_UNICODE:
5942 md->bsr_anycrlf = FALSE;
5943 break;
5944
5945 default: return PCRE_ERROR_BADNEWLINE;
5946 }
5947
5948 /* Handle different types of newline. The three bits give eight cases. If
5949 nothing is set at run time, whatever was used at compile time applies. */
5950
5951 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5952 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5953 {
5954 case 0: newline = NEWLINE; break; /* Compile-time default */
5955 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5956 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5957 case PCRE_NEWLINE_CR+
5958 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5959 case PCRE_NEWLINE_ANY: newline = -1; break;
5960 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5961 default: return PCRE_ERROR_BADNEWLINE;
5962 }
5963
5964 if (newline == -2)
5965 {
5966 md->nltype = NLTYPE_ANYCRLF;
5967 }
5968 else if (newline < 0)
5969 {
5970 md->nltype = NLTYPE_ANY;
5971 }
5972 else
5973 {
5974 md->nltype = NLTYPE_FIXED;
5975 if (newline > 255)
5976 {
5977 md->nllen = 2;
5978 md->nl[0] = (newline >> 8) & 255;
5979 md->nl[1] = newline & 255;
5980 }
5981 else
5982 {
5983 md->nllen = 1;
5984 md->nl[0] = newline;
5985 }
5986 }
5987
5988 /* Partial matching was originally supported only for a restricted set of
5989 regexes; from release 8.00 there are no restrictions, but the bits are still
5990 defined (though never set). So there's no harm in leaving this code. */
5991
5992 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5993 return PCRE_ERROR_BADPARTIAL;
5994
5995 /* Check a UTF-8 string if required. Pass back the character offset and error
5996 code for an invalid string if a results vector is available. */
5997
5998 #ifdef SUPPORT_UTF8
5999 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6000 {
6001 int erroroffset;
6002 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
6003 if (errorcode != 0)
6004 {
6005 if (offsetcount >= 2)
6006 {
6007 offsets[0] = erroroffset;
6008 offsets[1] = errorcode;
6009 }
6010 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6011 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6012 }
6013
6014 /* Check that a start_offset points to the start of a UTF-8 character. */
6015
6016 if (start_offset > 0 && start_offset < length &&
6017 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6018 return PCRE_ERROR_BADUTF8_OFFSET;
6019 }
6020 #endif
6021
6022 /* If the expression has got more back references than the offsets supplied can
6023 hold, we get a temporary chunk of working store to use during the matching.
6024 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6025 of 3. */
6026
6027 ocount = offsetcount - (offsetcount % 3);
6028
6029 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6030 {
6031 ocount = re->top_backref * 3 + 3;
6032 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6033 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6034 using_temporary_offsets = TRUE;
6035 DPRINTF(("Got memory to hold back references\n"));
6036 }
6037 else md->offset_vector = offsets;
6038
6039 md->offset_end = ocount;
6040 md->offset_max = (2*ocount)/3;
6041 md->offset_overflow = FALSE;
6042 md->capture_last = -1;
6043
6044 /* Compute the minimum number of offsets that we need to reset each time. Doing
6045 this makes a huge difference to execution time when there aren't many brackets
6046 in the pattern. */
6047
6048 resetcount = 2 + re->top_bracket * 2;
6049 if (resetcount > offsetcount) resetcount = ocount;
6050
6051 /* Reset the working variable associated with each extraction. These should
6052 never be used unless previously set, but they get saved and restored, and so we
6053 initialize them to avoid reading uninitialized locations. */
6054
6055 if (md->offset_vector != NULL)
6056 {
6057 register int *iptr = md->offset_vector + ocount;
6058 register int *iend = iptr - resetcount/2 + 1;
6059 while (--iptr >= iend) *iptr = -1;
6060 }
6061
6062 /* Set up the first character to match, if available. The first_byte value is
6063 never set for an anchored regular expression, but the anchoring may be forced
6064 at run time, so we have to test for anchoring. The first char may be unset for
6065 an unanchored pattern, of course. If there's no first char and the pattern was
6066 studied, there may be a bitmap of possible first characters. */
6067
6068 if (!anchored)
6069 {
6070 if ((re->flags & PCRE_FIRSTSET) != 0)
6071 {
6072 first_byte = re->first_byte & 255;
6073 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6074 first_byte = md->lcc[first_byte];
6075 }
6076 else
6077 if (!startline && study != NULL &&
6078 (study->flags & PCRE_STUDY_MAPPED) != 0)
6079 start_bits = study->start_bits;
6080 }
6081
6082 /* For anchored or unanchored matches, there may be a "last known required
6083 character" set. */
6084
6085 if ((re->flags & PCRE_REQCHSET) != 0)
6086 {
6087 req_byte = re->req_byte & 255;
6088 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6089 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6090 }
6091
6092
6093 /* ==========================================================================*/
6094
6095 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6096 the loop runs just once. */
6097
6098 for(;;)
6099 {
6100 USPTR save_end_subject = end_subject;
6101 USPTR new_start_match;
6102
6103 /* Reset the maximum number of extractions we might see. */
6104
6105 if (md->offset_vector != NULL)
6106 {
6107 register int *iptr = md->offset_vector;
6108 register int *iend = iptr + resetcount;
6109 while (iptr < iend) *iptr++ = -1;
6110 }
6111
6112 /* If firstline is TRUE, the start of the match is constrained to the first
6113 line of a multiline string. That is, the match must be before or at the first
6114 newline. Implement this by temporarily adjusting end_subject so that we stop
6115 scanning at a newline. If the match fails at the newline, later code breaks
6116 this loop. */
6117
6118 if (firstline)
6119 {
6120 USPTR t = start_match;
6121 #ifdef SUPPORT_UTF8
6122 if (utf8)
6123 {
6124 while (t < md->end_subject && !IS_NEWLINE(t))
6125 {
6126 t++;
6127 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6128 }
6129 }
6130 else
6131 #endif
6132 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6133 end_subject = t;
6134 }
6135
6136 /* There are some optimizations that avoid running the match if a known
6137 starting point is not found, or if a known later character is not present.
6138 However, there is an option that disables these, for testing and for ensuring
6139 that all callouts do actually occur. The option can be set in the regex by
6140 (*NO_START_OPT) or passed in match-time options. */
6141
6142 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6143 {
6144 /* Advance to a unique first byte if there is one. */
6145
6146 if (first_byte >= 0)
6147 {
6148 if (first_byte_caseless)
6149 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6150 start_match++;
6151 else
6152 while (start_match < end_subject && *start_match != first_byte)
6153 start_match++;
6154 }
6155
6156 /* Or to just after a linebreak for a multiline match */
6157
6158 else if (startline)
6159 {
6160 if (start_match > md->start_subject + start_offset)
6161 {
6162 #ifdef SUPPORT_UTF8
6163 if (utf8)
6164 {
6165 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6166 {
6167 start_match++;
6168 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6169 start_match++;
6170 }
6171 }
6172 else
6173 #endif
6174 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6175 start_match++;
6176
6177 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6178 and we are now at a LF, advance the match position by one more character.
6179 */
6180
6181 if (start_match[-1] == CHAR_CR &&
6182 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6183 start_match < end_subject &&
6184 *start_match == CHAR_NL)
6185 start_match++;
6186 }
6187 }
6188
6189 /* Or to a non-unique first byte after study */
6190
6191 else if (start_bits != NULL)
6192 {
6193 while (start_match < end_subject)
6194 {
6195 register unsigned int c = *start_match;
6196 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6197 {
6198 start_match++;
6199 #ifdef SUPPORT_UTF8
6200 if (utf8)
6201 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6202 start_match++;
6203 #endif
6204 }
6205 else break;
6206 }
6207 }
6208 } /* Starting optimizations */
6209
6210 /* Restore fudged end_subject */
6211
6212 end_subject = save_end_subject;
6213
6214 /* The following two optimizations are disabled for partial matching or if
6215 disabling is explicitly requested. */
6216
6217 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6218 {
6219 /* If the pattern was studied, a minimum subject length may be set. This is
6220 a lower bound; no actual string of that length may actually match the
6221 pattern. Although the value is, strictly, in characters, we treat it as
6222 bytes to avoid spending too much time in this optimization. */
6223
6224 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6225 (pcre_uint32)(end_subject - start_match) < study->minlength)
6226 {
6227 rc = MATCH_NOMATCH;
6228 break;
6229 }
6230
6231 /* If req_byte is set, we know that that character must appear in the
6232 subject for the match to succeed. If the first character is set, req_byte
6233 must be later in the subject; otherwise the test starts at the match point.
6234 This optimization can save a huge amount of backtracking in patterns with
6235 nested unlimited repeats that aren't going to match. Writing separate code
6236 for cased/caseless versions makes it go faster, as does using an
6237 autoincrement and backing off on a match.
6238
6239 HOWEVER: when the subject string is very, very long, searching to its end
6240 can take a long time, and give bad performance on quite ordinary patterns.
6241 This showed up when somebody was matching something like /^\d+C/ on a
6242 32-megabyte string... so we don't do this when the string is sufficiently
6243 long. */
6244
6245 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6246 {
6247 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6248
6249 /* We don't need to repeat the search if we haven't yet reached the
6250 place we found it at last time. */
6251
6252 if (p > req_byte_ptr)
6253 {
6254 if (req_byte_caseless)
6255 {
6256 while (p < end_subject)
6257 {
6258 register int pp = *p++;
6259 if (pp == req_byte || pp == req_byte2) { p--; break; }
6260 }
6261 }
6262 else
6263 {
6264 while (p < end_subject)
6265 {
6266 if (*p++ == req_byte) { p--; break; }
6267 }
6268 }
6269
6270 /* If we can't find the required character, break the matching loop,
6271 forcing a match failure. */
6272
6273 if (p >= end_subject)
6274 {
6275 rc = MATCH_NOMATCH;
6276 break;
6277 }
6278
6279 /* If we have found the required character, save the point where we
6280 found it, so that we don't search again next time round the loop if
6281 the start hasn't passed this character yet. */
6282
6283 req_byte_ptr = p;
6284 }
6285 }
6286 }
6287
6288 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6289 printf(">>>> Match against: ");
6290 pchars(start_match, end_subject - start_match, TRUE, md);
6291 printf("\n");
6292 #endif
6293
6294 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6295 first starting point for which a partial match was found. */
6296
6297 md->start_match_ptr = start_match;
6298 md->start_used_ptr = start_match;
6299 md->match_call_count = 0;
6300 md->match_function_type = 0;
6301 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6302 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6303
6304 switch(rc)
6305 {
6306 /* SKIP passes back the next starting point explicitly, but if it is the
6307 same as the match we have just done, treat it as NOMATCH. */
6308
6309 case MATCH_SKIP:
6310 if (md->start_match_ptr != start_match)
6311 {
6312 new_start_match = md->start_match_ptr;
6313 break;
6314 }
6315 /* Fall through */
6316
6317 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6318 the SKIP's arg was not found. We also treat this as NOMATCH. */
6319
6320 case MATCH_SKIP_ARG:
6321 /* Fall through */
6322
6323 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6324 exactly like PRUNE. */
6325
6326 case MATCH_NOMATCH:
6327 case MATCH_PRUNE:
6328 case MATCH_THEN:
6329 new_start_match = start_match + 1;
6330 #ifdef SUPPORT_UTF8
6331 if (utf8)
6332 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6333 new_start_match++;
6334 #endif
6335 break;
6336
6337 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6338
6339 case MATCH_COMMIT:
6340 rc = MATCH_NOMATCH;
6341 goto ENDLOOP;
6342
6343 /* Any other return is either a match, or some kind of error. */
6344
6345 default:
6346 goto ENDLOOP;
6347 }
6348
6349 /* Control reaches here for the various types of "no match at this point"
6350 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6351
6352 rc = MATCH_NOMATCH;
6353
6354 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6355 newline in the subject (though it may continue over the newline). Therefore,
6356 if we have just failed to match, starting at a newline, do not continue. */
6357
6358 if (firstline && IS_NEWLINE(start_match)) break;
6359
6360 /* Advance to new matching position */
6361
6362 start_match = new_start_match;
6363
6364 /* Break the loop if the pattern is anchored or if we have passed the end of
6365 the subject. */
6366
6367 if (anchored || start_match > end_subject) break;
6368
6369 /* If we have just passed a CR and we are now at a LF, and the pattern does
6370 not contain any explicit matches for \r or \n, and the newline option is CRLF
6371 or ANY or ANYCRLF, advance the match position by one more character. */
6372
6373 if (start_match[-1] == CHAR_CR &&
6374 start_match < end_subject &&
6375 *start_match == CHAR_NL &&
6376 (re->flags & PCRE_HASCRORLF) == 0 &&
6377 (md->nltype == NLTYPE_ANY ||
6378 md->nltype == NLTYPE_ANYCRLF ||
6379 md->nllen == 2))
6380 start_match++;
6381
6382 md->mark = NULL; /* Reset for start of next match attempt */
6383 } /* End of for(;;) "bumpalong" loop */
6384
6385 /* ==========================================================================*/
6386
6387 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6388 conditions is true:
6389
6390 (1) The pattern is anchored or the match was failed by (*COMMIT);
6391
6392 (2) We are past the end of the subject;
6393
6394 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6395 this option requests that a match occur at or before the first newline in
6396 the subject.
6397
6398 When we have a match and the offset vector is big enough to deal with any
6399 backreferences, captured substring offsets will already be set up. In the case
6400 where we had to get some local store to hold offsets for backreference
6401 processing, copy those that we can. In this case there need not be overflow if
6402 certain parts of the pattern were not used, even though there are more
6403 capturing parentheses than vector slots. */
6404
6405 ENDLOOP:
6406
6407 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6408 {
6409 if (using_temporary_offsets)
6410 {
6411 if (offsetcount >= 4)
6412 {
6413 memcpy(offsets + 2, md->offset_vector + 2,
6414 (offsetcount - 2) * sizeof(int));
6415 DPRINTF(("Copied offsets from temporary memory\n"));
6416 }
6417 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6418 DPRINTF(("Freeing temporary memory\n"));
6419 (pcre_free)(md->offset_vector);
6420 }
6421
6422 /* Set the return code to the number of captured strings, or 0 if there are
6423 too many to fit into the vector. */
6424
6425 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6426
6427 /* If there is space, set up the whole thing as substring 0. The value of
6428 md->start_match_ptr might be modified if \K was encountered on the success
6429 matching path. */
6430
6431 if (offsetcount < 2) rc = 0; else
6432 {
6433 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6434 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6435 }
6436
6437 DPRINTF((">>>> returning %d\n", rc));
6438 goto RETURN_MARK;
6439 }
6440
6441 /* Control gets here if there has been an error, or if the overall match
6442 attempt has failed at all permitted starting positions. */
6443
6444 if (using_temporary_offsets)
6445 {
6446 DPRINTF(("Freeing temporary memory\n"));
6447 (pcre_free)(md->offset_vector);
6448 }
6449
6450 /* For anything other than nomatch or partial match, just return the code. */
6451
6452 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6453 {
6454 DPRINTF((">>>> error: returning %d\n", rc));
6455 return rc;
6456 }
6457
6458 /* Handle partial matches - disable any mark data */
6459
6460 if (start_partial != NULL)
6461 {
6462 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6463 md->mark = NULL;
6464 if (offsetcount > 1)
6465 {
6466 offsets[0] = (int)(start_partial - (USPTR)subject);
6467 offsets[1] = (int)(end_subject - (USPTR)subject);
6468 }
6469 rc = PCRE_ERROR_PARTIAL;
6470 }
6471
6472 /* This is the classic nomatch case */
6473
6474 else
6475 {
6476 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6477 rc = PCRE_ERROR_NOMATCH;
6478 }
6479
6480 /* Return the MARK data if it has been requested. */
6481
6482 RETURN_MARK:
6483
6484 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6485 *(extra_data->mark) = (unsigned char *)(md->mark);
6486 return rc;
6487 }
6488
6489 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5