/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (show annotations)
Wed Jun 15 18:09:23 2011 UTC (8 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 194331 byte(s)
Fix bug with /\A.*?(?:a|b(*THEN)c)/ by removing the tail recursion optimization 
for the final branch. Also fix a similar bug for conditional subpatterns.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63};
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. If we fail to match, we need to restore this value and also the
812 values of the final offsets, in case they were set by a previous iteration
813 of the same bracket.
814
815 If there isn't enough space in the offset vector, treat this as if it were
816 a non-capturing bracket. Don't worry about setting the flag for the error
817 case here; that is handled in the code for KET. */
818
819 case OP_CBRA:
820 case OP_SCBRA:
821 number = GET2(ecode, 1+LINK_SIZE);
822 offset = number << 1;
823
824 #ifdef PCRE_DEBUG
825 printf("start bracket %d\n", number);
826 printf("subject=");
827 pchars(eptr, 16, TRUE, md);
828 printf("\n");
829 #endif
830
831 if (offset < md->offset_max)
832 {
833 save_offset1 = md->offset_vector[offset];
834 save_offset2 = md->offset_vector[offset+1];
835 save_offset3 = md->offset_vector[md->offset_end - number];
836 save_capture_last = md->capture_last;
837
838 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 md->offset_vector[md->offset_end - number] =
840 (int)(eptr - md->start_subject);
841
842 for (;;)
843 {
844 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846 eptrb, RM1);
847 if (rrc != MATCH_NOMATCH &&
848 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849 RRETURN(rrc);
850 md->capture_last = save_capture_last;
851 ecode += GET(ecode, 1);
852 if (*ecode != OP_ALT) break;
853 }
854
855 DPRINTF(("bracket %d failed\n", number));
856
857 md->offset_vector[offset] = save_offset1;
858 md->offset_vector[offset+1] = save_offset2;
859 md->offset_vector[md->offset_end - number] = save_offset3;
860
861 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
862 RRETURN(MATCH_NOMATCH);
863 }
864
865 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866 as a non-capturing bracket. */
867
868 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870
871 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877 for all the alternatives. When we get to the final alternative within the
878 brackets, we used to return the result of a recursive call to match()
879 whatever happened so it was possible to reduce stack usage by turning this
880 into a tail recursion, except in the case of a possibly empty group.
881 However, now that there is the possiblity of (*THEN) occurring in the final
882 alternative, this optimization is no longer possible. */
883
884 case OP_BRA:
885 case OP_SBRA:
886 DPRINTF(("start non-capturing bracket\n"));
887 for (;;)
888 {
889 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
890 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
891 RM2);
892 if (rrc != MATCH_NOMATCH &&
893 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
894 RRETURN(rrc);
895 ecode += GET(ecode, 1);
896 if (*ecode != OP_ALT) break;
897 }
898
899 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
900 RRETURN(MATCH_NOMATCH);
901
902 /* Handle possessive capturing brackets with an unlimited repeat. We come
903 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
904 handled similarly to the normal case above. However, the matching is
905 different. The end of these brackets will always be OP_KETRPOS, which
906 returns MATCH_KETRPOS without going further in the pattern. By this means
907 we can handle the group by iteration rather than recursion, thereby
908 reducing the amount of stack needed. */
909
910 case OP_CBRAPOS:
911 case OP_SCBRAPOS:
912 allow_zero = FALSE;
913
914 POSSESSIVE_CAPTURE:
915 number = GET2(ecode, 1+LINK_SIZE);
916 offset = number << 1;
917
918 #ifdef PCRE_DEBUG
919 printf("start possessive bracket %d\n", number);
920 printf("subject=");
921 pchars(eptr, 16, TRUE, md);
922 printf("\n");
923 #endif
924
925 if (offset < md->offset_max)
926 {
927 matched_once = FALSE;
928 code_offset = ecode - md->start_code;
929
930 save_offset1 = md->offset_vector[offset];
931 save_offset2 = md->offset_vector[offset+1];
932 save_offset3 = md->offset_vector[md->offset_end - number];
933 save_capture_last = md->capture_last;
934
935 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
936
937 /* Each time round the loop, save the current subject position for use
938 when the group matches. For MATCH_MATCH, the group has matched, so we
939 restart it with a new subject starting position, remembering that we had
940 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
941 usual. If we haven't matched any alternatives in any iteration, check to
942 see if a previous iteration matched. If so, the group has matched;
943 continue from afterwards. Otherwise it has failed; restore the previous
944 capture values before returning NOMATCH. */
945
946 for (;;)
947 {
948 md->offset_vector[md->offset_end - number] =
949 (int)(eptr - md->start_subject);
950 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
951 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
952 eptrb, RM63);
953 if (rrc == MATCH_KETRPOS)
954 {
955 offset_top = md->end_offset_top;
956 eptr = md->end_match_ptr;
957 ecode = md->start_code + code_offset;
958 save_capture_last = md->capture_last;
959 matched_once = TRUE;
960 continue;
961 }
962 if (rrc != MATCH_NOMATCH &&
963 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
964 RRETURN(rrc);
965 md->capture_last = save_capture_last;
966 ecode += GET(ecode, 1);
967 if (*ecode != OP_ALT) break;
968 }
969
970 if (!matched_once)
971 {
972 md->offset_vector[offset] = save_offset1;
973 md->offset_vector[offset+1] = save_offset2;
974 md->offset_vector[md->offset_end - number] = save_offset3;
975 }
976
977 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
978 if (allow_zero || matched_once)
979 {
980 ecode += 1 + LINK_SIZE;
981 break;
982 }
983
984 RRETURN(MATCH_NOMATCH);
985 }
986
987 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
988 as a non-capturing bracket. */
989
990 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
991 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
992
993 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
994
995 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
997
998 /* Non-capturing possessive bracket with unlimited repeat. We come here
999 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1000 without the capturing complication. It is written out separately for speed
1001 and cleanliness. */
1002
1003 case OP_BRAPOS:
1004 case OP_SBRAPOS:
1005 allow_zero = FALSE;
1006
1007 POSSESSIVE_NON_CAPTURE:
1008 matched_once = FALSE;
1009 code_offset = ecode - md->start_code;
1010
1011 for (;;)
1012 {
1013 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1014 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1015 eptrb, RM48);
1016 if (rrc == MATCH_KETRPOS)
1017 {
1018 eptr = md->end_match_ptr;
1019 ecode = md->start_code + code_offset;
1020 matched_once = TRUE;
1021 continue;
1022 }
1023 if (rrc != MATCH_NOMATCH &&
1024 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1025 RRETURN(rrc);
1026 ecode += GET(ecode, 1);
1027 if (*ecode != OP_ALT) break;
1028 }
1029
1030 if (matched_once || allow_zero)
1031 {
1032 ecode += 1 + LINK_SIZE;
1033 break;
1034 }
1035 RRETURN(MATCH_NOMATCH);
1036
1037 /* Control never reaches here. */
1038
1039 /* Conditional group: compilation checked that there are no more than
1040 two branches. If the condition is false, skipping the first branch takes us
1041 past the end if there is only one branch, but that's OK because that is
1042 exactly what going to the ket would do. */
1043
1044 case OP_COND:
1045 case OP_SCOND:
1046 codelink = GET(ecode, 1);
1047
1048 /* Because of the way auto-callout works during compile, a callout item is
1049 inserted between OP_COND and an assertion condition. */
1050
1051 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1052 {
1053 if (pcre_callout != NULL)
1054 {
1055 pcre_callout_block cb;
1056 cb.version = 1; /* Version 1 of the callout block */
1057 cb.callout_number = ecode[LINK_SIZE+2];
1058 cb.offset_vector = md->offset_vector;
1059 cb.subject = (PCRE_SPTR)md->start_subject;
1060 cb.subject_length = (int)(md->end_subject - md->start_subject);
1061 cb.start_match = (int)(mstart - md->start_subject);
1062 cb.current_position = (int)(eptr - md->start_subject);
1063 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1064 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1065 cb.capture_top = offset_top/2;
1066 cb.capture_last = md->capture_last;
1067 cb.callout_data = md->callout_data;
1068 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1069 if (rrc < 0) RRETURN(rrc);
1070 }
1071 ecode += _pcre_OP_lengths[OP_CALLOUT];
1072 }
1073
1074 condcode = ecode[LINK_SIZE+1];
1075
1076 /* Now see what the actual condition is */
1077
1078 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1079 {
1080 if (md->recursive == NULL) /* Not recursing => FALSE */
1081 {
1082 condition = FALSE;
1083 ecode += GET(ecode, 1);
1084 }
1085 else
1086 {
1087 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1088 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1089
1090 /* If the test is for recursion into a specific subpattern, and it is
1091 false, but the test was set up by name, scan the table to see if the
1092 name refers to any other numbers, and test them. The condition is true
1093 if any one is set. */
1094
1095 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1096 {
1097 uschar *slotA = md->name_table;
1098 for (i = 0; i < md->name_count; i++)
1099 {
1100 if (GET2(slotA, 0) == recno) break;
1101 slotA += md->name_entry_size;
1102 }
1103
1104 /* Found a name for the number - there can be only one; duplicate
1105 names for different numbers are allowed, but not vice versa. First
1106 scan down for duplicates. */
1107
1108 if (i < md->name_count)
1109 {
1110 uschar *slotB = slotA;
1111 while (slotB > md->name_table)
1112 {
1113 slotB -= md->name_entry_size;
1114 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1115 {
1116 condition = GET2(slotB, 0) == md->recursive->group_num;
1117 if (condition) break;
1118 }
1119 else break;
1120 }
1121
1122 /* Scan up for duplicates */
1123
1124 if (!condition)
1125 {
1126 slotB = slotA;
1127 for (i++; i < md->name_count; i++)
1128 {
1129 slotB += md->name_entry_size;
1130 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1131 {
1132 condition = GET2(slotB, 0) == md->recursive->group_num;
1133 if (condition) break;
1134 }
1135 else break;
1136 }
1137 }
1138 }
1139 }
1140
1141 /* Chose branch according to the condition */
1142
1143 ecode += condition? 3 : GET(ecode, 1);
1144 }
1145 }
1146
1147 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1148 {
1149 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1150 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1151
1152 /* If the numbered capture is unset, but the reference was by name,
1153 scan the table to see if the name refers to any other numbers, and test
1154 them. The condition is true if any one is set. This is tediously similar
1155 to the code above, but not close enough to try to amalgamate. */
1156
1157 if (!condition && condcode == OP_NCREF)
1158 {
1159 int refno = offset >> 1;
1160 uschar *slotA = md->name_table;
1161
1162 for (i = 0; i < md->name_count; i++)
1163 {
1164 if (GET2(slotA, 0) == refno) break;
1165 slotA += md->name_entry_size;
1166 }
1167
1168 /* Found a name for the number - there can be only one; duplicate names
1169 for different numbers are allowed, but not vice versa. First scan down
1170 for duplicates. */
1171
1172 if (i < md->name_count)
1173 {
1174 uschar *slotB = slotA;
1175 while (slotB > md->name_table)
1176 {
1177 slotB -= md->name_entry_size;
1178 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1179 {
1180 offset = GET2(slotB, 0) << 1;
1181 condition = offset < offset_top &&
1182 md->offset_vector[offset] >= 0;
1183 if (condition) break;
1184 }
1185 else break;
1186 }
1187
1188 /* Scan up for duplicates */
1189
1190 if (!condition)
1191 {
1192 slotB = slotA;
1193 for (i++; i < md->name_count; i++)
1194 {
1195 slotB += md->name_entry_size;
1196 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1197 {
1198 offset = GET2(slotB, 0) << 1;
1199 condition = offset < offset_top &&
1200 md->offset_vector[offset] >= 0;
1201 if (condition) break;
1202 }
1203 else break;
1204 }
1205 }
1206 }
1207 }
1208
1209 /* Chose branch according to the condition */
1210
1211 ecode += condition? 3 : GET(ecode, 1);
1212 }
1213
1214 else if (condcode == OP_DEF) /* DEFINE - always false */
1215 {
1216 condition = FALSE;
1217 ecode += GET(ecode, 1);
1218 }
1219
1220 /* The condition is an assertion. Call match() to evaluate it - setting
1221 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1222 an assertion. */
1223
1224 else
1225 {
1226 md->match_function_type = MATCH_CONDASSERT;
1227 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1228 if (rrc == MATCH_MATCH)
1229 {
1230 condition = TRUE;
1231 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1232 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1233 }
1234 else if (rrc != MATCH_NOMATCH &&
1235 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1236 {
1237 RRETURN(rrc); /* Need braces because of following else */
1238 }
1239 else
1240 {
1241 condition = FALSE;
1242 ecode += codelink;
1243 }
1244 }
1245
1246 /* We are now at the branch that is to be obeyed. As there is only one,
1247 we used to use tail recursion to avoid using another stack frame, except
1248 when there was unlimited repeat of a possibly empty group. However, that
1249 strategy no longer works because of the possibilty of (*THEN) being
1250 encountered in the branch. A recursive call to match() is always required,
1251 unless the second alternative doesn't exist, in which case we can just
1252 plough on. */
1253
1254 if (condition || *ecode == OP_ALT)
1255 {
1256 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1257 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1258 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1259 rrc = MATCH_NOMATCH;
1260 RRETURN(rrc);
1261 }
1262 else /* Condition false & no alternative */
1263 {
1264 ecode += 1 + LINK_SIZE;
1265 }
1266 break;
1267
1268
1269 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1270 to close any currently open capturing brackets. */
1271
1272 case OP_CLOSE:
1273 number = GET2(ecode, 1);
1274 offset = number << 1;
1275
1276 #ifdef PCRE_DEBUG
1277 printf("end bracket %d at *ACCEPT", number);
1278 printf("\n");
1279 #endif
1280
1281 md->capture_last = number;
1282 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1283 {
1284 md->offset_vector[offset] =
1285 md->offset_vector[md->offset_end - number];
1286 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1287 if (offset_top <= offset) offset_top = offset + 2;
1288 }
1289 ecode += 3;
1290 break;
1291
1292
1293 /* End of the pattern, either real or forced. If we are in a recursion, we
1294 should restore the offsets appropriately, and if it's a top-level
1295 recursion, continue from after the call. */
1296
1297 case OP_ACCEPT:
1298 case OP_END:
1299 if (md->recursive != NULL)
1300 {
1301 recursion_info *rec = md->recursive;
1302 md->recursive = rec->prevrec;
1303 memmove(md->offset_vector, rec->offset_save,
1304 rec->saved_max * sizeof(int));
1305 offset_top = rec->save_offset_top;
1306 if (rec->group_num == 0)
1307 {
1308 ecode = rec->after_call;
1309 break;
1310 }
1311 }
1312
1313 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1314 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1315 the subject. In both cases, backtracking will then try other alternatives,
1316 if any. */
1317
1318 else if (eptr == mstart &&
1319 (md->notempty ||
1320 (md->notempty_atstart &&
1321 mstart == md->start_subject + md->start_offset)))
1322 MRRETURN(MATCH_NOMATCH);
1323
1324 /* Otherwise, we have a match. */
1325
1326 md->end_match_ptr = eptr; /* Record where we ended */
1327 md->end_offset_top = offset_top; /* and how many extracts were taken */
1328 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1329
1330 /* For some reason, the macros don't work properly if an expression is
1331 given as the argument to MRRETURN when the heap is in use. */
1332
1333 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1334 MRRETURN(rrc);
1335
1336 /* Assertion brackets. Check the alternative branches in turn - the
1337 matching won't pass the KET for an assertion. If any one branch matches,
1338 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1339 start of each branch to move the current point backwards, so the code at
1340 this level is identical to the lookahead case. When the assertion is part
1341 of a condition, we want to return immediately afterwards. The caller of
1342 this incarnation of the match() function will have set MATCH_CONDASSERT in
1343 md->match_function type, and one of these opcodes will be the first opcode
1344 that is processed. We use a local variable that is preserved over calls to
1345 match() to remember this case. */
1346
1347 case OP_ASSERT:
1348 case OP_ASSERTBACK:
1349 if (md->match_function_type == MATCH_CONDASSERT)
1350 {
1351 condassert = TRUE;
1352 md->match_function_type = 0;
1353 }
1354 else condassert = FALSE;
1355
1356 do
1357 {
1358 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1359 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1360 {
1361 mstart = md->start_match_ptr; /* In case \K reset it */
1362 break;
1363 }
1364 if (rrc != MATCH_NOMATCH &&
1365 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1366 RRETURN(rrc);
1367 ecode += GET(ecode, 1);
1368 }
1369 while (*ecode == OP_ALT);
1370
1371 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1372
1373 /* If checking an assertion for a condition, return MATCH_MATCH. */
1374
1375 if (condassert) RRETURN(MATCH_MATCH);
1376
1377 /* Continue from after the assertion, updating the offsets high water
1378 mark, since extracts may have been taken during the assertion. */
1379
1380 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1381 ecode += 1 + LINK_SIZE;
1382 offset_top = md->end_offset_top;
1383 continue;
1384
1385 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1386 PRUNE, or COMMIT means we must assume failure without checking subsequent
1387 branches. */
1388
1389 case OP_ASSERT_NOT:
1390 case OP_ASSERTBACK_NOT:
1391 if (md->match_function_type == MATCH_CONDASSERT)
1392 {
1393 condassert = TRUE;
1394 md->match_function_type = 0;
1395 }
1396 else condassert = FALSE;
1397
1398 do
1399 {
1400 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1401 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1402 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1403 {
1404 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1405 break;
1406 }
1407 if (rrc != MATCH_NOMATCH &&
1408 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1409 RRETURN(rrc);
1410 ecode += GET(ecode,1);
1411 }
1412 while (*ecode == OP_ALT);
1413
1414 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1415
1416 ecode += 1 + LINK_SIZE;
1417 continue;
1418
1419 /* Move the subject pointer back. This occurs only at the start of
1420 each branch of a lookbehind assertion. If we are too close to the start to
1421 move back, this match function fails. When working with UTF-8 we move
1422 back a number of characters, not bytes. */
1423
1424 case OP_REVERSE:
1425 #ifdef SUPPORT_UTF8
1426 if (utf8)
1427 {
1428 i = GET(ecode, 1);
1429 while (i-- > 0)
1430 {
1431 eptr--;
1432 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1433 BACKCHAR(eptr);
1434 }
1435 }
1436 else
1437 #endif
1438
1439 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1440
1441 {
1442 eptr -= GET(ecode, 1);
1443 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1444 }
1445
1446 /* Save the earliest consulted character, then skip to next op code */
1447
1448 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1449 ecode += 1 + LINK_SIZE;
1450 break;
1451
1452 /* The callout item calls an external function, if one is provided, passing
1453 details of the match so far. This is mainly for debugging, though the
1454 function is able to force a failure. */
1455
1456 case OP_CALLOUT:
1457 if (pcre_callout != NULL)
1458 {
1459 pcre_callout_block cb;
1460 cb.version = 1; /* Version 1 of the callout block */
1461 cb.callout_number = ecode[1];
1462 cb.offset_vector = md->offset_vector;
1463 cb.subject = (PCRE_SPTR)md->start_subject;
1464 cb.subject_length = (int)(md->end_subject - md->start_subject);
1465 cb.start_match = (int)(mstart - md->start_subject);
1466 cb.current_position = (int)(eptr - md->start_subject);
1467 cb.pattern_position = GET(ecode, 2);
1468 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1469 cb.capture_top = offset_top/2;
1470 cb.capture_last = md->capture_last;
1471 cb.callout_data = md->callout_data;
1472 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1473 if (rrc < 0) RRETURN(rrc);
1474 }
1475 ecode += 2 + 2*LINK_SIZE;
1476 break;
1477
1478 /* Recursion either matches the current regex, or some subexpression. The
1479 offset data is the offset to the starting bracket from the start of the
1480 whole pattern. (This is so that it works from duplicated subpatterns.)
1481
1482 If there are any capturing brackets started but not finished, we have to
1483 save their starting points and reinstate them after the recursion. However,
1484 we don't know how many such there are (offset_top records the completed
1485 total) so we just have to save all the potential data. There may be up to
1486 65535 such values, which is too large to put on the stack, but using malloc
1487 for small numbers seems expensive. As a compromise, the stack is used when
1488 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1489 is used. A problem is what to do if the malloc fails ... there is no way of
1490 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1491 values on the stack, and accept that the rest may be wrong.
1492
1493 There are also other values that have to be saved. We use a chained
1494 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1495 for the original version of this logic. */
1496
1497 case OP_RECURSE:
1498 {
1499 callpat = md->start_code + GET(ecode, 1);
1500 new_recursive.group_num = (callpat == md->start_code)? 0 :
1501 GET2(callpat, 1 + LINK_SIZE);
1502
1503 /* Add to "recursing stack" */
1504
1505 new_recursive.prevrec = md->recursive;
1506 md->recursive = &new_recursive;
1507
1508 /* Find where to continue from afterwards */
1509
1510 ecode += 1 + LINK_SIZE;
1511 new_recursive.after_call = ecode;
1512
1513 /* Now save the offset data. */
1514
1515 new_recursive.saved_max = md->offset_end;
1516 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1517 new_recursive.offset_save = stacksave;
1518 else
1519 {
1520 new_recursive.offset_save =
1521 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1522 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1523 }
1524
1525 memcpy(new_recursive.offset_save, md->offset_vector,
1526 new_recursive.saved_max * sizeof(int));
1527 new_recursive.save_offset_top = offset_top;
1528
1529 /* OK, now we can do the recursion. For each top-level alternative we
1530 restore the offset and recursion data. */
1531
1532 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1533 cbegroup = (*callpat >= OP_SBRA);
1534 do
1535 {
1536 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1537 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1538 md, eptrb, RM6);
1539 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1540 {
1541 DPRINTF(("Recursion matched\n"));
1542 md->recursive = new_recursive.prevrec;
1543 if (new_recursive.offset_save != stacksave)
1544 (pcre_free)(new_recursive.offset_save);
1545 MRRETURN(MATCH_MATCH);
1546 }
1547 else if (rrc != MATCH_NOMATCH &&
1548 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1549 {
1550 DPRINTF(("Recursion gave error %d\n", rrc));
1551 if (new_recursive.offset_save != stacksave)
1552 (pcre_free)(new_recursive.offset_save);
1553 RRETURN(rrc);
1554 }
1555
1556 md->recursive = &new_recursive;
1557 memcpy(md->offset_vector, new_recursive.offset_save,
1558 new_recursive.saved_max * sizeof(int));
1559 callpat += GET(callpat, 1);
1560 }
1561 while (*callpat == OP_ALT);
1562
1563 DPRINTF(("Recursion didn't match\n"));
1564 md->recursive = new_recursive.prevrec;
1565 if (new_recursive.offset_save != stacksave)
1566 (pcre_free)(new_recursive.offset_save);
1567 MRRETURN(MATCH_NOMATCH);
1568 }
1569 /* Control never reaches here */
1570
1571 /* "Once" brackets are like assertion brackets except that after a match,
1572 the point in the subject string is not moved back. Thus there can never be
1573 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1574 Check the alternative branches in turn - the matching won't pass the KET
1575 for this kind of subpattern. If any one branch matches, we carry on as at
1576 the end of a normal bracket, leaving the subject pointer, but resetting
1577 the start-of-match value in case it was changed by \K. */
1578
1579 case OP_ONCE:
1580 prev = ecode;
1581 saved_eptr = eptr;
1582
1583 do
1584 {
1585 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1586 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1587 {
1588 mstart = md->start_match_ptr;
1589 break;
1590 }
1591 if (rrc != MATCH_NOMATCH &&
1592 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1593 RRETURN(rrc);
1594 ecode += GET(ecode,1);
1595 }
1596 while (*ecode == OP_ALT);
1597
1598 /* If hit the end of the group (which could be repeated), fail */
1599
1600 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1601
1602 /* Continue as from after the assertion, updating the offsets high water
1603 mark, since extracts may have been taken. */
1604
1605 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1606
1607 offset_top = md->end_offset_top;
1608 eptr = md->end_match_ptr;
1609
1610 /* For a non-repeating ket, just continue at this level. This also
1611 happens for a repeating ket if no characters were matched in the group.
1612 This is the forcible breaking of infinite loops as implemented in Perl
1613 5.005. If there is an options reset, it will get obeyed in the normal
1614 course of events. */
1615
1616 if (*ecode == OP_KET || eptr == saved_eptr)
1617 {
1618 ecode += 1+LINK_SIZE;
1619 break;
1620 }
1621
1622 /* The repeating kets try the rest of the pattern or restart from the
1623 preceding bracket, in the appropriate order. The second "call" of match()
1624 uses tail recursion, to avoid using another stack frame. */
1625
1626 if (*ecode == OP_KETRMIN)
1627 {
1628 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1630 ecode = prev;
1631 goto TAIL_RECURSE;
1632 }
1633 else /* OP_KETRMAX */
1634 {
1635 md->match_function_type = MATCH_CBEGROUP;
1636 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1638 ecode += 1 + LINK_SIZE;
1639 goto TAIL_RECURSE;
1640 }
1641 /* Control never gets here */
1642
1643 /* An alternation is the end of a branch; scan along to find the end of the
1644 bracketed group and go to there. */
1645
1646 case OP_ALT:
1647 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1648 break;
1649
1650 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1651 indicating that it may occur zero times. It may repeat infinitely, or not
1652 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1653 with fixed upper repeat limits are compiled as a number of copies, with the
1654 optional ones preceded by BRAZERO or BRAMINZERO. */
1655
1656 case OP_BRAZERO:
1657 next = ecode + 1;
1658 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1660 do next += GET(next, 1); while (*next == OP_ALT);
1661 ecode = next + 1 + LINK_SIZE;
1662 break;
1663
1664 case OP_BRAMINZERO:
1665 next = ecode + 1;
1666 do next += GET(next, 1); while (*next == OP_ALT);
1667 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1669 ecode++;
1670 break;
1671
1672 case OP_SKIPZERO:
1673 next = ecode+1;
1674 do next += GET(next,1); while (*next == OP_ALT);
1675 ecode = next + 1 + LINK_SIZE;
1676 break;
1677
1678 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1679 here; just jump to the group, with allow_zero set TRUE. */
1680
1681 case OP_BRAPOSZERO:
1682 op = *(++ecode);
1683 allow_zero = TRUE;
1684 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1685 goto POSSESSIVE_NON_CAPTURE;
1686
1687 /* End of a group, repeated or non-repeating. */
1688
1689 case OP_KET:
1690 case OP_KETRMIN:
1691 case OP_KETRMAX:
1692 case OP_KETRPOS:
1693 prev = ecode - GET(ecode, 1);
1694
1695 /* If this was a group that remembered the subject start, in order to break
1696 infinite repeats of empty string matches, retrieve the subject start from
1697 the chain. Otherwise, set it NULL. */
1698
1699 if (*prev >= OP_SBRA)
1700 {
1701 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1702 eptrb = eptrb->epb_prev; /* Backup to previous group */
1703 }
1704 else saved_eptr = NULL;
1705
1706 /* If we are at the end of an assertion group or an atomic group, stop
1707 matching and return MATCH_MATCH, but record the current high water mark for
1708 use by positive assertions. We also need to record the match start in case
1709 it was changed by \K. */
1710
1711 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1712 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1713 *prev == OP_ONCE)
1714 {
1715 md->end_match_ptr = eptr; /* For ONCE */
1716 md->end_offset_top = offset_top;
1717 md->start_match_ptr = mstart;
1718 MRRETURN(MATCH_MATCH);
1719 }
1720
1721 /* For capturing groups we have to check the group number back at the start
1722 and if necessary complete handling an extraction by setting the offsets and
1723 bumping the high water mark. Note that whole-pattern recursion is coded as
1724 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1725 when the OP_END is reached. Other recursion is handled here. */
1726
1727 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1728 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1729 {
1730 number = GET2(prev, 1+LINK_SIZE);
1731 offset = number << 1;
1732
1733 #ifdef PCRE_DEBUG
1734 printf("end bracket %d", number);
1735 printf("\n");
1736 #endif
1737
1738 md->capture_last = number;
1739 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1740 {
1741 md->offset_vector[offset] =
1742 md->offset_vector[md->offset_end - number];
1743 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1744 if (offset_top <= offset) offset_top = offset + 2;
1745 }
1746
1747 /* Handle a recursively called group. Restore the offsets
1748 appropriately and continue from after the call. */
1749
1750 if (md->recursive != NULL && md->recursive->group_num == number)
1751 {
1752 recursion_info *rec = md->recursive;
1753 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1754 md->recursive = rec->prevrec;
1755 memcpy(md->offset_vector, rec->offset_save,
1756 rec->saved_max * sizeof(int));
1757 offset_top = rec->save_offset_top;
1758 ecode = rec->after_call;
1759 break;
1760 }
1761 }
1762
1763 /* For a non-repeating ket, just continue at this level. This also
1764 happens for a repeating ket if no characters were matched in the group.
1765 This is the forcible breaking of infinite loops as implemented in Perl
1766 5.005. If there is an options reset, it will get obeyed in the normal
1767 course of events. */
1768
1769 if (*ecode == OP_KET || eptr == saved_eptr)
1770 {
1771 ecode += 1 + LINK_SIZE;
1772 break;
1773 }
1774
1775 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1776 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1777 at a time from the outer level, thus saving stack. */
1778
1779 if (*ecode == OP_KETRPOS)
1780 {
1781 md->end_match_ptr = eptr;
1782 md->end_offset_top = offset_top;
1783 RRETURN(MATCH_KETRPOS);
1784 }
1785
1786 /* The normal repeating kets try the rest of the pattern or restart from
1787 the preceding bracket, in the appropriate order. In the second case, we can
1788 use tail recursion to avoid using another stack frame, unless we have an
1789 unlimited repeat of a group that can match an empty string. */
1790
1791 if (*ecode == OP_KETRMIN)
1792 {
1793 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1795 if (*prev >= OP_SBRA) /* Could match an empty string */
1796 {
1797 md->match_function_type = MATCH_CBEGROUP;
1798 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1799 RRETURN(rrc);
1800 }
1801 ecode = prev;
1802 goto TAIL_RECURSE;
1803 }
1804 else /* OP_KETRMAX */
1805 {
1806 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1807 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1809 ecode += 1 + LINK_SIZE;
1810 goto TAIL_RECURSE;
1811 }
1812 /* Control never gets here */
1813
1814 /* Not multiline mode: start of subject assertion, unless notbol. */
1815
1816 case OP_CIRC:
1817 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1818
1819 /* Start of subject assertion */
1820
1821 case OP_SOD:
1822 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1823 ecode++;
1824 break;
1825
1826 /* Multiline mode: start of subject unless notbol, or after any newline. */
1827
1828 case OP_CIRCM:
1829 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1830 if (eptr != md->start_subject &&
1831 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1832 MRRETURN(MATCH_NOMATCH);
1833 ecode++;
1834 break;
1835
1836 /* Start of match assertion */
1837
1838 case OP_SOM:
1839 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1840 ecode++;
1841 break;
1842
1843 /* Reset the start of match point */
1844
1845 case OP_SET_SOM:
1846 mstart = eptr;
1847 ecode++;
1848 break;
1849
1850 /* Multiline mode: assert before any newline, or before end of subject
1851 unless noteol is set. */
1852
1853 case OP_DOLLM:
1854 if (eptr < md->end_subject)
1855 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1856 else
1857 {
1858 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1859 SCHECK_PARTIAL();
1860 }
1861 ecode++;
1862 break;
1863
1864 /* Not multiline mode: assert before a terminating newline or before end of
1865 subject unless noteol is set. */
1866
1867 case OP_DOLL:
1868 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1869 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1870
1871 /* ... else fall through for endonly */
1872
1873 /* End of subject assertion (\z) */
1874
1875 case OP_EOD:
1876 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1877 SCHECK_PARTIAL();
1878 ecode++;
1879 break;
1880
1881 /* End of subject or ending \n assertion (\Z) */
1882
1883 case OP_EODN:
1884 ASSERT_NL_OR_EOS:
1885 if (eptr < md->end_subject &&
1886 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1887 MRRETURN(MATCH_NOMATCH);
1888
1889 /* Either at end of string or \n before end. */
1890
1891 SCHECK_PARTIAL();
1892 ecode++;
1893 break;
1894
1895 /* Word boundary assertions */
1896
1897 case OP_NOT_WORD_BOUNDARY:
1898 case OP_WORD_BOUNDARY:
1899 {
1900
1901 /* Find out if the previous and current characters are "word" characters.
1902 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1903 be "non-word" characters. Remember the earliest consulted character for
1904 partial matching. */
1905
1906 #ifdef SUPPORT_UTF8
1907 if (utf8)
1908 {
1909 /* Get status of previous character */
1910
1911 if (eptr == md->start_subject) prev_is_word = FALSE; else
1912 {
1913 USPTR lastptr = eptr - 1;
1914 while((*lastptr & 0xc0) == 0x80) lastptr--;
1915 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1916 GETCHAR(c, lastptr);
1917 #ifdef SUPPORT_UCP
1918 if (md->use_ucp)
1919 {
1920 if (c == '_') prev_is_word = TRUE; else
1921 {
1922 int cat = UCD_CATEGORY(c);
1923 prev_is_word = (cat == ucp_L || cat == ucp_N);
1924 }
1925 }
1926 else
1927 #endif
1928 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1929 }
1930
1931 /* Get status of next character */
1932
1933 if (eptr >= md->end_subject)
1934 {
1935 SCHECK_PARTIAL();
1936 cur_is_word = FALSE;
1937 }
1938 else
1939 {
1940 GETCHAR(c, eptr);
1941 #ifdef SUPPORT_UCP
1942 if (md->use_ucp)
1943 {
1944 if (c == '_') cur_is_word = TRUE; else
1945 {
1946 int cat = UCD_CATEGORY(c);
1947 cur_is_word = (cat == ucp_L || cat == ucp_N);
1948 }
1949 }
1950 else
1951 #endif
1952 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1953 }
1954 }
1955 else
1956 #endif
1957
1958 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1959 consistency with the behaviour of \w we do use it in this case. */
1960
1961 {
1962 /* Get status of previous character */
1963
1964 if (eptr == md->start_subject) prev_is_word = FALSE; else
1965 {
1966 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1967 #ifdef SUPPORT_UCP
1968 if (md->use_ucp)
1969 {
1970 c = eptr[-1];
1971 if (c == '_') prev_is_word = TRUE; else
1972 {
1973 int cat = UCD_CATEGORY(c);
1974 prev_is_word = (cat == ucp_L || cat == ucp_N);
1975 }
1976 }
1977 else
1978 #endif
1979 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1980 }
1981
1982 /* Get status of next character */
1983
1984 if (eptr >= md->end_subject)
1985 {
1986 SCHECK_PARTIAL();
1987 cur_is_word = FALSE;
1988 }
1989 else
1990 #ifdef SUPPORT_UCP
1991 if (md->use_ucp)
1992 {
1993 c = *eptr;
1994 if (c == '_') cur_is_word = TRUE; else
1995 {
1996 int cat = UCD_CATEGORY(c);
1997 cur_is_word = (cat == ucp_L || cat == ucp_N);
1998 }
1999 }
2000 else
2001 #endif
2002 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2003 }
2004
2005 /* Now see if the situation is what we want */
2006
2007 if ((*ecode++ == OP_WORD_BOUNDARY)?
2008 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2009 MRRETURN(MATCH_NOMATCH);
2010 }
2011 break;
2012
2013 /* Match a single character type; inline for speed */
2014
2015 case OP_ANY:
2016 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2017 /* Fall through */
2018
2019 case OP_ALLANY:
2020 if (eptr++ >= md->end_subject)
2021 {
2022 SCHECK_PARTIAL();
2023 MRRETURN(MATCH_NOMATCH);
2024 }
2025 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2026 ecode++;
2027 break;
2028
2029 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2030 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2031
2032 case OP_ANYBYTE:
2033 if (eptr++ >= md->end_subject)
2034 {
2035 SCHECK_PARTIAL();
2036 MRRETURN(MATCH_NOMATCH);
2037 }
2038 ecode++;
2039 break;
2040
2041 case OP_NOT_DIGIT:
2042 if (eptr >= md->end_subject)
2043 {
2044 SCHECK_PARTIAL();
2045 MRRETURN(MATCH_NOMATCH);
2046 }
2047 GETCHARINCTEST(c, eptr);
2048 if (
2049 #ifdef SUPPORT_UTF8
2050 c < 256 &&
2051 #endif
2052 (md->ctypes[c] & ctype_digit) != 0
2053 )
2054 MRRETURN(MATCH_NOMATCH);
2055 ecode++;
2056 break;
2057
2058 case OP_DIGIT:
2059 if (eptr >= md->end_subject)
2060 {
2061 SCHECK_PARTIAL();
2062 MRRETURN(MATCH_NOMATCH);
2063 }
2064 GETCHARINCTEST(c, eptr);
2065 if (
2066 #ifdef SUPPORT_UTF8
2067 c >= 256 ||
2068 #endif
2069 (md->ctypes[c] & ctype_digit) == 0
2070 )
2071 MRRETURN(MATCH_NOMATCH);
2072 ecode++;
2073 break;
2074
2075 case OP_NOT_WHITESPACE:
2076 if (eptr >= md->end_subject)
2077 {
2078 SCHECK_PARTIAL();
2079 MRRETURN(MATCH_NOMATCH);
2080 }
2081 GETCHARINCTEST(c, eptr);
2082 if (
2083 #ifdef SUPPORT_UTF8
2084 c < 256 &&
2085 #endif
2086 (md->ctypes[c] & ctype_space) != 0
2087 )
2088 MRRETURN(MATCH_NOMATCH);
2089 ecode++;
2090 break;
2091
2092 case OP_WHITESPACE:
2093 if (eptr >= md->end_subject)
2094 {
2095 SCHECK_PARTIAL();
2096 MRRETURN(MATCH_NOMATCH);
2097 }
2098 GETCHARINCTEST(c, eptr);
2099 if (
2100 #ifdef SUPPORT_UTF8
2101 c >= 256 ||
2102 #endif
2103 (md->ctypes[c] & ctype_space) == 0
2104 )
2105 MRRETURN(MATCH_NOMATCH);
2106 ecode++;
2107 break;
2108
2109 case OP_NOT_WORDCHAR:
2110 if (eptr >= md->end_subject)
2111 {
2112 SCHECK_PARTIAL();
2113 MRRETURN(MATCH_NOMATCH);
2114 }
2115 GETCHARINCTEST(c, eptr);
2116 if (
2117 #ifdef SUPPORT_UTF8
2118 c < 256 &&
2119 #endif
2120 (md->ctypes[c] & ctype_word) != 0
2121 )
2122 MRRETURN(MATCH_NOMATCH);
2123 ecode++;
2124 break;
2125
2126 case OP_WORDCHAR:
2127 if (eptr >= md->end_subject)
2128 {
2129 SCHECK_PARTIAL();
2130 MRRETURN(MATCH_NOMATCH);
2131 }
2132 GETCHARINCTEST(c, eptr);
2133 if (
2134 #ifdef SUPPORT_UTF8
2135 c >= 256 ||
2136 #endif
2137 (md->ctypes[c] & ctype_word) == 0
2138 )
2139 MRRETURN(MATCH_NOMATCH);
2140 ecode++;
2141 break;
2142
2143 case OP_ANYNL:
2144 if (eptr >= md->end_subject)
2145 {
2146 SCHECK_PARTIAL();
2147 MRRETURN(MATCH_NOMATCH);
2148 }
2149 GETCHARINCTEST(c, eptr);
2150 switch(c)
2151 {
2152 default: MRRETURN(MATCH_NOMATCH);
2153
2154 case 0x000d:
2155 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2156 break;
2157
2158 case 0x000a:
2159 break;
2160
2161 case 0x000b:
2162 case 0x000c:
2163 case 0x0085:
2164 case 0x2028:
2165 case 0x2029:
2166 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2167 break;
2168 }
2169 ecode++;
2170 break;
2171
2172 case OP_NOT_HSPACE:
2173 if (eptr >= md->end_subject)
2174 {
2175 SCHECK_PARTIAL();
2176 MRRETURN(MATCH_NOMATCH);
2177 }
2178 GETCHARINCTEST(c, eptr);
2179 switch(c)
2180 {
2181 default: break;
2182 case 0x09: /* HT */
2183 case 0x20: /* SPACE */
2184 case 0xa0: /* NBSP */
2185 case 0x1680: /* OGHAM SPACE MARK */
2186 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2187 case 0x2000: /* EN QUAD */
2188 case 0x2001: /* EM QUAD */
2189 case 0x2002: /* EN SPACE */
2190 case 0x2003: /* EM SPACE */
2191 case 0x2004: /* THREE-PER-EM SPACE */
2192 case 0x2005: /* FOUR-PER-EM SPACE */
2193 case 0x2006: /* SIX-PER-EM SPACE */
2194 case 0x2007: /* FIGURE SPACE */
2195 case 0x2008: /* PUNCTUATION SPACE */
2196 case 0x2009: /* THIN SPACE */
2197 case 0x200A: /* HAIR SPACE */
2198 case 0x202f: /* NARROW NO-BREAK SPACE */
2199 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2200 case 0x3000: /* IDEOGRAPHIC SPACE */
2201 MRRETURN(MATCH_NOMATCH);
2202 }
2203 ecode++;
2204 break;
2205
2206 case OP_HSPACE:
2207 if (eptr >= md->end_subject)
2208 {
2209 SCHECK_PARTIAL();
2210 MRRETURN(MATCH_NOMATCH);
2211 }
2212 GETCHARINCTEST(c, eptr);
2213 switch(c)
2214 {
2215 default: MRRETURN(MATCH_NOMATCH);
2216 case 0x09: /* HT */
2217 case 0x20: /* SPACE */
2218 case 0xa0: /* NBSP */
2219 case 0x1680: /* OGHAM SPACE MARK */
2220 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2221 case 0x2000: /* EN QUAD */
2222 case 0x2001: /* EM QUAD */
2223 case 0x2002: /* EN SPACE */
2224 case 0x2003: /* EM SPACE */
2225 case 0x2004: /* THREE-PER-EM SPACE */
2226 case 0x2005: /* FOUR-PER-EM SPACE */
2227 case 0x2006: /* SIX-PER-EM SPACE */
2228 case 0x2007: /* FIGURE SPACE */
2229 case 0x2008: /* PUNCTUATION SPACE */
2230 case 0x2009: /* THIN SPACE */
2231 case 0x200A: /* HAIR SPACE */
2232 case 0x202f: /* NARROW NO-BREAK SPACE */
2233 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2234 case 0x3000: /* IDEOGRAPHIC SPACE */
2235 break;
2236 }
2237 ecode++;
2238 break;
2239
2240 case OP_NOT_VSPACE:
2241 if (eptr >= md->end_subject)
2242 {
2243 SCHECK_PARTIAL();
2244 MRRETURN(MATCH_NOMATCH);
2245 }
2246 GETCHARINCTEST(c, eptr);
2247 switch(c)
2248 {
2249 default: break;
2250 case 0x0a: /* LF */
2251 case 0x0b: /* VT */
2252 case 0x0c: /* FF */
2253 case 0x0d: /* CR */
2254 case 0x85: /* NEL */
2255 case 0x2028: /* LINE SEPARATOR */
2256 case 0x2029: /* PARAGRAPH SEPARATOR */
2257 MRRETURN(MATCH_NOMATCH);
2258 }
2259 ecode++;
2260 break;
2261
2262 case OP_VSPACE:
2263 if (eptr >= md->end_subject)
2264 {
2265 SCHECK_PARTIAL();
2266 MRRETURN(MATCH_NOMATCH);
2267 }
2268 GETCHARINCTEST(c, eptr);
2269 switch(c)
2270 {
2271 default: MRRETURN(MATCH_NOMATCH);
2272 case 0x0a: /* LF */
2273 case 0x0b: /* VT */
2274 case 0x0c: /* FF */
2275 case 0x0d: /* CR */
2276 case 0x85: /* NEL */
2277 case 0x2028: /* LINE SEPARATOR */
2278 case 0x2029: /* PARAGRAPH SEPARATOR */
2279 break;
2280 }
2281 ecode++;
2282 break;
2283
2284 #ifdef SUPPORT_UCP
2285 /* Check the next character by Unicode property. We will get here only
2286 if the support is in the binary; otherwise a compile-time error occurs. */
2287
2288 case OP_PROP:
2289 case OP_NOTPROP:
2290 if (eptr >= md->end_subject)
2291 {
2292 SCHECK_PARTIAL();
2293 MRRETURN(MATCH_NOMATCH);
2294 }
2295 GETCHARINCTEST(c, eptr);
2296 {
2297 const ucd_record *prop = GET_UCD(c);
2298
2299 switch(ecode[1])
2300 {
2301 case PT_ANY:
2302 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2303 break;
2304
2305 case PT_LAMP:
2306 if ((prop->chartype == ucp_Lu ||
2307 prop->chartype == ucp_Ll ||
2308 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2309 MRRETURN(MATCH_NOMATCH);
2310 break;
2311
2312 case PT_GC:
2313 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2314 MRRETURN(MATCH_NOMATCH);
2315 break;
2316
2317 case PT_PC:
2318 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2319 MRRETURN(MATCH_NOMATCH);
2320 break;
2321
2322 case PT_SC:
2323 if ((ecode[2] != prop->script) == (op == OP_PROP))
2324 MRRETURN(MATCH_NOMATCH);
2325 break;
2326
2327 /* These are specials */
2328
2329 case PT_ALNUM:
2330 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2331 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2332 MRRETURN(MATCH_NOMATCH);
2333 break;
2334
2335 case PT_SPACE: /* Perl space */
2336 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2337 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2338 == (op == OP_NOTPROP))
2339 MRRETURN(MATCH_NOMATCH);
2340 break;
2341
2342 case PT_PXSPACE: /* POSIX space */
2343 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2344 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2345 c == CHAR_FF || c == CHAR_CR)
2346 == (op == OP_NOTPROP))
2347 MRRETURN(MATCH_NOMATCH);
2348 break;
2349
2350 case PT_WORD:
2351 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2352 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2353 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2354 MRRETURN(MATCH_NOMATCH);
2355 break;
2356
2357 /* This should never occur */
2358
2359 default:
2360 RRETURN(PCRE_ERROR_INTERNAL);
2361 }
2362
2363 ecode += 3;
2364 }
2365 break;
2366
2367 /* Match an extended Unicode sequence. We will get here only if the support
2368 is in the binary; otherwise a compile-time error occurs. */
2369
2370 case OP_EXTUNI:
2371 if (eptr >= md->end_subject)
2372 {
2373 SCHECK_PARTIAL();
2374 MRRETURN(MATCH_NOMATCH);
2375 }
2376 GETCHARINCTEST(c, eptr);
2377 {
2378 int category = UCD_CATEGORY(c);
2379 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2380 while (eptr < md->end_subject)
2381 {
2382 int len = 1;
2383 if (!utf8) c = *eptr; else
2384 {
2385 GETCHARLEN(c, eptr, len);
2386 }
2387 category = UCD_CATEGORY(c);
2388 if (category != ucp_M) break;
2389 eptr += len;
2390 }
2391 }
2392 ecode++;
2393 break;
2394 #endif
2395
2396
2397 /* Match a back reference, possibly repeatedly. Look past the end of the
2398 item to see if there is repeat information following. The code is similar
2399 to that for character classes, but repeated for efficiency. Then obey
2400 similar code to character type repeats - written out again for speed.
2401 However, if the referenced string is the empty string, always treat
2402 it as matched, any number of times (otherwise there could be infinite
2403 loops). */
2404
2405 case OP_REF:
2406 case OP_REFI:
2407 caseless = op == OP_REFI;
2408 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2409 ecode += 3;
2410
2411 /* If the reference is unset, there are two possibilities:
2412
2413 (a) In the default, Perl-compatible state, set the length negative;
2414 this ensures that every attempt at a match fails. We can't just fail
2415 here, because of the possibility of quantifiers with zero minima.
2416
2417 (b) If the JavaScript compatibility flag is set, set the length to zero
2418 so that the back reference matches an empty string.
2419
2420 Otherwise, set the length to the length of what was matched by the
2421 referenced subpattern. */
2422
2423 if (offset >= offset_top || md->offset_vector[offset] < 0)
2424 length = (md->jscript_compat)? 0 : -1;
2425 else
2426 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2427
2428 /* Set up for repetition, or handle the non-repeated case */
2429
2430 switch (*ecode)
2431 {
2432 case OP_CRSTAR:
2433 case OP_CRMINSTAR:
2434 case OP_CRPLUS:
2435 case OP_CRMINPLUS:
2436 case OP_CRQUERY:
2437 case OP_CRMINQUERY:
2438 c = *ecode++ - OP_CRSTAR;
2439 minimize = (c & 1) != 0;
2440 min = rep_min[c]; /* Pick up values from tables; */
2441 max = rep_max[c]; /* zero for max => infinity */
2442 if (max == 0) max = INT_MAX;
2443 break;
2444
2445 case OP_CRRANGE:
2446 case OP_CRMINRANGE:
2447 minimize = (*ecode == OP_CRMINRANGE);
2448 min = GET2(ecode, 1);
2449 max = GET2(ecode, 3);
2450 if (max == 0) max = INT_MAX;
2451 ecode += 5;
2452 break;
2453
2454 default: /* No repeat follows */
2455 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2456 {
2457 CHECK_PARTIAL();
2458 MRRETURN(MATCH_NOMATCH);
2459 }
2460 eptr += length;
2461 continue; /* With the main loop */
2462 }
2463
2464 /* Handle repeated back references. If the length of the reference is
2465 zero, just continue with the main loop. */
2466
2467 if (length == 0) continue;
2468
2469 /* First, ensure the minimum number of matches are present. We get back
2470 the length of the reference string explicitly rather than passing the
2471 address of eptr, so that eptr can be a register variable. */
2472
2473 for (i = 1; i <= min; i++)
2474 {
2475 int slength;
2476 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2477 {
2478 CHECK_PARTIAL();
2479 MRRETURN(MATCH_NOMATCH);
2480 }
2481 eptr += slength;
2482 }
2483
2484 /* If min = max, continue at the same level without recursion.
2485 They are not both allowed to be zero. */
2486
2487 if (min == max) continue;
2488
2489 /* If minimizing, keep trying and advancing the pointer */
2490
2491 if (minimize)
2492 {
2493 for (fi = min;; fi++)
2494 {
2495 int slength;
2496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2498 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2499 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2500 {
2501 CHECK_PARTIAL();
2502 MRRETURN(MATCH_NOMATCH);
2503 }
2504 eptr += slength;
2505 }
2506 /* Control never gets here */
2507 }
2508
2509 /* If maximizing, find the longest string and work backwards */
2510
2511 else
2512 {
2513 pp = eptr;
2514 for (i = min; i < max; i++)
2515 {
2516 int slength;
2517 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2518 {
2519 CHECK_PARTIAL();
2520 break;
2521 }
2522 eptr += slength;
2523 }
2524 while (eptr >= pp)
2525 {
2526 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2527 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2528 eptr -= length;
2529 }
2530 MRRETURN(MATCH_NOMATCH);
2531 }
2532 /* Control never gets here */
2533
2534 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2535 used when all the characters in the class have values in the range 0-255,
2536 and either the matching is caseful, or the characters are in the range
2537 0-127 when UTF-8 processing is enabled. The only difference between
2538 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2539 encountered.
2540
2541 First, look past the end of the item to see if there is repeat information
2542 following. Then obey similar code to character type repeats - written out
2543 again for speed. */
2544
2545 case OP_NCLASS:
2546 case OP_CLASS:
2547 {
2548 data = ecode + 1; /* Save for matching */
2549 ecode += 33; /* Advance past the item */
2550
2551 switch (*ecode)
2552 {
2553 case OP_CRSTAR:
2554 case OP_CRMINSTAR:
2555 case OP_CRPLUS:
2556 case OP_CRMINPLUS:
2557 case OP_CRQUERY:
2558 case OP_CRMINQUERY:
2559 c = *ecode++ - OP_CRSTAR;
2560 minimize = (c & 1) != 0;
2561 min = rep_min[c]; /* Pick up values from tables; */
2562 max = rep_max[c]; /* zero for max => infinity */
2563 if (max == 0) max = INT_MAX;
2564 break;
2565
2566 case OP_CRRANGE:
2567 case OP_CRMINRANGE:
2568 minimize = (*ecode == OP_CRMINRANGE);
2569 min = GET2(ecode, 1);
2570 max = GET2(ecode, 3);
2571 if (max == 0) max = INT_MAX;
2572 ecode += 5;
2573 break;
2574
2575 default: /* No repeat follows */
2576 min = max = 1;
2577 break;
2578 }
2579
2580 /* First, ensure the minimum number of matches are present. */
2581
2582 #ifdef SUPPORT_UTF8
2583 /* UTF-8 mode */
2584 if (utf8)
2585 {
2586 for (i = 1; i <= min; i++)
2587 {
2588 if (eptr >= md->end_subject)
2589 {
2590 SCHECK_PARTIAL();
2591 MRRETURN(MATCH_NOMATCH);
2592 }
2593 GETCHARINC(c, eptr);
2594 if (c > 255)
2595 {
2596 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2597 }
2598 else
2599 {
2600 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2601 }
2602 }
2603 }
2604 else
2605 #endif
2606 /* Not UTF-8 mode */
2607 {
2608 for (i = 1; i <= min; i++)
2609 {
2610 if (eptr >= md->end_subject)
2611 {
2612 SCHECK_PARTIAL();
2613 MRRETURN(MATCH_NOMATCH);
2614 }
2615 c = *eptr++;
2616 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2617 }
2618 }
2619
2620 /* If max == min we can continue with the main loop without the
2621 need to recurse. */
2622
2623 if (min == max) continue;
2624
2625 /* If minimizing, keep testing the rest of the expression and advancing
2626 the pointer while it matches the class. */
2627
2628 if (minimize)
2629 {
2630 #ifdef SUPPORT_UTF8
2631 /* UTF-8 mode */
2632 if (utf8)
2633 {
2634 for (fi = min;; fi++)
2635 {
2636 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2638 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2639 if (eptr >= md->end_subject)
2640 {
2641 SCHECK_PARTIAL();
2642 MRRETURN(MATCH_NOMATCH);
2643 }
2644 GETCHARINC(c, eptr);
2645 if (c > 255)
2646 {
2647 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2648 }
2649 else
2650 {
2651 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2652 }
2653 }
2654 }
2655 else
2656 #endif
2657 /* Not UTF-8 mode */
2658 {
2659 for (fi = min;; fi++)
2660 {
2661 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2663 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2664 if (eptr >= md->end_subject)
2665 {
2666 SCHECK_PARTIAL();
2667 MRRETURN(MATCH_NOMATCH);
2668 }
2669 c = *eptr++;
2670 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2671 }
2672 }
2673 /* Control never gets here */
2674 }
2675
2676 /* If maximizing, find the longest possible run, then work backwards. */
2677
2678 else
2679 {
2680 pp = eptr;
2681
2682 #ifdef SUPPORT_UTF8
2683 /* UTF-8 mode */
2684 if (utf8)
2685 {
2686 for (i = min; i < max; i++)
2687 {
2688 int len = 1;
2689 if (eptr >= md->end_subject)
2690 {
2691 SCHECK_PARTIAL();
2692 break;
2693 }
2694 GETCHARLEN(c, eptr, len);
2695 if (c > 255)
2696 {
2697 if (op == OP_CLASS) break;
2698 }
2699 else
2700 {
2701 if ((data[c/8] & (1 << (c&7))) == 0) break;
2702 }
2703 eptr += len;
2704 }
2705 for (;;)
2706 {
2707 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2709 if (eptr-- == pp) break; /* Stop if tried at original pos */
2710 BACKCHAR(eptr);
2711 }
2712 }
2713 else
2714 #endif
2715 /* Not UTF-8 mode */
2716 {
2717 for (i = min; i < max; i++)
2718 {
2719 if (eptr >= md->end_subject)
2720 {
2721 SCHECK_PARTIAL();
2722 break;
2723 }
2724 c = *eptr;
2725 if ((data[c/8] & (1 << (c&7))) == 0) break;
2726 eptr++;
2727 }
2728 while (eptr >= pp)
2729 {
2730 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2731 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2732 eptr--;
2733 }
2734 }
2735
2736 MRRETURN(MATCH_NOMATCH);
2737 }
2738 }
2739 /* Control never gets here */
2740
2741
2742 /* Match an extended character class. This opcode is encountered only
2743 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2744 mode, because Unicode properties are supported in non-UTF-8 mode. */
2745
2746 #ifdef SUPPORT_UTF8
2747 case OP_XCLASS:
2748 {
2749 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2750 ecode += GET(ecode, 1); /* Advance past the item */
2751
2752 switch (*ecode)
2753 {
2754 case OP_CRSTAR:
2755 case OP_CRMINSTAR:
2756 case OP_CRPLUS:
2757 case OP_CRMINPLUS:
2758 case OP_CRQUERY:
2759 case OP_CRMINQUERY:
2760 c = *ecode++ - OP_CRSTAR;
2761 minimize = (c & 1) != 0;
2762 min = rep_min[c]; /* Pick up values from tables; */
2763 max = rep_max[c]; /* zero for max => infinity */
2764 if (max == 0) max = INT_MAX;
2765 break;
2766
2767 case OP_CRRANGE:
2768 case OP_CRMINRANGE:
2769 minimize = (*ecode == OP_CRMINRANGE);
2770 min = GET2(ecode, 1);
2771 max = GET2(ecode, 3);
2772 if (max == 0) max = INT_MAX;
2773 ecode += 5;
2774 break;
2775
2776 default: /* No repeat follows */
2777 min = max = 1;
2778 break;
2779 }
2780
2781 /* First, ensure the minimum number of matches are present. */
2782
2783 for (i = 1; i <= min; i++)
2784 {
2785 if (eptr >= md->end_subject)
2786 {
2787 SCHECK_PARTIAL();
2788 MRRETURN(MATCH_NOMATCH);
2789 }
2790 GETCHARINCTEST(c, eptr);
2791 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2792 }
2793
2794 /* If max == min we can continue with the main loop without the
2795 need to recurse. */
2796
2797 if (min == max) continue;
2798
2799 /* If minimizing, keep testing the rest of the expression and advancing
2800 the pointer while it matches the class. */
2801
2802 if (minimize)
2803 {
2804 for (fi = min;; fi++)
2805 {
2806 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2808 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2809 if (eptr >= md->end_subject)
2810 {
2811 SCHECK_PARTIAL();
2812 MRRETURN(MATCH_NOMATCH);
2813 }
2814 GETCHARINCTEST(c, eptr);
2815 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2816 }
2817 /* Control never gets here */
2818 }
2819
2820 /* If maximizing, find the longest possible run, then work backwards. */
2821
2822 else
2823 {
2824 pp = eptr;
2825 for (i = min; i < max; i++)
2826 {
2827 int len = 1;
2828 if (eptr >= md->end_subject)
2829 {
2830 SCHECK_PARTIAL();
2831 break;
2832 }
2833 GETCHARLENTEST(c, eptr, len);
2834 if (!_pcre_xclass(c, data)) break;
2835 eptr += len;
2836 }
2837 for(;;)
2838 {
2839 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2840 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2841 if (eptr-- == pp) break; /* Stop if tried at original pos */
2842 if (utf8) BACKCHAR(eptr);
2843 }
2844 MRRETURN(MATCH_NOMATCH);
2845 }
2846
2847 /* Control never gets here */
2848 }
2849 #endif /* End of XCLASS */
2850
2851 /* Match a single character, casefully */
2852
2853 case OP_CHAR:
2854 #ifdef SUPPORT_UTF8
2855 if (utf8)
2856 {
2857 length = 1;
2858 ecode++;
2859 GETCHARLEN(fc, ecode, length);
2860 if (length > md->end_subject - eptr)
2861 {
2862 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2863 MRRETURN(MATCH_NOMATCH);
2864 }
2865 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2866 }
2867 else
2868 #endif
2869
2870 /* Non-UTF-8 mode */
2871 {
2872 if (md->end_subject - eptr < 1)
2873 {
2874 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2875 MRRETURN(MATCH_NOMATCH);
2876 }
2877 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2878 ecode += 2;
2879 }
2880 break;
2881
2882 /* Match a single character, caselessly */
2883
2884 case OP_CHARI:
2885 #ifdef SUPPORT_UTF8
2886 if (utf8)
2887 {
2888 length = 1;
2889 ecode++;
2890 GETCHARLEN(fc, ecode, length);
2891
2892 if (length > md->end_subject - eptr)
2893 {
2894 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2895 MRRETURN(MATCH_NOMATCH);
2896 }
2897
2898 /* If the pattern character's value is < 128, we have only one byte, and
2899 can use the fast lookup table. */
2900
2901 if (fc < 128)
2902 {
2903 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2904 }
2905
2906 /* Otherwise we must pick up the subject character */
2907
2908 else
2909 {
2910 unsigned int dc;
2911 GETCHARINC(dc, eptr);
2912 ecode += length;
2913
2914 /* If we have Unicode property support, we can use it to test the other
2915 case of the character, if there is one. */
2916
2917 if (fc != dc)
2918 {
2919 #ifdef SUPPORT_UCP
2920 if (dc != UCD_OTHERCASE(fc))
2921 #endif
2922 MRRETURN(MATCH_NOMATCH);
2923 }
2924 }
2925 }
2926 else
2927 #endif /* SUPPORT_UTF8 */
2928
2929 /* Non-UTF-8 mode */
2930 {
2931 if (md->end_subject - eptr < 1)
2932 {
2933 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2934 MRRETURN(MATCH_NOMATCH);
2935 }
2936 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2937 ecode += 2;
2938 }
2939 break;
2940
2941 /* Match a single character repeatedly. */
2942
2943 case OP_EXACT:
2944 case OP_EXACTI:
2945 min = max = GET2(ecode, 1);
2946 ecode += 3;
2947 goto REPEATCHAR;
2948
2949 case OP_POSUPTO:
2950 case OP_POSUPTOI:
2951 possessive = TRUE;
2952 /* Fall through */
2953
2954 case OP_UPTO:
2955 case OP_UPTOI:
2956 case OP_MINUPTO:
2957 case OP_MINUPTOI:
2958 min = 0;
2959 max = GET2(ecode, 1);
2960 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2961 ecode += 3;
2962 goto REPEATCHAR;
2963
2964 case OP_POSSTAR:
2965 case OP_POSSTARI:
2966 possessive = TRUE;
2967 min = 0;
2968 max = INT_MAX;
2969 ecode++;
2970 goto REPEATCHAR;
2971
2972 case OP_POSPLUS:
2973 case OP_POSPLUSI:
2974 possessive = TRUE;
2975 min = 1;
2976 max = INT_MAX;
2977 ecode++;
2978 goto REPEATCHAR;
2979
2980 case OP_POSQUERY:
2981 case OP_POSQUERYI:
2982 possessive = TRUE;
2983 min = 0;
2984 max = 1;
2985 ecode++;
2986 goto REPEATCHAR;
2987
2988 case OP_STAR:
2989 case OP_STARI:
2990 case OP_MINSTAR:
2991 case OP_MINSTARI:
2992 case OP_PLUS:
2993 case OP_PLUSI:
2994 case OP_MINPLUS:
2995 case OP_MINPLUSI:
2996 case OP_QUERY:
2997 case OP_QUERYI:
2998 case OP_MINQUERY:
2999 case OP_MINQUERYI:
3000 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3001 minimize = (c & 1) != 0;
3002 min = rep_min[c]; /* Pick up values from tables; */
3003 max = rep_max[c]; /* zero for max => infinity */
3004 if (max == 0) max = INT_MAX;
3005
3006 /* Common code for all repeated single-character matches. */
3007
3008 REPEATCHAR:
3009 #ifdef SUPPORT_UTF8
3010 if (utf8)
3011 {
3012 length = 1;
3013 charptr = ecode;
3014 GETCHARLEN(fc, ecode, length);
3015 ecode += length;
3016
3017 /* Handle multibyte character matching specially here. There is
3018 support for caseless matching if UCP support is present. */
3019
3020 if (length > 1)
3021 {
3022 #ifdef SUPPORT_UCP
3023 unsigned int othercase;
3024 if (op >= OP_STARI && /* Caseless */
3025 (othercase = UCD_OTHERCASE(fc)) != fc)
3026 oclength = _pcre_ord2utf8(othercase, occhars);
3027 else oclength = 0;
3028 #endif /* SUPPORT_UCP */
3029
3030 for (i = 1; i <= min; i++)
3031 {
3032 if (eptr <= md->end_subject - length &&
3033 memcmp(eptr, charptr, length) == 0) eptr += length;
3034 #ifdef SUPPORT_UCP
3035 else if (oclength > 0 &&
3036 eptr <= md->end_subject - oclength &&
3037 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3038 #endif /* SUPPORT_UCP */
3039 else
3040 {
3041 CHECK_PARTIAL();
3042 MRRETURN(MATCH_NOMATCH);
3043 }
3044 }
3045
3046 if (min == max) continue;
3047
3048 if (minimize)
3049 {
3050 for (fi = min;; fi++)
3051 {
3052 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3054 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3055 if (eptr <= md->end_subject - length &&
3056 memcmp(eptr, charptr, length) == 0) eptr += length;
3057 #ifdef SUPPORT_UCP
3058 else if (oclength > 0 &&
3059 eptr <= md->end_subject - oclength &&
3060 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3061 #endif /* SUPPORT_UCP */
3062 else
3063 {
3064 CHECK_PARTIAL();
3065 MRRETURN(MATCH_NOMATCH);
3066 }
3067 }
3068 /* Control never gets here */
3069 }
3070
3071 else /* Maximize */
3072 {
3073 pp = eptr;
3074 for (i = min; i < max; i++)
3075 {
3076 if (eptr <= md->end_subject - length &&
3077 memcmp(eptr, charptr, length) == 0) eptr += length;
3078 #ifdef SUPPORT_UCP
3079 else if (oclength > 0 &&
3080 eptr <= md->end_subject - oclength &&
3081 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3082 #endif /* SUPPORT_UCP */
3083 else
3084 {
3085 CHECK_PARTIAL();
3086 break;
3087 }
3088 }
3089
3090 if (possessive) continue;
3091
3092 for(;;)
3093 {
3094 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3096 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3097 #ifdef SUPPORT_UCP
3098 eptr--;
3099 BACKCHAR(eptr);
3100 #else /* without SUPPORT_UCP */
3101 eptr -= length;
3102 #endif /* SUPPORT_UCP */
3103 }
3104 }
3105 /* Control never gets here */
3106 }
3107
3108 /* If the length of a UTF-8 character is 1, we fall through here, and
3109 obey the code as for non-UTF-8 characters below, though in this case the
3110 value of fc will always be < 128. */
3111 }
3112 else
3113 #endif /* SUPPORT_UTF8 */
3114
3115 /* When not in UTF-8 mode, load a single-byte character. */
3116
3117 fc = *ecode++;
3118
3119 /* The value of fc at this point is always less than 256, though we may or
3120 may not be in UTF-8 mode. The code is duplicated for the caseless and
3121 caseful cases, for speed, since matching characters is likely to be quite
3122 common. First, ensure the minimum number of matches are present. If min =
3123 max, continue at the same level without recursing. Otherwise, if
3124 minimizing, keep trying the rest of the expression and advancing one
3125 matching character if failing, up to the maximum. Alternatively, if
3126 maximizing, find the maximum number of characters and work backwards. */
3127
3128 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3129 max, eptr));
3130
3131 if (op >= OP_STARI) /* Caseless */
3132 {
3133 fc = md->lcc[fc];
3134 for (i = 1; i <= min; i++)
3135 {
3136 if (eptr >= md->end_subject)
3137 {
3138 SCHECK_PARTIAL();
3139 MRRETURN(MATCH_NOMATCH);
3140 }
3141 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3142 }
3143 if (min == max) continue;
3144 if (minimize)
3145 {
3146 for (fi = min;; fi++)
3147 {
3148 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3149 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3150 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3151 if (eptr >= md->end_subject)
3152 {
3153 SCHECK_PARTIAL();
3154 MRRETURN(MATCH_NOMATCH);
3155 }
3156 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3157 }
3158 /* Control never gets here */
3159 }
3160 else /* Maximize */
3161 {
3162 pp = eptr;
3163 for (i = min; i < max; i++)
3164 {
3165 if (eptr >= md->end_subject)
3166 {
3167 SCHECK_PARTIAL();
3168 break;
3169 }
3170 if (fc != md->lcc[*eptr]) break;
3171 eptr++;
3172 }
3173
3174 if (possessive) continue;
3175
3176 while (eptr >= pp)
3177 {
3178 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3179 eptr--;
3180 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3181 }
3182 MRRETURN(MATCH_NOMATCH);
3183 }
3184 /* Control never gets here */
3185 }
3186
3187 /* Caseful comparisons (includes all multi-byte characters) */
3188
3189 else
3190 {
3191 for (i = 1; i <= min; i++)
3192 {
3193 if (eptr >= md->end_subject)
3194 {
3195 SCHECK_PARTIAL();
3196 MRRETURN(MATCH_NOMATCH);
3197 }
3198 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3199 }
3200
3201 if (min == max) continue;
3202
3203 if (minimize)
3204 {
3205 for (fi = min;; fi++)
3206 {
3207 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3209 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3210 if (eptr >= md->end_subject)
3211 {
3212 SCHECK_PARTIAL();
3213 MRRETURN(MATCH_NOMATCH);
3214 }
3215 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3216 }
3217 /* Control never gets here */
3218 }
3219 else /* Maximize */
3220 {
3221 pp = eptr;
3222 for (i = min; i < max; i++)
3223 {
3224 if (eptr >= md->end_subject)
3225 {
3226 SCHECK_PARTIAL();
3227 break;
3228 }
3229 if (fc != *eptr) break;
3230 eptr++;
3231 }
3232 if (possessive) continue;
3233
3234 while (eptr >= pp)
3235 {
3236 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3237 eptr--;
3238 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3239 }
3240 MRRETURN(MATCH_NOMATCH);
3241 }
3242 }
3243 /* Control never gets here */
3244
3245 /* Match a negated single one-byte character. The character we are
3246 checking can be multibyte. */
3247
3248 case OP_NOT:
3249 case OP_NOTI:
3250 if (eptr >= md->end_subject)
3251 {
3252 SCHECK_PARTIAL();
3253 MRRETURN(MATCH_NOMATCH);
3254 }
3255 ecode++;
3256 GETCHARINCTEST(c, eptr);
3257 if (op == OP_NOTI) /* The caseless case */
3258 {
3259 #ifdef SUPPORT_UTF8
3260 if (c < 256)
3261 #endif
3262 c = md->lcc[c];
3263 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3264 }
3265 else /* Caseful */
3266 {
3267 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3268 }
3269 break;
3270
3271 /* Match a negated single one-byte character repeatedly. This is almost a
3272 repeat of the code for a repeated single character, but I haven't found a
3273 nice way of commoning these up that doesn't require a test of the
3274 positive/negative option for each character match. Maybe that wouldn't add
3275 very much to the time taken, but character matching *is* what this is all
3276 about... */
3277
3278 case OP_NOTEXACT:
3279 case OP_NOTEXACTI:
3280 min = max = GET2(ecode, 1);
3281 ecode += 3;
3282 goto REPEATNOTCHAR;
3283
3284 case OP_NOTUPTO:
3285 case OP_NOTUPTOI:
3286 case OP_NOTMINUPTO:
3287 case OP_NOTMINUPTOI:
3288 min = 0;
3289 max = GET2(ecode, 1);
3290 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3291 ecode += 3;
3292 goto REPEATNOTCHAR;
3293
3294 case OP_NOTPOSSTAR:
3295 case OP_NOTPOSSTARI:
3296 possessive = TRUE;
3297 min = 0;
3298 max = INT_MAX;
3299 ecode++;
3300 goto REPEATNOTCHAR;
3301
3302 case OP_NOTPOSPLUS:
3303 case OP_NOTPOSPLUSI:
3304 possessive = TRUE;
3305 min = 1;
3306 max = INT_MAX;
3307 ecode++;
3308 goto REPEATNOTCHAR;
3309
3310 case OP_NOTPOSQUERY:
3311 case OP_NOTPOSQUERYI:
3312 possessive = TRUE;
3313 min = 0;
3314 max = 1;
3315 ecode++;
3316 goto REPEATNOTCHAR;
3317
3318 case OP_NOTPOSUPTO:
3319 case OP_NOTPOSUPTOI:
3320 possessive = TRUE;
3321 min = 0;
3322 max = GET2(ecode, 1);
3323 ecode += 3;
3324 goto REPEATNOTCHAR;
3325
3326 case OP_NOTSTAR:
3327 case OP_NOTSTARI:
3328 case OP_NOTMINSTAR:
3329 case OP_NOTMINSTARI:
3330 case OP_NOTPLUS:
3331 case OP_NOTPLUSI:
3332 case OP_NOTMINPLUS:
3333 case OP_NOTMINPLUSI:
3334 case OP_NOTQUERY:
3335 case OP_NOTQUERYI:
3336 case OP_NOTMINQUERY:
3337 case OP_NOTMINQUERYI:
3338 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3339 minimize = (c & 1) != 0;
3340 min = rep_min[c]; /* Pick up values from tables; */
3341 max = rep_max[c]; /* zero for max => infinity */
3342 if (max == 0) max = INT_MAX;
3343
3344 /* Common code for all repeated single-byte matches. */
3345
3346 REPEATNOTCHAR:
3347 fc = *ecode++;
3348
3349 /* The code is duplicated for the caseless and caseful cases, for speed,
3350 since matching characters is likely to be quite common. First, ensure the
3351 minimum number of matches are present. If min = max, continue at the same
3352 level without recursing. Otherwise, if minimizing, keep trying the rest of
3353 the expression and advancing one matching character if failing, up to the
3354 maximum. Alternatively, if maximizing, find the maximum number of
3355 characters and work backwards. */
3356
3357 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3358 max, eptr));
3359
3360 if (op >= OP_NOTSTARI) /* Caseless */
3361 {
3362 fc = md->lcc[fc];
3363
3364 #ifdef SUPPORT_UTF8
3365 /* UTF-8 mode */
3366 if (utf8)
3367 {
3368 register unsigned int d;
3369 for (i = 1; i <= min; i++)
3370 {
3371 if (eptr >= md->end_subject)
3372 {
3373 SCHECK_PARTIAL();
3374 MRRETURN(MATCH_NOMATCH);
3375 }
3376 GETCHARINC(d, eptr);
3377 if (d < 256) d = md->lcc[d];
3378 if (fc == d) MRRETURN(MATCH_NOMATCH);
3379 }
3380 }
3381 else
3382 #endif
3383
3384 /* Not UTF-8 mode */
3385 {
3386 for (i = 1; i <= min; i++)
3387 {
3388 if (eptr >= md->end_subject)
3389 {
3390 SCHECK_PARTIAL();
3391 MRRETURN(MATCH_NOMATCH);
3392 }
3393 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3394 }
3395 }
3396
3397 if (min == max) continue;
3398
3399 if (minimize)
3400 {
3401 #ifdef SUPPORT_UTF8
3402 /* UTF-8 mode */
3403 if (utf8)
3404 {
3405 register unsigned int d;
3406 for (fi = min;; fi++)
3407 {
3408 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3409 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3410 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3411 if (eptr >= md->end_subject)
3412 {
3413 SCHECK_PARTIAL();
3414 MRRETURN(MATCH_NOMATCH);
3415 }
3416 GETCHARINC(d, eptr);
3417 if (d < 256) d = md->lcc[d];
3418 if (fc == d) MRRETURN(MATCH_NOMATCH);
3419 }
3420 }
3421 else
3422 #endif
3423 /* Not UTF-8 mode */
3424 {
3425 for (fi = min;; fi++)
3426 {
3427 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3430 if (eptr >= md->end_subject)
3431 {
3432 SCHECK_PARTIAL();
3433 MRRETURN(MATCH_NOMATCH);
3434 }
3435 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3436 }
3437 }
3438 /* Control never gets here */
3439 }
3440
3441 /* Maximize case */
3442
3443 else
3444 {
3445 pp = eptr;
3446
3447 #ifdef SUPPORT_UTF8
3448 /* UTF-8 mode */
3449 if (utf8)
3450 {
3451 register unsigned int d;
3452 for (i = min; i < max; i++)
3453 {
3454 int len = 1;
3455 if (eptr >= md->end_subject)
3456 {
3457 SCHECK_PARTIAL();
3458 break;
3459 }
3460 GETCHARLEN(d, eptr, len);
3461 if (d < 256) d = md->lcc[d];
3462 if (fc == d) break;
3463 eptr += len;
3464 }
3465 if (possessive) continue;
3466 for(;;)
3467 {
3468 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3469 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3470 if (eptr-- == pp) break; /* Stop if tried at original pos */
3471 BACKCHAR(eptr);
3472 }
3473 }
3474 else
3475 #endif
3476 /* Not UTF-8 mode */
3477 {
3478 for (i = min; i < max; i++)
3479 {
3480 if (eptr >= md->end_subject)
3481 {
3482 SCHECK_PARTIAL();
3483 break;
3484 }
3485 if (fc == md->lcc[*eptr]) break;
3486 eptr++;
3487 }
3488 if (possessive) continue;
3489 while (eptr >= pp)
3490 {
3491 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3492 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3493 eptr--;
3494 }
3495 }
3496
3497 MRRETURN(MATCH_NOMATCH);
3498 }
3499 /* Control never gets here */
3500 }
3501
3502 /* Caseful comparisons */
3503
3504 else
3505 {
3506 #ifdef SUPPORT_UTF8
3507 /* UTF-8 mode */
3508 if (utf8)
3509 {
3510 register unsigned int d;
3511 for (i = 1; i <= min; i++)
3512 {
3513 if (eptr >= md->end_subject)
3514 {
3515 SCHECK_PARTIAL();
3516 MRRETURN(MATCH_NOMATCH);
3517 }
3518 GETCHARINC(d, eptr);
3519 if (fc == d) MRRETURN(MATCH_NOMATCH);
3520 }
3521 }
3522 else
3523 #endif
3524 /* Not UTF-8 mode */
3525 {
3526 for (i = 1; i <= min; i++)
3527 {
3528 if (eptr >= md->end_subject)
3529 {
3530 SCHECK_PARTIAL();
3531 MRRETURN(MATCH_NOMATCH);
3532 }
3533 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3534 }
3535 }
3536
3537 if (min == max) continue;
3538
3539 if (minimize)
3540 {
3541 #ifdef SUPPORT_UTF8
3542 /* UTF-8 mode */
3543 if (utf8)
3544 {
3545 register unsigned int d;
3546 for (fi = min;; fi++)
3547 {
3548 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3549 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3550 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3551 if (eptr >= md->end_subject)
3552 {
3553 SCHECK_PARTIAL();
3554 MRRETURN(MATCH_NOMATCH);
3555 }
3556 GETCHARINC(d, eptr);
3557 if (fc == d) MRRETURN(MATCH_NOMATCH);
3558 }
3559 }
3560 else
3561 #endif
3562 /* Not UTF-8 mode */
3563 {
3564 for (fi = min;; fi++)
3565 {
3566 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3568 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3569 if (eptr >= md->end_subject)
3570 {
3571 SCHECK_PARTIAL();
3572 MRRETURN(MATCH_NOMATCH);
3573 }
3574 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3575 }
3576 }
3577 /* Control never gets here */
3578 }
3579
3580 /* Maximize case */
3581
3582 else
3583 {
3584 pp = eptr;
3585
3586 #ifdef SUPPORT_UTF8
3587 /* UTF-8 mode */
3588 if (utf8)
3589 {
3590 register unsigned int d;
3591 for (i = min; i < max; i++)
3592 {
3593 int len = 1;
3594 if (eptr >= md->end_subject)
3595 {
3596 SCHECK_PARTIAL();
3597 break;
3598 }
3599 GETCHARLEN(d, eptr, len);
3600 if (fc == d) break;
3601 eptr += len;
3602 }
3603 if (possessive) continue;
3604 for(;;)
3605 {
3606 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3608 if (eptr-- == pp) break; /* Stop if tried at original pos */
3609 BACKCHAR(eptr);
3610 }
3611 }
3612 else
3613 #endif
3614 /* Not UTF-8 mode */
3615 {
3616 for (i = min; i < max; i++)
3617 {
3618 if (eptr >= md->end_subject)
3619 {
3620 SCHECK_PARTIAL();
3621 break;
3622 }
3623 if (fc == *eptr) break;
3624 eptr++;
3625 }
3626 if (possessive) continue;
3627 while (eptr >= pp)
3628 {
3629 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3630 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3631 eptr--;
3632 }
3633 }
3634
3635 MRRETURN(MATCH_NOMATCH);
3636 }
3637 }
3638 /* Control never gets here */
3639
3640 /* Match a single character type repeatedly; several different opcodes
3641 share code. This is very similar to the code for single characters, but we
3642 repeat it in the interests of efficiency. */
3643
3644 case OP_TYPEEXACT:
3645 min = max = GET2(ecode, 1);
3646 minimize = TRUE;
3647 ecode += 3;
3648 goto REPEATTYPE;
3649
3650 case OP_TYPEUPTO:
3651 case OP_TYPEMINUPTO:
3652 min = 0;
3653 max = GET2(ecode, 1);
3654 minimize = *ecode == OP_TYPEMINUPTO;
3655 ecode += 3;
3656 goto REPEATTYPE;
3657
3658 case OP_TYPEPOSSTAR:
3659 possessive = TRUE;
3660 min = 0;
3661 max = INT_MAX;
3662 ecode++;
3663 goto REPEATTYPE;
3664
3665 case OP_TYPEPOSPLUS:
3666 possessive = TRUE;
3667 min = 1;
3668 max = INT_MAX;
3669 ecode++;
3670 goto REPEATTYPE;
3671
3672 case OP_TYPEPOSQUERY:
3673 possessive = TRUE;
3674 min = 0;
3675 max = 1;
3676 ecode++;
3677 goto REPEATTYPE;
3678
3679 case OP_TYPEPOSUPTO:
3680 possessive = TRUE;
3681 min = 0;
3682 max = GET2(ecode, 1);
3683 ecode += 3;
3684 goto REPEATTYPE;
3685
3686 case OP_TYPESTAR:
3687 case OP_TYPEMINSTAR:
3688 case OP_TYPEPLUS:
3689 case OP_TYPEMINPLUS:
3690 case OP_TYPEQUERY:
3691 case OP_TYPEMINQUERY:
3692 c = *ecode++ - OP_TYPESTAR;
3693 minimize = (c & 1) != 0;
3694 min = rep_min[c]; /* Pick up values from tables; */
3695 max = rep_max[c]; /* zero for max => infinity */
3696 if (max == 0) max = INT_MAX;
3697
3698 /* Common code for all repeated single character type matches. Note that
3699 in UTF-8 mode, '.' matches a character of any length, but for the other
3700 character types, the valid characters are all one-byte long. */
3701
3702 REPEATTYPE:
3703 ctype = *ecode++; /* Code for the character type */
3704
3705 #ifdef SUPPORT_UCP
3706 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3707 {
3708 prop_fail_result = ctype == OP_NOTPROP;
3709 prop_type = *ecode++;
3710 prop_value = *ecode++;
3711 }
3712 else prop_type = -1;
3713 #endif
3714
3715 /* First, ensure the minimum number of matches are present. Use inline
3716 code for maximizing the speed, and do the type test once at the start
3717 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3718 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3719 and single-bytes. */
3720
3721 if (min > 0)
3722 {
3723 #ifdef SUPPORT_UCP
3724 if (prop_type >= 0)
3725 {
3726 switch(prop_type)
3727 {
3728 case PT_ANY:
3729 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3730 for (i = 1; i <= min; i++)
3731 {
3732 if (eptr >= md->end_subject)
3733 {
3734 SCHECK_PARTIAL();
3735 MRRETURN(MATCH_NOMATCH);
3736 }
3737 GETCHARINCTEST(c, eptr);
3738 }
3739 break;
3740
3741 case PT_LAMP:
3742 for (i = 1; i <= min; i++)
3743 {
3744 if (eptr >= md->end_subject)
3745 {
3746 SCHECK_PARTIAL();
3747 MRRETURN(MATCH_NOMATCH);
3748 }
3749 GETCHARINCTEST(c, eptr);
3750 prop_chartype = UCD_CHARTYPE(c);
3751 if ((prop_chartype == ucp_Lu ||
3752 prop_chartype == ucp_Ll ||
3753 prop_chartype == ucp_Lt) == prop_fail_result)
3754 MRRETURN(MATCH_NOMATCH);
3755 }
3756 break;
3757
3758 case PT_GC:
3759 for (i = 1; i <= min; i++)
3760 {
3761 if (eptr >= md->end_subject)
3762 {
3763 SCHECK_PARTIAL();
3764 MRRETURN(MATCH_NOMATCH);
3765 }
3766 GETCHARINCTEST(c, eptr);
3767 prop_category = UCD_CATEGORY(c);
3768 if ((prop_category == prop_value) == prop_fail_result)
3769 MRRETURN(MATCH_NOMATCH);
3770 }
3771 break;
3772
3773 case PT_PC:
3774 for (i = 1; i <= min; i++)
3775 {
3776 if (eptr >= md->end_subject)
3777 {
3778 SCHECK_PARTIAL();
3779 MRRETURN(MATCH_NOMATCH);
3780 }
3781 GETCHARINCTEST(c, eptr);
3782 prop_chartype = UCD_CHARTYPE(c);
3783 if ((prop_chartype == prop_value) == prop_fail_result)
3784 MRRETURN(MATCH_NOMATCH);
3785 }
3786 break;
3787
3788 case PT_SC:
3789 for (i = 1; i <= min; i++)
3790 {
3791 if (eptr >= md->end_subject)
3792 {
3793 SCHECK_PARTIAL();
3794 MRRETURN(MATCH_NOMATCH);
3795 }
3796 GETCHARINCTEST(c, eptr);
3797 prop_script = UCD_SCRIPT(c);
3798 if ((prop_script == prop_value) == prop_fail_result)
3799 MRRETURN(MATCH_NOMATCH);
3800 }
3801 break;
3802
3803 case PT_ALNUM:
3804 for (i = 1; i <= min; i++)
3805 {
3806 if (eptr >= md->end_subject)
3807 {
3808 SCHECK_PARTIAL();
3809 MRRETURN(MATCH_NOMATCH);
3810 }
3811 GETCHARINCTEST(c, eptr);
3812 prop_category = UCD_CATEGORY(c);
3813 if ((prop_category == ucp_L || prop_category == ucp_N)
3814 == prop_fail_result)
3815 MRRETURN(MATCH_NOMATCH);
3816 }
3817 break;
3818
3819 case PT_SPACE: /* Perl space */
3820 for (i = 1; i <= min; i++)
3821 {
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 MRRETURN(MATCH_NOMATCH);
3826 }
3827 GETCHARINCTEST(c, eptr);
3828 prop_category = UCD_CATEGORY(c);
3829 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3830 c == CHAR_FF || c == CHAR_CR)
3831 == prop_fail_result)
3832 MRRETURN(MATCH_NOMATCH);
3833 }
3834 break;
3835
3836 case PT_PXSPACE: /* POSIX space */
3837 for (i = 1; i <= min; i++)
3838 {
3839 if (eptr >= md->end_subject)
3840 {
3841 SCHECK_PARTIAL();
3842 MRRETURN(MATCH_NOMATCH);
3843 }
3844 GETCHARINCTEST(c, eptr);
3845 prop_category = UCD_CATEGORY(c);
3846 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3847 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3848 == prop_fail_result)
3849 MRRETURN(MATCH_NOMATCH);
3850 }
3851 break;
3852
3853 case PT_WORD:
3854 for (i = 1; i <= min; i++)
3855 {
3856 if (eptr >= md->end_subject)
3857 {
3858 SCHECK_PARTIAL();
3859 MRRETURN(MATCH_NOMATCH);
3860 }
3861 GETCHARINCTEST(c, eptr);
3862 prop_category = UCD_CATEGORY(c);
3863 if ((prop_category == ucp_L || prop_category == ucp_N ||
3864 c == CHAR_UNDERSCORE)
3865 == prop_fail_result)
3866 MRRETURN(MATCH_NOMATCH);
3867 }
3868 break;
3869
3870 /* This should not occur */
3871
3872 default:
3873 RRETURN(PCRE_ERROR_INTERNAL);
3874 }
3875 }
3876
3877 /* Match extended Unicode sequences. We will get here only if the
3878 support is in the binary; otherwise a compile-time error occurs. */
3879
3880 else if (ctype == OP_EXTUNI)
3881 {
3882 for (i = 1; i <= min; i++)
3883 {
3884 if (eptr >= md->end_subject)
3885 {
3886 SCHECK_PARTIAL();
3887 MRRETURN(MATCH_NOMATCH);
3888 }
3889 GETCHARINCTEST(c, eptr);
3890 prop_category = UCD_CATEGORY(c);
3891 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3892 while (eptr < md->end_subject)
3893 {
3894 int len = 1;
3895 if (!utf8) c = *eptr;
3896 else { GETCHARLEN(c, eptr, len); }
3897 prop_category = UCD_CATEGORY(c);
3898 if (prop_category != ucp_M) break;
3899 eptr += len;
3900 }
3901 }
3902 }
3903
3904 else
3905 #endif /* SUPPORT_UCP */
3906
3907 /* Handle all other cases when the coding is UTF-8 */
3908
3909 #ifdef SUPPORT_UTF8
3910 if (utf8) switch(ctype)
3911 {
3912 case OP_ANY:
3913 for (i = 1; i <= min; i++)
3914 {
3915 if (eptr >= md->end_subject)
3916 {
3917 SCHECK_PARTIAL();
3918 MRRETURN(MATCH_NOMATCH);
3919 }
3920 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3921 eptr++;
3922 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3923 }
3924 break;
3925
3926 case OP_ALLANY:
3927 for (i = 1; i <= min; i++)
3928 {
3929 if (eptr >= md->end_subject)
3930 {
3931 SCHECK_PARTIAL();
3932 MRRETURN(MATCH_NOMATCH);
3933 }
3934 eptr++;
3935 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3936 }
3937 break;
3938
3939 case OP_ANYBYTE:
3940 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3941 eptr += min;
3942 break;
3943
3944 case OP_ANYNL:
3945 for (i = 1; i <= min; i++)
3946 {
3947 if (eptr >= md->end_subject)
3948 {
3949 SCHECK_PARTIAL();
3950 MRRETURN(MATCH_NOMATCH);
3951 }
3952 GETCHARINC(c, eptr);
3953 switch(c)
3954 {
3955 default: MRRETURN(MATCH_NOMATCH);
3956
3957 case 0x000d:
3958 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3959 break;
3960
3961 case 0x000a:
3962 break;
3963
3964 case 0x000b:
3965 case 0x000c:
3966 case 0x0085:
3967 case 0x2028:
3968 case 0x2029:
3969 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3970 break;
3971 }
3972 }
3973 break;
3974
3975 case OP_NOT_HSPACE:
3976 for (i = 1; i <= min; i++)
3977 {
3978 if (eptr >= md->end_subject)
3979 {
3980 SCHECK_PARTIAL();
3981 MRRETURN(MATCH_NOMATCH);
3982 }
3983 GETCHARINC(c, eptr);
3984 switch(c)
3985 {
3986 default: break;
3987 case 0x09: /* HT */
3988 case 0x20: /* SPACE */
3989 case 0xa0: /* NBSP */
3990 case 0x1680: /* OGHAM SPACE MARK */
3991 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3992 case 0x2000: /* EN QUAD */
3993 case 0x2001: /* EM QUAD */
3994 case 0x2002: /* EN SPACE */
3995 case 0x2003: /* EM SPACE */
3996 case 0x2004: /* THREE-PER-EM SPACE */
3997 case 0x2005: /* FOUR-PER-EM SPACE */
3998 case 0x2006: /* SIX-PER-EM SPACE */
3999 case 0x2007: /* FIGURE SPACE */
4000 case 0x2008: /* PUNCTUATION SPACE */
4001 case 0x2009: /* THIN SPACE */
4002 case 0x200A: /* HAIR SPACE */
4003 case 0x202f: /* NARROW NO-BREAK SPACE */
4004 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4005 case 0x3000: /* IDEOGRAPHIC SPACE */
4006 MRRETURN(MATCH_NOMATCH);
4007 }
4008 }
4009 break;
4010
4011 case OP_HSPACE:
4012 for (i = 1; i <= min; i++)
4013 {
4014 if (eptr >= md->end_subject)
4015 {
4016 SCHECK_PARTIAL();
4017 MRRETURN(MATCH_NOMATCH);
4018 }
4019 GETCHARINC(c, eptr);
4020 switch(c)
4021 {
4022 default: MRRETURN(MATCH_NOMATCH);
4023 case 0x09: /* HT */
4024 case 0x20: /* SPACE */
4025 case 0xa0: /* NBSP */
4026 case 0x1680: /* OGHAM SPACE MARK */
4027 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4028 case 0x2000: /* EN QUAD */
4029 case 0x2001: /* EM QUAD */
4030 case 0x2002: /* EN SPACE */
4031 case 0x2003: /* EM SPACE */
4032 case 0x2004: /* THREE-PER-EM SPACE */
4033 case 0x2005: /* FOUR-PER-EM SPACE */
4034 case 0x2006: /* SIX-PER-EM SPACE */
4035 case 0x2007: /* FIGURE SPACE */
4036 case 0x2008: /* PUNCTUATION SPACE */
4037 case 0x2009: /* THIN SPACE */
4038 case 0x200A: /* HAIR SPACE */
4039 case 0x202f: /* NARROW NO-BREAK SPACE */
4040 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4041 case 0x3000: /* IDEOGRAPHIC SPACE */
4042 break;
4043 }
4044 }
4045 break;
4046
4047 case OP_NOT_VSPACE:
4048 for (i = 1; i <= min; i++)
4049 {
4050 if (eptr >= md->end_subject)
4051 {
4052 SCHECK_PARTIAL();
4053 MRRETURN(MATCH_NOMATCH);
4054 }
4055 GETCHARINC(c, eptr);
4056 switch(c)
4057 {
4058 default: break;
4059 case 0x0a: /* LF */
4060 case 0x0b: /* VT */
4061 case 0x0c: /* FF */
4062 case 0x0d: /* CR */
4063 case 0x85: /* NEL */
4064 case 0x2028: /* LINE SEPARATOR */
4065 case 0x2029: /* PARAGRAPH SEPARATOR */
4066 MRRETURN(MATCH_NOMATCH);
4067 }
4068 }
4069 break;
4070
4071 case OP_VSPACE:
4072 for (i = 1; i <= min; i++)
4073 {
4074 if (eptr >= md->end_subject)
4075 {
4076 SCHECK_PARTIAL();
4077 MRRETURN(MATCH_NOMATCH);
4078 }
4079 GETCHARINC(c, eptr);
4080 switch(c)
4081 {
4082 default: MRRETURN(MATCH_NOMATCH);
4083 case 0x0a: /* LF */
4084 case 0x0b: /* VT */
4085 case 0x0c: /* FF */
4086 case 0x0d: /* CR */
4087 case 0x85: /* NEL */
4088 case 0x2028: /* LINE SEPARATOR */
4089 case 0x2029: /* PARAGRAPH SEPARATOR */
4090 break;
4091 }
4092 }
4093 break;
4094
4095 case OP_NOT_DIGIT:
4096 for (i = 1; i <= min; i++)
4097 {
4098 if (eptr >= md->end_subject)
4099 {
4100 SCHECK_PARTIAL();
4101 MRRETURN(MATCH_NOMATCH);
4102 }
4103 GETCHARINC(c, eptr);
4104 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4105 MRRETURN(MATCH_NOMATCH);
4106 }
4107 break;
4108
4109 case OP_DIGIT:
4110 for (i = 1; i <= min; i++)
4111 {
4112 if (eptr >= md->end_subject)
4113 {
4114 SCHECK_PARTIAL();
4115 MRRETURN(MATCH_NOMATCH);
4116 }
4117 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4118 MRRETURN(MATCH_NOMATCH);
4119 /* No need to skip more bytes - we know it's a 1-byte character */
4120 }
4121 break;
4122
4123 case OP_NOT_WHITESPACE:
4124 for (i = 1; i <= min; i++)
4125 {
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 MRRETURN(MATCH_NOMATCH);
4130 }
4131 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4132 MRRETURN(MATCH_NOMATCH);
4133 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4134 }
4135 break;
4136
4137 case OP_WHITESPACE:
4138 for (i = 1; i <= min; i++)
4139 {
4140 if (eptr >= md->end_subject)
4141 {
4142 SCHECK_PARTIAL();
4143 MRRETURN(MATCH_NOMATCH);
4144 }
4145 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4146 MRRETURN(MATCH_NOMATCH);
4147 /* No need to skip more bytes - we know it's a 1-byte character */
4148 }
4149 break;
4150
4151 case OP_NOT_WORDCHAR:
4152 for (i = 1; i <= min; i++)
4153 {
4154 if (eptr >= md->end_subject)
4155 {
4156 SCHECK_PARTIAL();
4157 MRRETURN(MATCH_NOMATCH);
4158 }
4159 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4160 MRRETURN(MATCH_NOMATCH);
4161 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4162 }
4163 break;
4164
4165 case OP_WORDCHAR:
4166 for (i = 1; i <= min; i++)
4167 {
4168 if (eptr >= md->end_subject)
4169 {
4170 SCHECK_PARTIAL();
4171 MRRETURN(MATCH_NOMATCH);
4172 }
4173 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4174 MRRETURN(MATCH_NOMATCH);
4175 /* No need to skip more bytes - we know it's a 1-byte character */
4176 }
4177 break;
4178
4179 default:
4180 RRETURN(PCRE_ERROR_INTERNAL);
4181 } /* End switch(ctype) */
4182
4183 else
4184 #endif /* SUPPORT_UTF8 */
4185
4186 /* Code for the non-UTF-8 case for minimum matching of operators other
4187 than OP_PROP and OP_NOTPROP. */
4188
4189 switch(ctype)
4190 {
4191 case OP_ANY:
4192 for (i = 1; i <= min; i++)
4193 {
4194 if (eptr >= md->end_subject)
4195 {
4196 SCHECK_PARTIAL();
4197 MRRETURN(MATCH_NOMATCH);
4198 }
4199 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4200 eptr++;
4201 }
4202 break;
4203
4204 case OP_ALLANY:
4205 if (eptr > md->end_subject - min)
4206 {
4207 SCHECK_PARTIAL();
4208 MRRETURN(MATCH_NOMATCH);
4209 }
4210 eptr += min;
4211 break;
4212
4213 case OP_ANYBYTE:
4214 if (eptr > md->end_subject - min)
4215 {
4216 SCHECK_PARTIAL();
4217 MRRETURN(MATCH_NOMATCH);
4218 }
4219 eptr += min;
4220 break;
4221
4222 case OP_ANYNL:
4223 for (i = 1; i <= min; i++)
4224 {
4225 if (eptr >= md->end_subject)
4226 {
4227 SCHECK_PARTIAL();
4228 MRRETURN(MATCH_NOMATCH);
4229 }
4230 switch(*eptr++)
4231 {
4232 default: MRRETURN(MATCH_NOMATCH);
4233
4234 case 0x000d:
4235 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4236 break;
4237
4238 case 0x000a:
4239 break;
4240
4241 case 0x000b:
4242 case 0x000c:
4243 case 0x0085:
4244 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4245 break;
4246 }
4247 }
4248 break;
4249
4250 case OP_NOT_HSPACE:
4251 for (i = 1; i <= min; i++)
4252 {
4253 if (eptr >= md->end_subject)
4254 {
4255 SCHECK_PARTIAL();
4256 MRRETURN(MATCH_NOMATCH);
4257 }
4258 switch(*eptr++)
4259 {
4260 default: break;
4261 case 0x09: /* HT */
4262 case 0x20: /* SPACE */
4263 case 0xa0: /* NBSP */
4264 MRRETURN(MATCH_NOMATCH);
4265 }
4266 }
4267 break;
4268
4269 case OP_HSPACE:
4270 for (i = 1; i <= min; i++)
4271 {
4272 if (eptr >= md->end_subject)
4273 {
4274 SCHECK_PARTIAL();
4275 MRRETURN(MATCH_NOMATCH);
4276 }
4277 switch(*eptr++)
4278 {
4279 default: MRRETURN(MATCH_NOMATCH);
4280 case 0x09: /* HT */
4281 case 0x20: /* SPACE */
4282 case 0xa0: /* NBSP */
4283 break;
4284 }
4285 }
4286 break;
4287
4288 case OP_NOT_VSPACE:
4289 for (i = 1; i <= min; i++)
4290 {
4291 if (eptr >= md->end_subject)
4292 {
4293 SCHECK_PARTIAL();
4294 MRRETURN(MATCH_NOMATCH);
4295 }
4296 switch(*eptr++)
4297 {
4298 default: break;
4299 case 0x0a: /* LF */
4300 case 0x0b: /* VT */
4301 case 0x0c: /* FF */
4302 case 0x0d: /* CR */
4303 case 0x85: /* NEL */
4304 MRRETURN(MATCH_NOMATCH);
4305 }
4306 }
4307 break;
4308
4309 case OP_VSPACE:
4310 for (i = 1; i <= min; i++)
4311 {
4312 if (eptr >= md->end_subject)
4313 {
4314 SCHECK_PARTIAL();
4315 MRRETURN(MATCH_NOMATCH);
4316 }
4317 switch(*eptr++)
4318 {
4319 default: MRRETURN(MATCH_NOMATCH);
4320 case 0x0a: /* LF */
4321 case 0x0b: /* VT */
4322 case 0x0c: /* FF */
4323 case 0x0d: /* CR */
4324 case 0x85: /* NEL */
4325 break;
4326 }
4327 }
4328 break;
4329
4330 case OP_NOT_DIGIT:
4331 for (i = 1; i <= min; i++)
4332 {
4333 if (eptr >= md->end_subject)
4334 {
4335 SCHECK_PARTIAL();
4336 MRRETURN(MATCH_NOMATCH);
4337 }
4338 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4339 }
4340 break;
4341
4342 case OP_DIGIT:
4343 for (i = 1; i <= min; i++)
4344 {
4345 if (eptr >= md->end_subject)
4346 {
4347 SCHECK_PARTIAL();
4348 MRRETURN(MATCH_NOMATCH);
4349 }
4350 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4351 }
4352 break;
4353
4354 case OP_NOT_WHITESPACE:
4355 for (i = 1; i <= min; i++)
4356 {
4357 if (eptr >= md->end_subject)
4358 {
4359 SCHECK_PARTIAL();
4360 MRRETURN(MATCH_NOMATCH);
4361 }
4362 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4363 }
4364 break;
4365
4366 case OP_WHITESPACE:
4367 for (i = 1; i <= min; i++)
4368 {
4369 if (eptr >= md->end_subject)
4370 {
4371 SCHECK_PARTIAL();
4372 MRRETURN(MATCH_NOMATCH);
4373 }
4374 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4375 }
4376 break;
4377
4378 case OP_NOT_WORDCHAR:
4379 for (i = 1; i <= min; i++)
4380 {
4381 if (eptr >= md->end_subject)
4382 {
4383 SCHECK_PARTIAL();
4384 MRRETURN(MATCH_NOMATCH);
4385 }
4386 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4387 MRRETURN(MATCH_NOMATCH);
4388 }
4389 break;
4390
4391 case OP_WORDCHAR:
4392 for (i = 1; i <= min; i++)
4393 {
4394 if (eptr >= md->end_subject)
4395 {
4396 SCHECK_PARTIAL();
4397 MRRETURN(MATCH_NOMATCH);
4398 }
4399 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4400 MRRETURN(MATCH_NOMATCH);
4401 }
4402 break;
4403
4404 default:
4405 RRETURN(PCRE_ERROR_INTERNAL);
4406 }
4407 }
4408
4409 /* If min = max, continue at the same level without recursing */
4410
4411 if (min == max) continue;
4412
4413 /* If minimizing, we have to test the rest of the pattern before each
4414 subsequent match. Again, separate the UTF-8 case for speed, and also
4415 separate the UCP cases. */
4416
4417 if (minimize)
4418 {
4419 #ifdef SUPPORT_UCP
4420 if (prop_type >= 0)
4421 {
4422 switch(prop_type)
4423 {
4424 case PT_ANY:
4425 for (fi = min;; fi++)
4426 {
4427 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4429 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4430 if (eptr >= md->end_subject)
4431 {
4432 SCHECK_PARTIAL();
4433 MRRETURN(MATCH_NOMATCH);
4434 }
4435 GETCHARINCTEST(c, eptr);
4436 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4437 }
4438 /* Control never gets here */
4439
4440 case PT_LAMP:
4441 for (fi = min;; fi++)
4442 {
4443 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4445 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4446 if (eptr >= md->end_subject)
4447 {
4448 SCHECK_PARTIAL();
4449 MRRETURN(MATCH_NOMATCH);
4450 }
4451 GETCHARINCTEST(c, eptr);
4452 prop_chartype = UCD_CHARTYPE(c);
4453 if ((prop_chartype == ucp_Lu ||
4454 prop_chartype == ucp_Ll ||
4455 prop_chartype == ucp_Lt) == prop_fail_result)
4456 MRRETURN(MATCH_NOMATCH);
4457 }
4458 /* Control never gets here */
4459
4460 case PT_GC:
4461 for (fi = min;; fi++)
4462 {
4463 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4464 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4465 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4466 if (eptr >= md->end_subject)
4467 {
4468 SCHECK_PARTIAL();
4469 MRRETURN(MATCH_NOMATCH);
4470 }
4471 GETCHARINCTEST(c, eptr);
4472 prop_category = UCD_CATEGORY(c);
4473 if ((prop_category == prop_value) == prop_fail_result)
4474 MRRETURN(MATCH_NOMATCH);
4475 }
4476 /* Control never gets here */
4477
4478 case PT_PC:
4479 for (fi = min;; fi++)
4480 {
4481 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4483 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4484 if (eptr >= md->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 MRRETURN(MATCH_NOMATCH);
4488 }
4489 GETCHARINCTEST(c, eptr);
4490 prop_chartype = UCD_CHARTYPE(c);
4491 if ((prop_chartype == prop_value) == prop_fail_result)
4492 MRRETURN(MATCH_NOMATCH);
4493 }
4494 /* Control never gets here */
4495
4496 case PT_SC:
4497 for (fi = min;; fi++)
4498 {
4499 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4500 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4501 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4502 if (eptr >= md->end_subject)
4503 {
4504 SCHECK_PARTIAL();
4505 MRRETURN(MATCH_NOMATCH);
4506 }
4507 GETCHARINCTEST(c, eptr);
4508 prop_script = UCD_SCRIPT(c);
4509 if ((prop_script == prop_value) == prop_fail_result)
4510 MRRETURN(MATCH_NOMATCH);
4511 }
4512 /* Control never gets here */
4513
4514 case PT_ALNUM:
4515 for (fi = min;; fi++)
4516 {
4517 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4518 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4519 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4520 if (eptr >= md->end_subject)
4521 {
4522 SCHECK_PARTIAL();
4523 MRRETURN(MATCH_NOMATCH);
4524 }
4525 GETCHARINCTEST(c, eptr);
4526 prop_category = UCD_CATEGORY(c);
4527 if ((prop_category == ucp_L || prop_category == ucp_N)
4528 == prop_fail_result)
4529 MRRETURN(MATCH_NOMATCH);
4530 }
4531 /* Control never gets here */
4532
4533 case PT_SPACE: /* Perl space */
4534 for (fi = min;; fi++)
4535 {
4536 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4538 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4539 if (eptr >= md->end_subject)
4540 {
4541 SCHECK_PARTIAL();
4542 MRRETURN(MATCH_NOMATCH);
4543 }
4544 GETCHARINCTEST(c, eptr);
4545 prop_category = UCD_CATEGORY(c);
4546 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4547 c == CHAR_FF || c == CHAR_CR)
4548 == prop_fail_result)
4549 MRRETURN(MATCH_NOMATCH);
4550 }
4551 /* Control never gets here */
4552
4553 case PT_PXSPACE: /* POSIX space */
4554 for (fi = min;; fi++)
4555 {
4556 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4558 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4559 if (eptr >= md->end_subject)
4560 {
4561 SCHECK_PARTIAL();
4562 MRRETURN(MATCH_NOMATCH);
4563 }
4564 GETCHARINCTEST(c, eptr);
4565 prop_category = UCD_CATEGORY(c);
4566 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4567 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4568 == prop_fail_result)
4569 MRRETURN(MATCH_NOMATCH);
4570 }
4571 /* Control never gets here */
4572
4573 case PT_WORD:
4574 for (fi = min;; fi++)
4575 {
4576 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4577 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4578 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4579 if (eptr >= md->end_subject)
4580 {
4581 SCHECK_PARTIAL();
4582 MRRETURN(MATCH_NOMATCH);
4583 }
4584 GETCHARINCTEST(c, eptr);
4585 prop_category = UCD_CATEGORY(c);
4586 if ((prop_category == ucp_L ||
4587 prop_category == ucp_N ||
4588 c == CHAR_UNDERSCORE)
4589 == prop_fail_result)
4590 MRRETURN(MATCH_NOMATCH);
4591 }
4592 /* Control never gets here */
4593
4594 /* This should never occur */
4595
4596 default:
4597 RRETURN(PCRE_ERROR_INTERNAL);
4598 }
4599 }
4600
4601 /* Match extended Unicode sequences. We will get here only if the
4602 support is in the binary; otherwise a compile-time error occurs. */
4603
4604 else if (ctype == OP_EXTUNI)
4605 {
4606 for (fi = min;; fi++)
4607 {
4608 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4609 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4610 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4611 if (eptr >= md->end_subject)
4612 {
4613 SCHECK_PARTIAL();
4614 MRRETURN(MATCH_NOMATCH);
4615 }
4616 GETCHARINCTEST(c, eptr);
4617 prop_category = UCD_CATEGORY(c);
4618 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4619 while (eptr < md->end_subject)
4620 {
4621 int len = 1;
4622 if (!utf8) c = *eptr;
4623 else { GETCHARLEN(c, eptr, len); }
4624 prop_category = UCD_CATEGORY(c);
4625 if (prop_category != ucp_M) break;
4626 eptr += len;
4627 }
4628 }
4629 }
4630
4631 else
4632 #endif /* SUPPORT_UCP */
4633
4634 #ifdef SUPPORT_UTF8
4635 /* UTF-8 mode */
4636 if (utf8)
4637 {
4638 for (fi = min;; fi++)
4639 {
4640 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4642 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4643 if (eptr >= md->end_subject)
4644 {
4645 SCHECK_PARTIAL();
4646 MRRETURN(MATCH_NOMATCH);
4647 }
4648 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4649 MRRETURN(MATCH_NOMATCH);
4650 GETCHARINC(c, eptr);
4651 switch(ctype)
4652 {
4653 case OP_ANY: /* This is the non-NL case */
4654 case OP_ALLANY:
4655 case OP_ANYBYTE:
4656 break;
4657
4658 case OP_ANYNL:
4659 switch(c)
4660 {
4661 default: MRRETURN(MATCH_NOMATCH);
4662 case 0x000d:
4663 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4664 break;
4665 case 0x000a:
4666 break;
4667
4668 case 0x000b:
4669 case 0x000c:
4670 case 0x0085:
4671 case 0x2028:
4672 case 0x2029:
4673 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4674 break;
4675 }
4676 break;
4677
4678 case OP_NOT_HSPACE:
4679 switch(c)
4680 {
4681 default: break;
4682 case 0x09: /* HT */
4683 case 0x20: /* SPACE */
4684 case 0xa0: /* NBSP */
4685 case 0x1680: /* OGHAM SPACE MARK */
4686 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4687 case 0x2000: /* EN QUAD */
4688 case 0x2001: /* EM QUAD */
4689 case 0x2002: /* EN SPACE */
4690 case 0x2003: /* EM SPACE */
4691 case 0x2004: /* THREE-PER-EM SPACE */
4692 case 0x2005: /* FOUR-PER-EM SPACE */
4693 case 0x2006: /* SIX-PER-EM SPACE */
4694 case 0x2007: /* FIGURE SPACE */
4695 case 0x2008: /* PUNCTUATION SPACE */
4696 case 0x2009: /* THIN SPACE */
4697 case 0x200A: /* HAIR SPACE */
4698 case 0x202f: /* NARROW NO-BREAK SPACE */
4699 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4700 case 0x3000: /* IDEOGRAPHIC SPACE */
4701 MRRETURN(MATCH_NOMATCH);
4702 }
4703 break;
4704
4705 case OP_HSPACE:
4706 switch(c)
4707 {
4708 default: MRRETURN(MATCH_NOMATCH);
4709 case 0x09: /* HT */
4710 case 0x20: /* SPACE */
4711 case 0xa0: /* NBSP */
4712 case 0x1680: /* OGHAM SPACE MARK */
4713 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4714 case 0x2000: /* EN QUAD */
4715 case 0x2001: /* EM QUAD */
4716 case 0x2002: /* EN SPACE */
4717 case 0x2003: /* EM SPACE */
4718 case 0x2004: /* THREE-PER-EM SPACE */
4719 case 0x2005: /* FOUR-PER-EM SPACE */
4720 case 0x2006: /* SIX-PER-EM SPACE */
4721 case 0x2007: /* FIGURE SPACE */
4722 case 0x2008: /* PUNCTUATION SPACE */
4723 case 0x2009: /* THIN SPACE */
4724 case 0x200A: /* HAIR SPACE */
4725 case 0x202f: /* NARROW NO-BREAK SPACE */
4726 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4727 case 0x3000: /* IDEOGRAPHIC SPACE */
4728 break;
4729 }
4730 break;
4731
4732 case OP_NOT_VSPACE:
4733 switch(c)
4734 {
4735 default: break;
4736 case 0x0a: /* LF */
4737 case 0x0b: /* VT */
4738 case 0x0c: /* FF */
4739 case 0x0d: /* CR */
4740 case 0x85: /* NEL */
4741 case 0x2028: /* LINE SEPARATOR */
4742 case 0x2029: /* PARAGRAPH SEPARATOR */
4743 MRRETURN(MATCH_NOMATCH);
4744 }
4745 break;
4746
4747 case OP_VSPACE:
4748 switch(c)
4749 {
4750 default: MRRETURN(MATCH_NOMATCH);
4751 case 0x0a: /* LF */
4752 case 0x0b: /* VT */
4753 case 0x0c: /* FF */
4754 case 0x0d: /* CR */
4755 case 0x85: /* NEL */
4756 case 0x2028: /* LINE SEPARATOR */
4757 case 0x2029: /* PARAGRAPH SEPARATOR */
4758 break;
4759 }
4760 break;
4761
4762 case OP_NOT_DIGIT:
4763 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4764 MRRETURN(MATCH_NOMATCH);
4765 break;
4766
4767 case OP_DIGIT:
4768 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4769 MRRETURN(MATCH_NOMATCH);
4770 break;
4771
4772 case OP_NOT_WHITESPACE:
4773 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4774 MRRETURN(MATCH_NOMATCH);
4775 break;
4776
4777 case OP_WHITESPACE:
4778 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4779 MRRETURN(MATCH_NOMATCH);
4780 break;
4781
4782 case OP_NOT_WORDCHAR:
4783 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4784 MRRETURN(MATCH_NOMATCH);
4785 break;
4786
4787 case OP_WORDCHAR:
4788 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4789 MRRETURN(MATCH_NOMATCH);
4790 break;
4791
4792 default:
4793 RRETURN(PCRE_ERROR_INTERNAL);
4794 }
4795 }
4796 }
4797 else
4798 #endif
4799 /* Not UTF-8 mode */
4800 {
4801 for (fi = min;; fi++)
4802 {
4803 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4805 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4806 if (eptr >= md->end_subject)
4807 {
4808 SCHECK_PARTIAL();
4809 MRRETURN(MATCH_NOMATCH);
4810 }
4811 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4812 MRRETURN(MATCH_NOMATCH);
4813 c = *eptr++;
4814 switch(ctype)
4815 {
4816 case OP_ANY: /* This is the non-NL case */
4817 case OP_ALLANY:
4818 case OP_ANYBYTE:
4819 break;
4820
4821 case OP_ANYNL:
4822 switch(c)
4823 {
4824 default: MRRETURN(MATCH_NOMATCH);
4825 case 0x000d:
4826 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4827 break;
4828
4829 case 0x000a:
4830 break;
4831
4832 case 0x000b:
4833 case 0x000c:
4834 case 0x0085:
4835 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4836 break;
4837 }
4838 break;
4839
4840 case OP_NOT_HSPACE:
4841 switch(c)
4842 {
4843 default: break;
4844 case 0x09: /* HT */
4845 case 0x20: /* SPACE */
4846 case 0xa0: /* NBSP */
4847 MRRETURN(MATCH_NOMATCH);
4848 }
4849 break;
4850
4851 case OP_HSPACE:
4852 switch(c)
4853 {
4854 default: MRRETURN(MATCH_NOMATCH);
4855 case 0x09: /* HT */
4856 case 0x20: /* SPACE */
4857 case 0xa0: /* NBSP */
4858 break;
4859 }
4860 break;
4861
4862 case OP_NOT_VSPACE:
4863 switch(c)
4864 {
4865 default: break;
4866 case 0x0a: /* LF */
4867 case 0x0b: /* VT */
4868 case 0x0c: /* FF */
4869 case 0x0d: /* CR */
4870 case 0x85: /* NEL */
4871 MRRETURN(MATCH_NOMATCH);
4872 }
4873 break;
4874
4875 case OP_VSPACE:
4876 switch(c)
4877 {
4878 default: MRRETURN(MATCH_NOMATCH);
4879 case 0x0a: /* LF */
4880 case 0x0b: /* VT */
4881 case 0x0c: /* FF */
4882 case 0x0d: /* CR */
4883 case 0x85: /* NEL */
4884 break;
4885 }
4886 break;
4887
4888 case OP_NOT_DIGIT:
4889 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4890 break;
4891
4892 case OP_DIGIT:
4893 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4894 break;
4895
4896 case OP_NOT_WHITESPACE:
4897 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4898 break;
4899
4900 case OP_WHITESPACE:
4901 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4902 break;
4903
4904 case OP_NOT_WORDCHAR:
4905 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4906 break;
4907
4908 case OP_WORDCHAR:
4909 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4910 break;
4911
4912 default:
4913 RRETURN(PCRE_ERROR_INTERNAL);
4914 }
4915 }
4916 }
4917 /* Control never gets here */
4918 }
4919
4920 /* If maximizing, it is worth using inline code for speed, doing the type
4921 test once at the start (i.e. keep it out of the loop). Again, keep the
4922 UTF-8 and UCP stuff separate. */
4923
4924 else
4925 {
4926 pp = eptr; /* Remember where we started */
4927
4928 #ifdef SUPPORT_UCP
4929 if (prop_type >= 0)
4930 {
4931 switch(prop_type)
4932 {
4933 case PT_ANY:
4934 for (i = min; i < max; i++)
4935 {
4936 int len = 1;
4937 if (eptr >= md->end_subject)
4938 {
4939 SCHECK_PARTIAL();
4940 break;
4941 }
4942 GETCHARLENTEST(c, eptr, len);
4943 if (prop_fail_result) break;
4944 eptr+= len;
4945 }
4946 break;
4947
4948 case PT_LAMP:
4949 for (i = min; i < max; i++)
4950 {
4951 int len = 1;
4952 if (eptr >= md->end_subject)
4953 {
4954 SCHECK_PARTIAL();
4955 break;
4956 }
4957 GETCHARLENTEST(c, eptr, len);
4958 prop_chartype = UCD_CHARTYPE(c);
4959 if ((prop_chartype == ucp_Lu ||
4960 prop_chartype == ucp_Ll ||
4961 prop_chartype == ucp_Lt) == prop_fail_result)
4962 break;
4963 eptr+= len;
4964 }
4965 break;
4966
4967 case PT_GC:
4968 for (i = min; i < max; i++)
4969 {
4970 int len = 1;
4971 if (eptr >= md->end_subject)
4972 {
4973 SCHECK_PARTIAL();
4974 break;
4975 }
4976 GETCHARLENTEST(c, eptr, len);
4977 prop_category = UCD_CATEGORY(c);
4978 if ((prop_category == prop_value) == prop_fail_result)
4979 break;
4980 eptr+= len;
4981 }
4982 break;
4983
4984 case PT_PC:
4985 for (i = min; i < max; i++)
4986 {
4987 int len = 1;
4988 if (eptr >= md->end_subject)
4989 {
4990 SCHECK_PARTIAL();
4991 break;
4992 }
4993 GETCHARLENTEST(c, eptr, len);
4994 prop_chartype = UCD_CHARTYPE(c);
4995 if ((prop_chartype == prop_value) == prop_fail_result)
4996 break;
4997 eptr+= len;
4998 }
4999 break;
5000
5001 case PT_SC:
5002 for (i = min; i < max; i++)
5003 {
5004 int len = 1;
5005 if (eptr >= md->end_subject)
5006 {
5007 SCHECK_PARTIAL();
5008 break;
5009 }
5010 GETCHARLENTEST(c, eptr, len);
5011 prop_script = UCD_SCRIPT(c);
5012 if ((prop_script == prop_value) == prop_fail_result)
5013 break;
5014 eptr+= len;
5015 }
5016 break;
5017
5018 case PT_ALNUM:
5019 for (i = min; i < max; i++)
5020 {
5021 int len = 1;
5022 if (eptr >= md->end_subject)
5023 {
5024 SCHECK_PARTIAL();
5025 break;
5026 }
5027 GETCHARLENTEST(c, eptr, len);
5028 prop_category = UCD_CATEGORY(c);
5029 if ((prop_category == ucp_L || prop_category == ucp_N)
5030 == prop_fail_result)
5031 break;
5032 eptr+= len;
5033 }
5034 break;
5035
5036 case PT_SPACE: /* Perl space */
5037 for (i = min; i < max; i++)
5038 {
5039 int len = 1;
5040 if (eptr >= md->end_subject)
5041 {
5042 SCHECK_PARTIAL();
5043 break;
5044 }
5045 GETCHARLENTEST(c, eptr, len);
5046 prop_category = UCD_CATEGORY(c);
5047 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5048 c == CHAR_FF || c == CHAR_CR)
5049 == prop_fail_result)
5050 break;
5051 eptr+= len;
5052 }
5053 break;
5054
5055 case PT_PXSPACE: /* POSIX space */
5056 for (i = min; i < max; i++)
5057 {
5058 int len = 1;
5059 if (eptr >= md->end_subject)
5060 {
5061 SCHECK_PARTIAL();
5062 break;
5063 }
5064 GETCHARLENTEST(c, eptr, len);
5065 prop_category = UCD_CATEGORY(c);
5066 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5067 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5068 == prop_fail_result)
5069 break;
5070 eptr+= len;
5071 }
5072 break;
5073
5074 case PT_WORD:
5075 for (i = min; i < max; i++)
5076 {
5077 int len = 1;
5078 if (eptr >= md->end_subject)
5079 {
5080 SCHECK_PARTIAL();
5081 break;
5082 }
5083 GETCHARLENTEST(c, eptr, len);
5084 prop_category = UCD_CATEGORY(c);
5085 if ((prop_category == ucp_L || prop_category == ucp_N ||
5086 c == CHAR_UNDERSCORE) == prop_fail_result)
5087 break;
5088 eptr+= len;
5089 }
5090 break;
5091
5092 default:
5093 RRETURN(PCRE_ERROR_INTERNAL);
5094 }
5095
5096 /* eptr is now past the end of the maximum run */
5097
5098 if (possessive) continue;
5099 for(;;)
5100 {
5101 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5102 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5103 if (eptr-- == pp) break; /* Stop if tried at original pos */
5104 if (utf8) BACKCHAR(eptr);
5105 }
5106 }
5107
5108 /* Match extended Unicode sequences. We will get here only if the
5109 support is in the binary; otherwise a compile-time error occurs. */
5110
5111 else if (ctype == OP_EXTUNI)
5112 {
5113 for (i = min; i < max; i++)
5114 {
5115 if (eptr >= md->end_subject)
5116 {
5117 SCHECK_PARTIAL();
5118 break;
5119 }
5120 GETCHARINCTEST(c, eptr);
5121 prop_category = UCD_CATEGORY(c);
5122 if (prop_category == ucp_M) break;
5123 while (eptr < md->end_subject)
5124 {
5125 int len = 1;
5126 if (!utf8) c = *eptr; else
5127 {
5128 GETCHARLEN(c, eptr, len);
5129 }
5130 prop_category = UCD_CATEGORY(c);
5131 if (prop_category != ucp_M) break;
5132 eptr += len;
5133 }
5134 }
5135
5136 /* eptr is now past the end of the maximum run */
5137
5138 if (possessive) continue;
5139
5140 for(;;)
5141 {
5142 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5143 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5144 if (eptr-- == pp) break; /* Stop if tried at original pos */
5145 for (;;) /* Move back over one extended */
5146 {
5147 int len = 1;
5148 if (!utf8) c = *eptr; else
5149 {
5150 BACKCHAR(eptr);
5151 GETCHARLEN(c, eptr, len);
5152 }
5153 prop_category = UCD_CATEGORY(c);
5154 if (prop_category != ucp_M) break;
5155 eptr--;
5156 }
5157 }
5158 }
5159
5160 else
5161 #endif /* SUPPORT_UCP */
5162
5163 #ifdef SUPPORT_UTF8
5164 /* UTF-8 mode */
5165
5166 if (utf8)
5167 {
5168 switch(ctype)
5169 {
5170 case OP_ANY:
5171 if (max < INT_MAX)
5172 {
5173 for (i = min; i < max; i++)
5174 {
5175 if (eptr >= md->end_subject)
5176 {
5177 SCHECK_PARTIAL();
5178 break;
5179 }
5180 if (IS_NEWLINE(eptr)) break;
5181 eptr++;
5182 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5183 }
5184 }
5185
5186 /* Handle unlimited UTF-8 repeat */
5187
5188 else
5189 {
5190 for (i = min; i < max; i++)
5191 {
5192 if (eptr >= md->end_subject)
5193 {
5194 SCHECK_PARTIAL();
5195 break;
5196 }
5197 if (IS_NEWLINE(eptr)) break;
5198 eptr++;
5199 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5200 }
5201 }
5202 break;
5203
5204 case OP_ALLANY:
5205 if (max < INT_MAX)
5206 {
5207 for (i = min; i < max; i++)
5208 {
5209 if (eptr >= md->end_subject)
5210 {
5211 SCHECK_PARTIAL();
5212 break;
5213 }
5214 eptr++;
5215 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5216 }
5217 }
5218 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5219 break;
5220
5221 /* The byte case is the same as non-UTF8 */
5222
5223 case OP_ANYBYTE:
5224 c = max - min;
5225 if (c > (unsigned int)(md->end_subject - eptr))
5226 {
5227 eptr = md->end_subject;
5228 SCHECK_PARTIAL();
5229 }
5230 else eptr += c;
5231 break;
5232
5233 case OP_ANYNL:
5234 for (i = min; i < max; i++)
5235 {
5236 int len = 1;
5237 if (eptr >= md->end_subject)
5238 {
5239 SCHECK_PARTIAL();
5240 break;
5241 }
5242 GETCHARLEN(c, eptr, len);
5243 if (c == 0x000d)
5244 {
5245 if (++eptr >= md->end_subject) break;
5246 if (*eptr == 0x000a) eptr++;
5247 }
5248 else
5249 {
5250 if (c != 0x000a &&
5251 (md->bsr_anycrlf ||
5252 (c != 0x000b && c != 0x000c &&
5253 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5254 break;
5255 eptr += len;
5256 }
5257 }
5258 break;
5259
5260 case OP_NOT_HSPACE:
5261 case OP_HSPACE:
5262 for (i = min; i < max; i++)
5263 {
5264 BOOL gotspace;
5265 int len = 1;
5266 if (eptr >= md->end_subject)
5267 {
5268 SCHECK_PARTIAL();
5269 break;
5270 }
5271 GETCHARLEN(c, eptr, len);
5272 switch(c)
5273 {
5274 default: gotspace = FALSE; break;
5275 case 0x09: /* HT */
5276 case 0x20: /* SPACE */
5277 case 0xa0: /* NBSP */
5278 case 0x1680: /* OGHAM SPACE MARK */
5279 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5280 case 0x2000: /* EN QUAD */
5281 case 0x2001: /* EM QUAD */
5282 case 0x2002: /* EN SPACE */
5283 case 0x2003: /* EM SPACE */
5284 case 0x2004: /* THREE-PER-EM SPACE */
5285 case 0x2005: /* FOUR-PER-EM SPACE */
5286 case 0x2006: /* SIX-PER-EM SPACE */
5287 case 0x2007: /* FIGURE SPACE */
5288 case 0x2008: /* PUNCTUATION SPACE */
5289 case 0x2009: /* THIN SPACE */
5290 case 0x200A: /* HAIR SPACE */
5291 case 0x202f: /* NARROW NO-BREAK SPACE */
5292 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5293 case 0x3000: /* IDEOGRAPHIC SPACE */
5294 gotspace = TRUE;
5295 break;
5296 }
5297 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5298 eptr += len;
5299 }
5300 break;
5301
5302 case OP_NOT_VSPACE:
5303 case OP_VSPACE:
5304 for (i = min; i < max; i++)
5305 {
5306 BOOL gotspace;
5307 int len = 1;
5308 if (eptr >= md->end_subject)
5309 {
5310 SCHECK_PARTIAL();
5311 break;
5312 }
5313 GETCHARLEN(c, eptr, len);
5314 switch(c)
5315 {
5316 default: gotspace = FALSE; break;
5317 case 0x0a: /* LF */
5318 case 0x0b: /* VT */
5319 case 0x0c: /* FF */
5320 case 0x0d: /* CR */
5321 case 0x85: /* NEL */
5322 case 0x2028: /* LINE SEPARATOR */
5323 case 0x2029: /* PARAGRAPH SEPARATOR */
5324 gotspace = TRUE;
5325 break;
5326 }
5327 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5328 eptr += len;
5329 }
5330 break;
5331
5332 case OP_NOT_DIGIT:
5333 for (i = min; i < max; i++)
5334 {
5335 int len = 1;
5336 if (eptr >= md->end_subject)
5337 {
5338 SCHECK_PARTIAL();
5339 break;
5340 }
5341 GETCHARLEN(c, eptr, len);
5342 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5343 eptr+= len;
5344 }
5345 break;
5346
5347 case OP_DIGIT:
5348 for (i = min; i < max; i++)
5349 {
5350 int len = 1;
5351 if (eptr >= md->end_subject)
5352 {
5353 SCHECK_PARTIAL();
5354 break;
5355 }
5356 GETCHARLEN(c, eptr, len);
5357 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5358 eptr+= len;
5359 }
5360 break;
5361
5362 case OP_NOT_WHITESPACE:
5363 for (i = min; i < max; i++)
5364 {
5365 int len = 1;
5366 if (eptr >= md->end_subject)
5367 {
5368 SCHECK_PARTIAL();
5369 break;
5370 }
5371 GETCHARLEN(c, eptr, len);
5372 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5373 eptr+= len;
5374 }
5375 break;
5376
5377 case OP_WHITESPACE:
5378 for (i = min; i < max; i++)
5379 {
5380 int len = 1;
5381 if (eptr >= md->end_subject)
5382 {
5383 SCHECK_PARTIAL();
5384 break;
5385 }
5386 GETCHARLEN(c, eptr, len);
5387 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5388 eptr+= len;
5389 }
5390 break;
5391
5392 case OP_NOT_WORDCHAR:
5393 for (i = min; i < max; i++)
5394 {
5395 int len = 1;
5396 if (eptr >= md->end_subject)
5397 {
5398 SCHECK_PARTIAL();
5399 break;
5400 }
5401 GETCHARLEN(c, eptr, len);
5402 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5403 eptr+= len;
5404 }
5405 break;
5406
5407 case OP_WORDCHAR:
5408 for (i = min; i < max; i++)
5409 {
5410 int len = 1;
5411 if (eptr >= md->end_subject)
5412 {
5413 SCHECK_PARTIAL();
5414 break;
5415 }
5416 GETCHARLEN(c, eptr, len);
5417 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5418 eptr+= len;
5419 }
5420 break;
5421
5422 default:
5423 RRETURN(PCRE_ERROR_INTERNAL);
5424 }
5425
5426 /* eptr is now past the end of the maximum run. If possessive, we are
5427 done (no backing up). Otherwise, match at this position; anything other
5428 than no match is immediately returned. For nomatch, back up one
5429 character, unless we are matching \R and the last thing matched was
5430 \r\n, in which case, back up two bytes. */
5431
5432 if (possessive) continue;
5433 for(;;)
5434 {
5435 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5436 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5437 if (eptr-- == pp) break; /* Stop if tried at original pos */
5438 BACKCHAR(eptr);
5439 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5440 eptr[-1] == '\r') eptr--;
5441 }
5442 }
5443 else
5444 #endif /* SUPPORT_UTF8 */
5445
5446 /* Not UTF-8 mode */
5447 {
5448 switch(ctype)
5449 {
5450 case OP_ANY:
5451 for (i = min; i < max; i++)
5452 {
5453 if (eptr >= md->end_subject)
5454 {
5455 SCHECK_PARTIAL();
5456 break;
5457 }
5458 if (IS_NEWLINE(eptr)) break;
5459 eptr++;
5460 }
5461 break;
5462
5463 case OP_ALLANY:
5464 case OP_ANYBYTE:
5465 c = max - min;
5466 if (c > (unsigned int)(md->end_subject - eptr))
5467 {
5468 eptr = md->end_subject;
5469 SCHECK_PARTIAL();
5470 }
5471 else eptr += c;
5472 break;
5473
5474 case OP_ANYNL:
5475 for (i = min; i < max; i++)
5476 {
5477 if (eptr >= md->end_subject)
5478 {
5479 SCHECK_PARTIAL();
5480 break;
5481 }
5482 c = *eptr;
5483 if (c == 0x000d)
5484 {
5485 if (++eptr >= md->end_subject) break;
5486 if (*eptr == 0x000a) eptr++;
5487 }
5488 else
5489 {
5490 if (c != 0x000a &&
5491 (md->bsr_anycrlf ||
5492 (c != 0x000b && c != 0x000c && c != 0x0085)))
5493 break;
5494 eptr++;
5495 }
5496 }
5497 break;
5498
5499 case OP_NOT_HSPACE:
5500 for (i = min; i < max; i++)
5501 {
5502 if (eptr >= md->end_subject)
5503 {
5504 SCHECK_PARTIAL();
5505 break;
5506 }
5507 c = *eptr;
5508 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5509 eptr++;
5510 }
5511 break;
5512
5513 case OP_HSPACE:
5514 for (i = min; i < max; i++)
5515 {
5516 if (eptr >= md->end_subject)
5517 {
5518 SCHECK_PARTIAL();
5519 break;
5520 }
5521 c = *eptr;
5522 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5523 eptr++;
5524 }
5525 break;
5526
5527 case OP_NOT_VSPACE:
5528 for (i = min; i < max; i++)
5529 {
5530 if (eptr >= md->end_subject)
5531 {
5532 SCHECK_PARTIAL();
5533 break;
5534 }
5535 c = *eptr;
5536 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5537 break;
5538 eptr++;
5539 }
5540 break;
5541
5542 case OP_VSPACE:
5543 for (i = min; i < max; i++)
5544 {
5545 if (eptr >= md->end_subject)
5546 {
5547 SCHECK_PARTIAL();
5548 break;
5549 }
5550 c = *eptr;
5551 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5552 break;
5553 eptr++;
5554 }
5555 break;
5556
5557 case OP_NOT_DIGIT:
5558 for (i = min; i < max; i++)
5559 {
5560 if (eptr >= md->end_subject)
5561 {
5562 SCHECK_PARTIAL();
5563 break;
5564 }
5565 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5566 eptr++;
5567 }
5568 break;
5569
5570 case OP_DIGIT:
5571 for (i = min; i < max; i++)
5572 {
5573 if (eptr >= md->end_subject)
5574 {
5575 SCHECK_PARTIAL();
5576 break;
5577 }
5578 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5579 eptr++;
5580 }
5581 break;
5582
5583 case OP_NOT_WHITESPACE:
5584 for (i = min; i < max; i++)
5585 {
5586 if (eptr >= md->end_subject)
5587 {
5588 SCHECK_PARTIAL();
5589 break;
5590 }
5591 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5592 eptr++;
5593 }
5594 break;
5595
5596 case OP_WHITESPACE:
5597 for (i = min; i < max; i++)
5598 {
5599 if (eptr >= md->end_subject)
5600 {
5601 SCHECK_PARTIAL();
5602 break;
5603 }
5604 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5605 eptr++;
5606 }
5607 break;
5608
5609 case OP_NOT_WORDCHAR:
5610 for (i = min; i < max; i++)
5611 {
5612 if (eptr >= md->end_subject)
5613 {
5614 SCHECK_PARTIAL();
5615 break;
5616 }
5617 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5618 eptr++;
5619 }
5620 break;
5621
5622 case OP_WORDCHAR:
5623 for (i = min; i < max; i++)
5624 {
5625 if (eptr >= md->end_subject)
5626 {
5627 SCHECK_PARTIAL();
5628 break;
5629 }
5630 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5631 eptr++;
5632 }
5633 break;
5634
5635 default:
5636 RRETURN(PCRE_ERROR_INTERNAL);
5637 }
5638
5639 /* eptr is now past the end of the maximum run. If possessive, we are
5640 done (no backing up). Otherwise, match at this position; anything other
5641 than no match is immediately returned. For nomatch, back up one
5642 character (byte), unless we are matching \R and the last thing matched
5643 was \r\n, in which case, back up two bytes. */
5644
5645 if (possessive) continue;
5646 while (eptr >= pp)
5647 {
5648 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5650 eptr--;
5651 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5652 eptr[-1] == '\r') eptr--;
5653 }
5654 }
5655
5656 /* Get here if we can't make it match with any permitted repetitions */
5657
5658 MRRETURN(MATCH_NOMATCH);
5659 }
5660 /* Control never gets here */
5661
5662 /* There's been some horrible disaster. Arrival here can only mean there is
5663 something seriously wrong in the code above or the OP_xxx definitions. */
5664
5665 default:
5666 DPRINTF(("Unknown opcode %d\n", *ecode));
5667 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5668 }
5669
5670 /* Do not stick any code in here without much thought; it is assumed
5671 that "continue" in the code above comes out to here to repeat the main
5672 loop. */
5673
5674 } /* End of main loop */
5675 /* Control never reaches here */
5676
5677
5678 /* When compiling to use the heap rather than the stack for recursive calls to
5679 match(), the RRETURN() macro jumps here. The number that is saved in
5680 frame->Xwhere indicates which label we actually want to return to. */
5681
5682 #ifdef NO_RECURSE
5683 #define LBL(val) case val: goto L_RM##val;
5684 HEAP_RETURN:
5685 switch (frame->Xwhere)
5686 {
5687 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5688 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5689 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5690 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5691 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5692 #ifdef SUPPORT_UTF8
5693 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5694 LBL(32) LBL(34) LBL(42) LBL(46)
5695 #ifdef SUPPORT_UCP
5696 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5697 LBL(59) LBL(60) LBL(61) LBL(62)
5698 #endif /* SUPPORT_UCP */
5699 #endif /* SUPPORT_UTF8 */
5700 default:
5701 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5702 return PCRE_ERROR_INTERNAL;
5703 }
5704 #undef LBL
5705 #endif /* NO_RECURSE */
5706 }
5707
5708
5709 /***************************************************************************
5710 ****************************************************************************
5711 RECURSION IN THE match() FUNCTION
5712
5713 Undefine all the macros that were defined above to handle this. */
5714
5715 #ifdef NO_RECURSE
5716 #undef eptr
5717 #undef ecode
5718 #undef mstart
5719 #undef offset_top
5720 #undef eptrb
5721 #undef flags
5722
5723 #undef callpat
5724 #undef charptr
5725 #undef data
5726 #undef next
5727 #undef pp
5728 #undef prev
5729 #undef saved_eptr
5730
5731 #undef new_recursive
5732
5733 #undef cur_is_word
5734 #undef condition
5735 #undef prev_is_word
5736
5737 #undef ctype
5738 #undef length
5739 #undef max
5740 #undef min
5741 #undef number
5742 #undef offset
5743 #undef op
5744 #undef save_capture_last
5745 #undef save_offset1
5746 #undef save_offset2
5747 #undef save_offset3
5748 #undef stacksave
5749
5750 #undef newptrb
5751
5752 #endif
5753
5754 /* These two are defined as macros in both cases */
5755
5756 #undef fc
5757 #undef fi
5758
5759 /***************************************************************************
5760 ***************************************************************************/
5761
5762
5763
5764 /*************************************************
5765 * Execute a Regular Expression *
5766 *************************************************/
5767
5768 /* This function applies a compiled re to a subject string and picks out
5769 portions of the string if it matches. Two elements in the vector are set for
5770 each substring: the offsets to the start and end of the substring.
5771
5772 Arguments:
5773 argument_re points to the compiled expression
5774 extra_data points to extra data or is NULL
5775 subject points to the subject string
5776 length length of subject string (may contain binary zeros)
5777 start_offset where to start in the subject string
5778 options option bits
5779 offsets points to a vector of ints to be filled in with offsets
5780 offsetcount the number of elements in the vector
5781
5782 Returns: > 0 => success; value is the number of elements filled in
5783 = 0 => success, but offsets is not big enough
5784 -1 => failed to match
5785 < -1 => some kind of unexpected problem
5786 */
5787
5788 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5789 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5790 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5791 int offsetcount)
5792 {
5793 int rc, resetcount, ocount;
5794 int first_byte = -1;
5795 int req_byte = -1;
5796 int req_byte2 = -1;
5797 int newline;
5798 BOOL using_temporary_offsets = FALSE;
5799 BOOL anchored;
5800 BOOL startline;
5801 BOOL firstline;
5802 BOOL first_byte_caseless = FALSE;
5803 BOOL req_byte_caseless = FALSE;
5804 BOOL utf8;
5805 match_data match_block;
5806 match_data *md = &match_block;
5807 const uschar *tables;
5808 const uschar *start_bits = NULL;
5809 USPTR start_match = (USPTR)subject + start_offset;
5810 USPTR end_subject;
5811 USPTR start_partial = NULL;
5812 USPTR req_byte_ptr = start_match - 1;
5813
5814 pcre_study_data internal_study;
5815 const pcre_study_data *study;
5816
5817 real_pcre internal_re;
5818 const real_pcre *external_re = (const real_pcre *)argument_re;
5819 const real_pcre *re = external_re;
5820
5821 /* Plausibility checks */
5822
5823 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5824 if (re == NULL || subject == NULL ||
5825 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5826 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5827 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5828
5829 /* This information is for finding all the numbers associated with a given
5830 name, for condition testing. */
5831
5832 md->name_table = (uschar *)re + re->name_table_offset;
5833 md->name_count = re->name_count;
5834 md->name_entry_size = re->name_entry_size;
5835
5836 /* Fish out the optional data from the extra_data structure, first setting
5837 the default values. */
5838
5839 study = NULL;
5840 md->match_limit = MATCH_LIMIT;
5841 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5842 md->callout_data = NULL;
5843
5844 /* The table pointer is always in native byte order. */
5845
5846 tables = external_re->tables;
5847
5848 if (extra_data != NULL)
5849 {
5850 register unsigned int flags = extra_data->flags;
5851 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5852 study = (const pcre_study_data *)extra_data->study_data;
5853 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5854 md->match_limit = extra_data->match_limit;
5855 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5856 md->match_limit_recursion = extra_data->match_limit_recursion;
5857 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5858 md->callout_data = extra_data->callout_data;
5859 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5860 }
5861
5862 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5863 is a feature that makes it possible to save compiled regex and re-use them
5864 in other programs later. */
5865
5866 if (tables == NULL) tables = _pcre_default_tables;
5867
5868 /* Check that the first field in the block is the magic number. If it is not,
5869 test for a regex that was compiled on a host of opposite endianness. If this is
5870 the case, flipped values are put in internal_re and internal_study if there was
5871 study data too. */
5872
5873 if (re->magic_number != MAGIC_NUMBER)
5874 {
5875 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5876 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5877 if (study != NULL) study = &internal_study;
5878 }
5879
5880 /* Set up other data */
5881
5882 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5883 startline = (re->flags & PCRE_STARTLINE) != 0;
5884 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5885
5886 /* The code starts after the real_pcre block and the capture name table. */
5887
5888 md->start_code = (const uschar *)external_re + re->name_table_offset +
5889 re->name_count * re->name_entry_size;
5890
5891 md->start_subject = (USPTR)subject;
5892 md->start_offset = start_offset;
5893 md->end_subject = md->start_subject + length;
5894 end_subject = md->end_subject;
5895
5896 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5897 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5898 md->use_ucp = (re->options & PCRE_UCP) != 0;
5899 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5900
5901 md->notbol = (options & PCRE_NOTBOL) != 0;
5902 md->noteol = (options & PCRE_NOTEOL) != 0;
5903 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5904 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5905 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5906 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5907 md->hitend = FALSE;
5908 md->mark = NULL; /* In case never set */
5909
5910 md->recursive = NULL; /* No recursion at top level */
5911
5912 md->lcc = tables + lcc_offset;
5913 md->ctypes = tables + ctypes_offset;
5914
5915 /* Handle different \R options. */
5916
5917 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5918 {
5919 case 0:
5920 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5921 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5922 else
5923 #ifdef BSR_ANYCRLF
5924 md->bsr_anycrlf = TRUE;
5925 #else
5926 md->bsr_anycrlf = FALSE;
5927 #endif
5928 break;
5929
5930 case PCRE_BSR_ANYCRLF:
5931 md->bsr_anycrlf = TRUE;
5932 break;
5933
5934 case PCRE_BSR_UNICODE:
5935 md->bsr_anycrlf = FALSE;
5936 break;
5937
5938 default: return PCRE_ERROR_BADNEWLINE;
5939 }
5940
5941 /* Handle different types of newline. The three bits give eight cases. If
5942 nothing is set at run time, whatever was used at compile time applies. */
5943
5944 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5945 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5946 {
5947 case 0: newline = NEWLINE; break; /* Compile-time default */
5948 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5949 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5950 case PCRE_NEWLINE_CR+
5951 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5952 case PCRE_NEWLINE_ANY: newline = -1; break;
5953 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5954 default: return PCRE_ERROR_BADNEWLINE;
5955 }
5956
5957 if (newline == -2)
5958 {
5959 md->nltype = NLTYPE_ANYCRLF;
5960 }
5961 else if (newline < 0)
5962 {
5963 md->nltype = NLTYPE_ANY;
5964 }
5965 else
5966 {
5967 md->nltype = NLTYPE_FIXED;
5968 if (newline > 255)
5969 {
5970 md->nllen = 2;
5971 md->nl[0] = (newline >> 8) & 255;
5972 md->nl[1] = newline & 255;
5973 }
5974 else
5975 {
5976 md->nllen = 1;
5977 md->nl[0] = newline;
5978 }
5979 }
5980
5981 /* Partial matching was originally supported only for a restricted set of
5982 regexes; from release 8.00 there are no restrictions, but the bits are still
5983 defined (though never set). So there's no harm in leaving this code. */
5984
5985 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5986 return PCRE_ERROR_BADPARTIAL;
5987
5988 /* Check a UTF-8 string if required. Pass back the character offset and error
5989 code for an invalid string if a results vector is available. */
5990
5991 #ifdef SUPPORT_UTF8
5992 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5993 {
5994 int erroroffset;
5995 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5996 if (errorcode != 0)
5997 {
5998 if (offsetcount >= 2)
5999 {
6000 offsets[0] = erroroffset;
6001 offsets[1] = errorcode;
6002 }
6003 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6004 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6005 }
6006
6007 /* Check that a start_offset points to the start of a UTF-8 character. */
6008
6009 if (start_offset > 0 && start_offset < length &&
6010 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6011 return PCRE_ERROR_BADUTF8_OFFSET;
6012 }
6013 #endif
6014
6015 /* If the expression has got more back references than the offsets supplied can
6016 hold, we get a temporary chunk of working store to use during the matching.
6017 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6018 of 3. */
6019
6020 ocount = offsetcount - (offsetcount % 3);
6021
6022 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6023 {
6024 ocount = re->top_backref * 3 + 3;
6025 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6026 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6027 using_temporary_offsets = TRUE;
6028 DPRINTF(("Got memory to hold back references\n"));
6029 }
6030 else md->offset_vector = offsets;
6031
6032 md->offset_end = ocount;
6033 md->offset_max = (2*ocount)/3;
6034 md->offset_overflow = FALSE;
6035 md->capture_last = -1;
6036
6037 /* Compute the minimum number of offsets that we need to reset each time. Doing
6038 this makes a huge difference to execution time when there aren't many brackets
6039 in the pattern. */
6040
6041 resetcount = 2 + re->top_bracket * 2;
6042 if (resetcount > offsetcount) resetcount = ocount;
6043
6044 /* Reset the working variable associated with each extraction. These should
6045 never be used unless previously set, but they get saved and restored, and so we
6046 initialize them to avoid reading uninitialized locations. */
6047
6048 if (md->offset_vector != NULL)
6049 {
6050 register int *iptr = md->offset_vector + ocount;
6051 register int *iend = iptr - resetcount/2 + 1;
6052 while (--iptr >= iend) *iptr = -1;
6053 }
6054
6055 /* Set up the first character to match, if available. The first_byte value is
6056 never set for an anchored regular expression, but the anchoring may be forced
6057 at run time, so we have to test for anchoring. The first char may be unset for
6058 an unanchored pattern, of course. If there's no first char and the pattern was
6059 studied, there may be a bitmap of possible first characters. */
6060
6061 if (!anchored)
6062 {
6063 if ((re->flags & PCRE_FIRSTSET) != 0)
6064 {
6065 first_byte = re->first_byte & 255;
6066 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6067 first_byte = md->lcc[first_byte];
6068 }
6069 else
6070 if (!startline && study != NULL &&
6071 (study->flags & PCRE_STUDY_MAPPED) != 0)
6072 start_bits = study->start_bits;
6073 }
6074
6075 /* For anchored or unanchored matches, there may be a "last known required
6076 character" set. */
6077
6078 if ((re->flags & PCRE_REQCHSET) != 0)
6079 {
6080 req_byte = re->req_byte & 255;
6081 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6082 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6083 }
6084
6085
6086 /* ==========================================================================*/
6087
6088 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6089 the loop runs just once. */
6090
6091 for(;;)
6092 {
6093 USPTR save_end_subject = end_subject;
6094 USPTR new_start_match;
6095
6096 /* Reset the maximum number of extractions we might see. */
6097
6098 if (md->offset_vector != NULL)
6099 {
6100 register int *iptr = md->offset_vector;
6101 register int *iend = iptr + resetcount;
6102 while (iptr < iend) *iptr++ = -1;
6103 }
6104
6105 /* If firstline is TRUE, the start of the match is constrained to the first
6106 line of a multiline string. That is, the match must be before or at the first
6107 newline. Implement this by temporarily adjusting end_subject so that we stop
6108 scanning at a newline. If the match fails at the newline, later code breaks
6109 this loop. */
6110
6111 if (firstline)
6112 {
6113 USPTR t = start_match;
6114 #ifdef SUPPORT_UTF8
6115 if (utf8)
6116 {
6117 while (t < md->end_subject && !IS_NEWLINE(t))
6118 {
6119 t++;
6120 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6121 }
6122 }
6123 else
6124 #endif
6125 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6126 end_subject = t;
6127 }
6128
6129 /* There are some optimizations that avoid running the match if a known
6130 starting point is not found, or if a known later character is not present.
6131 However, there is an option that disables these, for testing and for ensuring
6132 that all callouts do actually occur. The option can be set in the regex by
6133 (*NO_START_OPT) or passed in match-time options. */
6134
6135 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6136 {
6137 /* Advance to a unique first byte if there is one. */
6138
6139 if (first_byte >= 0)
6140 {
6141 if (first_byte_caseless)
6142 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6143 start_match++;
6144 else
6145 while (start_match < end_subject && *start_match != first_byte)
6146 start_match++;
6147 }
6148
6149 /* Or to just after a linebreak for a multiline match */
6150
6151 else if (startline)
6152 {
6153 if (start_match > md->start_subject + start_offset)
6154 {
6155 #ifdef SUPPORT_UTF8
6156 if (utf8)
6157 {
6158 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6159 {
6160 start_match++;
6161 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6162 start_match++;
6163 }
6164 }
6165 else
6166 #endif
6167 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6168 start_match++;
6169
6170 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6171 and we are now at a LF, advance the match position by one more character.
6172 */
6173
6174 if (start_match[-1] == CHAR_CR &&
6175 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6176 start_match < end_subject &&
6177 *start_match == CHAR_NL)
6178 start_match++;
6179 }
6180 }
6181
6182 /* Or to a non-unique first byte after study */
6183
6184 else if (start_bits != NULL)
6185 {
6186 while (start_match < end_subject)
6187 {
6188 register unsigned int c = *start_match;
6189 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6190 {
6191 start_match++;
6192 #ifdef SUPPORT_UTF8
6193 if (utf8)
6194 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6195 start_match++;
6196 #endif
6197 }
6198 else break;
6199 }
6200 }
6201 } /* Starting optimizations */
6202
6203 /* Restore fudged end_subject */
6204
6205 end_subject = save_end_subject;
6206
6207 /* The following two optimizations are disabled for partial matching or if
6208 disabling is explicitly requested. */
6209
6210 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6211 {
6212 /* If the pattern was studied, a minimum subject length may be set. This is
6213 a lower bound; no actual string of that length may actually match the
6214 pattern. Although the value is, strictly, in characters, we treat it as
6215 bytes to avoid spending too much time in this optimization. */
6216
6217 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6218 (pcre_uint32)(end_subject - start_match) < study->minlength)
6219 {
6220 rc = MATCH_NOMATCH;
6221 break;
6222 }
6223
6224 /* If req_byte is set, we know that that character must appear in the
6225 subject for the match to succeed. If the first character is set, req_byte
6226 must be later in the subject; otherwise the test starts at the match point.
6227 This optimization can save a huge amount of backtracking in patterns with
6228 nested unlimited repeats that aren't going to match. Writing separate code
6229 for cased/caseless versions makes it go faster, as does using an
6230 autoincrement and backing off on a match.
6231
6232 HOWEVER: when the subject string is very, very long, searching to its end
6233 can take a long time, and give bad performance on quite ordinary patterns.
6234 This showed up when somebody was matching something like /^\d+C/ on a
6235 32-megabyte string... so we don't do this when the string is sufficiently
6236 long. */
6237
6238 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6239 {
6240 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6241
6242 /* We don't need to repeat the search if we haven't yet reached the
6243 place we found it at last time. */
6244
6245 if (p > req_byte_ptr)
6246 {
6247 if (req_byte_caseless)
6248 {
6249 while (p < end_subject)
6250 {
6251 register int pp = *p++;
6252 if (pp == req_byte || pp == req_byte2) { p--; break; }
6253 }
6254 }
6255 else
6256 {
6257 while (p < end_subject)
6258 {
6259 if (*p++ == req_byte) { p--; break; }
6260 }
6261 }
6262
6263 /* If we can't find the required character, break the matching loop,
6264 forcing a match failure. */
6265
6266 if (p >= end_subject)
6267 {
6268 rc = MATCH_NOMATCH;
6269 break;
6270 }
6271
6272 /* If we have found the required character, save the point where we
6273 found it, so that we don't search again next time round the loop if
6274 the start hasn't passed this character yet. */
6275
6276 req_byte_ptr = p;
6277 }
6278 }
6279 }
6280
6281 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6282 printf(">>>> Match against: ");
6283 pchars(start_match, end_subject - start_match, TRUE, md);
6284 printf("\n");
6285 #endif
6286
6287 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6288 first starting point for which a partial match was found. */
6289
6290 md->start_match_ptr = start_match;
6291 md->start_used_ptr = start_match;
6292 md->match_call_count = 0;
6293 md->match_function_type = 0;
6294 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6295 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6296
6297 switch(rc)
6298 {
6299 /* SKIP passes back the next starting point explicitly, but if it is the
6300 same as the match we have just done, treat it as NOMATCH. */
6301
6302 case MATCH_SKIP:
6303 if (md->start_match_ptr != start_match)
6304 {
6305 new_start_match = md->start_match_ptr;
6306 break;
6307 }
6308 /* Fall through */
6309
6310 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6311 the SKIP's arg was not found. We also treat this as NOMATCH. */
6312
6313 case MATCH_SKIP_ARG:
6314 /* Fall through */
6315
6316 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6317 exactly like PRUNE. */
6318
6319 case MATCH_NOMATCH:
6320 case MATCH_PRUNE:
6321 case MATCH_THEN:
6322 new_start_match = start_match + 1;
6323 #ifdef SUPPORT_UTF8
6324 if (utf8)
6325 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6326 new_start_match++;
6327 #endif
6328 break;
6329
6330 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6331
6332 case MATCH_COMMIT:
6333 rc = MATCH_NOMATCH;
6334 goto ENDLOOP;
6335
6336 /* Any other return is either a match, or some kind of error. */
6337
6338 default:
6339 goto ENDLOOP;
6340 }
6341
6342 /* Control reaches here for the various types of "no match at this point"
6343 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6344
6345 rc = MATCH_NOMATCH;
6346
6347 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6348 newline in the subject (though it may continue over the newline). Therefore,
6349 if we have just failed to match, starting at a newline, do not continue. */
6350
6351 if (firstline && IS_NEWLINE(start_match)) break;
6352
6353 /* Advance to new matching position */
6354
6355 start_match = new_start_match;
6356
6357 /* Break the loop if the pattern is anchored or if we have passed the end of
6358 the subject. */
6359
6360 if (anchored || start_match > end_subject) break;
6361
6362 /* If we have just passed a CR and we are now at a LF, and the pattern does
6363 not contain any explicit matches for \r or \n, and the newline option is CRLF
6364 or ANY or ANYCRLF, advance the match position by one more character. */
6365
6366 if (start_match[-1] == CHAR_CR &&
6367 start_match < end_subject &&
6368 *start_match == CHAR_NL &&
6369 (re->flags & PCRE_HASCRORLF) == 0 &&
6370 (md->nltype == NLTYPE_ANY ||
6371 md->nltype == NLTYPE_ANYCRLF ||
6372 md->nllen == 2))
6373 start_match++;
6374
6375 md->mark = NULL; /* Reset for start of next match attempt */
6376 } /* End of for(;;) "bumpalong" loop */
6377
6378 /* ==========================================================================*/
6379
6380 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6381 conditions is true:
6382
6383 (1) The pattern is anchored or the match was failed by (*COMMIT);
6384
6385 (2) We are past the end of the subject;
6386
6387 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6388 this option requests that a match occur at or before the first newline in
6389 the subject.
6390
6391 When we have a match and the offset vector is big enough to deal with any
6392 backreferences, captured substring offsets will already be set up. In the case
6393 where we had to get some local store to hold offsets for backreference
6394 processing, copy those that we can. In this case there need not be overflow if
6395 certain parts of the pattern were not used, even though there are more
6396 capturing parentheses than vector slots. */
6397
6398 ENDLOOP:
6399
6400 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6401 {
6402 if (using_temporary_offsets)
6403 {
6404 if (offsetcount >= 4)
6405 {
6406 memcpy(offsets + 2, md->offset_vector + 2,
6407 (offsetcount - 2) * sizeof(int));
6408 DPRINTF(("Copied offsets from temporary memory\n"));
6409 }
6410 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6411 DPRINTF(("Freeing temporary memory\n"));
6412 (pcre_free)(md->offset_vector);
6413 }
6414
6415 /* Set the return code to the number of captured strings, or 0 if there are
6416 too many to fit into the vector. */
6417
6418 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6419
6420 /* If there is space, set up the whole thing as substring 0. The value of
6421 md->start_match_ptr might be modified if \K was encountered on the success
6422 matching path. */
6423
6424 if (offsetcount < 2) rc = 0; else
6425 {
6426 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6427 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6428 }
6429
6430 DPRINTF((">>>> returning %d\n", rc));
6431 goto RETURN_MARK;
6432 }
6433
6434 /* Control gets here if there has been an error, or if the overall match
6435 attempt has failed at all permitted starting positions. */
6436
6437 if (using_temporary_offsets)
6438 {
6439 DPRINTF(("Freeing temporary memory\n"));
6440 (pcre_free)(md->offset_vector);
6441 }
6442
6443 /* For anything other than nomatch or partial match, just return the code. */
6444
6445 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6446 {
6447 DPRINTF((">>>> error: returning %d\n", rc));
6448 return rc;
6449 }
6450
6451 /* Handle partial matches - disable any mark data */
6452
6453 if (start_partial != NULL)
6454 {
6455 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6456 md->mark = NULL;
6457 if (offsetcount > 1)
6458 {
6459 offsets[0] = (int)(start_partial - (USPTR)subject);
6460 offsets[1] = (int)(end_subject - (USPTR)subject);
6461 }
6462 rc = PCRE_ERROR_PARTIAL;
6463 }
6464
6465 /* This is the classic nomatch case */
6466
6467 else
6468 {
6469 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6470 rc = PCRE_ERROR_NOMATCH;
6471 }
6472
6473 /* Return the MARK data if it has been requested. */
6474
6475 RETURN_MARK:
6476
6477 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6478 *(extra_data->mark) = (unsigned char *)(md->mark);
6479 return rc;
6480 }
6481
6482 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5