/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 610 - (show annotations)
Tue Jun 28 15:58:34 2011 UTC (8 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 194372 byte(s)
Fixed newly introduced bug for patterns like /(?:(b))++/, where the capturing 
was happening, but not setting the correct return code.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63};
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. If we fail to match, we need to restore this value and also the
812 values of the final offsets, in case they were set by a previous iteration
813 of the same bracket.
814
815 If there isn't enough space in the offset vector, treat this as if it were
816 a non-capturing bracket. Don't worry about setting the flag for the error
817 case here; that is handled in the code for KET. */
818
819 case OP_CBRA:
820 case OP_SCBRA:
821 number = GET2(ecode, 1+LINK_SIZE);
822 offset = number << 1;
823
824 #ifdef PCRE_DEBUG
825 printf("start bracket %d\n", number);
826 printf("subject=");
827 pchars(eptr, 16, TRUE, md);
828 printf("\n");
829 #endif
830
831 if (offset < md->offset_max)
832 {
833 save_offset1 = md->offset_vector[offset];
834 save_offset2 = md->offset_vector[offset+1];
835 save_offset3 = md->offset_vector[md->offset_end - number];
836 save_capture_last = md->capture_last;
837
838 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 md->offset_vector[md->offset_end - number] =
840 (int)(eptr - md->start_subject);
841
842 for (;;)
843 {
844 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846 eptrb, RM1);
847 if (rrc != MATCH_NOMATCH &&
848 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849 RRETURN(rrc);
850 md->capture_last = save_capture_last;
851 ecode += GET(ecode, 1);
852 if (*ecode != OP_ALT) break;
853 }
854
855 DPRINTF(("bracket %d failed\n", number));
856
857 md->offset_vector[offset] = save_offset1;
858 md->offset_vector[offset+1] = save_offset2;
859 md->offset_vector[md->offset_end - number] = save_offset3;
860
861 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
862 RRETURN(MATCH_NOMATCH);
863 }
864
865 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866 as a non-capturing bracket. */
867
868 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870
871 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877 for all the alternatives. When we get to the final alternative within the
878 brackets, we used to return the result of a recursive call to match()
879 whatever happened so it was possible to reduce stack usage by turning this
880 into a tail recursion, except in the case of a possibly empty group.
881 However, now that there is the possiblity of (*THEN) occurring in the final
882 alternative, this optimization is no longer possible. */
883
884 case OP_BRA:
885 case OP_SBRA:
886 DPRINTF(("start non-capturing bracket\n"));
887 for (;;)
888 {
889 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
890 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
891 RM2);
892 if (rrc != MATCH_NOMATCH &&
893 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
894 RRETURN(rrc);
895 ecode += GET(ecode, 1);
896 if (*ecode != OP_ALT) break;
897 }
898
899 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
900 RRETURN(MATCH_NOMATCH);
901
902 /* Handle possessive capturing brackets with an unlimited repeat. We come
903 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
904 handled similarly to the normal case above. However, the matching is
905 different. The end of these brackets will always be OP_KETRPOS, which
906 returns MATCH_KETRPOS without going further in the pattern. By this means
907 we can handle the group by iteration rather than recursion, thereby
908 reducing the amount of stack needed. */
909
910 case OP_CBRAPOS:
911 case OP_SCBRAPOS:
912 allow_zero = FALSE;
913
914 POSSESSIVE_CAPTURE:
915 number = GET2(ecode, 1+LINK_SIZE);
916 offset = number << 1;
917
918 #ifdef PCRE_DEBUG
919 printf("start possessive bracket %d\n", number);
920 printf("subject=");
921 pchars(eptr, 16, TRUE, md);
922 printf("\n");
923 #endif
924
925 if (offset < md->offset_max)
926 {
927 matched_once = FALSE;
928 code_offset = ecode - md->start_code;
929
930 save_offset1 = md->offset_vector[offset];
931 save_offset2 = md->offset_vector[offset+1];
932 save_offset3 = md->offset_vector[md->offset_end - number];
933 save_capture_last = md->capture_last;
934
935 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
936
937 /* Each time round the loop, save the current subject position for use
938 when the group matches. For MATCH_MATCH, the group has matched, so we
939 restart it with a new subject starting position, remembering that we had
940 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
941 usual. If we haven't matched any alternatives in any iteration, check to
942 see if a previous iteration matched. If so, the group has matched;
943 continue from afterwards. Otherwise it has failed; restore the previous
944 capture values before returning NOMATCH. */
945
946 for (;;)
947 {
948 md->offset_vector[md->offset_end - number] =
949 (int)(eptr - md->start_subject);
950 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
951 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
952 eptrb, RM63);
953 if (rrc == MATCH_KETRPOS)
954 {
955 offset_top = md->end_offset_top;
956 eptr = md->end_match_ptr;
957 ecode = md->start_code + code_offset;
958 save_capture_last = md->capture_last;
959 matched_once = TRUE;
960 continue;
961 }
962 if (rrc != MATCH_NOMATCH &&
963 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
964 RRETURN(rrc);
965 md->capture_last = save_capture_last;
966 ecode += GET(ecode, 1);
967 if (*ecode != OP_ALT) break;
968 }
969
970 if (!matched_once)
971 {
972 md->offset_vector[offset] = save_offset1;
973 md->offset_vector[offset+1] = save_offset2;
974 md->offset_vector[md->offset_end - number] = save_offset3;
975 }
976
977 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
978 if (allow_zero || matched_once)
979 {
980 ecode += 1 + LINK_SIZE;
981 break;
982 }
983
984 RRETURN(MATCH_NOMATCH);
985 }
986
987 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
988 as a non-capturing bracket. */
989
990 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
991 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
992
993 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
994
995 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
997
998 /* Non-capturing possessive bracket with unlimited repeat. We come here
999 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1000 without the capturing complication. It is written out separately for speed
1001 and cleanliness. */
1002
1003 case OP_BRAPOS:
1004 case OP_SBRAPOS:
1005 allow_zero = FALSE;
1006
1007 POSSESSIVE_NON_CAPTURE:
1008 matched_once = FALSE;
1009 code_offset = ecode - md->start_code;
1010
1011 for (;;)
1012 {
1013 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1014 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1015 eptrb, RM48);
1016 if (rrc == MATCH_KETRPOS)
1017 {
1018 offset_top = md->end_offset_top;
1019 eptr = md->end_match_ptr;
1020 ecode = md->start_code + code_offset;
1021 matched_once = TRUE;
1022 continue;
1023 }
1024 if (rrc != MATCH_NOMATCH &&
1025 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1026 RRETURN(rrc);
1027 ecode += GET(ecode, 1);
1028 if (*ecode != OP_ALT) break;
1029 }
1030
1031 if (matched_once || allow_zero)
1032 {
1033 ecode += 1 + LINK_SIZE;
1034 break;
1035 }
1036 RRETURN(MATCH_NOMATCH);
1037
1038 /* Control never reaches here. */
1039
1040 /* Conditional group: compilation checked that there are no more than
1041 two branches. If the condition is false, skipping the first branch takes us
1042 past the end if there is only one branch, but that's OK because that is
1043 exactly what going to the ket would do. */
1044
1045 case OP_COND:
1046 case OP_SCOND:
1047 codelink = GET(ecode, 1);
1048
1049 /* Because of the way auto-callout works during compile, a callout item is
1050 inserted between OP_COND and an assertion condition. */
1051
1052 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1053 {
1054 if (pcre_callout != NULL)
1055 {
1056 pcre_callout_block cb;
1057 cb.version = 1; /* Version 1 of the callout block */
1058 cb.callout_number = ecode[LINK_SIZE+2];
1059 cb.offset_vector = md->offset_vector;
1060 cb.subject = (PCRE_SPTR)md->start_subject;
1061 cb.subject_length = (int)(md->end_subject - md->start_subject);
1062 cb.start_match = (int)(mstart - md->start_subject);
1063 cb.current_position = (int)(eptr - md->start_subject);
1064 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1065 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1066 cb.capture_top = offset_top/2;
1067 cb.capture_last = md->capture_last;
1068 cb.callout_data = md->callout_data;
1069 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1070 if (rrc < 0) RRETURN(rrc);
1071 }
1072 ecode += _pcre_OP_lengths[OP_CALLOUT];
1073 }
1074
1075 condcode = ecode[LINK_SIZE+1];
1076
1077 /* Now see what the actual condition is */
1078
1079 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1080 {
1081 if (md->recursive == NULL) /* Not recursing => FALSE */
1082 {
1083 condition = FALSE;
1084 ecode += GET(ecode, 1);
1085 }
1086 else
1087 {
1088 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1089 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1090
1091 /* If the test is for recursion into a specific subpattern, and it is
1092 false, but the test was set up by name, scan the table to see if the
1093 name refers to any other numbers, and test them. The condition is true
1094 if any one is set. */
1095
1096 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1097 {
1098 uschar *slotA = md->name_table;
1099 for (i = 0; i < md->name_count; i++)
1100 {
1101 if (GET2(slotA, 0) == recno) break;
1102 slotA += md->name_entry_size;
1103 }
1104
1105 /* Found a name for the number - there can be only one; duplicate
1106 names for different numbers are allowed, but not vice versa. First
1107 scan down for duplicates. */
1108
1109 if (i < md->name_count)
1110 {
1111 uschar *slotB = slotA;
1112 while (slotB > md->name_table)
1113 {
1114 slotB -= md->name_entry_size;
1115 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1116 {
1117 condition = GET2(slotB, 0) == md->recursive->group_num;
1118 if (condition) break;
1119 }
1120 else break;
1121 }
1122
1123 /* Scan up for duplicates */
1124
1125 if (!condition)
1126 {
1127 slotB = slotA;
1128 for (i++; i < md->name_count; i++)
1129 {
1130 slotB += md->name_entry_size;
1131 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132 {
1133 condition = GET2(slotB, 0) == md->recursive->group_num;
1134 if (condition) break;
1135 }
1136 else break;
1137 }
1138 }
1139 }
1140 }
1141
1142 /* Chose branch according to the condition */
1143
1144 ecode += condition? 3 : GET(ecode, 1);
1145 }
1146 }
1147
1148 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1149 {
1150 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1151 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1152
1153 /* If the numbered capture is unset, but the reference was by name,
1154 scan the table to see if the name refers to any other numbers, and test
1155 them. The condition is true if any one is set. This is tediously similar
1156 to the code above, but not close enough to try to amalgamate. */
1157
1158 if (!condition && condcode == OP_NCREF)
1159 {
1160 int refno = offset >> 1;
1161 uschar *slotA = md->name_table;
1162
1163 for (i = 0; i < md->name_count; i++)
1164 {
1165 if (GET2(slotA, 0) == refno) break;
1166 slotA += md->name_entry_size;
1167 }
1168
1169 /* Found a name for the number - there can be only one; duplicate names
1170 for different numbers are allowed, but not vice versa. First scan down
1171 for duplicates. */
1172
1173 if (i < md->name_count)
1174 {
1175 uschar *slotB = slotA;
1176 while (slotB > md->name_table)
1177 {
1178 slotB -= md->name_entry_size;
1179 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1180 {
1181 offset = GET2(slotB, 0) << 1;
1182 condition = offset < offset_top &&
1183 md->offset_vector[offset] >= 0;
1184 if (condition) break;
1185 }
1186 else break;
1187 }
1188
1189 /* Scan up for duplicates */
1190
1191 if (!condition)
1192 {
1193 slotB = slotA;
1194 for (i++; i < md->name_count; i++)
1195 {
1196 slotB += md->name_entry_size;
1197 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1198 {
1199 offset = GET2(slotB, 0) << 1;
1200 condition = offset < offset_top &&
1201 md->offset_vector[offset] >= 0;
1202 if (condition) break;
1203 }
1204 else break;
1205 }
1206 }
1207 }
1208 }
1209
1210 /* Chose branch according to the condition */
1211
1212 ecode += condition? 3 : GET(ecode, 1);
1213 }
1214
1215 else if (condcode == OP_DEF) /* DEFINE - always false */
1216 {
1217 condition = FALSE;
1218 ecode += GET(ecode, 1);
1219 }
1220
1221 /* The condition is an assertion. Call match() to evaluate it - setting
1222 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1223 an assertion. */
1224
1225 else
1226 {
1227 md->match_function_type = MATCH_CONDASSERT;
1228 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1229 if (rrc == MATCH_MATCH)
1230 {
1231 condition = TRUE;
1232 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1233 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1234 }
1235 else if (rrc != MATCH_NOMATCH &&
1236 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1237 {
1238 RRETURN(rrc); /* Need braces because of following else */
1239 }
1240 else
1241 {
1242 condition = FALSE;
1243 ecode += codelink;
1244 }
1245 }
1246
1247 /* We are now at the branch that is to be obeyed. As there is only one,
1248 we used to use tail recursion to avoid using another stack frame, except
1249 when there was unlimited repeat of a possibly empty group. However, that
1250 strategy no longer works because of the possibilty of (*THEN) being
1251 encountered in the branch. A recursive call to match() is always required,
1252 unless the second alternative doesn't exist, in which case we can just
1253 plough on. */
1254
1255 if (condition || *ecode == OP_ALT)
1256 {
1257 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1258 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1259 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1260 rrc = MATCH_NOMATCH;
1261 RRETURN(rrc);
1262 }
1263 else /* Condition false & no alternative */
1264 {
1265 ecode += 1 + LINK_SIZE;
1266 }
1267 break;
1268
1269
1270 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1271 to close any currently open capturing brackets. */
1272
1273 case OP_CLOSE:
1274 number = GET2(ecode, 1);
1275 offset = number << 1;
1276
1277 #ifdef PCRE_DEBUG
1278 printf("end bracket %d at *ACCEPT", number);
1279 printf("\n");
1280 #endif
1281
1282 md->capture_last = number;
1283 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1284 {
1285 md->offset_vector[offset] =
1286 md->offset_vector[md->offset_end - number];
1287 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1288 if (offset_top <= offset) offset_top = offset + 2;
1289 }
1290 ecode += 3;
1291 break;
1292
1293
1294 /* End of the pattern, either real or forced. If we are in a recursion, we
1295 should restore the offsets appropriately, and if it's a top-level
1296 recursion, continue from after the call. */
1297
1298 case OP_ACCEPT:
1299 case OP_END:
1300 if (md->recursive != NULL)
1301 {
1302 recursion_info *rec = md->recursive;
1303 md->recursive = rec->prevrec;
1304 memmove(md->offset_vector, rec->offset_save,
1305 rec->saved_max * sizeof(int));
1306 offset_top = rec->save_offset_top;
1307 if (rec->group_num == 0)
1308 {
1309 ecode = rec->after_call;
1310 break;
1311 }
1312 }
1313
1314 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1315 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1316 the subject. In both cases, backtracking will then try other alternatives,
1317 if any. */
1318
1319 else if (eptr == mstart &&
1320 (md->notempty ||
1321 (md->notempty_atstart &&
1322 mstart == md->start_subject + md->start_offset)))
1323 MRRETURN(MATCH_NOMATCH);
1324
1325 /* Otherwise, we have a match. */
1326
1327 md->end_match_ptr = eptr; /* Record where we ended */
1328 md->end_offset_top = offset_top; /* and how many extracts were taken */
1329 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1330
1331 /* For some reason, the macros don't work properly if an expression is
1332 given as the argument to MRRETURN when the heap is in use. */
1333
1334 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1335 MRRETURN(rrc);
1336
1337 /* Assertion brackets. Check the alternative branches in turn - the
1338 matching won't pass the KET for an assertion. If any one branch matches,
1339 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1340 start of each branch to move the current point backwards, so the code at
1341 this level is identical to the lookahead case. When the assertion is part
1342 of a condition, we want to return immediately afterwards. The caller of
1343 this incarnation of the match() function will have set MATCH_CONDASSERT in
1344 md->match_function type, and one of these opcodes will be the first opcode
1345 that is processed. We use a local variable that is preserved over calls to
1346 match() to remember this case. */
1347
1348 case OP_ASSERT:
1349 case OP_ASSERTBACK:
1350 if (md->match_function_type == MATCH_CONDASSERT)
1351 {
1352 condassert = TRUE;
1353 md->match_function_type = 0;
1354 }
1355 else condassert = FALSE;
1356
1357 do
1358 {
1359 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1360 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1361 {
1362 mstart = md->start_match_ptr; /* In case \K reset it */
1363 break;
1364 }
1365 if (rrc != MATCH_NOMATCH &&
1366 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1367 RRETURN(rrc);
1368 ecode += GET(ecode, 1);
1369 }
1370 while (*ecode == OP_ALT);
1371
1372 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1373
1374 /* If checking an assertion for a condition, return MATCH_MATCH. */
1375
1376 if (condassert) RRETURN(MATCH_MATCH);
1377
1378 /* Continue from after the assertion, updating the offsets high water
1379 mark, since extracts may have been taken during the assertion. */
1380
1381 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1382 ecode += 1 + LINK_SIZE;
1383 offset_top = md->end_offset_top;
1384 continue;
1385
1386 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1387 PRUNE, or COMMIT means we must assume failure without checking subsequent
1388 branches. */
1389
1390 case OP_ASSERT_NOT:
1391 case OP_ASSERTBACK_NOT:
1392 if (md->match_function_type == MATCH_CONDASSERT)
1393 {
1394 condassert = TRUE;
1395 md->match_function_type = 0;
1396 }
1397 else condassert = FALSE;
1398
1399 do
1400 {
1401 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1402 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1403 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1404 {
1405 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1406 break;
1407 }
1408 if (rrc != MATCH_NOMATCH &&
1409 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1410 RRETURN(rrc);
1411 ecode += GET(ecode,1);
1412 }
1413 while (*ecode == OP_ALT);
1414
1415 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1416
1417 ecode += 1 + LINK_SIZE;
1418 continue;
1419
1420 /* Move the subject pointer back. This occurs only at the start of
1421 each branch of a lookbehind assertion. If we are too close to the start to
1422 move back, this match function fails. When working with UTF-8 we move
1423 back a number of characters, not bytes. */
1424
1425 case OP_REVERSE:
1426 #ifdef SUPPORT_UTF8
1427 if (utf8)
1428 {
1429 i = GET(ecode, 1);
1430 while (i-- > 0)
1431 {
1432 eptr--;
1433 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1434 BACKCHAR(eptr);
1435 }
1436 }
1437 else
1438 #endif
1439
1440 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1441
1442 {
1443 eptr -= GET(ecode, 1);
1444 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1445 }
1446
1447 /* Save the earliest consulted character, then skip to next op code */
1448
1449 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1450 ecode += 1 + LINK_SIZE;
1451 break;
1452
1453 /* The callout item calls an external function, if one is provided, passing
1454 details of the match so far. This is mainly for debugging, though the
1455 function is able to force a failure. */
1456
1457 case OP_CALLOUT:
1458 if (pcre_callout != NULL)
1459 {
1460 pcre_callout_block cb;
1461 cb.version = 1; /* Version 1 of the callout block */
1462 cb.callout_number = ecode[1];
1463 cb.offset_vector = md->offset_vector;
1464 cb.subject = (PCRE_SPTR)md->start_subject;
1465 cb.subject_length = (int)(md->end_subject - md->start_subject);
1466 cb.start_match = (int)(mstart - md->start_subject);
1467 cb.current_position = (int)(eptr - md->start_subject);
1468 cb.pattern_position = GET(ecode, 2);
1469 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1470 cb.capture_top = offset_top/2;
1471 cb.capture_last = md->capture_last;
1472 cb.callout_data = md->callout_data;
1473 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1474 if (rrc < 0) RRETURN(rrc);
1475 }
1476 ecode += 2 + 2*LINK_SIZE;
1477 break;
1478
1479 /* Recursion either matches the current regex, or some subexpression. The
1480 offset data is the offset to the starting bracket from the start of the
1481 whole pattern. (This is so that it works from duplicated subpatterns.)
1482
1483 If there are any capturing brackets started but not finished, we have to
1484 save their starting points and reinstate them after the recursion. However,
1485 we don't know how many such there are (offset_top records the completed
1486 total) so we just have to save all the potential data. There may be up to
1487 65535 such values, which is too large to put on the stack, but using malloc
1488 for small numbers seems expensive. As a compromise, the stack is used when
1489 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1490 is used. A problem is what to do if the malloc fails ... there is no way of
1491 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1492 values on the stack, and accept that the rest may be wrong.
1493
1494 There are also other values that have to be saved. We use a chained
1495 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1496 for the original version of this logic. */
1497
1498 case OP_RECURSE:
1499 {
1500 callpat = md->start_code + GET(ecode, 1);
1501 new_recursive.group_num = (callpat == md->start_code)? 0 :
1502 GET2(callpat, 1 + LINK_SIZE);
1503
1504 /* Add to "recursing stack" */
1505
1506 new_recursive.prevrec = md->recursive;
1507 md->recursive = &new_recursive;
1508
1509 /* Find where to continue from afterwards */
1510
1511 ecode += 1 + LINK_SIZE;
1512 new_recursive.after_call = ecode;
1513
1514 /* Now save the offset data. */
1515
1516 new_recursive.saved_max = md->offset_end;
1517 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1518 new_recursive.offset_save = stacksave;
1519 else
1520 {
1521 new_recursive.offset_save =
1522 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1523 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1524 }
1525
1526 memcpy(new_recursive.offset_save, md->offset_vector,
1527 new_recursive.saved_max * sizeof(int));
1528 new_recursive.save_offset_top = offset_top;
1529
1530 /* OK, now we can do the recursion. For each top-level alternative we
1531 restore the offset and recursion data. */
1532
1533 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1534 cbegroup = (*callpat >= OP_SBRA);
1535 do
1536 {
1537 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1538 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1539 md, eptrb, RM6);
1540 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1541 {
1542 DPRINTF(("Recursion matched\n"));
1543 md->recursive = new_recursive.prevrec;
1544 if (new_recursive.offset_save != stacksave)
1545 (pcre_free)(new_recursive.offset_save);
1546 MRRETURN(MATCH_MATCH);
1547 }
1548 else if (rrc != MATCH_NOMATCH &&
1549 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1550 {
1551 DPRINTF(("Recursion gave error %d\n", rrc));
1552 if (new_recursive.offset_save != stacksave)
1553 (pcre_free)(new_recursive.offset_save);
1554 RRETURN(rrc);
1555 }
1556
1557 md->recursive = &new_recursive;
1558 memcpy(md->offset_vector, new_recursive.offset_save,
1559 new_recursive.saved_max * sizeof(int));
1560 callpat += GET(callpat, 1);
1561 }
1562 while (*callpat == OP_ALT);
1563
1564 DPRINTF(("Recursion didn't match\n"));
1565 md->recursive = new_recursive.prevrec;
1566 if (new_recursive.offset_save != stacksave)
1567 (pcre_free)(new_recursive.offset_save);
1568 MRRETURN(MATCH_NOMATCH);
1569 }
1570 /* Control never reaches here */
1571
1572 /* "Once" brackets are like assertion brackets except that after a match,
1573 the point in the subject string is not moved back. Thus there can never be
1574 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1575 Check the alternative branches in turn - the matching won't pass the KET
1576 for this kind of subpattern. If any one branch matches, we carry on as at
1577 the end of a normal bracket, leaving the subject pointer, but resetting
1578 the start-of-match value in case it was changed by \K. */
1579
1580 case OP_ONCE:
1581 prev = ecode;
1582 saved_eptr = eptr;
1583
1584 do
1585 {
1586 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1587 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1588 {
1589 mstart = md->start_match_ptr;
1590 break;
1591 }
1592 if (rrc != MATCH_NOMATCH &&
1593 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1594 RRETURN(rrc);
1595 ecode += GET(ecode,1);
1596 }
1597 while (*ecode == OP_ALT);
1598
1599 /* If hit the end of the group (which could be repeated), fail */
1600
1601 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1602
1603 /* Continue as from after the assertion, updating the offsets high water
1604 mark, since extracts may have been taken. */
1605
1606 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1607
1608 offset_top = md->end_offset_top;
1609 eptr = md->end_match_ptr;
1610
1611 /* For a non-repeating ket, just continue at this level. This also
1612 happens for a repeating ket if no characters were matched in the group.
1613 This is the forcible breaking of infinite loops as implemented in Perl
1614 5.005. If there is an options reset, it will get obeyed in the normal
1615 course of events. */
1616
1617 if (*ecode == OP_KET || eptr == saved_eptr)
1618 {
1619 ecode += 1+LINK_SIZE;
1620 break;
1621 }
1622
1623 /* The repeating kets try the rest of the pattern or restart from the
1624 preceding bracket, in the appropriate order. The second "call" of match()
1625 uses tail recursion, to avoid using another stack frame. */
1626
1627 if (*ecode == OP_KETRMIN)
1628 {
1629 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1630 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1631 ecode = prev;
1632 goto TAIL_RECURSE;
1633 }
1634 else /* OP_KETRMAX */
1635 {
1636 md->match_function_type = MATCH_CBEGROUP;
1637 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1638 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1639 ecode += 1 + LINK_SIZE;
1640 goto TAIL_RECURSE;
1641 }
1642 /* Control never gets here */
1643
1644 /* An alternation is the end of a branch; scan along to find the end of the
1645 bracketed group and go to there. */
1646
1647 case OP_ALT:
1648 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1649 break;
1650
1651 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1652 indicating that it may occur zero times. It may repeat infinitely, or not
1653 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1654 with fixed upper repeat limits are compiled as a number of copies, with the
1655 optional ones preceded by BRAZERO or BRAMINZERO. */
1656
1657 case OP_BRAZERO:
1658 next = ecode + 1;
1659 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1660 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1661 do next += GET(next, 1); while (*next == OP_ALT);
1662 ecode = next + 1 + LINK_SIZE;
1663 break;
1664
1665 case OP_BRAMINZERO:
1666 next = ecode + 1;
1667 do next += GET(next, 1); while (*next == OP_ALT);
1668 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1670 ecode++;
1671 break;
1672
1673 case OP_SKIPZERO:
1674 next = ecode+1;
1675 do next += GET(next,1); while (*next == OP_ALT);
1676 ecode = next + 1 + LINK_SIZE;
1677 break;
1678
1679 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1680 here; just jump to the group, with allow_zero set TRUE. */
1681
1682 case OP_BRAPOSZERO:
1683 op = *(++ecode);
1684 allow_zero = TRUE;
1685 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1686 goto POSSESSIVE_NON_CAPTURE;
1687
1688 /* End of a group, repeated or non-repeating. */
1689
1690 case OP_KET:
1691 case OP_KETRMIN:
1692 case OP_KETRMAX:
1693 case OP_KETRPOS:
1694 prev = ecode - GET(ecode, 1);
1695
1696 /* If this was a group that remembered the subject start, in order to break
1697 infinite repeats of empty string matches, retrieve the subject start from
1698 the chain. Otherwise, set it NULL. */
1699
1700 if (*prev >= OP_SBRA)
1701 {
1702 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1703 eptrb = eptrb->epb_prev; /* Backup to previous group */
1704 }
1705 else saved_eptr = NULL;
1706
1707 /* If we are at the end of an assertion group or an atomic group, stop
1708 matching and return MATCH_MATCH, but record the current high water mark for
1709 use by positive assertions. We also need to record the match start in case
1710 it was changed by \K. */
1711
1712 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1713 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1714 *prev == OP_ONCE)
1715 {
1716 md->end_match_ptr = eptr; /* For ONCE */
1717 md->end_offset_top = offset_top;
1718 md->start_match_ptr = mstart;
1719 MRRETURN(MATCH_MATCH);
1720 }
1721
1722 /* For capturing groups we have to check the group number back at the start
1723 and if necessary complete handling an extraction by setting the offsets and
1724 bumping the high water mark. Note that whole-pattern recursion is coded as
1725 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1726 when the OP_END is reached. Other recursion is handled here. */
1727
1728 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1729 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1730 {
1731 number = GET2(prev, 1+LINK_SIZE);
1732 offset = number << 1;
1733
1734 #ifdef PCRE_DEBUG
1735 printf("end bracket %d", number);
1736 printf("\n");
1737 #endif
1738
1739 md->capture_last = number;
1740 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1741 {
1742 md->offset_vector[offset] =
1743 md->offset_vector[md->offset_end - number];
1744 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1745 if (offset_top <= offset) offset_top = offset + 2;
1746 }
1747
1748 /* Handle a recursively called group. Restore the offsets
1749 appropriately and continue from after the call. */
1750
1751 if (md->recursive != NULL && md->recursive->group_num == number)
1752 {
1753 recursion_info *rec = md->recursive;
1754 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1755 md->recursive = rec->prevrec;
1756 memcpy(md->offset_vector, rec->offset_save,
1757 rec->saved_max * sizeof(int));
1758 offset_top = rec->save_offset_top;
1759 ecode = rec->after_call;
1760 break;
1761 }
1762 }
1763
1764 /* For a non-repeating ket, just continue at this level. This also
1765 happens for a repeating ket if no characters were matched in the group.
1766 This is the forcible breaking of infinite loops as implemented in Perl
1767 5.005. If there is an options reset, it will get obeyed in the normal
1768 course of events. */
1769
1770 if (*ecode == OP_KET || eptr == saved_eptr)
1771 {
1772 ecode += 1 + LINK_SIZE;
1773 break;
1774 }
1775
1776 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1777 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1778 at a time from the outer level, thus saving stack. */
1779
1780 if (*ecode == OP_KETRPOS)
1781 {
1782 md->end_match_ptr = eptr;
1783 md->end_offset_top = offset_top;
1784 RRETURN(MATCH_KETRPOS);
1785 }
1786
1787 /* The normal repeating kets try the rest of the pattern or restart from
1788 the preceding bracket, in the appropriate order. In the second case, we can
1789 use tail recursion to avoid using another stack frame, unless we have an
1790 unlimited repeat of a group that can match an empty string. */
1791
1792 if (*ecode == OP_KETRMIN)
1793 {
1794 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1796 if (*prev >= OP_SBRA) /* Could match an empty string */
1797 {
1798 md->match_function_type = MATCH_CBEGROUP;
1799 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1800 RRETURN(rrc);
1801 }
1802 ecode = prev;
1803 goto TAIL_RECURSE;
1804 }
1805 else /* OP_KETRMAX */
1806 {
1807 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1808 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1810 ecode += 1 + LINK_SIZE;
1811 goto TAIL_RECURSE;
1812 }
1813 /* Control never gets here */
1814
1815 /* Not multiline mode: start of subject assertion, unless notbol. */
1816
1817 case OP_CIRC:
1818 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1819
1820 /* Start of subject assertion */
1821
1822 case OP_SOD:
1823 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1824 ecode++;
1825 break;
1826
1827 /* Multiline mode: start of subject unless notbol, or after any newline. */
1828
1829 case OP_CIRCM:
1830 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1831 if (eptr != md->start_subject &&
1832 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1833 MRRETURN(MATCH_NOMATCH);
1834 ecode++;
1835 break;
1836
1837 /* Start of match assertion */
1838
1839 case OP_SOM:
1840 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1841 ecode++;
1842 break;
1843
1844 /* Reset the start of match point */
1845
1846 case OP_SET_SOM:
1847 mstart = eptr;
1848 ecode++;
1849 break;
1850
1851 /* Multiline mode: assert before any newline, or before end of subject
1852 unless noteol is set. */
1853
1854 case OP_DOLLM:
1855 if (eptr < md->end_subject)
1856 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1857 else
1858 {
1859 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1860 SCHECK_PARTIAL();
1861 }
1862 ecode++;
1863 break;
1864
1865 /* Not multiline mode: assert before a terminating newline or before end of
1866 subject unless noteol is set. */
1867
1868 case OP_DOLL:
1869 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1870 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1871
1872 /* ... else fall through for endonly */
1873
1874 /* End of subject assertion (\z) */
1875
1876 case OP_EOD:
1877 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1878 SCHECK_PARTIAL();
1879 ecode++;
1880 break;
1881
1882 /* End of subject or ending \n assertion (\Z) */
1883
1884 case OP_EODN:
1885 ASSERT_NL_OR_EOS:
1886 if (eptr < md->end_subject &&
1887 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1888 MRRETURN(MATCH_NOMATCH);
1889
1890 /* Either at end of string or \n before end. */
1891
1892 SCHECK_PARTIAL();
1893 ecode++;
1894 break;
1895
1896 /* Word boundary assertions */
1897
1898 case OP_NOT_WORD_BOUNDARY:
1899 case OP_WORD_BOUNDARY:
1900 {
1901
1902 /* Find out if the previous and current characters are "word" characters.
1903 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1904 be "non-word" characters. Remember the earliest consulted character for
1905 partial matching. */
1906
1907 #ifdef SUPPORT_UTF8
1908 if (utf8)
1909 {
1910 /* Get status of previous character */
1911
1912 if (eptr == md->start_subject) prev_is_word = FALSE; else
1913 {
1914 USPTR lastptr = eptr - 1;
1915 while((*lastptr & 0xc0) == 0x80) lastptr--;
1916 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1917 GETCHAR(c, lastptr);
1918 #ifdef SUPPORT_UCP
1919 if (md->use_ucp)
1920 {
1921 if (c == '_') prev_is_word = TRUE; else
1922 {
1923 int cat = UCD_CATEGORY(c);
1924 prev_is_word = (cat == ucp_L || cat == ucp_N);
1925 }
1926 }
1927 else
1928 #endif
1929 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1930 }
1931
1932 /* Get status of next character */
1933
1934 if (eptr >= md->end_subject)
1935 {
1936 SCHECK_PARTIAL();
1937 cur_is_word = FALSE;
1938 }
1939 else
1940 {
1941 GETCHAR(c, eptr);
1942 #ifdef SUPPORT_UCP
1943 if (md->use_ucp)
1944 {
1945 if (c == '_') cur_is_word = TRUE; else
1946 {
1947 int cat = UCD_CATEGORY(c);
1948 cur_is_word = (cat == ucp_L || cat == ucp_N);
1949 }
1950 }
1951 else
1952 #endif
1953 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1954 }
1955 }
1956 else
1957 #endif
1958
1959 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1960 consistency with the behaviour of \w we do use it in this case. */
1961
1962 {
1963 /* Get status of previous character */
1964
1965 if (eptr == md->start_subject) prev_is_word = FALSE; else
1966 {
1967 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1968 #ifdef SUPPORT_UCP
1969 if (md->use_ucp)
1970 {
1971 c = eptr[-1];
1972 if (c == '_') prev_is_word = TRUE; else
1973 {
1974 int cat = UCD_CATEGORY(c);
1975 prev_is_word = (cat == ucp_L || cat == ucp_N);
1976 }
1977 }
1978 else
1979 #endif
1980 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1981 }
1982
1983 /* Get status of next character */
1984
1985 if (eptr >= md->end_subject)
1986 {
1987 SCHECK_PARTIAL();
1988 cur_is_word = FALSE;
1989 }
1990 else
1991 #ifdef SUPPORT_UCP
1992 if (md->use_ucp)
1993 {
1994 c = *eptr;
1995 if (c == '_') cur_is_word = TRUE; else
1996 {
1997 int cat = UCD_CATEGORY(c);
1998 cur_is_word = (cat == ucp_L || cat == ucp_N);
1999 }
2000 }
2001 else
2002 #endif
2003 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2004 }
2005
2006 /* Now see if the situation is what we want */
2007
2008 if ((*ecode++ == OP_WORD_BOUNDARY)?
2009 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2010 MRRETURN(MATCH_NOMATCH);
2011 }
2012 break;
2013
2014 /* Match a single character type; inline for speed */
2015
2016 case OP_ANY:
2017 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2018 /* Fall through */
2019
2020 case OP_ALLANY:
2021 if (eptr++ >= md->end_subject)
2022 {
2023 SCHECK_PARTIAL();
2024 MRRETURN(MATCH_NOMATCH);
2025 }
2026 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2027 ecode++;
2028 break;
2029
2030 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2031 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2032
2033 case OP_ANYBYTE:
2034 if (eptr++ >= md->end_subject)
2035 {
2036 SCHECK_PARTIAL();
2037 MRRETURN(MATCH_NOMATCH);
2038 }
2039 ecode++;
2040 break;
2041
2042 case OP_NOT_DIGIT:
2043 if (eptr >= md->end_subject)
2044 {
2045 SCHECK_PARTIAL();
2046 MRRETURN(MATCH_NOMATCH);
2047 }
2048 GETCHARINCTEST(c, eptr);
2049 if (
2050 #ifdef SUPPORT_UTF8
2051 c < 256 &&
2052 #endif
2053 (md->ctypes[c] & ctype_digit) != 0
2054 )
2055 MRRETURN(MATCH_NOMATCH);
2056 ecode++;
2057 break;
2058
2059 case OP_DIGIT:
2060 if (eptr >= md->end_subject)
2061 {
2062 SCHECK_PARTIAL();
2063 MRRETURN(MATCH_NOMATCH);
2064 }
2065 GETCHARINCTEST(c, eptr);
2066 if (
2067 #ifdef SUPPORT_UTF8
2068 c >= 256 ||
2069 #endif
2070 (md->ctypes[c] & ctype_digit) == 0
2071 )
2072 MRRETURN(MATCH_NOMATCH);
2073 ecode++;
2074 break;
2075
2076 case OP_NOT_WHITESPACE:
2077 if (eptr >= md->end_subject)
2078 {
2079 SCHECK_PARTIAL();
2080 MRRETURN(MATCH_NOMATCH);
2081 }
2082 GETCHARINCTEST(c, eptr);
2083 if (
2084 #ifdef SUPPORT_UTF8
2085 c < 256 &&
2086 #endif
2087 (md->ctypes[c] & ctype_space) != 0
2088 )
2089 MRRETURN(MATCH_NOMATCH);
2090 ecode++;
2091 break;
2092
2093 case OP_WHITESPACE:
2094 if (eptr >= md->end_subject)
2095 {
2096 SCHECK_PARTIAL();
2097 MRRETURN(MATCH_NOMATCH);
2098 }
2099 GETCHARINCTEST(c, eptr);
2100 if (
2101 #ifdef SUPPORT_UTF8
2102 c >= 256 ||
2103 #endif
2104 (md->ctypes[c] & ctype_space) == 0
2105 )
2106 MRRETURN(MATCH_NOMATCH);
2107 ecode++;
2108 break;
2109
2110 case OP_NOT_WORDCHAR:
2111 if (eptr >= md->end_subject)
2112 {
2113 SCHECK_PARTIAL();
2114 MRRETURN(MATCH_NOMATCH);
2115 }
2116 GETCHARINCTEST(c, eptr);
2117 if (
2118 #ifdef SUPPORT_UTF8
2119 c < 256 &&
2120 #endif
2121 (md->ctypes[c] & ctype_word) != 0
2122 )
2123 MRRETURN(MATCH_NOMATCH);
2124 ecode++;
2125 break;
2126
2127 case OP_WORDCHAR:
2128 if (eptr >= md->end_subject)
2129 {
2130 SCHECK_PARTIAL();
2131 MRRETURN(MATCH_NOMATCH);
2132 }
2133 GETCHARINCTEST(c, eptr);
2134 if (
2135 #ifdef SUPPORT_UTF8
2136 c >= 256 ||
2137 #endif
2138 (md->ctypes[c] & ctype_word) == 0
2139 )
2140 MRRETURN(MATCH_NOMATCH);
2141 ecode++;
2142 break;
2143
2144 case OP_ANYNL:
2145 if (eptr >= md->end_subject)
2146 {
2147 SCHECK_PARTIAL();
2148 MRRETURN(MATCH_NOMATCH);
2149 }
2150 GETCHARINCTEST(c, eptr);
2151 switch(c)
2152 {
2153 default: MRRETURN(MATCH_NOMATCH);
2154
2155 case 0x000d:
2156 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2157 break;
2158
2159 case 0x000a:
2160 break;
2161
2162 case 0x000b:
2163 case 0x000c:
2164 case 0x0085:
2165 case 0x2028:
2166 case 0x2029:
2167 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2168 break;
2169 }
2170 ecode++;
2171 break;
2172
2173 case OP_NOT_HSPACE:
2174 if (eptr >= md->end_subject)
2175 {
2176 SCHECK_PARTIAL();
2177 MRRETURN(MATCH_NOMATCH);
2178 }
2179 GETCHARINCTEST(c, eptr);
2180 switch(c)
2181 {
2182 default: break;
2183 case 0x09: /* HT */
2184 case 0x20: /* SPACE */
2185 case 0xa0: /* NBSP */
2186 case 0x1680: /* OGHAM SPACE MARK */
2187 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2188 case 0x2000: /* EN QUAD */
2189 case 0x2001: /* EM QUAD */
2190 case 0x2002: /* EN SPACE */
2191 case 0x2003: /* EM SPACE */
2192 case 0x2004: /* THREE-PER-EM SPACE */
2193 case 0x2005: /* FOUR-PER-EM SPACE */
2194 case 0x2006: /* SIX-PER-EM SPACE */
2195 case 0x2007: /* FIGURE SPACE */
2196 case 0x2008: /* PUNCTUATION SPACE */
2197 case 0x2009: /* THIN SPACE */
2198 case 0x200A: /* HAIR SPACE */
2199 case 0x202f: /* NARROW NO-BREAK SPACE */
2200 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2201 case 0x3000: /* IDEOGRAPHIC SPACE */
2202 MRRETURN(MATCH_NOMATCH);
2203 }
2204 ecode++;
2205 break;
2206
2207 case OP_HSPACE:
2208 if (eptr >= md->end_subject)
2209 {
2210 SCHECK_PARTIAL();
2211 MRRETURN(MATCH_NOMATCH);
2212 }
2213 GETCHARINCTEST(c, eptr);
2214 switch(c)
2215 {
2216 default: MRRETURN(MATCH_NOMATCH);
2217 case 0x09: /* HT */
2218 case 0x20: /* SPACE */
2219 case 0xa0: /* NBSP */
2220 case 0x1680: /* OGHAM SPACE MARK */
2221 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2222 case 0x2000: /* EN QUAD */
2223 case 0x2001: /* EM QUAD */
2224 case 0x2002: /* EN SPACE */
2225 case 0x2003: /* EM SPACE */
2226 case 0x2004: /* THREE-PER-EM SPACE */
2227 case 0x2005: /* FOUR-PER-EM SPACE */
2228 case 0x2006: /* SIX-PER-EM SPACE */
2229 case 0x2007: /* FIGURE SPACE */
2230 case 0x2008: /* PUNCTUATION SPACE */
2231 case 0x2009: /* THIN SPACE */
2232 case 0x200A: /* HAIR SPACE */
2233 case 0x202f: /* NARROW NO-BREAK SPACE */
2234 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2235 case 0x3000: /* IDEOGRAPHIC SPACE */
2236 break;
2237 }
2238 ecode++;
2239 break;
2240
2241 case OP_NOT_VSPACE:
2242 if (eptr >= md->end_subject)
2243 {
2244 SCHECK_PARTIAL();
2245 MRRETURN(MATCH_NOMATCH);
2246 }
2247 GETCHARINCTEST(c, eptr);
2248 switch(c)
2249 {
2250 default: break;
2251 case 0x0a: /* LF */
2252 case 0x0b: /* VT */
2253 case 0x0c: /* FF */
2254 case 0x0d: /* CR */
2255 case 0x85: /* NEL */
2256 case 0x2028: /* LINE SEPARATOR */
2257 case 0x2029: /* PARAGRAPH SEPARATOR */
2258 MRRETURN(MATCH_NOMATCH);
2259 }
2260 ecode++;
2261 break;
2262
2263 case OP_VSPACE:
2264 if (eptr >= md->end_subject)
2265 {
2266 SCHECK_PARTIAL();
2267 MRRETURN(MATCH_NOMATCH);
2268 }
2269 GETCHARINCTEST(c, eptr);
2270 switch(c)
2271 {
2272 default: MRRETURN(MATCH_NOMATCH);
2273 case 0x0a: /* LF */
2274 case 0x0b: /* VT */
2275 case 0x0c: /* FF */
2276 case 0x0d: /* CR */
2277 case 0x85: /* NEL */
2278 case 0x2028: /* LINE SEPARATOR */
2279 case 0x2029: /* PARAGRAPH SEPARATOR */
2280 break;
2281 }
2282 ecode++;
2283 break;
2284
2285 #ifdef SUPPORT_UCP
2286 /* Check the next character by Unicode property. We will get here only
2287 if the support is in the binary; otherwise a compile-time error occurs. */
2288
2289 case OP_PROP:
2290 case OP_NOTPROP:
2291 if (eptr >= md->end_subject)
2292 {
2293 SCHECK_PARTIAL();
2294 MRRETURN(MATCH_NOMATCH);
2295 }
2296 GETCHARINCTEST(c, eptr);
2297 {
2298 const ucd_record *prop = GET_UCD(c);
2299
2300 switch(ecode[1])
2301 {
2302 case PT_ANY:
2303 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2304 break;
2305
2306 case PT_LAMP:
2307 if ((prop->chartype == ucp_Lu ||
2308 prop->chartype == ucp_Ll ||
2309 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2310 MRRETURN(MATCH_NOMATCH);
2311 break;
2312
2313 case PT_GC:
2314 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2315 MRRETURN(MATCH_NOMATCH);
2316 break;
2317
2318 case PT_PC:
2319 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2320 MRRETURN(MATCH_NOMATCH);
2321 break;
2322
2323 case PT_SC:
2324 if ((ecode[2] != prop->script) == (op == OP_PROP))
2325 MRRETURN(MATCH_NOMATCH);
2326 break;
2327
2328 /* These are specials */
2329
2330 case PT_ALNUM:
2331 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2332 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2333 MRRETURN(MATCH_NOMATCH);
2334 break;
2335
2336 case PT_SPACE: /* Perl space */
2337 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2338 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2339 == (op == OP_NOTPROP))
2340 MRRETURN(MATCH_NOMATCH);
2341 break;
2342
2343 case PT_PXSPACE: /* POSIX space */
2344 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2345 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2346 c == CHAR_FF || c == CHAR_CR)
2347 == (op == OP_NOTPROP))
2348 MRRETURN(MATCH_NOMATCH);
2349 break;
2350
2351 case PT_WORD:
2352 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2353 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2354 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2355 MRRETURN(MATCH_NOMATCH);
2356 break;
2357
2358 /* This should never occur */
2359
2360 default:
2361 RRETURN(PCRE_ERROR_INTERNAL);
2362 }
2363
2364 ecode += 3;
2365 }
2366 break;
2367
2368 /* Match an extended Unicode sequence. We will get here only if the support
2369 is in the binary; otherwise a compile-time error occurs. */
2370
2371 case OP_EXTUNI:
2372 if (eptr >= md->end_subject)
2373 {
2374 SCHECK_PARTIAL();
2375 MRRETURN(MATCH_NOMATCH);
2376 }
2377 GETCHARINCTEST(c, eptr);
2378 {
2379 int category = UCD_CATEGORY(c);
2380 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2381 while (eptr < md->end_subject)
2382 {
2383 int len = 1;
2384 if (!utf8) c = *eptr; else
2385 {
2386 GETCHARLEN(c, eptr, len);
2387 }
2388 category = UCD_CATEGORY(c);
2389 if (category != ucp_M) break;
2390 eptr += len;
2391 }
2392 }
2393 ecode++;
2394 break;
2395 #endif
2396
2397
2398 /* Match a back reference, possibly repeatedly. Look past the end of the
2399 item to see if there is repeat information following. The code is similar
2400 to that for character classes, but repeated for efficiency. Then obey
2401 similar code to character type repeats - written out again for speed.
2402 However, if the referenced string is the empty string, always treat
2403 it as matched, any number of times (otherwise there could be infinite
2404 loops). */
2405
2406 case OP_REF:
2407 case OP_REFI:
2408 caseless = op == OP_REFI;
2409 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2410 ecode += 3;
2411
2412 /* If the reference is unset, there are two possibilities:
2413
2414 (a) In the default, Perl-compatible state, set the length negative;
2415 this ensures that every attempt at a match fails. We can't just fail
2416 here, because of the possibility of quantifiers with zero minima.
2417
2418 (b) If the JavaScript compatibility flag is set, set the length to zero
2419 so that the back reference matches an empty string.
2420
2421 Otherwise, set the length to the length of what was matched by the
2422 referenced subpattern. */
2423
2424 if (offset >= offset_top || md->offset_vector[offset] < 0)
2425 length = (md->jscript_compat)? 0 : -1;
2426 else
2427 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2428
2429 /* Set up for repetition, or handle the non-repeated case */
2430
2431 switch (*ecode)
2432 {
2433 case OP_CRSTAR:
2434 case OP_CRMINSTAR:
2435 case OP_CRPLUS:
2436 case OP_CRMINPLUS:
2437 case OP_CRQUERY:
2438 case OP_CRMINQUERY:
2439 c = *ecode++ - OP_CRSTAR;
2440 minimize = (c & 1) != 0;
2441 min = rep_min[c]; /* Pick up values from tables; */
2442 max = rep_max[c]; /* zero for max => infinity */
2443 if (max == 0) max = INT_MAX;
2444 break;
2445
2446 case OP_CRRANGE:
2447 case OP_CRMINRANGE:
2448 minimize = (*ecode == OP_CRMINRANGE);
2449 min = GET2(ecode, 1);
2450 max = GET2(ecode, 3);
2451 if (max == 0) max = INT_MAX;
2452 ecode += 5;
2453 break;
2454
2455 default: /* No repeat follows */
2456 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2457 {
2458 CHECK_PARTIAL();
2459 MRRETURN(MATCH_NOMATCH);
2460 }
2461 eptr += length;
2462 continue; /* With the main loop */
2463 }
2464
2465 /* Handle repeated back references. If the length of the reference is
2466 zero, just continue with the main loop. */
2467
2468 if (length == 0) continue;
2469
2470 /* First, ensure the minimum number of matches are present. We get back
2471 the length of the reference string explicitly rather than passing the
2472 address of eptr, so that eptr can be a register variable. */
2473
2474 for (i = 1; i <= min; i++)
2475 {
2476 int slength;
2477 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2478 {
2479 CHECK_PARTIAL();
2480 MRRETURN(MATCH_NOMATCH);
2481 }
2482 eptr += slength;
2483 }
2484
2485 /* If min = max, continue at the same level without recursion.
2486 They are not both allowed to be zero. */
2487
2488 if (min == max) continue;
2489
2490 /* If minimizing, keep trying and advancing the pointer */
2491
2492 if (minimize)
2493 {
2494 for (fi = min;; fi++)
2495 {
2496 int slength;
2497 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2498 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2499 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2500 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2501 {
2502 CHECK_PARTIAL();
2503 MRRETURN(MATCH_NOMATCH);
2504 }
2505 eptr += slength;
2506 }
2507 /* Control never gets here */
2508 }
2509
2510 /* If maximizing, find the longest string and work backwards */
2511
2512 else
2513 {
2514 pp = eptr;
2515 for (i = min; i < max; i++)
2516 {
2517 int slength;
2518 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2519 {
2520 CHECK_PARTIAL();
2521 break;
2522 }
2523 eptr += slength;
2524 }
2525 while (eptr >= pp)
2526 {
2527 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2528 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2529 eptr -= length;
2530 }
2531 MRRETURN(MATCH_NOMATCH);
2532 }
2533 /* Control never gets here */
2534
2535 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2536 used when all the characters in the class have values in the range 0-255,
2537 and either the matching is caseful, or the characters are in the range
2538 0-127 when UTF-8 processing is enabled. The only difference between
2539 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2540 encountered.
2541
2542 First, look past the end of the item to see if there is repeat information
2543 following. Then obey similar code to character type repeats - written out
2544 again for speed. */
2545
2546 case OP_NCLASS:
2547 case OP_CLASS:
2548 {
2549 data = ecode + 1; /* Save for matching */
2550 ecode += 33; /* Advance past the item */
2551
2552 switch (*ecode)
2553 {
2554 case OP_CRSTAR:
2555 case OP_CRMINSTAR:
2556 case OP_CRPLUS:
2557 case OP_CRMINPLUS:
2558 case OP_CRQUERY:
2559 case OP_CRMINQUERY:
2560 c = *ecode++ - OP_CRSTAR;
2561 minimize = (c & 1) != 0;
2562 min = rep_min[c]; /* Pick up values from tables; */
2563 max = rep_max[c]; /* zero for max => infinity */
2564 if (max == 0) max = INT_MAX;
2565 break;
2566
2567 case OP_CRRANGE:
2568 case OP_CRMINRANGE:
2569 minimize = (*ecode == OP_CRMINRANGE);
2570 min = GET2(ecode, 1);
2571 max = GET2(ecode, 3);
2572 if (max == 0) max = INT_MAX;
2573 ecode += 5;
2574 break;
2575
2576 default: /* No repeat follows */
2577 min = max = 1;
2578 break;
2579 }
2580
2581 /* First, ensure the minimum number of matches are present. */
2582
2583 #ifdef SUPPORT_UTF8
2584 /* UTF-8 mode */
2585 if (utf8)
2586 {
2587 for (i = 1; i <= min; i++)
2588 {
2589 if (eptr >= md->end_subject)
2590 {
2591 SCHECK_PARTIAL();
2592 MRRETURN(MATCH_NOMATCH);
2593 }
2594 GETCHARINC(c, eptr);
2595 if (c > 255)
2596 {
2597 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2598 }
2599 else
2600 {
2601 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2602 }
2603 }
2604 }
2605 else
2606 #endif
2607 /* Not UTF-8 mode */
2608 {
2609 for (i = 1; i <= min; i++)
2610 {
2611 if (eptr >= md->end_subject)
2612 {
2613 SCHECK_PARTIAL();
2614 MRRETURN(MATCH_NOMATCH);
2615 }
2616 c = *eptr++;
2617 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2618 }
2619 }
2620
2621 /* If max == min we can continue with the main loop without the
2622 need to recurse. */
2623
2624 if (min == max) continue;
2625
2626 /* If minimizing, keep testing the rest of the expression and advancing
2627 the pointer while it matches the class. */
2628
2629 if (minimize)
2630 {
2631 #ifdef SUPPORT_UTF8
2632 /* UTF-8 mode */
2633 if (utf8)
2634 {
2635 for (fi = min;; fi++)
2636 {
2637 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2638 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2639 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2640 if (eptr >= md->end_subject)
2641 {
2642 SCHECK_PARTIAL();
2643 MRRETURN(MATCH_NOMATCH);
2644 }
2645 GETCHARINC(c, eptr);
2646 if (c > 255)
2647 {
2648 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2649 }
2650 else
2651 {
2652 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2653 }
2654 }
2655 }
2656 else
2657 #endif
2658 /* Not UTF-8 mode */
2659 {
2660 for (fi = min;; fi++)
2661 {
2662 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2664 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2665 if (eptr >= md->end_subject)
2666 {
2667 SCHECK_PARTIAL();
2668 MRRETURN(MATCH_NOMATCH);
2669 }
2670 c = *eptr++;
2671 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2672 }
2673 }
2674 /* Control never gets here */
2675 }
2676
2677 /* If maximizing, find the longest possible run, then work backwards. */
2678
2679 else
2680 {
2681 pp = eptr;
2682
2683 #ifdef SUPPORT_UTF8
2684 /* UTF-8 mode */
2685 if (utf8)
2686 {
2687 for (i = min; i < max; i++)
2688 {
2689 int len = 1;
2690 if (eptr >= md->end_subject)
2691 {
2692 SCHECK_PARTIAL();
2693 break;
2694 }
2695 GETCHARLEN(c, eptr, len);
2696 if (c > 255)
2697 {
2698 if (op == OP_CLASS) break;
2699 }
2700 else
2701 {
2702 if ((data[c/8] & (1 << (c&7))) == 0) break;
2703 }
2704 eptr += len;
2705 }
2706 for (;;)
2707 {
2708 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2710 if (eptr-- == pp) break; /* Stop if tried at original pos */
2711 BACKCHAR(eptr);
2712 }
2713 }
2714 else
2715 #endif
2716 /* Not UTF-8 mode */
2717 {
2718 for (i = min; i < max; i++)
2719 {
2720 if (eptr >= md->end_subject)
2721 {
2722 SCHECK_PARTIAL();
2723 break;
2724 }
2725 c = *eptr;
2726 if ((data[c/8] & (1 << (c&7))) == 0) break;
2727 eptr++;
2728 }
2729 while (eptr >= pp)
2730 {
2731 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2733 eptr--;
2734 }
2735 }
2736
2737 MRRETURN(MATCH_NOMATCH);
2738 }
2739 }
2740 /* Control never gets here */
2741
2742
2743 /* Match an extended character class. This opcode is encountered only
2744 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2745 mode, because Unicode properties are supported in non-UTF-8 mode. */
2746
2747 #ifdef SUPPORT_UTF8
2748 case OP_XCLASS:
2749 {
2750 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2751 ecode += GET(ecode, 1); /* Advance past the item */
2752
2753 switch (*ecode)
2754 {
2755 case OP_CRSTAR:
2756 case OP_CRMINSTAR:
2757 case OP_CRPLUS:
2758 case OP_CRMINPLUS:
2759 case OP_CRQUERY:
2760 case OP_CRMINQUERY:
2761 c = *ecode++ - OP_CRSTAR;
2762 minimize = (c & 1) != 0;
2763 min = rep_min[c]; /* Pick up values from tables; */
2764 max = rep_max[c]; /* zero for max => infinity */
2765 if (max == 0) max = INT_MAX;
2766 break;
2767
2768 case OP_CRRANGE:
2769 case OP_CRMINRANGE:
2770 minimize = (*ecode == OP_CRMINRANGE);
2771 min = GET2(ecode, 1);
2772 max = GET2(ecode, 3);
2773 if (max == 0) max = INT_MAX;
2774 ecode += 5;
2775 break;
2776
2777 default: /* No repeat follows */
2778 min = max = 1;
2779 break;
2780 }
2781
2782 /* First, ensure the minimum number of matches are present. */
2783
2784 for (i = 1; i <= min; i++)
2785 {
2786 if (eptr >= md->end_subject)
2787 {
2788 SCHECK_PARTIAL();
2789 MRRETURN(MATCH_NOMATCH);
2790 }
2791 GETCHARINCTEST(c, eptr);
2792 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2793 }
2794
2795 /* If max == min we can continue with the main loop without the
2796 need to recurse. */
2797
2798 if (min == max) continue;
2799
2800 /* If minimizing, keep testing the rest of the expression and advancing
2801 the pointer while it matches the class. */
2802
2803 if (minimize)
2804 {
2805 for (fi = min;; fi++)
2806 {
2807 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2809 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2810 if (eptr >= md->end_subject)
2811 {
2812 SCHECK_PARTIAL();
2813 MRRETURN(MATCH_NOMATCH);
2814 }
2815 GETCHARINCTEST(c, eptr);
2816 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2817 }
2818 /* Control never gets here */
2819 }
2820
2821 /* If maximizing, find the longest possible run, then work backwards. */
2822
2823 else
2824 {
2825 pp = eptr;
2826 for (i = min; i < max; i++)
2827 {
2828 int len = 1;
2829 if (eptr >= md->end_subject)
2830 {
2831 SCHECK_PARTIAL();
2832 break;
2833 }
2834 GETCHARLENTEST(c, eptr, len);
2835 if (!_pcre_xclass(c, data)) break;
2836 eptr += len;
2837 }
2838 for(;;)
2839 {
2840 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2842 if (eptr-- == pp) break; /* Stop if tried at original pos */
2843 if (utf8) BACKCHAR(eptr);
2844 }
2845 MRRETURN(MATCH_NOMATCH);
2846 }
2847
2848 /* Control never gets here */
2849 }
2850 #endif /* End of XCLASS */
2851
2852 /* Match a single character, casefully */
2853
2854 case OP_CHAR:
2855 #ifdef SUPPORT_UTF8
2856 if (utf8)
2857 {
2858 length = 1;
2859 ecode++;
2860 GETCHARLEN(fc, ecode, length);
2861 if (length > md->end_subject - eptr)
2862 {
2863 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2864 MRRETURN(MATCH_NOMATCH);
2865 }
2866 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2867 }
2868 else
2869 #endif
2870
2871 /* Non-UTF-8 mode */
2872 {
2873 if (md->end_subject - eptr < 1)
2874 {
2875 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2876 MRRETURN(MATCH_NOMATCH);
2877 }
2878 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2879 ecode += 2;
2880 }
2881 break;
2882
2883 /* Match a single character, caselessly */
2884
2885 case OP_CHARI:
2886 #ifdef SUPPORT_UTF8
2887 if (utf8)
2888 {
2889 length = 1;
2890 ecode++;
2891 GETCHARLEN(fc, ecode, length);
2892
2893 if (length > md->end_subject - eptr)
2894 {
2895 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2896 MRRETURN(MATCH_NOMATCH);
2897 }
2898
2899 /* If the pattern character's value is < 128, we have only one byte, and
2900 can use the fast lookup table. */
2901
2902 if (fc < 128)
2903 {
2904 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2905 }
2906
2907 /* Otherwise we must pick up the subject character */
2908
2909 else
2910 {
2911 unsigned int dc;
2912 GETCHARINC(dc, eptr);
2913 ecode += length;
2914
2915 /* If we have Unicode property support, we can use it to test the other
2916 case of the character, if there is one. */
2917
2918 if (fc != dc)
2919 {
2920 #ifdef SUPPORT_UCP
2921 if (dc != UCD_OTHERCASE(fc))
2922 #endif
2923 MRRETURN(MATCH_NOMATCH);
2924 }
2925 }
2926 }
2927 else
2928 #endif /* SUPPORT_UTF8 */
2929
2930 /* Non-UTF-8 mode */
2931 {
2932 if (md->end_subject - eptr < 1)
2933 {
2934 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2935 MRRETURN(MATCH_NOMATCH);
2936 }
2937 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2938 ecode += 2;
2939 }
2940 break;
2941
2942 /* Match a single character repeatedly. */
2943
2944 case OP_EXACT:
2945 case OP_EXACTI:
2946 min = max = GET2(ecode, 1);
2947 ecode += 3;
2948 goto REPEATCHAR;
2949
2950 case OP_POSUPTO:
2951 case OP_POSUPTOI:
2952 possessive = TRUE;
2953 /* Fall through */
2954
2955 case OP_UPTO:
2956 case OP_UPTOI:
2957 case OP_MINUPTO:
2958 case OP_MINUPTOI:
2959 min = 0;
2960 max = GET2(ecode, 1);
2961 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2962 ecode += 3;
2963 goto REPEATCHAR;
2964
2965 case OP_POSSTAR:
2966 case OP_POSSTARI:
2967 possessive = TRUE;
2968 min = 0;
2969 max = INT_MAX;
2970 ecode++;
2971 goto REPEATCHAR;
2972
2973 case OP_POSPLUS:
2974 case OP_POSPLUSI:
2975 possessive = TRUE;
2976 min = 1;
2977 max = INT_MAX;
2978 ecode++;
2979 goto REPEATCHAR;
2980
2981 case OP_POSQUERY:
2982 case OP_POSQUERYI:
2983 possessive = TRUE;
2984 min = 0;
2985 max = 1;
2986 ecode++;
2987 goto REPEATCHAR;
2988
2989 case OP_STAR:
2990 case OP_STARI:
2991 case OP_MINSTAR:
2992 case OP_MINSTARI:
2993 case OP_PLUS:
2994 case OP_PLUSI:
2995 case OP_MINPLUS:
2996 case OP_MINPLUSI:
2997 case OP_QUERY:
2998 case OP_QUERYI:
2999 case OP_MINQUERY:
3000 case OP_MINQUERYI:
3001 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3002 minimize = (c & 1) != 0;
3003 min = rep_min[c]; /* Pick up values from tables; */
3004 max = rep_max[c]; /* zero for max => infinity */
3005 if (max == 0) max = INT_MAX;
3006
3007 /* Common code for all repeated single-character matches. */
3008
3009 REPEATCHAR:
3010 #ifdef SUPPORT_UTF8
3011 if (utf8)
3012 {
3013 length = 1;
3014 charptr = ecode;
3015 GETCHARLEN(fc, ecode, length);
3016 ecode += length;
3017
3018 /* Handle multibyte character matching specially here. There is
3019 support for caseless matching if UCP support is present. */
3020
3021 if (length > 1)
3022 {
3023 #ifdef SUPPORT_UCP
3024 unsigned int othercase;
3025 if (op >= OP_STARI && /* Caseless */
3026 (othercase = UCD_OTHERCASE(fc)) != fc)
3027 oclength = _pcre_ord2utf8(othercase, occhars);
3028 else oclength = 0;
3029 #endif /* SUPPORT_UCP */
3030
3031 for (i = 1; i <= min; i++)
3032 {
3033 if (eptr <= md->end_subject - length &&
3034 memcmp(eptr, charptr, length) == 0) eptr += length;
3035 #ifdef SUPPORT_UCP
3036 else if (oclength > 0 &&
3037 eptr <= md->end_subject - oclength &&
3038 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3039 #endif /* SUPPORT_UCP */
3040 else
3041 {
3042 CHECK_PARTIAL();
3043 MRRETURN(MATCH_NOMATCH);
3044 }
3045 }
3046
3047 if (min == max) continue;
3048
3049 if (minimize)
3050 {
3051 for (fi = min;; fi++)
3052 {
3053 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3055 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3056 if (eptr <= md->end_subject - length &&
3057 memcmp(eptr, charptr, length) == 0) eptr += length;
3058 #ifdef SUPPORT_UCP
3059 else if (oclength > 0 &&
3060 eptr <= md->end_subject - oclength &&
3061 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3062 #endif /* SUPPORT_UCP */
3063 else
3064 {
3065 CHECK_PARTIAL();
3066 MRRETURN(MATCH_NOMATCH);
3067 }
3068 }
3069 /* Control never gets here */
3070 }
3071
3072 else /* Maximize */
3073 {
3074 pp = eptr;
3075 for (i = min; i < max; i++)
3076 {
3077 if (eptr <= md->end_subject - length &&
3078 memcmp(eptr, charptr, length) == 0) eptr += length;
3079 #ifdef SUPPORT_UCP
3080 else if (oclength > 0 &&
3081 eptr <= md->end_subject - oclength &&
3082 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3083 #endif /* SUPPORT_UCP */
3084 else
3085 {
3086 CHECK_PARTIAL();
3087 break;
3088 }
3089 }
3090
3091 if (possessive) continue;
3092
3093 for(;;)
3094 {
3095 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3097 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3098 #ifdef SUPPORT_UCP
3099 eptr--;
3100 BACKCHAR(eptr);
3101 #else /* without SUPPORT_UCP */
3102 eptr -= length;
3103 #endif /* SUPPORT_UCP */
3104 }
3105 }
3106 /* Control never gets here */
3107 }
3108
3109 /* If the length of a UTF-8 character is 1, we fall through here, and
3110 obey the code as for non-UTF-8 characters below, though in this case the
3111 value of fc will always be < 128. */
3112 }
3113 else
3114 #endif /* SUPPORT_UTF8 */
3115
3116 /* When not in UTF-8 mode, load a single-byte character. */
3117
3118 fc = *ecode++;
3119
3120 /* The value of fc at this point is always less than 256, though we may or
3121 may not be in UTF-8 mode. The code is duplicated for the caseless and
3122 caseful cases, for speed, since matching characters is likely to be quite
3123 common. First, ensure the minimum number of matches are present. If min =
3124 max, continue at the same level without recursing. Otherwise, if
3125 minimizing, keep trying the rest of the expression and advancing one
3126 matching character if failing, up to the maximum. Alternatively, if
3127 maximizing, find the maximum number of characters and work backwards. */
3128
3129 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3130 max, eptr));
3131
3132 if (op >= OP_STARI) /* Caseless */
3133 {
3134 fc = md->lcc[fc];
3135 for (i = 1; i <= min; i++)
3136 {
3137 if (eptr >= md->end_subject)
3138 {
3139 SCHECK_PARTIAL();
3140 MRRETURN(MATCH_NOMATCH);
3141 }
3142 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3143 }
3144 if (min == max) continue;
3145 if (minimize)
3146 {
3147 for (fi = min;; fi++)
3148 {
3149 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3150 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3151 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3152 if (eptr >= md->end_subject)
3153 {
3154 SCHECK_PARTIAL();
3155 MRRETURN(MATCH_NOMATCH);
3156 }
3157 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3158 }
3159 /* Control never gets here */
3160 }
3161 else /* Maximize */
3162 {
3163 pp = eptr;
3164 for (i = min; i < max; i++)
3165 {
3166 if (eptr >= md->end_subject)
3167 {
3168 SCHECK_PARTIAL();
3169 break;
3170 }
3171 if (fc != md->lcc[*eptr]) break;
3172 eptr++;
3173 }
3174
3175 if (possessive) continue;
3176
3177 while (eptr >= pp)
3178 {
3179 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3180 eptr--;
3181 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3182 }
3183 MRRETURN(MATCH_NOMATCH);
3184 }
3185 /* Control never gets here */
3186 }
3187
3188 /* Caseful comparisons (includes all multi-byte characters) */
3189
3190 else
3191 {
3192 for (i = 1; i <= min; i++)
3193 {
3194 if (eptr >= md->end_subject)
3195 {
3196 SCHECK_PARTIAL();
3197 MRRETURN(MATCH_NOMATCH);
3198 }
3199 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3200 }
3201
3202 if (min == max) continue;
3203
3204 if (minimize)
3205 {
3206 for (fi = min;; fi++)
3207 {
3208 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3209 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3210 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3211 if (eptr >= md->end_subject)
3212 {
3213 SCHECK_PARTIAL();
3214 MRRETURN(MATCH_NOMATCH);
3215 }
3216 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3217 }
3218 /* Control never gets here */
3219 }
3220 else /* Maximize */
3221 {
3222 pp = eptr;
3223 for (i = min; i < max; i++)
3224 {
3225 if (eptr >= md->end_subject)
3226 {
3227 SCHECK_PARTIAL();
3228 break;
3229 }
3230 if (fc != *eptr) break;
3231 eptr++;
3232 }
3233 if (possessive) continue;
3234
3235 while (eptr >= pp)
3236 {
3237 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3238 eptr--;
3239 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3240 }
3241 MRRETURN(MATCH_NOMATCH);
3242 }
3243 }
3244 /* Control never gets here */
3245
3246 /* Match a negated single one-byte character. The character we are
3247 checking can be multibyte. */
3248
3249 case OP_NOT:
3250 case OP_NOTI:
3251 if (eptr >= md->end_subject)
3252 {
3253 SCHECK_PARTIAL();
3254 MRRETURN(MATCH_NOMATCH);
3255 }
3256 ecode++;
3257 GETCHARINCTEST(c, eptr);
3258 if (op == OP_NOTI) /* The caseless case */
3259 {
3260 #ifdef SUPPORT_UTF8
3261 if (c < 256)
3262 #endif
3263 c = md->lcc[c];
3264 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3265 }
3266 else /* Caseful */
3267 {
3268 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3269 }
3270 break;
3271
3272 /* Match a negated single one-byte character repeatedly. This is almost a
3273 repeat of the code for a repeated single character, but I haven't found a
3274 nice way of commoning these up that doesn't require a test of the
3275 positive/negative option for each character match. Maybe that wouldn't add
3276 very much to the time taken, but character matching *is* what this is all
3277 about... */
3278
3279 case OP_NOTEXACT:
3280 case OP_NOTEXACTI:
3281 min = max = GET2(ecode, 1);
3282 ecode += 3;
3283 goto REPEATNOTCHAR;
3284
3285 case OP_NOTUPTO:
3286 case OP_NOTUPTOI:
3287 case OP_NOTMINUPTO:
3288 case OP_NOTMINUPTOI:
3289 min = 0;
3290 max = GET2(ecode, 1);
3291 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3292 ecode += 3;
3293 goto REPEATNOTCHAR;
3294
3295 case OP_NOTPOSSTAR:
3296 case OP_NOTPOSSTARI:
3297 possessive = TRUE;
3298 min = 0;
3299 max = INT_MAX;
3300 ecode++;
3301 goto REPEATNOTCHAR;
3302
3303 case OP_NOTPOSPLUS:
3304 case OP_NOTPOSPLUSI:
3305 possessive = TRUE;
3306 min = 1;
3307 max = INT_MAX;
3308 ecode++;
3309 goto REPEATNOTCHAR;
3310
3311 case OP_NOTPOSQUERY:
3312 case OP_NOTPOSQUERYI:
3313 possessive = TRUE;
3314 min = 0;
3315 max = 1;
3316 ecode++;
3317 goto REPEATNOTCHAR;
3318
3319 case OP_NOTPOSUPTO:
3320 case OP_NOTPOSUPTOI:
3321 possessive = TRUE;
3322 min = 0;
3323 max = GET2(ecode, 1);
3324 ecode += 3;
3325 goto REPEATNOTCHAR;
3326
3327 case OP_NOTSTAR:
3328 case OP_NOTSTARI:
3329 case OP_NOTMINSTAR:
3330 case OP_NOTMINSTARI:
3331 case OP_NOTPLUS:
3332 case OP_NOTPLUSI:
3333 case OP_NOTMINPLUS:
3334 case OP_NOTMINPLUSI:
3335 case OP_NOTQUERY:
3336 case OP_NOTQUERYI:
3337 case OP_NOTMINQUERY:
3338 case OP_NOTMINQUERYI:
3339 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3340 minimize = (c & 1) != 0;
3341 min = rep_min[c]; /* Pick up values from tables; */
3342 max = rep_max[c]; /* zero for max => infinity */
3343 if (max == 0) max = INT_MAX;
3344
3345 /* Common code for all repeated single-byte matches. */
3346
3347 REPEATNOTCHAR:
3348 fc = *ecode++;
3349
3350 /* The code is duplicated for the caseless and caseful cases, for speed,
3351 since matching characters is likely to be quite common. First, ensure the
3352 minimum number of matches are present. If min = max, continue at the same
3353 level without recursing. Otherwise, if minimizing, keep trying the rest of
3354 the expression and advancing one matching character if failing, up to the
3355 maximum. Alternatively, if maximizing, find the maximum number of
3356 characters and work backwards. */
3357
3358 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3359 max, eptr));
3360
3361 if (op >= OP_NOTSTARI) /* Caseless */
3362 {
3363 fc = md->lcc[fc];
3364
3365 #ifdef SUPPORT_UTF8
3366 /* UTF-8 mode */
3367 if (utf8)
3368 {
3369 register unsigned int d;
3370 for (i = 1; i <= min; i++)
3371 {
3372 if (eptr >= md->end_subject)
3373 {
3374 SCHECK_PARTIAL();
3375 MRRETURN(MATCH_NOMATCH);
3376 }
3377 GETCHARINC(d, eptr);
3378 if (d < 256) d = md->lcc[d];
3379 if (fc == d) MRRETURN(MATCH_NOMATCH);
3380 }
3381 }
3382 else
3383 #endif
3384
3385 /* Not UTF-8 mode */
3386 {
3387 for (i = 1; i <= min; i++)
3388 {
3389 if (eptr >= md->end_subject)
3390 {
3391 SCHECK_PARTIAL();
3392 MRRETURN(MATCH_NOMATCH);
3393 }
3394 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3395 }
3396 }
3397
3398 if (min == max) continue;
3399
3400 if (minimize)
3401 {
3402 #ifdef SUPPORT_UTF8
3403 /* UTF-8 mode */
3404 if (utf8)
3405 {
3406 register unsigned int d;
3407 for (fi = min;; fi++)
3408 {
3409 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3410 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3411 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3412 if (eptr >= md->end_subject)
3413 {
3414 SCHECK_PARTIAL();
3415 MRRETURN(MATCH_NOMATCH);
3416 }
3417 GETCHARINC(d, eptr);
3418 if (d < 256) d = md->lcc[d];
3419 if (fc == d) MRRETURN(MATCH_NOMATCH);
3420 }
3421 }
3422 else
3423 #endif
3424 /* Not UTF-8 mode */
3425 {
3426 for (fi = min;; fi++)
3427 {
3428 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3429 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3430 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3431 if (eptr >= md->end_subject)
3432 {
3433 SCHECK_PARTIAL();
3434 MRRETURN(MATCH_NOMATCH);
3435 }
3436 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3437 }
3438 }
3439 /* Control never gets here */
3440 }
3441
3442 /* Maximize case */
3443
3444 else
3445 {
3446 pp = eptr;
3447
3448 #ifdef SUPPORT_UTF8
3449 /* UTF-8 mode */
3450 if (utf8)
3451 {
3452 register unsigned int d;
3453 for (i = min; i < max; i++)
3454 {
3455 int len = 1;
3456 if (eptr >= md->end_subject)
3457 {
3458 SCHECK_PARTIAL();
3459 break;
3460 }
3461 GETCHARLEN(d, eptr, len);
3462 if (d < 256) d = md->lcc[d];
3463 if (fc == d) break;
3464 eptr += len;
3465 }
3466 if (possessive) continue;
3467 for(;;)
3468 {
3469 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3470 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3471 if (eptr-- == pp) break; /* Stop if tried at original pos */
3472 BACKCHAR(eptr);
3473 }
3474 }
3475 else
3476 #endif
3477 /* Not UTF-8 mode */
3478 {
3479 for (i = min; i < max; i++)
3480 {
3481 if (eptr >= md->end_subject)
3482 {
3483 SCHECK_PARTIAL();
3484 break;
3485 }
3486 if (fc == md->lcc[*eptr]) break;
3487 eptr++;
3488 }
3489 if (possessive) continue;
3490 while (eptr >= pp)
3491 {
3492 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3493 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3494 eptr--;
3495 }
3496 }
3497
3498 MRRETURN(MATCH_NOMATCH);
3499 }
3500 /* Control never gets here */
3501 }
3502
3503 /* Caseful comparisons */
3504
3505 else
3506 {
3507 #ifdef SUPPORT_UTF8
3508 /* UTF-8 mode */
3509 if (utf8)
3510 {
3511 register unsigned int d;
3512 for (i = 1; i <= min; i++)
3513 {
3514 if (eptr >= md->end_subject)
3515 {
3516 SCHECK_PARTIAL();
3517 MRRETURN(MATCH_NOMATCH);
3518 }
3519 GETCHARINC(d, eptr);
3520 if (fc == d) MRRETURN(MATCH_NOMATCH);
3521 }
3522 }
3523 else
3524 #endif
3525 /* Not UTF-8 mode */
3526 {
3527 for (i = 1; i <= min; i++)
3528 {
3529 if (eptr >= md->end_subject)
3530 {
3531 SCHECK_PARTIAL();
3532 MRRETURN(MATCH_NOMATCH);
3533 }
3534 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3535 }
3536 }
3537
3538 if (min == max) continue;
3539
3540 if (minimize)
3541 {
3542 #ifdef SUPPORT_UTF8
3543 /* UTF-8 mode */
3544 if (utf8)
3545 {
3546 register unsigned int d;
3547 for (fi = min;; fi++)
3548 {
3549 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3550 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3551 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3552 if (eptr >= md->end_subject)
3553 {
3554 SCHECK_PARTIAL();
3555 MRRETURN(MATCH_NOMATCH);
3556 }
3557 GETCHARINC(d, eptr);
3558 if (fc == d) MRRETURN(MATCH_NOMATCH);
3559 }
3560 }
3561 else
3562 #endif
3563 /* Not UTF-8 mode */
3564 {
3565 for (fi = min;; fi++)
3566 {
3567 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3569 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3570 if (eptr >= md->end_subject)
3571 {
3572 SCHECK_PARTIAL();
3573 MRRETURN(MATCH_NOMATCH);
3574 }
3575 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3576 }
3577 }
3578 /* Control never gets here */
3579 }
3580
3581 /* Maximize case */
3582
3583 else
3584 {
3585 pp = eptr;
3586
3587 #ifdef SUPPORT_UTF8
3588 /* UTF-8 mode */
3589 if (utf8)
3590 {
3591 register unsigned int d;
3592 for (i = min; i < max; i++)
3593 {
3594 int len = 1;
3595 if (eptr >= md->end_subject)
3596 {
3597 SCHECK_PARTIAL();
3598 break;
3599 }
3600 GETCHARLEN(d, eptr, len);
3601 if (fc == d) break;
3602 eptr += len;
3603 }
3604 if (possessive) continue;
3605 for(;;)
3606 {
3607 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3608 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3609 if (eptr-- == pp) break; /* Stop if tried at original pos */
3610 BACKCHAR(eptr);
3611 }
3612 }
3613 else
3614 #endif
3615 /* Not UTF-8 mode */
3616 {
3617 for (i = min; i < max; i++)
3618 {
3619 if (eptr >= md->end_subject)
3620 {
3621 SCHECK_PARTIAL();
3622 break;
3623 }
3624 if (fc == *eptr) break;
3625 eptr++;
3626 }
3627 if (possessive) continue;
3628 while (eptr >= pp)
3629 {
3630 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3632 eptr--;
3633 }
3634 }
3635
3636 MRRETURN(MATCH_NOMATCH);
3637 }
3638 }
3639 /* Control never gets here */
3640
3641 /* Match a single character type repeatedly; several different opcodes
3642 share code. This is very similar to the code for single characters, but we
3643 repeat it in the interests of efficiency. */
3644
3645 case OP_TYPEEXACT:
3646 min = max = GET2(ecode, 1);
3647 minimize = TRUE;
3648 ecode += 3;
3649 goto REPEATTYPE;
3650
3651 case OP_TYPEUPTO:
3652 case OP_TYPEMINUPTO:
3653 min = 0;
3654 max = GET2(ecode, 1);
3655 minimize = *ecode == OP_TYPEMINUPTO;
3656 ecode += 3;
3657 goto REPEATTYPE;
3658
3659 case OP_TYPEPOSSTAR:
3660 possessive = TRUE;
3661 min = 0;
3662 max = INT_MAX;
3663 ecode++;
3664 goto REPEATTYPE;
3665
3666 case OP_TYPEPOSPLUS:
3667 possessive = TRUE;
3668 min = 1;
3669 max = INT_MAX;
3670 ecode++;
3671 goto REPEATTYPE;
3672
3673 case OP_TYPEPOSQUERY:
3674 possessive = TRUE;
3675 min = 0;
3676 max = 1;
3677 ecode++;
3678 goto REPEATTYPE;
3679
3680 case OP_TYPEPOSUPTO:
3681 possessive = TRUE;
3682 min = 0;
3683 max = GET2(ecode, 1);
3684 ecode += 3;
3685 goto REPEATTYPE;
3686
3687 case OP_TYPESTAR:
3688 case OP_TYPEMINSTAR:
3689 case OP_TYPEPLUS:
3690 case OP_TYPEMINPLUS:
3691 case OP_TYPEQUERY:
3692 case OP_TYPEMINQUERY:
3693 c = *ecode++ - OP_TYPESTAR;
3694 minimize = (c & 1) != 0;
3695 min = rep_min[c]; /* Pick up values from tables; */
3696 max = rep_max[c]; /* zero for max => infinity */
3697 if (max == 0) max = INT_MAX;
3698
3699 /* Common code for all repeated single character type matches. Note that
3700 in UTF-8 mode, '.' matches a character of any length, but for the other
3701 character types, the valid characters are all one-byte long. */
3702
3703 REPEATTYPE:
3704 ctype = *ecode++; /* Code for the character type */
3705
3706 #ifdef SUPPORT_UCP
3707 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3708 {
3709 prop_fail_result = ctype == OP_NOTPROP;
3710 prop_type = *ecode++;
3711 prop_value = *ecode++;
3712 }
3713 else prop_type = -1;
3714 #endif
3715
3716 /* First, ensure the minimum number of matches are present. Use inline
3717 code for maximizing the speed, and do the type test once at the start
3718 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3719 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3720 and single-bytes. */
3721
3722 if (min > 0)
3723 {
3724 #ifdef SUPPORT_UCP
3725 if (prop_type >= 0)
3726 {
3727 switch(prop_type)
3728 {
3729 case PT_ANY:
3730 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3731 for (i = 1; i <= min; i++)
3732 {
3733 if (eptr >= md->end_subject)
3734 {
3735 SCHECK_PARTIAL();
3736 MRRETURN(MATCH_NOMATCH);
3737 }
3738 GETCHARINCTEST(c, eptr);
3739 }
3740 break;
3741
3742 case PT_LAMP:
3743 for (i = 1; i <= min; i++)
3744 {
3745 if (eptr >= md->end_subject)
3746 {
3747 SCHECK_PARTIAL();
3748 MRRETURN(MATCH_NOMATCH);
3749 }
3750 GETCHARINCTEST(c, eptr);
3751 prop_chartype = UCD_CHARTYPE(c);
3752 if ((prop_chartype == ucp_Lu ||
3753 prop_chartype == ucp_Ll ||
3754 prop_chartype == ucp_Lt) == prop_fail_result)
3755 MRRETURN(MATCH_NOMATCH);
3756 }
3757 break;
3758
3759 case PT_GC:
3760 for (i = 1; i <= min; i++)
3761 {
3762 if (eptr >= md->end_subject)
3763 {
3764 SCHECK_PARTIAL();
3765 MRRETURN(MATCH_NOMATCH);
3766 }
3767 GETCHARINCTEST(c, eptr);
3768 prop_category = UCD_CATEGORY(c);
3769 if ((prop_category == prop_value) == prop_fail_result)
3770 MRRETURN(MATCH_NOMATCH);
3771 }
3772 break;
3773
3774 case PT_PC:
3775 for (i = 1; i <= min; i++)
3776 {
3777 if (eptr >= md->end_subject)
3778 {
3779 SCHECK_PARTIAL();
3780 MRRETURN(MATCH_NOMATCH);
3781 }
3782 GETCHARINCTEST(c, eptr);
3783 prop_chartype = UCD_CHARTYPE(c);
3784 if ((prop_chartype == prop_value) == prop_fail_result)
3785 MRRETURN(MATCH_NOMATCH);
3786 }
3787 break;
3788
3789 case PT_SC:
3790 for (i = 1; i <= min; i++)
3791 {
3792 if (eptr >= md->end_subject)
3793 {
3794 SCHECK_PARTIAL();
3795 MRRETURN(MATCH_NOMATCH);
3796 }
3797 GETCHARINCTEST(c, eptr);
3798 prop_script = UCD_SCRIPT(c);
3799 if ((prop_script == prop_value) == prop_fail_result)
3800 MRRETURN(MATCH_NOMATCH);
3801 }
3802 break;
3803
3804 case PT_ALNUM:
3805 for (i = 1; i <= min; i++)
3806 {
3807 if (eptr >= md->end_subject)
3808 {
3809 SCHECK_PARTIAL();
3810 MRRETURN(MATCH_NOMATCH);
3811 }
3812 GETCHARINCTEST(c, eptr);
3813 prop_category = UCD_CATEGORY(c);
3814 if ((prop_category == ucp_L || prop_category == ucp_N)
3815 == prop_fail_result)
3816 MRRETURN(MATCH_NOMATCH);
3817 }
3818 break;
3819
3820 case PT_SPACE: /* Perl space */
3821 for (i = 1; i <= min; i++)
3822 {
3823 if (eptr >= md->end_subject)
3824 {
3825 SCHECK_PARTIAL();
3826 MRRETURN(MATCH_NOMATCH);
3827 }
3828 GETCHARINCTEST(c, eptr);
3829 prop_category = UCD_CATEGORY(c);
3830 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3831 c == CHAR_FF || c == CHAR_CR)
3832 == prop_fail_result)
3833 MRRETURN(MATCH_NOMATCH);
3834 }
3835 break;
3836
3837 case PT_PXSPACE: /* POSIX space */
3838 for (i = 1; i <= min; i++)
3839 {
3840 if (eptr >= md->end_subject)
3841 {
3842 SCHECK_PARTIAL();
3843 MRRETURN(MATCH_NOMATCH);
3844 }
3845 GETCHARINCTEST(c, eptr);
3846 prop_category = UCD_CATEGORY(c);
3847 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3848 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3849 == prop_fail_result)
3850 MRRETURN(MATCH_NOMATCH);
3851 }
3852 break;
3853
3854 case PT_WORD:
3855 for (i = 1; i <= min; i++)
3856 {
3857 if (eptr >= md->end_subject)
3858 {
3859 SCHECK_PARTIAL();
3860 MRRETURN(MATCH_NOMATCH);
3861 }
3862 GETCHARINCTEST(c, eptr);
3863 prop_category = UCD_CATEGORY(c);
3864 if ((prop_category == ucp_L || prop_category == ucp_N ||
3865 c == CHAR_UNDERSCORE)
3866 == prop_fail_result)
3867 MRRETURN(MATCH_NOMATCH);
3868 }
3869 break;
3870
3871 /* This should not occur */
3872
3873 default:
3874 RRETURN(PCRE_ERROR_INTERNAL);
3875 }
3876 }
3877
3878 /* Match extended Unicode sequences. We will get here only if the
3879 support is in the binary; otherwise a compile-time error occurs. */
3880
3881 else if (ctype == OP_EXTUNI)
3882 {
3883 for (i = 1; i <= min; i++)
3884 {
3885 if (eptr >= md->end_subject)
3886 {
3887 SCHECK_PARTIAL();
3888 MRRETURN(MATCH_NOMATCH);
3889 }
3890 GETCHARINCTEST(c, eptr);
3891 prop_category = UCD_CATEGORY(c);
3892 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3893 while (eptr < md->end_subject)
3894 {
3895 int len = 1;
3896 if (!utf8) c = *eptr;
3897 else { GETCHARLEN(c, eptr, len); }
3898 prop_category = UCD_CATEGORY(c);
3899 if (prop_category != ucp_M) break;
3900 eptr += len;
3901 }
3902 }
3903 }
3904
3905 else
3906 #endif /* SUPPORT_UCP */
3907
3908 /* Handle all other cases when the coding is UTF-8 */
3909
3910 #ifdef SUPPORT_UTF8
3911 if (utf8) switch(ctype)
3912 {
3913 case OP_ANY:
3914 for (i = 1; i <= min; i++)
3915 {
3916 if (eptr >= md->end_subject)
3917 {
3918 SCHECK_PARTIAL();
3919 MRRETURN(MATCH_NOMATCH);
3920 }
3921 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3922 eptr++;
3923 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3924 }
3925 break;
3926
3927 case OP_ALLANY:
3928 for (i = 1; i <= min; i++)
3929 {
3930 if (eptr >= md->end_subject)
3931 {
3932 SCHECK_PARTIAL();
3933 MRRETURN(MATCH_NOMATCH);
3934 }
3935 eptr++;
3936 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3937 }
3938 break;
3939
3940 case OP_ANYBYTE:
3941 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3942 eptr += min;
3943 break;
3944
3945 case OP_ANYNL:
3946 for (i = 1; i <= min; i++)
3947 {
3948 if (eptr >= md->end_subject)
3949 {
3950 SCHECK_PARTIAL();
3951 MRRETURN(MATCH_NOMATCH);
3952 }
3953 GETCHARINC(c, eptr);
3954 switch(c)
3955 {
3956 default: MRRETURN(MATCH_NOMATCH);
3957
3958 case 0x000d:
3959 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3960 break;
3961
3962 case 0x000a:
3963 break;
3964
3965 case 0x000b:
3966 case 0x000c:
3967 case 0x0085:
3968 case 0x2028:
3969 case 0x2029:
3970 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3971 break;
3972 }
3973 }
3974 break;
3975
3976 case OP_NOT_HSPACE:
3977 for (i = 1; i <= min; i++)
3978 {
3979 if (eptr >= md->end_subject)
3980 {
3981 SCHECK_PARTIAL();
3982 MRRETURN(MATCH_NOMATCH);
3983 }
3984 GETCHARINC(c, eptr);
3985 switch(c)
3986 {
3987 default: break;
3988 case 0x09: /* HT */
3989 case 0x20: /* SPACE */
3990 case 0xa0: /* NBSP */
3991 case 0x1680: /* OGHAM SPACE MARK */
3992 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3993 case 0x2000: /* EN QUAD */
3994 case 0x2001: /* EM QUAD */
3995 case 0x2002: /* EN SPACE */
3996 case 0x2003: /* EM SPACE */
3997 case 0x2004: /* THREE-PER-EM SPACE */
3998 case 0x2005: /* FOUR-PER-EM SPACE */
3999 case 0x2006: /* SIX-PER-EM SPACE */
4000 case 0x2007: /* FIGURE SPACE */
4001 case 0x2008: /* PUNCTUATION SPACE */
4002 case 0x2009: /* THIN SPACE */
4003 case 0x200A: /* HAIR SPACE */
4004 case 0x202f: /* NARROW NO-BREAK SPACE */
4005 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4006 case 0x3000: /* IDEOGRAPHIC SPACE */
4007 MRRETURN(MATCH_NOMATCH);
4008 }
4009 }
4010 break;
4011
4012 case OP_HSPACE:
4013 for (i = 1; i <= min; i++)
4014 {
4015 if (eptr >= md->end_subject)
4016 {
4017 SCHECK_PARTIAL();
4018 MRRETURN(MATCH_NOMATCH);
4019 }
4020 GETCHARINC(c, eptr);
4021 switch(c)
4022 {
4023 default: MRRETURN(MATCH_NOMATCH);
4024 case 0x09: /* HT */
4025 case 0x20: /* SPACE */
4026 case 0xa0: /* NBSP */
4027 case 0x1680: /* OGHAM SPACE MARK */
4028 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4029 case 0x2000: /* EN QUAD */
4030 case 0x2001: /* EM QUAD */
4031 case 0x2002: /* EN SPACE */
4032 case 0x2003: /* EM SPACE */
4033 case 0x2004: /* THREE-PER-EM SPACE */
4034 case 0x2005: /* FOUR-PER-EM SPACE */
4035 case 0x2006: /* SIX-PER-EM SPACE */
4036 case 0x2007: /* FIGURE SPACE */
4037 case 0x2008: /* PUNCTUATION SPACE */
4038 case 0x2009: /* THIN SPACE */
4039 case 0x200A: /* HAIR SPACE */
4040 case 0x202f: /* NARROW NO-BREAK SPACE */
4041 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4042 case 0x3000: /* IDEOGRAPHIC SPACE */
4043 break;
4044 }
4045 }
4046 break;
4047
4048 case OP_NOT_VSPACE:
4049 for (i = 1; i <= min; i++)
4050 {
4051 if (eptr >= md->end_subject)
4052 {
4053 SCHECK_PARTIAL();
4054 MRRETURN(MATCH_NOMATCH);
4055 }
4056 GETCHARINC(c, eptr);
4057 switch(c)
4058 {
4059 default: break;
4060 case 0x0a: /* LF */
4061 case 0x0b: /* VT */
4062 case 0x0c: /* FF */
4063 case 0x0d: /* CR */
4064 case 0x85: /* NEL */
4065 case 0x2028: /* LINE SEPARATOR */
4066 case 0x2029: /* PARAGRAPH SEPARATOR */
4067 MRRETURN(MATCH_NOMATCH);
4068 }
4069 }
4070 break;
4071
4072 case OP_VSPACE:
4073 for (i = 1; i <= min; i++)
4074 {
4075 if (eptr >= md->end_subject)
4076 {
4077 SCHECK_PARTIAL();
4078 MRRETURN(MATCH_NOMATCH);
4079 }
4080 GETCHARINC(c, eptr);
4081 switch(c)
4082 {
4083 default: MRRETURN(MATCH_NOMATCH);
4084 case 0x0a: /* LF */
4085 case 0x0b: /* VT */
4086 case 0x0c: /* FF */
4087 case 0x0d: /* CR */
4088 case 0x85: /* NEL */
4089 case 0x2028: /* LINE SEPARATOR */
4090 case 0x2029: /* PARAGRAPH SEPARATOR */
4091 break;
4092 }
4093 }
4094 break;
4095
4096 case OP_NOT_DIGIT:
4097 for (i = 1; i <= min; i++)
4098 {
4099 if (eptr >= md->end_subject)
4100 {
4101 SCHECK_PARTIAL();
4102 MRRETURN(MATCH_NOMATCH);
4103 }
4104 GETCHARINC(c, eptr);
4105 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4106 MRRETURN(MATCH_NOMATCH);
4107 }
4108 break;
4109
4110 case OP_DIGIT:
4111 for (i = 1; i <= min; i++)
4112 {
4113 if (eptr >= md->end_subject)
4114 {
4115 SCHECK_PARTIAL();
4116 MRRETURN(MATCH_NOMATCH);
4117 }
4118 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4119 MRRETURN(MATCH_NOMATCH);
4120 /* No need to skip more bytes - we know it's a 1-byte character */
4121 }
4122 break;
4123
4124 case OP_NOT_WHITESPACE:
4125 for (i = 1; i <= min; i++)
4126 {
4127 if (eptr >= md->end_subject)
4128 {
4129 SCHECK_PARTIAL();
4130 MRRETURN(MATCH_NOMATCH);
4131 }
4132 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4133 MRRETURN(MATCH_NOMATCH);
4134 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4135 }
4136 break;
4137
4138 case OP_WHITESPACE:
4139 for (i = 1; i <= min; i++)
4140 {
4141 if (eptr >= md->end_subject)
4142 {
4143 SCHECK_PARTIAL();
4144 MRRETURN(MATCH_NOMATCH);
4145 }
4146 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4147 MRRETURN(MATCH_NOMATCH);
4148 /* No need to skip more bytes - we know it's a 1-byte character */
4149 }
4150 break;
4151
4152 case OP_NOT_WORDCHAR:
4153 for (i = 1; i <= min; i++)
4154 {
4155 if (eptr >= md->end_subject)
4156 {
4157 SCHECK_PARTIAL();
4158 MRRETURN(MATCH_NOMATCH);
4159 }
4160 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4161 MRRETURN(MATCH_NOMATCH);
4162 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4163 }
4164 break;
4165
4166 case OP_WORDCHAR:
4167 for (i = 1; i <= min; i++)
4168 {
4169 if (eptr >= md->end_subject)
4170 {
4171 SCHECK_PARTIAL();
4172 MRRETURN(MATCH_NOMATCH);
4173 }
4174 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4175 MRRETURN(MATCH_NOMATCH);
4176 /* No need to skip more bytes - we know it's a 1-byte character */
4177 }
4178 break;
4179
4180 default:
4181 RRETURN(PCRE_ERROR_INTERNAL);
4182 } /* End switch(ctype) */
4183
4184 else
4185 #endif /* SUPPORT_UTF8 */
4186
4187 /* Code for the non-UTF-8 case for minimum matching of operators other
4188 than OP_PROP and OP_NOTPROP. */
4189
4190 switch(ctype)
4191 {
4192 case OP_ANY:
4193 for (i = 1; i <= min; i++)
4194 {
4195 if (eptr >= md->end_subject)
4196 {
4197 SCHECK_PARTIAL();
4198 MRRETURN(MATCH_NOMATCH);
4199 }
4200 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4201 eptr++;
4202 }
4203 break;
4204
4205 case OP_ALLANY:
4206 if (eptr > md->end_subject - min)
4207 {
4208 SCHECK_PARTIAL();
4209 MRRETURN(MATCH_NOMATCH);
4210 }
4211 eptr += min;
4212 break;
4213
4214 case OP_ANYBYTE:
4215 if (eptr > md->end_subject - min)
4216 {
4217 SCHECK_PARTIAL();
4218 MRRETURN(MATCH_NOMATCH);
4219 }
4220 eptr += min;
4221 break;
4222
4223 case OP_ANYNL:
4224 for (i = 1; i <= min; i++)
4225 {
4226 if (eptr >= md->end_subject)
4227 {
4228 SCHECK_PARTIAL();
4229 MRRETURN(MATCH_NOMATCH);
4230 }
4231 switch(*eptr++)
4232 {
4233 default: MRRETURN(MATCH_NOMATCH);
4234
4235 case 0x000d:
4236 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4237 break;
4238
4239 case 0x000a:
4240 break;
4241
4242 case 0x000b:
4243 case 0x000c:
4244 case 0x0085:
4245 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4246 break;
4247 }
4248 }
4249 break;
4250
4251 case OP_NOT_HSPACE:
4252 for (i = 1; i <= min; i++)
4253 {
4254 if (eptr >= md->end_subject)
4255 {
4256 SCHECK_PARTIAL();
4257 MRRETURN(MATCH_NOMATCH);
4258 }
4259 switch(*eptr++)
4260 {
4261 default: break;
4262 case 0x09: /* HT */
4263 case 0x20: /* SPACE */
4264 case 0xa0: /* NBSP */
4265 MRRETURN(MATCH_NOMATCH);
4266 }
4267 }
4268 break;
4269
4270 case OP_HSPACE:
4271 for (i = 1; i <= min; i++)
4272 {
4273 if (eptr >= md->end_subject)
4274 {
4275 SCHECK_PARTIAL();
4276 MRRETURN(MATCH_NOMATCH);
4277 }
4278 switch(*eptr++)
4279 {
4280 default: MRRETURN(MATCH_NOMATCH);
4281 case 0x09: /* HT */
4282 case 0x20: /* SPACE */
4283 case 0xa0: /* NBSP */
4284 break;
4285 }
4286 }
4287 break;
4288
4289 case OP_NOT_VSPACE:
4290 for (i = 1; i <= min; i++)
4291 {
4292 if (eptr >= md->end_subject)
4293 {
4294 SCHECK_PARTIAL();
4295 MRRETURN(MATCH_NOMATCH);
4296 }
4297 switch(*eptr++)
4298 {
4299 default: break;
4300 case 0x0a: /* LF */
4301 case 0x0b: /* VT */
4302 case 0x0c: /* FF */
4303 case 0x0d: /* CR */
4304 case 0x85: /* NEL */
4305 MRRETURN(MATCH_NOMATCH);
4306 }
4307 }
4308 break;
4309
4310 case OP_VSPACE:
4311 for (i = 1; i <= min; i++)
4312 {
4313 if (eptr >= md->end_subject)
4314 {
4315 SCHECK_PARTIAL();
4316 MRRETURN(MATCH_NOMATCH);
4317 }
4318 switch(*eptr++)
4319 {
4320 default: MRRETURN(MATCH_NOMATCH);
4321 case 0x0a: /* LF */
4322 case 0x0b: /* VT */
4323 case 0x0c: /* FF */
4324 case 0x0d: /* CR */
4325 case 0x85: /* NEL */
4326 break;
4327 }
4328 }
4329 break;
4330
4331 case OP_NOT_DIGIT:
4332 for (i = 1; i <= min; i++)
4333 {
4334 if (eptr >= md->end_subject)
4335 {
4336 SCHECK_PARTIAL();
4337 MRRETURN(MATCH_NOMATCH);
4338 }
4339 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4340 }
4341 break;
4342
4343 case OP_DIGIT:
4344 for (i = 1; i <= min; i++)
4345 {
4346 if (eptr >= md->end_subject)
4347 {
4348 SCHECK_PARTIAL();
4349 MRRETURN(MATCH_NOMATCH);
4350 }
4351 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4352 }
4353 break;
4354
4355 case OP_NOT_WHITESPACE:
4356 for (i = 1; i <= min; i++)
4357 {
4358 if (eptr >= md->end_subject)
4359 {
4360 SCHECK_PARTIAL();
4361 MRRETURN(MATCH_NOMATCH);
4362 }
4363 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4364 }
4365 break;
4366
4367 case OP_WHITESPACE:
4368 for (i = 1; i <= min; i++)
4369 {
4370 if (eptr >= md->end_subject)
4371 {
4372 SCHECK_PARTIAL();
4373 MRRETURN(MATCH_NOMATCH);
4374 }
4375 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4376 }
4377 break;
4378
4379 case OP_NOT_WORDCHAR:
4380 for (i = 1; i <= min; i++)
4381 {
4382 if (eptr >= md->end_subject)
4383 {
4384 SCHECK_PARTIAL();
4385 MRRETURN(MATCH_NOMATCH);
4386 }
4387 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4388 MRRETURN(MATCH_NOMATCH);
4389 }
4390 break;
4391
4392 case OP_WORDCHAR:
4393 for (i = 1; i <= min; i++)
4394 {
4395 if (eptr >= md->end_subject)
4396 {
4397 SCHECK_PARTIAL();
4398 MRRETURN(MATCH_NOMATCH);
4399 }
4400 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4401 MRRETURN(MATCH_NOMATCH);
4402 }
4403 break;
4404
4405 default:
4406 RRETURN(PCRE_ERROR_INTERNAL);
4407 }
4408 }
4409
4410 /* If min = max, continue at the same level without recursing */
4411
4412 if (min == max) continue;
4413
4414 /* If minimizing, we have to test the rest of the pattern before each
4415 subsequent match. Again, separate the UTF-8 case for speed, and also
4416 separate the UCP cases. */
4417
4418 if (minimize)
4419 {
4420 #ifdef SUPPORT_UCP
4421 if (prop_type >= 0)
4422 {
4423 switch(prop_type)
4424 {
4425 case PT_ANY:
4426 for (fi = min;; fi++)
4427 {
4428 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4429 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4430 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4431 if (eptr >= md->end_subject)
4432 {
4433 SCHECK_PARTIAL();
4434 MRRETURN(MATCH_NOMATCH);
4435 }
4436 GETCHARINCTEST(c, eptr);
4437 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4438 }
4439 /* Control never gets here */
4440
4441 case PT_LAMP:
4442 for (fi = min;; fi++)
4443 {
4444 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4446 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4447 if (eptr >= md->end_subject)
4448 {
4449 SCHECK_PARTIAL();
4450 MRRETURN(MATCH_NOMATCH);
4451 }
4452 GETCHARINCTEST(c, eptr);
4453 prop_chartype = UCD_CHARTYPE(c);
4454 if ((prop_chartype == ucp_Lu ||
4455 prop_chartype == ucp_Ll ||
4456 prop_chartype == ucp_Lt) == prop_fail_result)
4457 MRRETURN(MATCH_NOMATCH);
4458 }
4459 /* Control never gets here */
4460
4461 case PT_GC:
4462 for (fi = min;; fi++)
4463 {
4464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4466 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4467 if (eptr >= md->end_subject)
4468 {
4469 SCHECK_PARTIAL();
4470 MRRETURN(MATCH_NOMATCH);
4471 }
4472 GETCHARINCTEST(c, eptr);
4473 prop_category = UCD_CATEGORY(c);
4474 if ((prop_category == prop_value) == prop_fail_result)
4475 MRRETURN(MATCH_NOMATCH);
4476 }
4477 /* Control never gets here */
4478
4479 case PT_PC:
4480 for (fi = min;; fi++)
4481 {
4482 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4484 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4485 if (eptr >= md->end_subject)
4486 {
4487 SCHECK_PARTIAL();
4488 MRRETURN(MATCH_NOMATCH);
4489 }
4490 GETCHARINCTEST(c, eptr);
4491 prop_chartype = UCD_CHARTYPE(c);
4492 if ((prop_chartype == prop_value) == prop_fail_result)
4493 MRRETURN(MATCH_NOMATCH);
4494 }
4495 /* Control never gets here */
4496
4497 case PT_SC:
4498 for (fi = min;; fi++)
4499 {
4500 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4501 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4502 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4503 if (eptr >= md->end_subject)
4504 {
4505 SCHECK_PARTIAL();
4506 MRRETURN(MATCH_NOMATCH);
4507 }
4508 GETCHARINCTEST(c, eptr);
4509 prop_script = UCD_SCRIPT(c);
4510 if ((prop_script == prop_value) == prop_fail_result)
4511 MRRETURN(MATCH_NOMATCH);
4512 }
4513 /* Control never gets here */
4514
4515 case PT_ALNUM:
4516 for (fi = min;; fi++)
4517 {
4518 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4519 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4520 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4521 if (eptr >= md->end_subject)
4522 {
4523 SCHECK_PARTIAL();
4524 MRRETURN(MATCH_NOMATCH);
4525 }
4526 GETCHARINCTEST(c, eptr);
4527 prop_category = UCD_CATEGORY(c);
4528 if ((prop_category == ucp_L || prop_category == ucp_N)
4529 == prop_fail_result)
4530 MRRETURN(MATCH_NOMATCH);
4531 }
4532 /* Control never gets here */
4533
4534 case PT_SPACE: /* Perl space */
4535 for (fi = min;; fi++)
4536 {
4537 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4539 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4540 if (eptr >= md->end_subject)
4541 {
4542 SCHECK_PARTIAL();
4543 MRRETURN(MATCH_NOMATCH);
4544 }
4545 GETCHARINCTEST(c, eptr);
4546 prop_category = UCD_CATEGORY(c);
4547 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4548 c == CHAR_FF || c == CHAR_CR)
4549 == prop_fail_result)
4550 MRRETURN(MATCH_NOMATCH);
4551 }
4552 /* Control never gets here */
4553
4554 case PT_PXSPACE: /* POSIX space */
4555 for (fi = min;; fi++)
4556 {
4557 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4559 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4560 if (eptr >= md->end_subject)
4561 {
4562 SCHECK_PARTIAL();
4563 MRRETURN(MATCH_NOMATCH);
4564 }
4565 GETCHARINCTEST(c, eptr);
4566 prop_category = UCD_CATEGORY(c);
4567 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4568 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4569 == prop_fail_result)
4570 MRRETURN(MATCH_NOMATCH);
4571 }
4572 /* Control never gets here */
4573
4574 case PT_WORD:
4575 for (fi = min;; fi++)
4576 {
4577 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4579 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4580 if (eptr >= md->end_subject)
4581 {
4582 SCHECK_PARTIAL();
4583 MRRETURN(MATCH_NOMATCH);
4584 }
4585 GETCHARINCTEST(c, eptr);
4586 prop_category = UCD_CATEGORY(c);
4587 if ((prop_category == ucp_L ||
4588 prop_category == ucp_N ||
4589 c == CHAR_UNDERSCORE)
4590 == prop_fail_result)
4591 MRRETURN(MATCH_NOMATCH);
4592 }
4593 /* Control never gets here */
4594
4595 /* This should never occur */
4596
4597 default:
4598 RRETURN(PCRE_ERROR_INTERNAL);
4599 }
4600 }
4601
4602 /* Match extended Unicode sequences. We will get here only if the
4603 support is in the binary; otherwise a compile-time error occurs. */
4604
4605 else if (ctype == OP_EXTUNI)
4606 {
4607 for (fi = min;; fi++)
4608 {
4609 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4610 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4611 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4612 if (eptr >= md->end_subject)
4613 {
4614 SCHECK_PARTIAL();
4615 MRRETURN(MATCH_NOMATCH);
4616 }
4617 GETCHARINCTEST(c, eptr);
4618 prop_category = UCD_CATEGORY(c);
4619 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4620 while (eptr < md->end_subject)
4621 {
4622 int len = 1;
4623 if (!utf8) c = *eptr;
4624 else { GETCHARLEN(c, eptr, len); }
4625 prop_category = UCD_CATEGORY(c);
4626 if (prop_category != ucp_M) break;
4627 eptr += len;
4628 }
4629 }
4630 }
4631
4632 else
4633 #endif /* SUPPORT_UCP */
4634
4635 #ifdef SUPPORT_UTF8
4636 /* UTF-8 mode */
4637 if (utf8)
4638 {
4639 for (fi = min;; fi++)
4640 {
4641 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4643 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4644 if (eptr >= md->end_subject)
4645 {
4646 SCHECK_PARTIAL();
4647 MRRETURN(MATCH_NOMATCH);
4648 }
4649 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4650 MRRETURN(MATCH_NOMATCH);
4651 GETCHARINC(c, eptr);
4652 switch(ctype)
4653 {
4654 case OP_ANY: /* This is the non-NL case */
4655 case OP_ALLANY:
4656 case OP_ANYBYTE:
4657 break;
4658
4659 case OP_ANYNL:
4660 switch(c)
4661 {
4662 default: MRRETURN(MATCH_NOMATCH);
4663 case 0x000d:
4664 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4665 break;
4666 case 0x000a:
4667 break;
4668
4669 case 0x000b:
4670 case 0x000c:
4671 case 0x0085:
4672 case 0x2028:
4673 case 0x2029:
4674 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4675 break;
4676 }
4677 break;
4678
4679 case OP_NOT_HSPACE:
4680 switch(c)
4681 {
4682 default: break;
4683 case 0x09: /* HT */
4684 case 0x20: /* SPACE */
4685 case 0xa0: /* NBSP */
4686 case 0x1680: /* OGHAM SPACE MARK */
4687 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4688 case 0x2000: /* EN QUAD */
4689 case 0x2001: /* EM QUAD */
4690 case 0x2002: /* EN SPACE */
4691 case 0x2003: /* EM SPACE */
4692 case 0x2004: /* THREE-PER-EM SPACE */
4693 case 0x2005: /* FOUR-PER-EM SPACE */
4694 case 0x2006: /* SIX-PER-EM SPACE */
4695 case 0x2007: /* FIGURE SPACE */
4696 case 0x2008: /* PUNCTUATION SPACE */
4697 case 0x2009: /* THIN SPACE */
4698 case 0x200A: /* HAIR SPACE */
4699 case 0x202f: /* NARROW NO-BREAK SPACE */
4700 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4701 case 0x3000: /* IDEOGRAPHIC SPACE */
4702 MRRETURN(MATCH_NOMATCH);
4703 }
4704 break;
4705
4706 case OP_HSPACE:
4707 switch(c)
4708 {
4709 default: MRRETURN(MATCH_NOMATCH);
4710 case 0x09: /* HT */
4711 case 0x20: /* SPACE */
4712 case 0xa0: /* NBSP */
4713 case 0x1680: /* OGHAM SPACE MARK */
4714 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4715 case 0x2000: /* EN QUAD */
4716 case 0x2001: /* EM QUAD */
4717 case 0x2002: /* EN SPACE */
4718 case 0x2003: /* EM SPACE */
4719 case 0x2004: /* THREE-PER-EM SPACE */
4720 case 0x2005: /* FOUR-PER-EM SPACE */
4721 case 0x2006: /* SIX-PER-EM SPACE */
4722 case 0x2007: /* FIGURE SPACE */
4723 case 0x2008: /* PUNCTUATION SPACE */
4724 case 0x2009: /* THIN SPACE */
4725 case 0x200A: /* HAIR SPACE */
4726 case 0x202f: /* NARROW NO-BREAK SPACE */
4727 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4728 case 0x3000: /* IDEOGRAPHIC SPACE */
4729 break;
4730 }
4731 break;
4732
4733 case OP_NOT_VSPACE:
4734 switch(c)
4735 {
4736 default: break;
4737 case 0x0a: /* LF */
4738 case 0x0b: /* VT */
4739 case 0x0c: /* FF */
4740 case 0x0d: /* CR */
4741 case 0x85: /* NEL */
4742 case 0x2028: /* LINE SEPARATOR */
4743 case 0x2029: /* PARAGRAPH SEPARATOR */
4744 MRRETURN(MATCH_NOMATCH);
4745 }
4746 break;
4747
4748 case OP_VSPACE:
4749 switch(c)
4750 {
4751 default: MRRETURN(MATCH_NOMATCH);
4752 case 0x0a: /* LF */
4753 case 0x0b: /* VT */
4754 case 0x0c: /* FF */
4755 case 0x0d: /* CR */
4756 case 0x85: /* NEL */
4757 case 0x2028: /* LINE SEPARATOR */
4758 case 0x2029: /* PARAGRAPH SEPARATOR */
4759 break;
4760 }
4761 break;
4762
4763 case OP_NOT_DIGIT:
4764 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4765 MRRETURN(MATCH_NOMATCH);
4766 break;
4767
4768 case OP_DIGIT:
4769 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4770 MRRETURN(MATCH_NOMATCH);
4771 break;
4772
4773 case OP_NOT_WHITESPACE:
4774 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4775 MRRETURN(MATCH_NOMATCH);
4776 break;
4777
4778 case OP_WHITESPACE:
4779 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4780 MRRETURN(MATCH_NOMATCH);
4781 break;
4782
4783 case OP_NOT_WORDCHAR:
4784 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4785 MRRETURN(MATCH_NOMATCH);
4786 break;
4787
4788 case OP_WORDCHAR:
4789 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4790 MRRETURN(MATCH_NOMATCH);
4791 break;
4792
4793 default:
4794 RRETURN(PCRE_ERROR_INTERNAL);
4795 }
4796 }
4797 }
4798 else
4799 #endif
4800 /* Not UTF-8 mode */
4801 {
4802 for (fi = min;; fi++)
4803 {
4804 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4805 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4806 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4807 if (eptr >= md->end_subject)
4808 {
4809 SCHECK_PARTIAL();
4810 MRRETURN(MATCH_NOMATCH);
4811 }
4812 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4813 MRRETURN(MATCH_NOMATCH);
4814 c = *eptr++;
4815 switch(ctype)
4816 {
4817 case OP_ANY: /* This is the non-NL case */
4818 case OP_ALLANY:
4819 case OP_ANYBYTE:
4820 break;
4821
4822 case OP_ANYNL:
4823 switch(c)
4824 {
4825 default: MRRETURN(MATCH_NOMATCH);
4826 case 0x000d:
4827 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4828 break;
4829
4830 case 0x000a:
4831 break;
4832
4833 case 0x000b:
4834 case 0x000c:
4835 case 0x0085:
4836 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4837 break;
4838 }
4839 break;
4840
4841 case OP_NOT_HSPACE:
4842 switch(c)
4843 {
4844 default: break;
4845 case 0x09: /* HT */
4846 case 0x20: /* SPACE */
4847 case 0xa0: /* NBSP */
4848 MRRETURN(MATCH_NOMATCH);
4849 }
4850 break;
4851
4852 case OP_HSPACE:
4853 switch(c)
4854 {
4855 default: MRRETURN(MATCH_NOMATCH);
4856 case 0x09: /* HT */
4857 case 0x20: /* SPACE */
4858 case 0xa0: /* NBSP */
4859 break;
4860 }
4861 break;
4862
4863 case OP_NOT_VSPACE:
4864 switch(c)
4865 {
4866 default: break;
4867 case 0x0a: /* LF */
4868 case 0x0b: /* VT */
4869 case 0x0c: /* FF */
4870 case 0x0d: /* CR */
4871 case 0x85: /* NEL */
4872 MRRETURN(MATCH_NOMATCH);
4873 }
4874 break;
4875
4876 case OP_VSPACE:
4877 switch(c)
4878 {
4879 default: MRRETURN(MATCH_NOMATCH);
4880 case 0x0a: /* LF */
4881 case 0x0b: /* VT */
4882 case 0x0c: /* FF */
4883 case 0x0d: /* CR */
4884 case 0x85: /* NEL */
4885 break;
4886 }
4887 break;
4888
4889 case OP_NOT_DIGIT:
4890 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4891 break;
4892
4893 case OP_DIGIT:
4894 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4895 break;
4896
4897 case OP_NOT_WHITESPACE:
4898 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4899 break;
4900
4901 case OP_WHITESPACE:
4902 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4903 break;
4904
4905 case OP_NOT_WORDCHAR:
4906 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4907 break;
4908
4909 case OP_WORDCHAR:
4910 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4911 break;
4912
4913 default:
4914 RRETURN(PCRE_ERROR_INTERNAL);
4915 }
4916 }
4917 }
4918 /* Control never gets here */
4919 }
4920
4921 /* If maximizing, it is worth using inline code for speed, doing the type
4922 test once at the start (i.e. keep it out of the loop). Again, keep the
4923 UTF-8 and UCP stuff separate. */
4924
4925 else
4926 {
4927 pp = eptr; /* Remember where we started */
4928
4929 #ifdef SUPPORT_UCP
4930 if (prop_type >= 0)
4931 {
4932 switch(prop_type)
4933 {
4934 case PT_ANY:
4935 for (i = min; i < max; i++)
4936 {
4937 int len = 1;
4938 if (eptr >= md->end_subject)
4939 {
4940 SCHECK_PARTIAL();
4941 break;
4942 }
4943 GETCHARLENTEST(c, eptr, len);
4944 if (prop_fail_result) break;
4945 eptr+= len;
4946 }
4947 break;
4948
4949 case PT_LAMP:
4950 for (i = min; i < max; i++)
4951 {
4952 int len = 1;
4953 if (eptr >= md->end_subject)
4954 {
4955 SCHECK_PARTIAL();
4956 break;
4957 }
4958 GETCHARLENTEST(c, eptr, len);
4959 prop_chartype = UCD_CHARTYPE(c);
4960 if ((prop_chartype == ucp_Lu ||
4961 prop_chartype == ucp_Ll ||
4962 prop_chartype == ucp_Lt) == prop_fail_result)
4963 break;
4964 eptr+= len;
4965 }
4966 break;
4967
4968 case PT_GC:
4969 for (i = min; i < max; i++)
4970 {
4971 int len = 1;
4972 if (eptr >= md->end_subject)
4973 {
4974 SCHECK_PARTIAL();
4975 break;
4976 }
4977 GETCHARLENTEST(c, eptr, len);
4978 prop_category = UCD_CATEGORY(c);
4979 if ((prop_category == prop_value) == prop_fail_result)
4980 break;
4981 eptr+= len;
4982 }
4983 break;
4984
4985 case PT_PC:
4986 for (i = min; i < max; i++)
4987 {
4988 int len = 1;
4989 if (eptr >= md->end_subject)
4990 {
4991 SCHECK_PARTIAL();
4992 break;
4993 }
4994 GETCHARLENTEST(c, eptr, len);
4995 prop_chartype = UCD_CHARTYPE(c);
4996 if ((prop_chartype == prop_value) == prop_fail_result)
4997 break;
4998 eptr+= len;
4999 }
5000 break;
5001
5002 case PT_SC:
5003 for (i = min; i < max; i++)
5004 {
5005 int len = 1;
5006 if (eptr >= md->end_subject)
5007 {
5008 SCHECK_PARTIAL();
5009 break;
5010 }
5011 GETCHARLENTEST(c, eptr, len);
5012 prop_script = UCD_SCRIPT(c);
5013 if ((prop_script == prop_value) == prop_fail_result)
5014 break;
5015 eptr+= len;
5016 }
5017 break;
5018
5019 case PT_ALNUM:
5020 for (i = min; i < max; i++)
5021 {
5022 int len = 1;
5023 if (eptr >= md->end_subject)
5024 {
5025 SCHECK_PARTIAL();
5026 break;
5027 }
5028 GETCHARLENTEST(c, eptr, len);
5029 prop_category = UCD_CATEGORY(c);
5030 if ((prop_category == ucp_L || prop_category == ucp_N)
5031 == prop_fail_result)
5032 break;
5033 eptr+= len;
5034 }
5035 break;
5036
5037 case PT_SPACE: /* Perl space */
5038 for (i = min; i < max; i++)
5039 {
5040 int len = 1;
5041 if (eptr >= md->end_subject)
5042 {
5043 SCHECK_PARTIAL();
5044 break;
5045 }
5046 GETCHARLENTEST(c, eptr, len);
5047 prop_category = UCD_CATEGORY(c);
5048 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5049 c == CHAR_FF || c == CHAR_CR)
5050 == prop_fail_result)
5051 break;
5052 eptr+= len;
5053 }
5054 break;
5055
5056 case PT_PXSPACE: /* POSIX space */
5057 for (i = min; i < max; i++)
5058 {
5059 int len = 1;
5060 if (eptr >= md->end_subject)
5061 {
5062 SCHECK_PARTIAL();
5063 break;
5064 }
5065 GETCHARLENTEST(c, eptr, len);
5066 prop_category = UCD_CATEGORY(c);
5067 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5068 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5069 == prop_fail_result)
5070 break;
5071 eptr+= len;
5072 }
5073 break;
5074
5075 case PT_WORD:
5076 for (i = min; i < max; i++)
5077 {
5078 int len = 1;
5079 if (eptr >= md->end_subject)
5080 {
5081 SCHECK_PARTIAL();
5082 break;
5083 }
5084 GETCHARLENTEST(c, eptr, len);
5085 prop_category = UCD_CATEGORY(c);
5086 if ((prop_category == ucp_L || prop_category == ucp_N ||
5087 c == CHAR_UNDERSCORE) == prop_fail_result)
5088 break;
5089 eptr+= len;
5090 }
5091 break;
5092
5093 default:
5094 RRETURN(PCRE_ERROR_INTERNAL);
5095 }
5096
5097 /* eptr is now past the end of the maximum run */
5098
5099 if (possessive) continue;
5100 for(;;)
5101 {
5102 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5103 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5104 if (eptr-- == pp) break; /* Stop if tried at original pos */
5105 if (utf8) BACKCHAR(eptr);
5106 }
5107 }
5108
5109 /* Match extended Unicode sequences. We will get here only if the
5110 support is in the binary; otherwise a compile-time error occurs. */
5111
5112 else if (ctype == OP_EXTUNI)
5113 {
5114 for (i = min; i < max; i++)
5115 {
5116 if (eptr >= md->end_subject)
5117 {
5118 SCHECK_PARTIAL();
5119 break;
5120 }
5121 GETCHARINCTEST(c, eptr);
5122 prop_category = UCD_CATEGORY(c);
5123 if (prop_category == ucp_M) break;
5124 while (eptr < md->end_subject)
5125 {
5126 int len = 1;
5127 if (!utf8) c = *eptr; else
5128 {
5129 GETCHARLEN(c, eptr, len);
5130 }
5131 prop_category = UCD_CATEGORY(c);
5132 if (prop_category != ucp_M) break;
5133 eptr += len;
5134 }
5135 }
5136
5137 /* eptr is now past the end of the maximum run */
5138
5139 if (possessive) continue;
5140
5141 for(;;)
5142 {
5143 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5144 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5145 if (eptr-- == pp) break; /* Stop if tried at original pos */
5146 for (;;) /* Move back over one extended */
5147 {
5148 int len = 1;
5149 if (!utf8) c = *eptr; else
5150 {
5151 BACKCHAR(eptr);
5152 GETCHARLEN(c, eptr, len);
5153 }
5154 prop_category = UCD_CATEGORY(c);
5155 if (prop_category != ucp_M) break;
5156 eptr--;
5157 }
5158 }
5159 }
5160
5161 else
5162 #endif /* SUPPORT_UCP */
5163
5164 #ifdef SUPPORT_UTF8
5165 /* UTF-8 mode */
5166
5167 if (utf8)
5168 {
5169 switch(ctype)
5170 {
5171 case OP_ANY:
5172 if (max < INT_MAX)
5173 {
5174 for (i = min; i < max; i++)
5175 {
5176 if (eptr >= md->end_subject)
5177 {
5178 SCHECK_PARTIAL();
5179 break;
5180 }
5181 if (IS_NEWLINE(eptr)) break;
5182 eptr++;
5183 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5184 }
5185 }
5186
5187 /* Handle unlimited UTF-8 repeat */
5188
5189 else
5190 {
5191 for (i = min; i < max; i++)
5192 {
5193 if (eptr >= md->end_subject)
5194 {
5195 SCHECK_PARTIAL();
5196 break;
5197 }
5198 if (IS_NEWLINE(eptr)) break;
5199 eptr++;
5200 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5201 }
5202 }
5203 break;
5204
5205 case OP_ALLANY:
5206 if (max < INT_MAX)
5207 {
5208 for (i = min; i < max; i++)
5209 {
5210 if (eptr >= md->end_subject)
5211 {
5212 SCHECK_PARTIAL();
5213 break;
5214 }
5215 eptr++;
5216 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5217 }
5218 }
5219 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5220 break;
5221
5222 /* The byte case is the same as non-UTF8 */
5223
5224 case OP_ANYBYTE:
5225 c = max - min;
5226 if (c > (unsigned int)(md->end_subject - eptr))
5227 {
5228 eptr = md->end_subject;
5229 SCHECK_PARTIAL();
5230 }
5231 else eptr += c;
5232 break;
5233
5234 case OP_ANYNL:
5235 for (i = min; i < max; i++)
5236 {
5237 int len = 1;
5238 if (eptr >= md->end_subject)
5239 {
5240 SCHECK_PARTIAL();
5241 break;
5242 }
5243 GETCHARLEN(c, eptr, len);
5244 if (c == 0x000d)
5245 {
5246 if (++eptr >= md->end_subject) break;
5247 if (*eptr == 0x000a) eptr++;
5248 }
5249 else
5250 {
5251 if (c != 0x000a &&
5252 (md->bsr_anycrlf ||
5253 (c != 0x000b && c != 0x000c &&
5254 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5255 break;
5256 eptr += len;
5257 }
5258 }
5259 break;
5260
5261 case OP_NOT_HSPACE:
5262 case OP_HSPACE:
5263 for (i = min; i < max; i++)
5264 {
5265 BOOL gotspace;
5266 int len = 1;
5267 if (eptr >= md->end_subject)
5268 {
5269 SCHECK_PARTIAL();
5270 break;
5271 }
5272 GETCHARLEN(c, eptr, len);
5273 switch(c)
5274 {
5275 default: gotspace = FALSE; break;
5276 case 0x09: /* HT */
5277 case 0x20: /* SPACE */
5278 case 0xa0: /* NBSP */
5279 case 0x1680: /* OGHAM SPACE MARK */
5280 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5281 case 0x2000: /* EN QUAD */
5282 case 0x2001: /* EM QUAD */
5283 case 0x2002: /* EN SPACE */
5284 case 0x2003: /* EM SPACE */
5285 case 0x2004: /* THREE-PER-EM SPACE */
5286 case 0x2005: /* FOUR-PER-EM SPACE */
5287 case 0x2006: /* SIX-PER-EM SPACE */
5288 case 0x2007: /* FIGURE SPACE */
5289 case 0x2008: /* PUNCTUATION SPACE */
5290 case 0x2009: /* THIN SPACE */
5291 case 0x200A: /* HAIR SPACE */
5292 case 0x202f: /* NARROW NO-BREAK SPACE */
5293 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5294 case 0x3000: /* IDEOGRAPHIC SPACE */
5295 gotspace = TRUE;
5296 break;
5297 }
5298 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5299 eptr += len;
5300 }
5301 break;
5302
5303 case OP_NOT_VSPACE:
5304 case OP_VSPACE:
5305 for (i = min; i < max; i++)
5306 {
5307 BOOL gotspace;
5308 int len = 1;
5309 if (eptr >= md->end_subject)
5310 {
5311 SCHECK_PARTIAL();
5312 break;
5313 }
5314 GETCHARLEN(c, eptr, len);
5315 switch(c)
5316 {
5317 default: gotspace = FALSE; break;
5318 case 0x0a: /* LF */
5319 case 0x0b: /* VT */
5320 case 0x0c: /* FF */
5321 case 0x0d: /* CR */
5322 case 0x85: /* NEL */
5323 case 0x2028: /* LINE SEPARATOR */
5324 case 0x2029: /* PARAGRAPH SEPARATOR */
5325 gotspace = TRUE;
5326 break;
5327 }
5328 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5329 eptr += len;
5330 }
5331 break;
5332
5333 case OP_NOT_DIGIT:
5334 for (i = min; i < max; i++)
5335 {
5336 int len = 1;
5337 if (eptr >= md->end_subject)
5338 {
5339 SCHECK_PARTIAL();
5340 break;
5341 }
5342 GETCHARLEN(c, eptr, len);
5343 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5344 eptr+= len;
5345 }
5346 break;
5347
5348 case OP_DIGIT:
5349 for (i = min; i < max; i++)
5350 {
5351 int len = 1;
5352 if (eptr >= md->end_subject)
5353 {
5354 SCHECK_PARTIAL();
5355 break;
5356 }
5357 GETCHARLEN(c, eptr, len);
5358 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5359 eptr+= len;
5360 }
5361 break;
5362
5363 case OP_NOT_WHITESPACE:
5364 for (i = min; i < max; i++)
5365 {
5366 int len = 1;
5367 if (eptr >= md->end_subject)
5368 {
5369 SCHECK_PARTIAL();
5370 break;
5371 }
5372 GETCHARLEN(c, eptr, len);
5373 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5374 eptr+= len;
5375 }
5376 break;
5377
5378 case OP_WHITESPACE:
5379 for (i = min; i < max; i++)
5380 {
5381 int len = 1;
5382 if (eptr >= md->end_subject)
5383 {
5384 SCHECK_PARTIAL();
5385 break;
5386 }
5387 GETCHARLEN(c, eptr, len);
5388 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5389 eptr+= len;
5390 }
5391 break;
5392
5393 case OP_NOT_WORDCHAR:
5394 for (i = min; i < max; i++)
5395 {
5396 int len = 1;
5397 if (eptr >= md->end_subject)
5398 {
5399 SCHECK_PARTIAL();
5400 break;
5401 }
5402 GETCHARLEN(c, eptr, len);
5403 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5404 eptr+= len;
5405 }
5406 break;
5407
5408 case OP_WORDCHAR:
5409 for (i = min; i < max; i++)
5410 {
5411 int len = 1;
5412 if (eptr >= md->end_subject)
5413 {
5414 SCHECK_PARTIAL();
5415 break;
5416 }
5417 GETCHARLEN(c, eptr, len);
5418 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5419 eptr+= len;
5420 }
5421 break;
5422
5423 default:
5424 RRETURN(PCRE_ERROR_INTERNAL);
5425 }
5426
5427 /* eptr is now past the end of the maximum run. If possessive, we are
5428 done (no backing up). Otherwise, match at this position; anything other
5429 than no match is immediately returned. For nomatch, back up one
5430 character, unless we are matching \R and the last thing matched was
5431 \r\n, in which case, back up two bytes. */
5432
5433 if (possessive) continue;
5434 for(;;)
5435 {
5436 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5437 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5438 if (eptr-- == pp) break; /* Stop if tried at original pos */
5439 BACKCHAR(eptr);
5440 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5441 eptr[-1] == '\r') eptr--;
5442 }
5443 }
5444 else
5445 #endif /* SUPPORT_UTF8 */
5446
5447 /* Not UTF-8 mode */
5448 {
5449 switch(ctype)
5450 {
5451 case OP_ANY:
5452 for (i = min; i < max; i++)
5453 {
5454 if (eptr >= md->end_subject)
5455 {
5456 SCHECK_PARTIAL();
5457 break;
5458 }
5459 if (IS_NEWLINE(eptr)) break;
5460 eptr++;
5461 }
5462 break;
5463
5464 case OP_ALLANY:
5465 case OP_ANYBYTE:
5466 c = max - min;
5467 if (c > (unsigned int)(md->end_subject - eptr))
5468 {
5469 eptr = md->end_subject;
5470 SCHECK_PARTIAL();
5471 }
5472 else eptr += c;
5473 break;
5474
5475 case OP_ANYNL:
5476 for (i = min; i < max; i++)
5477 {
5478 if (eptr >= md->end_subject)
5479 {
5480 SCHECK_PARTIAL();
5481 break;
5482 }
5483 c = *eptr;
5484 if (c == 0x000d)
5485 {
5486 if (++eptr >= md->end_subject) break;
5487 if (*eptr == 0x000a) eptr++;
5488 }
5489 else
5490 {
5491 if (c != 0x000a &&
5492 (md->bsr_anycrlf ||
5493 (c != 0x000b && c != 0x000c && c != 0x0085)))
5494 break;
5495 eptr++;
5496 }
5497 }
5498 break;
5499
5500 case OP_NOT_HSPACE:
5501 for (i = min; i < max; i++)
5502 {
5503 if (eptr >= md->end_subject)
5504 {
5505 SCHECK_PARTIAL();
5506 break;
5507 }
5508 c = *eptr;
5509 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5510 eptr++;
5511 }
5512 break;
5513
5514 case OP_HSPACE:
5515 for (i = min; i < max; i++)
5516 {
5517 if (eptr >= md->end_subject)
5518 {
5519 SCHECK_PARTIAL();
5520 break;
5521 }
5522 c = *eptr;
5523 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5524 eptr++;
5525 }
5526 break;
5527
5528 case OP_NOT_VSPACE:
5529 for (i = min; i < max; i++)
5530 {
5531 if (eptr >= md->end_subject)
5532 {
5533 SCHECK_PARTIAL();
5534 break;
5535 }
5536 c = *eptr;
5537 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5538 break;
5539 eptr++;
5540 }
5541 break;
5542
5543 case OP_VSPACE:
5544 for (i = min; i < max; i++)
5545 {
5546 if (eptr >= md->end_subject)
5547 {
5548 SCHECK_PARTIAL();
5549 break;
5550 }
5551 c = *eptr;
5552 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5553 break;
5554 eptr++;
5555 }
5556 break;
5557
5558 case OP_NOT_DIGIT:
5559 for (i = min; i < max; i++)
5560 {
5561 if (eptr >= md->end_subject)
5562 {
5563 SCHECK_PARTIAL();
5564 break;
5565 }
5566 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5567 eptr++;
5568 }
5569 break;
5570
5571 case OP_DIGIT:
5572 for (i = min; i < max; i++)
5573 {
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5580 eptr++;
5581 }
5582 break;
5583
5584 case OP_NOT_WHITESPACE:
5585 for (i = min; i < max; i++)
5586 {
5587 if (eptr >= md->end_subject)
5588 {
5589 SCHECK_PARTIAL();
5590 break;
5591 }
5592 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5593 eptr++;
5594 }
5595 break;
5596
5597 case OP_WHITESPACE:
5598 for (i = min; i < max; i++)
5599 {
5600 if (eptr >= md->end_subject)
5601 {
5602 SCHECK_PARTIAL();
5603 break;
5604 }
5605 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5606 eptr++;
5607 }
5608 break;
5609
5610 case OP_NOT_WORDCHAR:
5611 for (i = min; i < max; i++)
5612 {
5613 if (eptr >= md->end_subject)
5614 {
5615 SCHECK_PARTIAL();
5616 break;
5617 }
5618 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5619 eptr++;
5620 }
5621 break;
5622
5623 case OP_WORDCHAR:
5624 for (i = min; i < max; i++)
5625 {
5626 if (eptr >= md->end_subject)
5627 {
5628 SCHECK_PARTIAL();
5629 break;
5630 }
5631 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5632 eptr++;
5633 }
5634 break;
5635
5636 default:
5637 RRETURN(PCRE_ERROR_INTERNAL);
5638 }
5639
5640 /* eptr is now past the end of the maximum run. If possessive, we are
5641 done (no backing up). Otherwise, match at this position; anything other
5642 than no match is immediately returned. For nomatch, back up one
5643 character (byte), unless we are matching \R and the last thing matched
5644 was \r\n, in which case, back up two bytes. */
5645
5646 if (possessive) continue;
5647 while (eptr >= pp)
5648 {
5649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5651 eptr--;
5652 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5653 eptr[-1] == '\r') eptr--;
5654 }
5655 }
5656
5657 /* Get here if we can't make it match with any permitted repetitions */
5658
5659 MRRETURN(MATCH_NOMATCH);
5660 }
5661 /* Control never gets here */
5662
5663 /* There's been some horrible disaster. Arrival here can only mean there is
5664 something seriously wrong in the code above or the OP_xxx definitions. */
5665
5666 default:
5667 DPRINTF(("Unknown opcode %d\n", *ecode));
5668 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5669 }
5670
5671 /* Do not stick any code in here without much thought; it is assumed
5672 that "continue" in the code above comes out to here to repeat the main
5673 loop. */
5674
5675 } /* End of main loop */
5676 /* Control never reaches here */
5677
5678
5679 /* When compiling to use the heap rather than the stack for recursive calls to
5680 match(), the RRETURN() macro jumps here. The number that is saved in
5681 frame->Xwhere indicates which label we actually want to return to. */
5682
5683 #ifdef NO_RECURSE
5684 #define LBL(val) case val: goto L_RM##val;
5685 HEAP_RETURN:
5686 switch (frame->Xwhere)
5687 {
5688 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5689 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5690 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5691 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5692 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5693 #ifdef SUPPORT_UTF8
5694 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5695 LBL(32) LBL(34) LBL(42) LBL(46)
5696 #ifdef SUPPORT_UCP
5697 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5698 LBL(59) LBL(60) LBL(61) LBL(62)
5699 #endif /* SUPPORT_UCP */
5700 #endif /* SUPPORT_UTF8 */
5701 default:
5702 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5703 return PCRE_ERROR_INTERNAL;
5704 }
5705 #undef LBL
5706 #endif /* NO_RECURSE */
5707 }
5708
5709
5710 /***************************************************************************
5711 ****************************************************************************
5712 RECURSION IN THE match() FUNCTION
5713
5714 Undefine all the macros that were defined above to handle this. */
5715
5716 #ifdef NO_RECURSE
5717 #undef eptr
5718 #undef ecode
5719 #undef mstart
5720 #undef offset_top
5721 #undef eptrb
5722 #undef flags
5723
5724 #undef callpat
5725 #undef charptr
5726 #undef data
5727 #undef next
5728 #undef pp
5729 #undef prev
5730 #undef saved_eptr
5731
5732 #undef new_recursive
5733
5734 #undef cur_is_word
5735 #undef condition
5736 #undef prev_is_word
5737
5738 #undef ctype
5739 #undef length
5740 #undef max
5741 #undef min
5742 #undef number
5743 #undef offset
5744 #undef op
5745 #undef save_capture_last
5746 #undef save_offset1
5747 #undef save_offset2
5748 #undef save_offset3
5749 #undef stacksave
5750
5751 #undef newptrb
5752
5753 #endif
5754
5755 /* These two are defined as macros in both cases */
5756
5757 #undef fc
5758 #undef fi
5759
5760 /***************************************************************************
5761 ***************************************************************************/
5762
5763
5764
5765 /*************************************************
5766 * Execute a Regular Expression *
5767 *************************************************/
5768
5769 /* This function applies a compiled re to a subject string and picks out
5770 portions of the string if it matches. Two elements in the vector are set for
5771 each substring: the offsets to the start and end of the substring.
5772
5773 Arguments:
5774 argument_re points to the compiled expression
5775 extra_data points to extra data or is NULL
5776 subject points to the subject string
5777 length length of subject string (may contain binary zeros)
5778 start_offset where to start in the subject string
5779 options option bits
5780 offsets points to a vector of ints to be filled in with offsets
5781 offsetcount the number of elements in the vector
5782
5783 Returns: > 0 => success; value is the number of elements filled in
5784 = 0 => success, but offsets is not big enough
5785 -1 => failed to match
5786 < -1 => some kind of unexpected problem
5787 */
5788
5789 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5790 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5791 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5792 int offsetcount)
5793 {
5794 int rc, resetcount, ocount;
5795 int first_byte = -1;
5796 int req_byte = -1;
5797 int req_byte2 = -1;
5798 int newline;
5799 BOOL using_temporary_offsets = FALSE;
5800 BOOL anchored;
5801 BOOL startline;
5802 BOOL firstline;
5803 BOOL first_byte_caseless = FALSE;
5804 BOOL req_byte_caseless = FALSE;
5805 BOOL utf8;
5806 match_data match_block;
5807 match_data *md = &match_block;
5808 const uschar *tables;
5809 const uschar *start_bits = NULL;
5810 USPTR start_match = (USPTR)subject + start_offset;
5811 USPTR end_subject;
5812 USPTR start_partial = NULL;
5813 USPTR req_byte_ptr = start_match - 1;
5814
5815 pcre_study_data internal_study;
5816 const pcre_study_data *study;
5817
5818 real_pcre internal_re;
5819 const real_pcre *external_re = (const real_pcre *)argument_re;
5820 const real_pcre *re = external_re;
5821
5822 /* Plausibility checks */
5823
5824 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5825 if (re == NULL || subject == NULL ||
5826 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5827 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5828 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5829
5830 /* This information is for finding all the numbers associated with a given
5831 name, for condition testing. */
5832
5833 md->name_table = (uschar *)re + re->name_table_offset;
5834 md->name_count = re->name_count;
5835 md->name_entry_size = re->name_entry_size;
5836
5837 /* Fish out the optional data from the extra_data structure, first setting
5838 the default values. */
5839
5840 study = NULL;
5841 md->match_limit = MATCH_LIMIT;
5842 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5843 md->callout_data = NULL;
5844
5845 /* The table pointer is always in native byte order. */
5846
5847 tables = external_re->tables;
5848
5849 if (extra_data != NULL)
5850 {
5851 register unsigned int flags = extra_data->flags;
5852 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5853 study = (const pcre_study_data *)extra_data->study_data;
5854 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5855 md->match_limit = extra_data->match_limit;
5856 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5857 md->match_limit_recursion = extra_data->match_limit_recursion;
5858 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5859 md->callout_data = extra_data->callout_data;
5860 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5861 }
5862
5863 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5864 is a feature that makes it possible to save compiled regex and re-use them
5865 in other programs later. */
5866
5867 if (tables == NULL) tables = _pcre_default_tables;
5868
5869 /* Check that the first field in the block is the magic number. If it is not,
5870 test for a regex that was compiled on a host of opposite endianness. If this is
5871 the case, flipped values are put in internal_re and internal_study if there was
5872 study data too. */
5873
5874 if (re->magic_number != MAGIC_NUMBER)
5875 {
5876 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5877 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5878 if (study != NULL) study = &internal_study;
5879 }
5880
5881 /* Set up other data */
5882
5883 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5884 startline = (re->flags & PCRE_STARTLINE) != 0;
5885 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5886
5887 /* The code starts after the real_pcre block and the capture name table. */
5888
5889 md->start_code = (const uschar *)external_re + re->name_table_offset +
5890 re->name_count * re->name_entry_size;
5891
5892 md->start_subject = (USPTR)subject;
5893 md->start_offset = start_offset;
5894 md->end_subject = md->start_subject + length;
5895 end_subject = md->end_subject;
5896
5897 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5898 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5899 md->use_ucp = (re->options & PCRE_UCP) != 0;
5900 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5901
5902 md->notbol = (options & PCRE_NOTBOL) != 0;
5903 md->noteol = (options & PCRE_NOTEOL) != 0;
5904 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5905 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5906 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5907 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5908 md->hitend = FALSE;
5909 md->mark = NULL; /* In case never set */
5910
5911 md->recursive = NULL; /* No recursion at top level */
5912
5913 md->lcc = tables + lcc_offset;
5914 md->ctypes = tables + ctypes_offset;
5915
5916 /* Handle different \R options. */
5917
5918 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5919 {
5920 case 0:
5921 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5922 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5923 else
5924 #ifdef BSR_ANYCRLF
5925 md->bsr_anycrlf = TRUE;
5926 #else
5927 md->bsr_anycrlf = FALSE;
5928 #endif
5929 break;
5930
5931 case PCRE_BSR_ANYCRLF:
5932 md->bsr_anycrlf = TRUE;
5933 break;
5934
5935 case PCRE_BSR_UNICODE:
5936 md->bsr_anycrlf = FALSE;
5937 break;
5938
5939 default: return PCRE_ERROR_BADNEWLINE;
5940 }
5941
5942 /* Handle different types of newline. The three bits give eight cases. If
5943 nothing is set at run time, whatever was used at compile time applies. */
5944
5945 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5946 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5947 {
5948 case 0: newline = NEWLINE; break; /* Compile-time default */
5949 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5950 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5951 case PCRE_NEWLINE_CR+
5952 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5953 case PCRE_NEWLINE_ANY: newline = -1; break;
5954 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5955 default: return PCRE_ERROR_BADNEWLINE;
5956 }
5957
5958 if (newline == -2)
5959 {
5960 md->nltype = NLTYPE_ANYCRLF;
5961 }
5962 else if (newline < 0)
5963 {
5964 md->nltype = NLTYPE_ANY;
5965 }
5966 else
5967 {
5968 md->nltype = NLTYPE_FIXED;
5969 if (newline > 255)
5970 {
5971 md->nllen = 2;
5972 md->nl[0] = (newline >> 8) & 255;
5973 md->nl[1] = newline & 255;
5974 }
5975 else
5976 {
5977 md->nllen = 1;
5978 md->nl[0] = newline;
5979 }
5980 }
5981
5982 /* Partial matching was originally supported only for a restricted set of
5983 regexes; from release 8.00 there are no restrictions, but the bits are still
5984 defined (though never set). So there's no harm in leaving this code. */
5985
5986 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5987 return PCRE_ERROR_BADPARTIAL;
5988
5989 /* Check a UTF-8 string if required. Pass back the character offset and error
5990 code for an invalid string if a results vector is available. */
5991
5992 #ifdef SUPPORT_UTF8
5993 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5994 {
5995 int erroroffset;
5996 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5997 if (errorcode != 0)
5998 {
5999 if (offsetcount >= 2)
6000 {
6001 offsets[0] = erroroffset;
6002 offsets[1] = errorcode;
6003 }
6004 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6005 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6006 }
6007
6008 /* Check that a start_offset points to the start of a UTF-8 character. */
6009
6010 if (start_offset > 0 && start_offset < length &&
6011 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6012 return PCRE_ERROR_BADUTF8_OFFSET;
6013 }
6014 #endif
6015
6016 /* If the expression has got more back references than the offsets supplied can
6017 hold, we get a temporary chunk of working store to use during the matching.
6018 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6019 of 3. */
6020
6021 ocount = offsetcount - (offsetcount % 3);
6022
6023 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6024 {
6025 ocount = re->top_backref * 3 + 3;
6026 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6027 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6028 using_temporary_offsets = TRUE;
6029 DPRINTF(("Got memory to hold back references\n"));
6030 }
6031 else md->offset_vector = offsets;
6032
6033 md->offset_end = ocount;
6034 md->offset_max = (2*ocount)/3;
6035 md->offset_overflow = FALSE;
6036 md->capture_last = -1;
6037
6038 /* Compute the minimum number of offsets that we need to reset each time. Doing
6039 this makes a huge difference to execution time when there aren't many brackets
6040 in the pattern. */
6041
6042 resetcount = 2 + re->top_bracket * 2;
6043 if (resetcount > offsetcount) resetcount = ocount;
6044
6045 /* Reset the working variable associated with each extraction. These should
6046 never be used unless previously set, but they get saved and restored, and so we
6047 initialize them to avoid reading uninitialized locations. */
6048
6049 if (md->offset_vector != NULL)
6050 {
6051 register int *iptr = md->offset_vector + ocount;
6052 register int *iend = iptr - resetcount/2 + 1;
6053 while (--iptr >= iend) *iptr = -1;
6054 }
6055
6056 /* Set up the first character to match, if available. The first_byte value is
6057 never set for an anchored regular expression, but the anchoring may be forced
6058 at run time, so we have to test for anchoring. The first char may be unset for
6059 an unanchored pattern, of course. If there's no first char and the pattern was
6060 studied, there may be a bitmap of possible first characters. */
6061
6062 if (!anchored)
6063 {
6064 if ((re->flags & PCRE_FIRSTSET) != 0)
6065 {
6066 first_byte = re->first_byte & 255;
6067 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6068 first_byte = md->lcc[first_byte];
6069 }
6070 else
6071 if (!startline && study != NULL &&
6072 (study->flags & PCRE_STUDY_MAPPED) != 0)
6073 start_bits = study->start_bits;
6074 }
6075
6076 /* For anchored or unanchored matches, there may be a "last known required
6077 character" set. */
6078
6079 if ((re->flags & PCRE_REQCHSET) != 0)
6080 {
6081 req_byte = re->req_byte & 255;
6082 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6083 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6084 }
6085
6086
6087 /* ==========================================================================*/
6088
6089 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6090 the loop runs just once. */
6091
6092 for(;;)
6093 {
6094 USPTR save_end_subject = end_subject;
6095 USPTR new_start_match;
6096
6097 /* Reset the maximum number of extractions we might see. */
6098
6099 if (md->offset_vector != NULL)
6100 {
6101 register int *iptr = md->offset_vector;
6102 register int *iend = iptr + resetcount;
6103 while (iptr < iend) *iptr++ = -1;
6104 }
6105
6106 /* If firstline is TRUE, the start of the match is constrained to the first
6107 line of a multiline string. That is, the match must be before or at the first
6108 newline. Implement this by temporarily adjusting end_subject so that we stop
6109 scanning at a newline. If the match fails at the newline, later code breaks
6110 this loop. */
6111
6112 if (firstline)
6113 {
6114 USPTR t = start_match;
6115 #ifdef SUPPORT_UTF8
6116 if (utf8)
6117 {
6118 while (t < md->end_subject && !IS_NEWLINE(t))
6119 {
6120 t++;
6121 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6122 }
6123 }
6124 else
6125 #endif
6126 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6127 end_subject = t;
6128 }
6129
6130 /* There are some optimizations that avoid running the match if a known
6131 starting point is not found, or if a known later character is not present.
6132 However, there is an option that disables these, for testing and for ensuring
6133 that all callouts do actually occur. The option can be set in the regex by
6134 (*NO_START_OPT) or passed in match-time options. */
6135
6136 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6137 {
6138 /* Advance to a unique first byte if there is one. */
6139
6140 if (first_byte >= 0)
6141 {
6142 if (first_byte_caseless)
6143 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6144 start_match++;
6145 else
6146 while (start_match < end_subject && *start_match != first_byte)
6147 start_match++;
6148 }
6149
6150 /* Or to just after a linebreak for a multiline match */
6151
6152 else if (startline)
6153 {
6154 if (start_match > md->start_subject + start_offset)
6155 {
6156 #ifdef SUPPORT_UTF8
6157 if (utf8)
6158 {
6159 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6160 {
6161 start_match++;
6162 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6163 start_match++;
6164 }
6165 }
6166 else
6167 #endif
6168 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6169 start_match++;
6170
6171 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6172 and we are now at a LF, advance the match position by one more character.
6173 */
6174
6175 if (start_match[-1] == CHAR_CR &&
6176 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6177 start_match < end_subject &&
6178 *start_match == CHAR_NL)
6179 start_match++;
6180 }
6181 }
6182
6183 /* Or to a non-unique first byte after study */
6184
6185 else if (start_bits != NULL)
6186 {
6187 while (start_match < end_subject)
6188 {
6189 register unsigned int c = *start_match;
6190 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6191 {
6192 start_match++;
6193 #ifdef SUPPORT_UTF8
6194 if (utf8)
6195 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6196 start_match++;
6197 #endif
6198 }
6199 else break;
6200 }
6201 }
6202 } /* Starting optimizations */
6203
6204 /* Restore fudged end_subject */
6205
6206 end_subject = save_end_subject;
6207
6208 /* The following two optimizations are disabled for partial matching or if
6209 disabling is explicitly requested. */
6210
6211 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6212 {
6213 /* If the pattern was studied, a minimum subject length may be set. This is
6214 a lower bound; no actual string of that length may actually match the
6215 pattern. Although the value is, strictly, in characters, we treat it as
6216 bytes to avoid spending too much time in this optimization. */
6217
6218 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6219 (pcre_uint32)(end_subject - start_match) < study->minlength)
6220 {
6221 rc = MATCH_NOMATCH;
6222 break;
6223 }
6224
6225 /* If req_byte is set, we know that that character must appear in the
6226 subject for the match to succeed. If the first character is set, req_byte
6227 must be later in the subject; otherwise the test starts at the match point.
6228 This optimization can save a huge amount of backtracking in patterns with
6229 nested unlimited repeats that aren't going to match. Writing separate code
6230 for cased/caseless versions makes it go faster, as does using an
6231 autoincrement and backing off on a match.
6232
6233 HOWEVER: when the subject string is very, very long, searching to its end
6234 can take a long time, and give bad performance on quite ordinary patterns.
6235 This showed up when somebody was matching something like /^\d+C/ on a
6236 32-megabyte string... so we don't do this when the string is sufficiently
6237 long. */
6238
6239 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6240 {
6241 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6242
6243 /* We don't need to repeat the search if we haven't yet reached the
6244 place we found it at last time. */
6245
6246 if (p > req_byte_ptr)
6247 {
6248 if (req_byte_caseless)
6249 {
6250 while (p < end_subject)
6251 {
6252 register int pp = *p++;
6253 if (pp == req_byte || pp == req_byte2) { p--; break; }
6254 }
6255 }
6256 else
6257 {
6258 while (p < end_subject)
6259 {
6260 if (*p++ == req_byte) { p--; break; }
6261 }
6262 }
6263
6264 /* If we can't find the required character, break the matching loop,
6265 forcing a match failure. */
6266
6267 if (p >= end_subject)
6268 {
6269 rc = MATCH_NOMATCH;
6270 break;
6271 }
6272
6273 /* If we have found the required character, save the point where we
6274 found it, so that we don't search again next time round the loop if
6275 the start hasn't passed this character yet. */
6276
6277 req_byte_ptr = p;
6278 }
6279 }
6280 }
6281
6282 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6283 printf(">>>> Match against: ");
6284 pchars(start_match, end_subject - start_match, TRUE, md);
6285 printf("\n");
6286 #endif
6287
6288 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6289 first starting point for which a partial match was found. */
6290
6291 md->start_match_ptr = start_match;
6292 md->start_used_ptr = start_match;
6293 md->match_call_count = 0;
6294 md->match_function_type = 0;
6295 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6296 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6297
6298 switch(rc)
6299 {
6300 /* SKIP passes back the next starting point explicitly, but if it is the
6301 same as the match we have just done, treat it as NOMATCH. */
6302
6303 case MATCH_SKIP:
6304 if (md->start_match_ptr != start_match)
6305 {
6306 new_start_match = md->start_match_ptr;
6307 break;
6308 }
6309 /* Fall through */
6310
6311 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6312 the SKIP's arg was not found. We also treat this as NOMATCH. */
6313
6314 case MATCH_SKIP_ARG:
6315 /* Fall through */
6316
6317 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6318 exactly like PRUNE. */
6319
6320 case MATCH_NOMATCH:
6321 case MATCH_PRUNE:
6322 case MATCH_THEN:
6323 new_start_match = start_match + 1;
6324 #ifdef SUPPORT_UTF8
6325 if (utf8)
6326 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6327 new_start_match++;
6328 #endif
6329 break;
6330
6331 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6332
6333 case MATCH_COMMIT:
6334 rc = MATCH_NOMATCH;
6335 goto ENDLOOP;
6336
6337 /* Any other return is either a match, or some kind of error. */
6338
6339 default:
6340 goto ENDLOOP;
6341 }
6342
6343 /* Control reaches here for the various types of "no match at this point"
6344 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6345
6346 rc = MATCH_NOMATCH;
6347
6348 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6349 newline in the subject (though it may continue over the newline). Therefore,
6350 if we have just failed to match, starting at a newline, do not continue. */
6351
6352 if (firstline && IS_NEWLINE(start_match)) break;
6353
6354 /* Advance to new matching position */
6355
6356 start_match = new_start_match;
6357
6358 /* Break the loop if the pattern is anchored or if we have passed the end of
6359 the subject. */
6360
6361 if (anchored || start_match > end_subject) break;
6362
6363 /* If we have just passed a CR and we are now at a LF, and the pattern does
6364 not contain any explicit matches for \r or \n, and the newline option is CRLF
6365 or ANY or ANYCRLF, advance the match position by one more character. */
6366
6367 if (start_match[-1] == CHAR_CR &&
6368 start_match < end_subject &&
6369 *start_match == CHAR_NL &&
6370 (re->flags & PCRE_HASCRORLF) == 0 &&
6371 (md->nltype == NLTYPE_ANY ||
6372 md->nltype == NLTYPE_ANYCRLF ||
6373 md->nllen == 2))
6374 start_match++;
6375
6376 md->mark = NULL; /* Reset for start of next match attempt */
6377 } /* End of for(;;) "bumpalong" loop */
6378
6379 /* ==========================================================================*/
6380
6381 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6382 conditions is true:
6383
6384 (1) The pattern is anchored or the match was failed by (*COMMIT);
6385
6386 (2) We are past the end of the subject;
6387
6388 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6389 this option requests that a match occur at or before the first newline in
6390 the subject.
6391
6392 When we have a match and the offset vector is big enough to deal with any
6393 backreferences, captured substring offsets will already be set up. In the case
6394 where we had to get some local store to hold offsets for backreference
6395 processing, copy those that we can. In this case there need not be overflow if
6396 certain parts of the pattern were not used, even though there are more
6397 capturing parentheses than vector slots. */
6398
6399 ENDLOOP:
6400
6401 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6402 {
6403 if (using_temporary_offsets)
6404 {
6405 if (offsetcount >= 4)
6406 {
6407 memcpy(offsets + 2, md->offset_vector + 2,
6408 (offsetcount - 2) * sizeof(int));
6409 DPRINTF(("Copied offsets from temporary memory\n"));
6410 }
6411 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6412 DPRINTF(("Freeing temporary memory\n"));
6413 (pcre_free)(md->offset_vector);
6414 }
6415
6416 /* Set the return code to the number of captured strings, or 0 if there are
6417 too many to fit into the vector. */
6418
6419 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6420
6421 /* If there is space, set up the whole thing as substring 0. The value of
6422 md->start_match_ptr might be modified if \K was encountered on the success
6423 matching path. */
6424
6425 if (offsetcount < 2) rc = 0; else
6426 {
6427 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6428 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6429 }
6430
6431 DPRINTF((">>>> returning %d\n", rc));
6432 goto RETURN_MARK;
6433 }
6434
6435 /* Control gets here if there has been an error, or if the overall match
6436 attempt has failed at all permitted starting positions. */
6437
6438 if (using_temporary_offsets)
6439 {
6440 DPRINTF(("Freeing temporary memory\n"));
6441 (pcre_free)(md->offset_vector);
6442 }
6443
6444 /* For anything other than nomatch or partial match, just return the code. */
6445
6446 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6447 {
6448 DPRINTF((">>>> error: returning %d\n", rc));
6449 return rc;
6450 }
6451
6452 /* Handle partial matches - disable any mark data */
6453
6454 if (start_partial != NULL)
6455 {
6456 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6457 md->mark = NULL;
6458 if (offsetcount > 1)
6459 {
6460 offsets[0] = (int)(start_partial - (USPTR)subject);
6461 offsets[1] = (int)(end_subject - (USPTR)subject);
6462 }
6463 rc = PCRE_ERROR_PARTIAL;
6464 }
6465
6466 /* This is the classic nomatch case */
6467
6468 else
6469 {
6470 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6471 rc = PCRE_ERROR_NOMATCH;
6472 }
6473
6474 /* Return the MARK data if it has been requested. */
6475
6476 RETURN_MARK:
6477
6478 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6479 *(extra_data->mark) = (unsigned char *)(md->mark);
6480 return rc;
6481 }
6482
6483 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5