/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 604 - (show annotations)
Thu Jun 2 19:04:54 2011 UTC (8 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 194559 byte(s)
Error occurred while calculating annotation data.
Refactoring to reduce stack usage for possessively quantified subpatterns. Also 
fixed a number of bugs related to repeated subpatterns. Some further tidies 
consequent on the removal of OP_OPT are also in this patch.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63, RM64 };
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. If we fail to match, we need to restore this value and also the
812 values of the final offsets, in case they were set by a previous iteration
813 of the same bracket.
814
815 If there isn't enough space in the offset vector, treat this as if it were
816 a non-capturing bracket. Don't worry about setting the flag for the error
817 case here; that is handled in the code for KET. */
818
819 case OP_CBRA:
820 case OP_SCBRA:
821 number = GET2(ecode, 1+LINK_SIZE);
822 offset = number << 1;
823
824 #ifdef PCRE_DEBUG
825 printf("start bracket %d\n", number);
826 printf("subject=");
827 pchars(eptr, 16, TRUE, md);
828 printf("\n");
829 #endif
830
831 if (offset < md->offset_max)
832 {
833 save_offset1 = md->offset_vector[offset];
834 save_offset2 = md->offset_vector[offset+1];
835 save_offset3 = md->offset_vector[md->offset_end - number];
836 save_capture_last = md->capture_last;
837
838 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 md->offset_vector[md->offset_end - number] =
840 (int)(eptr - md->start_subject);
841
842 for (;;)
843 {
844 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846 eptrb, RM1);
847 if (rrc != MATCH_NOMATCH &&
848 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849 RRETURN(rrc);
850 md->capture_last = save_capture_last;
851 ecode += GET(ecode, 1);
852 if (*ecode != OP_ALT) break;
853 }
854
855 DPRINTF(("bracket %d failed\n", number));
856
857 md->offset_vector[offset] = save_offset1;
858 md->offset_vector[offset+1] = save_offset2;
859 md->offset_vector[md->offset_end - number] = save_offset3;
860
861 if (rrc != MATCH_THEN) md->mark = markptr;
862 RRETURN(MATCH_NOMATCH);
863 }
864
865 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866 as a non-capturing bracket. */
867
868 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870
871 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877 for all the alternatives. When we get to the final alternative within the
878 brackets, we would return the result of a recursive call to match()
879 whatever happened. We can reduce stack usage by turning this into a tail
880 recursion, except in the case of a possibly empty group.*/
881
882 case OP_BRA:
883 case OP_SBRA:
884 DPRINTF(("start non-capturing bracket\n"));
885 for (;;)
886 {
887 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
888 {
889 if (op >= OP_SBRA) /* Possibly empty group */
890 {
891 md->match_function_type = MATCH_CBEGROUP;
892 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
893 RM48);
894 if (rrc == MATCH_NOMATCH) md->mark = markptr;
895 RRETURN(rrc);
896 }
897 /* Not a possibly empty group; use tail recursion */
898 ecode += _pcre_OP_lengths[*ecode];
899 DPRINTF(("bracket 0 tail recursion\n"));
900 goto TAIL_RECURSE;
901 }
902
903 /* For non-final alternatives, continue the loop for a NOMATCH result;
904 otherwise return. */
905
906 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
907 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
908 RM2);
909 if (rrc != MATCH_NOMATCH &&
910 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
911 RRETURN(rrc);
912 ecode += GET(ecode, 1);
913 }
914 /* Control never reaches here. */
915
916 /* Handle possessive capturing brackets with an unlimited repeat. We come
917 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
918 handled similarly to the normal case above. However, the matching is
919 different. The end of these brackets will always be OP_KETRPOS, which
920 returns MATCH_KETRPOS without going further in the pattern. By this means
921 we can handle the group by iteration rather than recursion, thereby
922 reducing the amount of stack needed. */
923
924 case OP_CBRAPOS:
925 case OP_SCBRAPOS:
926 allow_zero = FALSE;
927
928 POSSESSIVE_CAPTURE:
929 number = GET2(ecode, 1+LINK_SIZE);
930 offset = number << 1;
931
932 #ifdef PCRE_DEBUG
933 printf("start possessive bracket %d\n", number);
934 printf("subject=");
935 pchars(eptr, 16, TRUE, md);
936 printf("\n");
937 #endif
938
939 if (offset < md->offset_max)
940 {
941 matched_once = FALSE;
942 code_offset = ecode - md->start_code;
943
944 save_offset1 = md->offset_vector[offset];
945 save_offset2 = md->offset_vector[offset+1];
946 save_offset3 = md->offset_vector[md->offset_end - number];
947 save_capture_last = md->capture_last;
948
949 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
950
951 /* Each time round the loop, save the current subject position for use
952 when the group matches. For MATCH_MATCH, the group has matched, so we
953 restart it with a new subject starting position, remembering that we had
954 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
955 usual. If we haven't matched any alternatives in any iteration, check to
956 see if a previous iteration matched. If so, the group has matched;
957 continue from afterwards. Otherwise it has failed; restore the previous
958 capture values before returning NOMATCH. */
959
960 for (;;)
961 {
962 md->offset_vector[md->offset_end - number] =
963 (int)(eptr - md->start_subject);
964 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
965 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
966 eptrb, RM63);
967 if (rrc == MATCH_KETRPOS)
968 {
969 offset_top = md->end_offset_top;
970 eptr = md->end_match_ptr;
971 ecode = md->start_code + code_offset;
972 save_capture_last = md->capture_last;
973 matched_once = TRUE;
974 continue;
975 }
976 if (rrc != MATCH_NOMATCH &&
977 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
978 RRETURN(rrc);
979 md->capture_last = save_capture_last;
980 ecode += GET(ecode, 1);
981 if (*ecode != OP_ALT) break;
982 }
983
984 if (!matched_once)
985 {
986 md->offset_vector[offset] = save_offset1;
987 md->offset_vector[offset+1] = save_offset2;
988 md->offset_vector[md->offset_end - number] = save_offset3;
989 }
990
991 if (rrc != MATCH_THEN) md->mark = markptr;
992 if (allow_zero || matched_once)
993 {
994 ecode += 1 + LINK_SIZE;
995 break;
996 }
997
998 RRETURN(MATCH_NOMATCH);
999 }
1000
1001 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1002 as a non-capturing bracket. */
1003
1004 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006
1007 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1008
1009 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1010 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1011
1012 /* Non-capturing possessive bracket with unlimited repeat. We come here
1013 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1014 without the capturing complication. It is written out separately for speed
1015 and cleanliness. */
1016
1017 case OP_BRAPOS:
1018 case OP_SBRAPOS:
1019 allow_zero = FALSE;
1020
1021 POSSESSIVE_NON_CAPTURE:
1022 matched_once = FALSE;
1023 code_offset = ecode - md->start_code;
1024
1025 for (;;)
1026 {
1027 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1028 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1029 eptrb, RM64);
1030 if (rrc == MATCH_KETRPOS)
1031 {
1032 eptr = md->end_match_ptr;
1033 ecode = md->start_code + code_offset;
1034 matched_once = TRUE;
1035 continue;
1036 }
1037 if (rrc != MATCH_NOMATCH &&
1038 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1039 RRETURN(rrc);
1040 ecode += GET(ecode, 1);
1041 if (*ecode != OP_ALT) break;
1042 }
1043
1044 if (matched_once || allow_zero)
1045 {
1046 ecode += 1 + LINK_SIZE;
1047 break;
1048 }
1049 RRETURN(MATCH_NOMATCH);
1050
1051 /* Control never reaches here. */
1052
1053 /* Conditional group: compilation checked that there are no more than
1054 two branches. If the condition is false, skipping the first branch takes us
1055 past the end if there is only one branch, but that's OK because that is
1056 exactly what going to the ket would do. As there is only one branch to be
1057 obeyed, we can use tail recursion to avoid using another stack frame. */
1058
1059 case OP_COND:
1060 case OP_SCOND:
1061 codelink = GET(ecode, 1);
1062
1063 /* Because of the way auto-callout works during compile, a callout item is
1064 inserted between OP_COND and an assertion condition. */
1065
1066 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1067 {
1068 if (pcre_callout != NULL)
1069 {
1070 pcre_callout_block cb;
1071 cb.version = 1; /* Version 1 of the callout block */
1072 cb.callout_number = ecode[LINK_SIZE+2];
1073 cb.offset_vector = md->offset_vector;
1074 cb.subject = (PCRE_SPTR)md->start_subject;
1075 cb.subject_length = (int)(md->end_subject - md->start_subject);
1076 cb.start_match = (int)(mstart - md->start_subject);
1077 cb.current_position = (int)(eptr - md->start_subject);
1078 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1079 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1080 cb.capture_top = offset_top/2;
1081 cb.capture_last = md->capture_last;
1082 cb.callout_data = md->callout_data;
1083 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1084 if (rrc < 0) RRETURN(rrc);
1085 }
1086 ecode += _pcre_OP_lengths[OP_CALLOUT];
1087 }
1088
1089 condcode = ecode[LINK_SIZE+1];
1090
1091 /* Now see what the actual condition is */
1092
1093 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1094 {
1095 if (md->recursive == NULL) /* Not recursing => FALSE */
1096 {
1097 condition = FALSE;
1098 ecode += GET(ecode, 1);
1099 }
1100 else
1101 {
1102 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1103 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1104
1105 /* If the test is for recursion into a specific subpattern, and it is
1106 false, but the test was set up by name, scan the table to see if the
1107 name refers to any other numbers, and test them. The condition is true
1108 if any one is set. */
1109
1110 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1111 {
1112 uschar *slotA = md->name_table;
1113 for (i = 0; i < md->name_count; i++)
1114 {
1115 if (GET2(slotA, 0) == recno) break;
1116 slotA += md->name_entry_size;
1117 }
1118
1119 /* Found a name for the number - there can be only one; duplicate
1120 names for different numbers are allowed, but not vice versa. First
1121 scan down for duplicates. */
1122
1123 if (i < md->name_count)
1124 {
1125 uschar *slotB = slotA;
1126 while (slotB > md->name_table)
1127 {
1128 slotB -= md->name_entry_size;
1129 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1130 {
1131 condition = GET2(slotB, 0) == md->recursive->group_num;
1132 if (condition) break;
1133 }
1134 else break;
1135 }
1136
1137 /* Scan up for duplicates */
1138
1139 if (!condition)
1140 {
1141 slotB = slotA;
1142 for (i++; i < md->name_count; i++)
1143 {
1144 slotB += md->name_entry_size;
1145 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1146 {
1147 condition = GET2(slotB, 0) == md->recursive->group_num;
1148 if (condition) break;
1149 }
1150 else break;
1151 }
1152 }
1153 }
1154 }
1155
1156 /* Chose branch according to the condition */
1157
1158 ecode += condition? 3 : GET(ecode, 1);
1159 }
1160 }
1161
1162 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1163 {
1164 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1165 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1166
1167 /* If the numbered capture is unset, but the reference was by name,
1168 scan the table to see if the name refers to any other numbers, and test
1169 them. The condition is true if any one is set. This is tediously similar
1170 to the code above, but not close enough to try to amalgamate. */
1171
1172 if (!condition && condcode == OP_NCREF)
1173 {
1174 int refno = offset >> 1;
1175 uschar *slotA = md->name_table;
1176
1177 for (i = 0; i < md->name_count; i++)
1178 {
1179 if (GET2(slotA, 0) == refno) break;
1180 slotA += md->name_entry_size;
1181 }
1182
1183 /* Found a name for the number - there can be only one; duplicate names
1184 for different numbers are allowed, but not vice versa. First scan down
1185 for duplicates. */
1186
1187 if (i < md->name_count)
1188 {
1189 uschar *slotB = slotA;
1190 while (slotB > md->name_table)
1191 {
1192 slotB -= md->name_entry_size;
1193 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1194 {
1195 offset = GET2(slotB, 0) << 1;
1196 condition = offset < offset_top &&
1197 md->offset_vector[offset] >= 0;
1198 if (condition) break;
1199 }
1200 else break;
1201 }
1202
1203 /* Scan up for duplicates */
1204
1205 if (!condition)
1206 {
1207 slotB = slotA;
1208 for (i++; i < md->name_count; i++)
1209 {
1210 slotB += md->name_entry_size;
1211 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1212 {
1213 offset = GET2(slotB, 0) << 1;
1214 condition = offset < offset_top &&
1215 md->offset_vector[offset] >= 0;
1216 if (condition) break;
1217 }
1218 else break;
1219 }
1220 }
1221 }
1222 }
1223
1224 /* Chose branch according to the condition */
1225
1226 ecode += condition? 3 : GET(ecode, 1);
1227 }
1228
1229 else if (condcode == OP_DEF) /* DEFINE - always false */
1230 {
1231 condition = FALSE;
1232 ecode += GET(ecode, 1);
1233 }
1234
1235 /* The condition is an assertion. Call match() to evaluate it - setting
1236 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1237 an assertion. */
1238
1239 else
1240 {
1241 md->match_function_type = MATCH_CONDASSERT;
1242 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1243 if (rrc == MATCH_MATCH)
1244 {
1245 condition = TRUE;
1246 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1247 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1248 }
1249 else if (rrc != MATCH_NOMATCH &&
1250 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1251 {
1252 RRETURN(rrc); /* Need braces because of following else */
1253 }
1254 else
1255 {
1256 condition = FALSE;
1257 ecode += codelink;
1258 }
1259 }
1260
1261 /* We are now at the branch that is to be obeyed. As there is only one,
1262 we can use tail recursion to avoid using another stack frame, except when
1263 we have an unlimited repeat of a possibly empty group. If the second
1264 alternative doesn't exist, we can just plough on. */
1265
1266 if (condition || *ecode == OP_ALT)
1267 {
1268 ecode += 1 + LINK_SIZE;
1269 if (op == OP_SCOND) /* Possibly empty group */
1270 {
1271 md->match_function_type = MATCH_CBEGROUP;
1272 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1273 RRETURN(rrc);
1274 }
1275 else goto TAIL_RECURSE;
1276 }
1277 else /* Condition false & no alternative */
1278 {
1279 ecode += 1 + LINK_SIZE;
1280 }
1281 break;
1282
1283
1284 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1285 to close any currently open capturing brackets. */
1286
1287 case OP_CLOSE:
1288 number = GET2(ecode, 1);
1289 offset = number << 1;
1290
1291 #ifdef PCRE_DEBUG
1292 printf("end bracket %d at *ACCEPT", number);
1293 printf("\n");
1294 #endif
1295
1296 md->capture_last = number;
1297 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1298 {
1299 md->offset_vector[offset] =
1300 md->offset_vector[md->offset_end - number];
1301 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1302 if (offset_top <= offset) offset_top = offset + 2;
1303 }
1304 ecode += 3;
1305 break;
1306
1307
1308 /* End of the pattern, either real or forced. If we are in a top-level
1309 recursion, we should restore the offsets appropriately and continue from
1310 after the call. */
1311
1312 case OP_ACCEPT:
1313 case OP_END:
1314 if (md->recursive != NULL && md->recursive->group_num == 0)
1315 {
1316 recursion_info *rec = md->recursive;
1317 DPRINTF(("End of pattern in a (?0) recursion\n"));
1318 md->recursive = rec->prevrec;
1319 memmove(md->offset_vector, rec->offset_save,
1320 rec->saved_max * sizeof(int));
1321 offset_top = rec->save_offset_top;
1322 ecode = rec->after_call;
1323 break;
1324 }
1325
1326 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1327 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1328 the subject. In both cases, backtracking will then try other alternatives,
1329 if any. */
1330
1331 if (eptr == mstart &&
1332 (md->notempty ||
1333 (md->notempty_atstart &&
1334 mstart == md->start_subject + md->start_offset)))
1335 MRRETURN(MATCH_NOMATCH);
1336
1337 /* Otherwise, we have a match. */
1338
1339 md->end_match_ptr = eptr; /* Record where we ended */
1340 md->end_offset_top = offset_top; /* and how many extracts were taken */
1341 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1342
1343 /* For some reason, the macros don't work properly if an expression is
1344 given as the argument to MRRETURN when the heap is in use. */
1345
1346 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1347 MRRETURN(rrc);
1348
1349 /* Assertion brackets. Check the alternative branches in turn - the
1350 matching won't pass the KET for an assertion. If any one branch matches,
1351 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1352 start of each branch to move the current point backwards, so the code at
1353 this level is identical to the lookahead case. When the assertion is part
1354 of a condition, we want to return immediately afterwards. The caller of
1355 this incarnation of the match() function will have set MATCH_CONDASSERT in
1356 md->match_function type, and one of these opcodes will be the first opcode
1357 that is processed. We use a local variable that is preserved over calls to
1358 match() to remember this case. */
1359
1360 case OP_ASSERT:
1361 case OP_ASSERTBACK:
1362 if (md->match_function_type == MATCH_CONDASSERT)
1363 {
1364 condassert = TRUE;
1365 md->match_function_type = 0;
1366 }
1367 else condassert = FALSE;
1368
1369 do
1370 {
1371 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1372 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1373 {
1374 mstart = md->start_match_ptr; /* In case \K reset it */
1375 break;
1376 }
1377 if (rrc != MATCH_NOMATCH &&
1378 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1379 RRETURN(rrc);
1380 ecode += GET(ecode, 1);
1381 }
1382 while (*ecode == OP_ALT);
1383
1384 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1385
1386 /* If checking an assertion for a condition, return MATCH_MATCH. */
1387
1388 if (condassert) RRETURN(MATCH_MATCH);
1389
1390 /* Continue from after the assertion, updating the offsets high water
1391 mark, since extracts may have been taken during the assertion. */
1392
1393 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1394 ecode += 1 + LINK_SIZE;
1395 offset_top = md->end_offset_top;
1396 continue;
1397
1398 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1399 PRUNE, or COMMIT means we must assume failure without checking subsequent
1400 branches. */
1401
1402 case OP_ASSERT_NOT:
1403 case OP_ASSERTBACK_NOT:
1404 if (md->match_function_type == MATCH_CONDASSERT)
1405 {
1406 condassert = TRUE;
1407 md->match_function_type = 0;
1408 }
1409 else condassert = FALSE;
1410
1411 do
1412 {
1413 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1414 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1415 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1416 {
1417 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1418 break;
1419 }
1420 if (rrc != MATCH_NOMATCH &&
1421 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1422 RRETURN(rrc);
1423 ecode += GET(ecode,1);
1424 }
1425 while (*ecode == OP_ALT);
1426
1427 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1428
1429 ecode += 1 + LINK_SIZE;
1430 continue;
1431
1432 /* Move the subject pointer back. This occurs only at the start of
1433 each branch of a lookbehind assertion. If we are too close to the start to
1434 move back, this match function fails. When working with UTF-8 we move
1435 back a number of characters, not bytes. */
1436
1437 case OP_REVERSE:
1438 #ifdef SUPPORT_UTF8
1439 if (utf8)
1440 {
1441 i = GET(ecode, 1);
1442 while (i-- > 0)
1443 {
1444 eptr--;
1445 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1446 BACKCHAR(eptr);
1447 }
1448 }
1449 else
1450 #endif
1451
1452 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1453
1454 {
1455 eptr -= GET(ecode, 1);
1456 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1457 }
1458
1459 /* Save the earliest consulted character, then skip to next op code */
1460
1461 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1462 ecode += 1 + LINK_SIZE;
1463 break;
1464
1465 /* The callout item calls an external function, if one is provided, passing
1466 details of the match so far. This is mainly for debugging, though the
1467 function is able to force a failure. */
1468
1469 case OP_CALLOUT:
1470 if (pcre_callout != NULL)
1471 {
1472 pcre_callout_block cb;
1473 cb.version = 1; /* Version 1 of the callout block */
1474 cb.callout_number = ecode[1];
1475 cb.offset_vector = md->offset_vector;
1476 cb.subject = (PCRE_SPTR)md->start_subject;
1477 cb.subject_length = (int)(md->end_subject - md->start_subject);
1478 cb.start_match = (int)(mstart - md->start_subject);
1479 cb.current_position = (int)(eptr - md->start_subject);
1480 cb.pattern_position = GET(ecode, 2);
1481 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1482 cb.capture_top = offset_top/2;
1483 cb.capture_last = md->capture_last;
1484 cb.callout_data = md->callout_data;
1485 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1486 if (rrc < 0) RRETURN(rrc);
1487 }
1488 ecode += 2 + 2*LINK_SIZE;
1489 break;
1490
1491 /* Recursion either matches the current regex, or some subexpression. The
1492 offset data is the offset to the starting bracket from the start of the
1493 whole pattern. (This is so that it works from duplicated subpatterns.)
1494
1495 If there are any capturing brackets started but not finished, we have to
1496 save their starting points and reinstate them after the recursion. However,
1497 we don't know how many such there are (offset_top records the completed
1498 total) so we just have to save all the potential data. There may be up to
1499 65535 such values, which is too large to put on the stack, but using malloc
1500 for small numbers seems expensive. As a compromise, the stack is used when
1501 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1502 is used. A problem is what to do if the malloc fails ... there is no way of
1503 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1504 values on the stack, and accept that the rest may be wrong.
1505
1506 There are also other values that have to be saved. We use a chained
1507 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1508 for the original version of this logic. */
1509
1510 case OP_RECURSE:
1511 {
1512 callpat = md->start_code + GET(ecode, 1);
1513 new_recursive.group_num = (callpat == md->start_code)? 0 :
1514 GET2(callpat, 1 + LINK_SIZE);
1515
1516 /* Add to "recursing stack" */
1517
1518 new_recursive.prevrec = md->recursive;
1519 md->recursive = &new_recursive;
1520
1521 /* Find where to continue from afterwards */
1522
1523 ecode += 1 + LINK_SIZE;
1524 new_recursive.after_call = ecode;
1525
1526 /* Now save the offset data. */
1527
1528 new_recursive.saved_max = md->offset_end;
1529 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1530 new_recursive.offset_save = stacksave;
1531 else
1532 {
1533 new_recursive.offset_save =
1534 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1535 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1536 }
1537
1538 memcpy(new_recursive.offset_save, md->offset_vector,
1539 new_recursive.saved_max * sizeof(int));
1540 new_recursive.save_offset_top = offset_top;
1541
1542 /* OK, now we can do the recursion. For each top-level alternative we
1543 restore the offset and recursion data. */
1544
1545 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1546 cbegroup = (*callpat >= OP_SBRA);
1547 do
1548 {
1549 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1550 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1551 md, eptrb, RM6);
1552 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1553 {
1554 DPRINTF(("Recursion matched\n"));
1555 md->recursive = new_recursive.prevrec;
1556 if (new_recursive.offset_save != stacksave)
1557 (pcre_free)(new_recursive.offset_save);
1558 MRRETURN(MATCH_MATCH);
1559 }
1560 else if (rrc != MATCH_NOMATCH &&
1561 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1562 {
1563 DPRINTF(("Recursion gave error %d\n", rrc));
1564 if (new_recursive.offset_save != stacksave)
1565 (pcre_free)(new_recursive.offset_save);
1566 RRETURN(rrc);
1567 }
1568
1569 md->recursive = &new_recursive;
1570 memcpy(md->offset_vector, new_recursive.offset_save,
1571 new_recursive.saved_max * sizeof(int));
1572 callpat += GET(callpat, 1);
1573 }
1574 while (*callpat == OP_ALT);
1575
1576 DPRINTF(("Recursion didn't match\n"));
1577 md->recursive = new_recursive.prevrec;
1578 if (new_recursive.offset_save != stacksave)
1579 (pcre_free)(new_recursive.offset_save);
1580 MRRETURN(MATCH_NOMATCH);
1581 }
1582 /* Control never reaches here */
1583
1584 /* "Once" brackets are like assertion brackets except that after a match,
1585 the point in the subject string is not moved back. Thus there can never be
1586 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1587 Check the alternative branches in turn - the matching won't pass the KET
1588 for this kind of subpattern. If any one branch matches, we carry on as at
1589 the end of a normal bracket, leaving the subject pointer, but resetting
1590 the start-of-match value in case it was changed by \K. */
1591
1592 case OP_ONCE:
1593 prev = ecode;
1594 saved_eptr = eptr;
1595
1596 do
1597 {
1598 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1599 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1600 {
1601 mstart = md->start_match_ptr;
1602 break;
1603 }
1604 if (rrc != MATCH_NOMATCH &&
1605 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1606 RRETURN(rrc);
1607 ecode += GET(ecode,1);
1608 }
1609 while (*ecode == OP_ALT);
1610
1611 /* If hit the end of the group (which could be repeated), fail */
1612
1613 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1614
1615 /* Continue as from after the assertion, updating the offsets high water
1616 mark, since extracts may have been taken. */
1617
1618 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1619
1620 offset_top = md->end_offset_top;
1621 eptr = md->end_match_ptr;
1622
1623 /* For a non-repeating ket, just continue at this level. This also
1624 happens for a repeating ket if no characters were matched in the group.
1625 This is the forcible breaking of infinite loops as implemented in Perl
1626 5.005. If there is an options reset, it will get obeyed in the normal
1627 course of events. */
1628
1629 if (*ecode == OP_KET || eptr == saved_eptr)
1630 {
1631 ecode += 1+LINK_SIZE;
1632 break;
1633 }
1634
1635 /* The repeating kets try the rest of the pattern or restart from the
1636 preceding bracket, in the appropriate order. The second "call" of match()
1637 uses tail recursion, to avoid using another stack frame. */
1638
1639 if (*ecode == OP_KETRMIN)
1640 {
1641 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1643 ecode = prev;
1644 goto TAIL_RECURSE;
1645 }
1646 else /* OP_KETRMAX */
1647 {
1648 md->match_function_type = MATCH_CBEGROUP;
1649 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1651 ecode += 1 + LINK_SIZE;
1652 goto TAIL_RECURSE;
1653 }
1654 /* Control never gets here */
1655
1656 /* An alternation is the end of a branch; scan along to find the end of the
1657 bracketed group and go to there. */
1658
1659 case OP_ALT:
1660 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1661 break;
1662
1663 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1664 indicating that it may occur zero times. It may repeat infinitely, or not
1665 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1666 with fixed upper repeat limits are compiled as a number of copies, with the
1667 optional ones preceded by BRAZERO or BRAMINZERO. */
1668
1669 case OP_BRAZERO:
1670 next = ecode + 1;
1671 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1672 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1673 do next += GET(next, 1); while (*next == OP_ALT);
1674 ecode = next + 1 + LINK_SIZE;
1675 break;
1676
1677 case OP_BRAMINZERO:
1678 next = ecode + 1;
1679 do next += GET(next, 1); while (*next == OP_ALT);
1680 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1682 ecode++;
1683 break;
1684
1685 case OP_SKIPZERO:
1686 next = ecode+1;
1687 do next += GET(next,1); while (*next == OP_ALT);
1688 ecode = next + 1 + LINK_SIZE;
1689 break;
1690
1691 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1692 here; just jump to the group, with allow_zero set TRUE. */
1693
1694 case OP_BRAPOSZERO:
1695 op = *(++ecode);
1696 allow_zero = TRUE;
1697 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1698 goto POSSESSIVE_NON_CAPTURE;
1699
1700 /* End of a group, repeated or non-repeating. */
1701
1702 case OP_KET:
1703 case OP_KETRMIN:
1704 case OP_KETRMAX:
1705 case OP_KETRPOS:
1706 prev = ecode - GET(ecode, 1);
1707
1708 /* If this was a group that remembered the subject start, in order to break
1709 infinite repeats of empty string matches, retrieve the subject start from
1710 the chain. Otherwise, set it NULL. */
1711
1712 if (*prev >= OP_SBRA)
1713 {
1714 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1715 eptrb = eptrb->epb_prev; /* Backup to previous group */
1716 }
1717 else saved_eptr = NULL;
1718
1719 /* If we are at the end of an assertion group or an atomic group, stop
1720 matching and return MATCH_MATCH, but record the current high water mark for
1721 use by positive assertions. We also need to record the match start in case
1722 it was changed by \K. */
1723
1724 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1725 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1726 *prev == OP_ONCE)
1727 {
1728 md->end_match_ptr = eptr; /* For ONCE */
1729 md->end_offset_top = offset_top;
1730 md->start_match_ptr = mstart;
1731 MRRETURN(MATCH_MATCH);
1732 }
1733
1734 /* For capturing groups we have to check the group number back at the start
1735 and if necessary complete handling an extraction by setting the offsets and
1736 bumping the high water mark. Note that whole-pattern recursion is coded as
1737 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1738 when the OP_END is reached. Other recursion is handled here. */
1739
1740 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1741 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1742 {
1743 number = GET2(prev, 1+LINK_SIZE);
1744 offset = number << 1;
1745
1746 #ifdef PCRE_DEBUG
1747 printf("end bracket %d", number);
1748 printf("\n");
1749 #endif
1750
1751 md->capture_last = number;
1752 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1753 {
1754 md->offset_vector[offset] =
1755 md->offset_vector[md->offset_end - number];
1756 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1757 if (offset_top <= offset) offset_top = offset + 2;
1758 }
1759
1760 /* Handle a recursively called group. Restore the offsets
1761 appropriately and continue from after the call. */
1762
1763 if (md->recursive != NULL && md->recursive->group_num == number)
1764 {
1765 recursion_info *rec = md->recursive;
1766 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1767 md->recursive = rec->prevrec;
1768 memcpy(md->offset_vector, rec->offset_save,
1769 rec->saved_max * sizeof(int));
1770 offset_top = rec->save_offset_top;
1771 ecode = rec->after_call;
1772 break;
1773 }
1774 }
1775
1776 /* For a non-repeating ket, just continue at this level. This also
1777 happens for a repeating ket if no characters were matched in the group.
1778 This is the forcible breaking of infinite loops as implemented in Perl
1779 5.005. If there is an options reset, it will get obeyed in the normal
1780 course of events. */
1781
1782 if (*ecode == OP_KET || eptr == saved_eptr)
1783 {
1784 ecode += 1 + LINK_SIZE;
1785 break;
1786 }
1787
1788 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1789 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1790 at a time from the outer level, thus saving stack. */
1791
1792 if (*ecode == OP_KETRPOS)
1793 {
1794 md->end_match_ptr = eptr;
1795 md->end_offset_top = offset_top;
1796 RRETURN(MATCH_KETRPOS);
1797 }
1798
1799 /* The normal repeating kets try the rest of the pattern or restart from
1800 the preceding bracket, in the appropriate order. In the second case, we can
1801 use tail recursion to avoid using another stack frame, unless we have an
1802 unlimited repeat of a group that can match an empty string. */
1803
1804 if (*ecode == OP_KETRMIN)
1805 {
1806 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1808 if (*prev >= OP_SBRA) /* Could match an empty string */
1809 {
1810 md->match_function_type = MATCH_CBEGROUP;
1811 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1812 RRETURN(rrc);
1813 }
1814 ecode = prev;
1815 goto TAIL_RECURSE;
1816 }
1817 else /* OP_KETRMAX */
1818 {
1819 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1820 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1821 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1822 ecode += 1 + LINK_SIZE;
1823 goto TAIL_RECURSE;
1824 }
1825 /* Control never gets here */
1826
1827 /* Not multiline mode: start of subject assertion, unless notbol. */
1828
1829 case OP_CIRC:
1830 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1831
1832 /* Start of subject assertion */
1833
1834 case OP_SOD:
1835 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1836 ecode++;
1837 break;
1838
1839 /* Multiline mode: start of subject unless notbol, or after any newline. */
1840
1841 case OP_CIRCM:
1842 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1843 if (eptr != md->start_subject &&
1844 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1845 MRRETURN(MATCH_NOMATCH);
1846 ecode++;
1847 break;
1848
1849 /* Start of match assertion */
1850
1851 case OP_SOM:
1852 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1853 ecode++;
1854 break;
1855
1856 /* Reset the start of match point */
1857
1858 case OP_SET_SOM:
1859 mstart = eptr;
1860 ecode++;
1861 break;
1862
1863 /* Multiline mode: assert before any newline, or before end of subject
1864 unless noteol is set. */
1865
1866 case OP_DOLLM:
1867 if (eptr < md->end_subject)
1868 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1869 else
1870 {
1871 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1872 SCHECK_PARTIAL();
1873 }
1874 ecode++;
1875 break;
1876
1877 /* Not multiline mode: assert before a terminating newline or before end of
1878 subject unless noteol is set. */
1879
1880 case OP_DOLL:
1881 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1882 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1883
1884 /* ... else fall through for endonly */
1885
1886 /* End of subject assertion (\z) */
1887
1888 case OP_EOD:
1889 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1890 SCHECK_PARTIAL();
1891 ecode++;
1892 break;
1893
1894 /* End of subject or ending \n assertion (\Z) */
1895
1896 case OP_EODN:
1897 ASSERT_NL_OR_EOS:
1898 if (eptr < md->end_subject &&
1899 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1900 MRRETURN(MATCH_NOMATCH);
1901
1902 /* Either at end of string or \n before end. */
1903
1904 SCHECK_PARTIAL();
1905 ecode++;
1906 break;
1907
1908 /* Word boundary assertions */
1909
1910 case OP_NOT_WORD_BOUNDARY:
1911 case OP_WORD_BOUNDARY:
1912 {
1913
1914 /* Find out if the previous and current characters are "word" characters.
1915 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1916 be "non-word" characters. Remember the earliest consulted character for
1917 partial matching. */
1918
1919 #ifdef SUPPORT_UTF8
1920 if (utf8)
1921 {
1922 /* Get status of previous character */
1923
1924 if (eptr == md->start_subject) prev_is_word = FALSE; else
1925 {
1926 USPTR lastptr = eptr - 1;
1927 while((*lastptr & 0xc0) == 0x80) lastptr--;
1928 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1929 GETCHAR(c, lastptr);
1930 #ifdef SUPPORT_UCP
1931 if (md->use_ucp)
1932 {
1933 if (c == '_') prev_is_word = TRUE; else
1934 {
1935 int cat = UCD_CATEGORY(c);
1936 prev_is_word = (cat == ucp_L || cat == ucp_N);
1937 }
1938 }
1939 else
1940 #endif
1941 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1942 }
1943
1944 /* Get status of next character */
1945
1946 if (eptr >= md->end_subject)
1947 {
1948 SCHECK_PARTIAL();
1949 cur_is_word = FALSE;
1950 }
1951 else
1952 {
1953 GETCHAR(c, eptr);
1954 #ifdef SUPPORT_UCP
1955 if (md->use_ucp)
1956 {
1957 if (c == '_') cur_is_word = TRUE; else
1958 {
1959 int cat = UCD_CATEGORY(c);
1960 cur_is_word = (cat == ucp_L || cat == ucp_N);
1961 }
1962 }
1963 else
1964 #endif
1965 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1966 }
1967 }
1968 else
1969 #endif
1970
1971 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1972 consistency with the behaviour of \w we do use it in this case. */
1973
1974 {
1975 /* Get status of previous character */
1976
1977 if (eptr == md->start_subject) prev_is_word = FALSE; else
1978 {
1979 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1980 #ifdef SUPPORT_UCP
1981 if (md->use_ucp)
1982 {
1983 c = eptr[-1];
1984 if (c == '_') prev_is_word = TRUE; else
1985 {
1986 int cat = UCD_CATEGORY(c);
1987 prev_is_word = (cat == ucp_L || cat == ucp_N);
1988 }
1989 }
1990 else
1991 #endif
1992 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1993 }
1994
1995 /* Get status of next character */
1996
1997 if (eptr >= md->end_subject)
1998 {
1999 SCHECK_PARTIAL();
2000 cur_is_word = FALSE;
2001 }
2002 else
2003 #ifdef SUPPORT_UCP
2004 if (md->use_ucp)
2005 {
2006 c = *eptr;
2007 if (c == '_') cur_is_word = TRUE; else
2008 {
2009 int cat = UCD_CATEGORY(c);
2010 cur_is_word = (cat == ucp_L || cat == ucp_N);
2011 }
2012 }
2013 else
2014 #endif
2015 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2016 }
2017
2018 /* Now see if the situation is what we want */
2019
2020 if ((*ecode++ == OP_WORD_BOUNDARY)?
2021 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2022 MRRETURN(MATCH_NOMATCH);
2023 }
2024 break;
2025
2026 /* Match a single character type; inline for speed */
2027
2028 case OP_ANY:
2029 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2030 /* Fall through */
2031
2032 case OP_ALLANY:
2033 if (eptr++ >= md->end_subject)
2034 {
2035 SCHECK_PARTIAL();
2036 MRRETURN(MATCH_NOMATCH);
2037 }
2038 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2039 ecode++;
2040 break;
2041
2042 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2043 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2044
2045 case OP_ANYBYTE:
2046 if (eptr++ >= md->end_subject)
2047 {
2048 SCHECK_PARTIAL();
2049 MRRETURN(MATCH_NOMATCH);
2050 }
2051 ecode++;
2052 break;
2053
2054 case OP_NOT_DIGIT:
2055 if (eptr >= md->end_subject)
2056 {
2057 SCHECK_PARTIAL();
2058 MRRETURN(MATCH_NOMATCH);
2059 }
2060 GETCHARINCTEST(c, eptr);
2061 if (
2062 #ifdef SUPPORT_UTF8
2063 c < 256 &&
2064 #endif
2065 (md->ctypes[c] & ctype_digit) != 0
2066 )
2067 MRRETURN(MATCH_NOMATCH);
2068 ecode++;
2069 break;
2070
2071 case OP_DIGIT:
2072 if (eptr >= md->end_subject)
2073 {
2074 SCHECK_PARTIAL();
2075 MRRETURN(MATCH_NOMATCH);
2076 }
2077 GETCHARINCTEST(c, eptr);
2078 if (
2079 #ifdef SUPPORT_UTF8
2080 c >= 256 ||
2081 #endif
2082 (md->ctypes[c] & ctype_digit) == 0
2083 )
2084 MRRETURN(MATCH_NOMATCH);
2085 ecode++;
2086 break;
2087
2088 case OP_NOT_WHITESPACE:
2089 if (eptr >= md->end_subject)
2090 {
2091 SCHECK_PARTIAL();
2092 MRRETURN(MATCH_NOMATCH);
2093 }
2094 GETCHARINCTEST(c, eptr);
2095 if (
2096 #ifdef SUPPORT_UTF8
2097 c < 256 &&
2098 #endif
2099 (md->ctypes[c] & ctype_space) != 0
2100 )
2101 MRRETURN(MATCH_NOMATCH);
2102 ecode++;
2103 break;
2104
2105 case OP_WHITESPACE:
2106 if (eptr >= md->end_subject)
2107 {
2108 SCHECK_PARTIAL();
2109 MRRETURN(MATCH_NOMATCH);
2110 }
2111 GETCHARINCTEST(c, eptr);
2112 if (
2113 #ifdef SUPPORT_UTF8
2114 c >= 256 ||
2115 #endif
2116 (md->ctypes[c] & ctype_space) == 0
2117 )
2118 MRRETURN(MATCH_NOMATCH);
2119 ecode++;
2120 break;
2121
2122 case OP_NOT_WORDCHAR:
2123 if (eptr >= md->end_subject)
2124 {
2125 SCHECK_PARTIAL();
2126 MRRETURN(MATCH_NOMATCH);
2127 }
2128 GETCHARINCTEST(c, eptr);
2129 if (
2130 #ifdef SUPPORT_UTF8
2131 c < 256 &&
2132 #endif
2133 (md->ctypes[c] & ctype_word) != 0
2134 )
2135 MRRETURN(MATCH_NOMATCH);
2136 ecode++;
2137 break;
2138
2139 case OP_WORDCHAR:
2140 if (eptr >= md->end_subject)
2141 {
2142 SCHECK_PARTIAL();
2143 MRRETURN(MATCH_NOMATCH);
2144 }
2145 GETCHARINCTEST(c, eptr);
2146 if (
2147 #ifdef SUPPORT_UTF8
2148 c >= 256 ||
2149 #endif
2150 (md->ctypes[c] & ctype_word) == 0
2151 )
2152 MRRETURN(MATCH_NOMATCH);
2153 ecode++;
2154 break;
2155
2156 case OP_ANYNL:
2157 if (eptr >= md->end_subject)
2158 {
2159 SCHECK_PARTIAL();
2160 MRRETURN(MATCH_NOMATCH);
2161 }
2162 GETCHARINCTEST(c, eptr);
2163 switch(c)
2164 {
2165 default: MRRETURN(MATCH_NOMATCH);
2166
2167 case 0x000d:
2168 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2169 break;
2170
2171 case 0x000a:
2172 break;
2173
2174 case 0x000b:
2175 case 0x000c:
2176 case 0x0085:
2177 case 0x2028:
2178 case 0x2029:
2179 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2180 break;
2181 }
2182 ecode++;
2183 break;
2184
2185 case OP_NOT_HSPACE:
2186 if (eptr >= md->end_subject)
2187 {
2188 SCHECK_PARTIAL();
2189 MRRETURN(MATCH_NOMATCH);
2190 }
2191 GETCHARINCTEST(c, eptr);
2192 switch(c)
2193 {
2194 default: break;
2195 case 0x09: /* HT */
2196 case 0x20: /* SPACE */
2197 case 0xa0: /* NBSP */
2198 case 0x1680: /* OGHAM SPACE MARK */
2199 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2200 case 0x2000: /* EN QUAD */
2201 case 0x2001: /* EM QUAD */
2202 case 0x2002: /* EN SPACE */
2203 case 0x2003: /* EM SPACE */
2204 case 0x2004: /* THREE-PER-EM SPACE */
2205 case 0x2005: /* FOUR-PER-EM SPACE */
2206 case 0x2006: /* SIX-PER-EM SPACE */
2207 case 0x2007: /* FIGURE SPACE */
2208 case 0x2008: /* PUNCTUATION SPACE */
2209 case 0x2009: /* THIN SPACE */
2210 case 0x200A: /* HAIR SPACE */
2211 case 0x202f: /* NARROW NO-BREAK SPACE */
2212 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2213 case 0x3000: /* IDEOGRAPHIC SPACE */
2214 MRRETURN(MATCH_NOMATCH);
2215 }
2216 ecode++;
2217 break;
2218
2219 case OP_HSPACE:
2220 if (eptr >= md->end_subject)
2221 {
2222 SCHECK_PARTIAL();
2223 MRRETURN(MATCH_NOMATCH);
2224 }
2225 GETCHARINCTEST(c, eptr);
2226 switch(c)
2227 {
2228 default: MRRETURN(MATCH_NOMATCH);
2229 case 0x09: /* HT */
2230 case 0x20: /* SPACE */
2231 case 0xa0: /* NBSP */
2232 case 0x1680: /* OGHAM SPACE MARK */
2233 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2234 case 0x2000: /* EN QUAD */
2235 case 0x2001: /* EM QUAD */
2236 case 0x2002: /* EN SPACE */
2237 case 0x2003: /* EM SPACE */
2238 case 0x2004: /* THREE-PER-EM SPACE */
2239 case 0x2005: /* FOUR-PER-EM SPACE */
2240 case 0x2006: /* SIX-PER-EM SPACE */
2241 case 0x2007: /* FIGURE SPACE */
2242 case 0x2008: /* PUNCTUATION SPACE */
2243 case 0x2009: /* THIN SPACE */
2244 case 0x200A: /* HAIR SPACE */
2245 case 0x202f: /* NARROW NO-BREAK SPACE */
2246 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2247 case 0x3000: /* IDEOGRAPHIC SPACE */
2248 break;
2249 }
2250 ecode++;
2251 break;
2252
2253 case OP_NOT_VSPACE:
2254 if (eptr >= md->end_subject)
2255 {
2256 SCHECK_PARTIAL();
2257 MRRETURN(MATCH_NOMATCH);
2258 }
2259 GETCHARINCTEST(c, eptr);
2260 switch(c)
2261 {
2262 default: break;
2263 case 0x0a: /* LF */
2264 case 0x0b: /* VT */
2265 case 0x0c: /* FF */
2266 case 0x0d: /* CR */
2267 case 0x85: /* NEL */
2268 case 0x2028: /* LINE SEPARATOR */
2269 case 0x2029: /* PARAGRAPH SEPARATOR */
2270 MRRETURN(MATCH_NOMATCH);
2271 }
2272 ecode++;
2273 break;
2274
2275 case OP_VSPACE:
2276 if (eptr >= md->end_subject)
2277 {
2278 SCHECK_PARTIAL();
2279 MRRETURN(MATCH_NOMATCH);
2280 }
2281 GETCHARINCTEST(c, eptr);
2282 switch(c)
2283 {
2284 default: MRRETURN(MATCH_NOMATCH);
2285 case 0x0a: /* LF */
2286 case 0x0b: /* VT */
2287 case 0x0c: /* FF */
2288 case 0x0d: /* CR */
2289 case 0x85: /* NEL */
2290 case 0x2028: /* LINE SEPARATOR */
2291 case 0x2029: /* PARAGRAPH SEPARATOR */
2292 break;
2293 }
2294 ecode++;
2295 break;
2296
2297 #ifdef SUPPORT_UCP
2298 /* Check the next character by Unicode property. We will get here only
2299 if the support is in the binary; otherwise a compile-time error occurs. */
2300
2301 case OP_PROP:
2302 case OP_NOTPROP:
2303 if (eptr >= md->end_subject)
2304 {
2305 SCHECK_PARTIAL();
2306 MRRETURN(MATCH_NOMATCH);
2307 }
2308 GETCHARINCTEST(c, eptr);
2309 {
2310 const ucd_record *prop = GET_UCD(c);
2311
2312 switch(ecode[1])
2313 {
2314 case PT_ANY:
2315 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2316 break;
2317
2318 case PT_LAMP:
2319 if ((prop->chartype == ucp_Lu ||
2320 prop->chartype == ucp_Ll ||
2321 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2322 MRRETURN(MATCH_NOMATCH);
2323 break;
2324
2325 case PT_GC:
2326 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2327 MRRETURN(MATCH_NOMATCH);
2328 break;
2329
2330 case PT_PC:
2331 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2332 MRRETURN(MATCH_NOMATCH);
2333 break;
2334
2335 case PT_SC:
2336 if ((ecode[2] != prop->script) == (op == OP_PROP))
2337 MRRETURN(MATCH_NOMATCH);
2338 break;
2339
2340 /* These are specials */
2341
2342 case PT_ALNUM:
2343 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2344 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2345 MRRETURN(MATCH_NOMATCH);
2346 break;
2347
2348 case PT_SPACE: /* Perl space */
2349 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2350 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2351 == (op == OP_NOTPROP))
2352 MRRETURN(MATCH_NOMATCH);
2353 break;
2354
2355 case PT_PXSPACE: /* POSIX space */
2356 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2357 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2358 c == CHAR_FF || c == CHAR_CR)
2359 == (op == OP_NOTPROP))
2360 MRRETURN(MATCH_NOMATCH);
2361 break;
2362
2363 case PT_WORD:
2364 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2365 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2366 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2367 MRRETURN(MATCH_NOMATCH);
2368 break;
2369
2370 /* This should never occur */
2371
2372 default:
2373 RRETURN(PCRE_ERROR_INTERNAL);
2374 }
2375
2376 ecode += 3;
2377 }
2378 break;
2379
2380 /* Match an extended Unicode sequence. We will get here only if the support
2381 is in the binary; otherwise a compile-time error occurs. */
2382
2383 case OP_EXTUNI:
2384 if (eptr >= md->end_subject)
2385 {
2386 SCHECK_PARTIAL();
2387 MRRETURN(MATCH_NOMATCH);
2388 }
2389 GETCHARINCTEST(c, eptr);
2390 {
2391 int category = UCD_CATEGORY(c);
2392 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2393 while (eptr < md->end_subject)
2394 {
2395 int len = 1;
2396 if (!utf8) c = *eptr; else
2397 {
2398 GETCHARLEN(c, eptr, len);
2399 }
2400 category = UCD_CATEGORY(c);
2401 if (category != ucp_M) break;
2402 eptr += len;
2403 }
2404 }
2405 ecode++;
2406 break;
2407 #endif
2408
2409
2410 /* Match a back reference, possibly repeatedly. Look past the end of the
2411 item to see if there is repeat information following. The code is similar
2412 to that for character classes, but repeated for efficiency. Then obey
2413 similar code to character type repeats - written out again for speed.
2414 However, if the referenced string is the empty string, always treat
2415 it as matched, any number of times (otherwise there could be infinite
2416 loops). */
2417
2418 case OP_REF:
2419 case OP_REFI:
2420 caseless = op == OP_REFI;
2421 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2422 ecode += 3;
2423
2424 /* If the reference is unset, there are two possibilities:
2425
2426 (a) In the default, Perl-compatible state, set the length negative;
2427 this ensures that every attempt at a match fails. We can't just fail
2428 here, because of the possibility of quantifiers with zero minima.
2429
2430 (b) If the JavaScript compatibility flag is set, set the length to zero
2431 so that the back reference matches an empty string.
2432
2433 Otherwise, set the length to the length of what was matched by the
2434 referenced subpattern. */
2435
2436 if (offset >= offset_top || md->offset_vector[offset] < 0)
2437 length = (md->jscript_compat)? 0 : -1;
2438 else
2439 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2440
2441 /* Set up for repetition, or handle the non-repeated case */
2442
2443 switch (*ecode)
2444 {
2445 case OP_CRSTAR:
2446 case OP_CRMINSTAR:
2447 case OP_CRPLUS:
2448 case OP_CRMINPLUS:
2449 case OP_CRQUERY:
2450 case OP_CRMINQUERY:
2451 c = *ecode++ - OP_CRSTAR;
2452 minimize = (c & 1) != 0;
2453 min = rep_min[c]; /* Pick up values from tables; */
2454 max = rep_max[c]; /* zero for max => infinity */
2455 if (max == 0) max = INT_MAX;
2456 break;
2457
2458 case OP_CRRANGE:
2459 case OP_CRMINRANGE:
2460 minimize = (*ecode == OP_CRMINRANGE);
2461 min = GET2(ecode, 1);
2462 max = GET2(ecode, 3);
2463 if (max == 0) max = INT_MAX;
2464 ecode += 5;
2465 break;
2466
2467 default: /* No repeat follows */
2468 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2469 {
2470 CHECK_PARTIAL();
2471 MRRETURN(MATCH_NOMATCH);
2472 }
2473 eptr += length;
2474 continue; /* With the main loop */
2475 }
2476
2477 /* Handle repeated back references. If the length of the reference is
2478 zero, just continue with the main loop. */
2479
2480 if (length == 0) continue;
2481
2482 /* First, ensure the minimum number of matches are present. We get back
2483 the length of the reference string explicitly rather than passing the
2484 address of eptr, so that eptr can be a register variable. */
2485
2486 for (i = 1; i <= min; i++)
2487 {
2488 int slength;
2489 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2490 {
2491 CHECK_PARTIAL();
2492 MRRETURN(MATCH_NOMATCH);
2493 }
2494 eptr += slength;
2495 }
2496
2497 /* If min = max, continue at the same level without recursion.
2498 They are not both allowed to be zero. */
2499
2500 if (min == max) continue;
2501
2502 /* If minimizing, keep trying and advancing the pointer */
2503
2504 if (minimize)
2505 {
2506 for (fi = min;; fi++)
2507 {
2508 int slength;
2509 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2512 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2513 {
2514 CHECK_PARTIAL();
2515 MRRETURN(MATCH_NOMATCH);
2516 }
2517 eptr += slength;
2518 }
2519 /* Control never gets here */
2520 }
2521
2522 /* If maximizing, find the longest string and work backwards */
2523
2524 else
2525 {
2526 pp = eptr;
2527 for (i = min; i < max; i++)
2528 {
2529 int slength;
2530 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2531 {
2532 CHECK_PARTIAL();
2533 break;
2534 }
2535 eptr += slength;
2536 }
2537 while (eptr >= pp)
2538 {
2539 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2540 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2541 eptr -= length;
2542 }
2543 MRRETURN(MATCH_NOMATCH);
2544 }
2545 /* Control never gets here */
2546
2547 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2548 used when all the characters in the class have values in the range 0-255,
2549 and either the matching is caseful, or the characters are in the range
2550 0-127 when UTF-8 processing is enabled. The only difference between
2551 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2552 encountered.
2553
2554 First, look past the end of the item to see if there is repeat information
2555 following. Then obey similar code to character type repeats - written out
2556 again for speed. */
2557
2558 case OP_NCLASS:
2559 case OP_CLASS:
2560 {
2561 data = ecode + 1; /* Save for matching */
2562 ecode += 33; /* Advance past the item */
2563
2564 switch (*ecode)
2565 {
2566 case OP_CRSTAR:
2567 case OP_CRMINSTAR:
2568 case OP_CRPLUS:
2569 case OP_CRMINPLUS:
2570 case OP_CRQUERY:
2571 case OP_CRMINQUERY:
2572 c = *ecode++ - OP_CRSTAR;
2573 minimize = (c & 1) != 0;
2574 min = rep_min[c]; /* Pick up values from tables; */
2575 max = rep_max[c]; /* zero for max => infinity */
2576 if (max == 0) max = INT_MAX;
2577 break;
2578
2579 case OP_CRRANGE:
2580 case OP_CRMINRANGE:
2581 minimize = (*ecode == OP_CRMINRANGE);
2582 min = GET2(ecode, 1);
2583 max = GET2(ecode, 3);
2584 if (max == 0) max = INT_MAX;
2585 ecode += 5;
2586 break;
2587
2588 default: /* No repeat follows */
2589 min = max = 1;
2590 break;
2591 }
2592
2593 /* First, ensure the minimum number of matches are present. */
2594
2595 #ifdef SUPPORT_UTF8
2596 /* UTF-8 mode */
2597 if (utf8)
2598 {
2599 for (i = 1; i <= min; i++)
2600 {
2601 if (eptr >= md->end_subject)
2602 {
2603 SCHECK_PARTIAL();
2604 MRRETURN(MATCH_NOMATCH);
2605 }
2606 GETCHARINC(c, eptr);
2607 if (c > 255)
2608 {
2609 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2610 }
2611 else
2612 {
2613 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2614 }
2615 }
2616 }
2617 else
2618 #endif
2619 /* Not UTF-8 mode */
2620 {
2621 for (i = 1; i <= min; i++)
2622 {
2623 if (eptr >= md->end_subject)
2624 {
2625 SCHECK_PARTIAL();
2626 MRRETURN(MATCH_NOMATCH);
2627 }
2628 c = *eptr++;
2629 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2630 }
2631 }
2632
2633 /* If max == min we can continue with the main loop without the
2634 need to recurse. */
2635
2636 if (min == max) continue;
2637
2638 /* If minimizing, keep testing the rest of the expression and advancing
2639 the pointer while it matches the class. */
2640
2641 if (minimize)
2642 {
2643 #ifdef SUPPORT_UTF8
2644 /* UTF-8 mode */
2645 if (utf8)
2646 {
2647 for (fi = min;; fi++)
2648 {
2649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2652 if (eptr >= md->end_subject)
2653 {
2654 SCHECK_PARTIAL();
2655 MRRETURN(MATCH_NOMATCH);
2656 }
2657 GETCHARINC(c, eptr);
2658 if (c > 255)
2659 {
2660 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2661 }
2662 else
2663 {
2664 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2665 }
2666 }
2667 }
2668 else
2669 #endif
2670 /* Not UTF-8 mode */
2671 {
2672 for (fi = min;; fi++)
2673 {
2674 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2676 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2677 if (eptr >= md->end_subject)
2678 {
2679 SCHECK_PARTIAL();
2680 MRRETURN(MATCH_NOMATCH);
2681 }
2682 c = *eptr++;
2683 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2684 }
2685 }
2686 /* Control never gets here */
2687 }
2688
2689 /* If maximizing, find the longest possible run, then work backwards. */
2690
2691 else
2692 {
2693 pp = eptr;
2694
2695 #ifdef SUPPORT_UTF8
2696 /* UTF-8 mode */
2697 if (utf8)
2698 {
2699 for (i = min; i < max; i++)
2700 {
2701 int len = 1;
2702 if (eptr >= md->end_subject)
2703 {
2704 SCHECK_PARTIAL();
2705 break;
2706 }
2707 GETCHARLEN(c, eptr, len);
2708 if (c > 255)
2709 {
2710 if (op == OP_CLASS) break;
2711 }
2712 else
2713 {
2714 if ((data[c/8] & (1 << (c&7))) == 0) break;
2715 }
2716 eptr += len;
2717 }
2718 for (;;)
2719 {
2720 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 if (eptr-- == pp) break; /* Stop if tried at original pos */
2723 BACKCHAR(eptr);
2724 }
2725 }
2726 else
2727 #endif
2728 /* Not UTF-8 mode */
2729 {
2730 for (i = min; i < max; i++)
2731 {
2732 if (eptr >= md->end_subject)
2733 {
2734 SCHECK_PARTIAL();
2735 break;
2736 }
2737 c = *eptr;
2738 if ((data[c/8] & (1 << (c&7))) == 0) break;
2739 eptr++;
2740 }
2741 while (eptr >= pp)
2742 {
2743 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2744 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2745 eptr--;
2746 }
2747 }
2748
2749 MRRETURN(MATCH_NOMATCH);
2750 }
2751 }
2752 /* Control never gets here */
2753
2754
2755 /* Match an extended character class. This opcode is encountered only
2756 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2757 mode, because Unicode properties are supported in non-UTF-8 mode. */
2758
2759 #ifdef SUPPORT_UTF8
2760 case OP_XCLASS:
2761 {
2762 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2763 ecode += GET(ecode, 1); /* Advance past the item */
2764
2765 switch (*ecode)
2766 {
2767 case OP_CRSTAR:
2768 case OP_CRMINSTAR:
2769 case OP_CRPLUS:
2770 case OP_CRMINPLUS:
2771 case OP_CRQUERY:
2772 case OP_CRMINQUERY:
2773 c = *ecode++ - OP_CRSTAR;
2774 minimize = (c & 1) != 0;
2775 min = rep_min[c]; /* Pick up values from tables; */
2776 max = rep_max[c]; /* zero for max => infinity */
2777 if (max == 0) max = INT_MAX;
2778 break;
2779
2780 case OP_CRRANGE:
2781 case OP_CRMINRANGE:
2782 minimize = (*ecode == OP_CRMINRANGE);
2783 min = GET2(ecode, 1);
2784 max = GET2(ecode, 3);
2785 if (max == 0) max = INT_MAX;
2786 ecode += 5;
2787 break;
2788
2789 default: /* No repeat follows */
2790 min = max = 1;
2791 break;
2792 }
2793
2794 /* First, ensure the minimum number of matches are present. */
2795
2796 for (i = 1; i <= min; i++)
2797 {
2798 if (eptr >= md->end_subject)
2799 {
2800 SCHECK_PARTIAL();
2801 MRRETURN(MATCH_NOMATCH);
2802 }
2803 GETCHARINCTEST(c, eptr);
2804 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2805 }
2806
2807 /* If max == min we can continue with the main loop without the
2808 need to recurse. */
2809
2810 if (min == max) continue;
2811
2812 /* If minimizing, keep testing the rest of the expression and advancing
2813 the pointer while it matches the class. */
2814
2815 if (minimize)
2816 {
2817 for (fi = min;; fi++)
2818 {
2819 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2822 if (eptr >= md->end_subject)
2823 {
2824 SCHECK_PARTIAL();
2825 MRRETURN(MATCH_NOMATCH);
2826 }
2827 GETCHARINCTEST(c, eptr);
2828 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2829 }
2830 /* Control never gets here */
2831 }
2832
2833 /* If maximizing, find the longest possible run, then work backwards. */
2834
2835 else
2836 {
2837 pp = eptr;
2838 for (i = min; i < max; i++)
2839 {
2840 int len = 1;
2841 if (eptr >= md->end_subject)
2842 {
2843 SCHECK_PARTIAL();
2844 break;
2845 }
2846 GETCHARLENTEST(c, eptr, len);
2847 if (!_pcre_xclass(c, data)) break;
2848 eptr += len;
2849 }
2850 for(;;)
2851 {
2852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2854 if (eptr-- == pp) break; /* Stop if tried at original pos */
2855 if (utf8) BACKCHAR(eptr);
2856 }
2857 MRRETURN(MATCH_NOMATCH);
2858 }
2859
2860 /* Control never gets here */
2861 }
2862 #endif /* End of XCLASS */
2863
2864 /* Match a single character, casefully */
2865
2866 case OP_CHAR:
2867 #ifdef SUPPORT_UTF8
2868 if (utf8)
2869 {
2870 length = 1;
2871 ecode++;
2872 GETCHARLEN(fc, ecode, length);
2873 if (length > md->end_subject - eptr)
2874 {
2875 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2876 MRRETURN(MATCH_NOMATCH);
2877 }
2878 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2879 }
2880 else
2881 #endif
2882
2883 /* Non-UTF-8 mode */
2884 {
2885 if (md->end_subject - eptr < 1)
2886 {
2887 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2888 MRRETURN(MATCH_NOMATCH);
2889 }
2890 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2891 ecode += 2;
2892 }
2893 break;
2894
2895 /* Match a single character, caselessly */
2896
2897 case OP_CHARI:
2898 #ifdef SUPPORT_UTF8
2899 if (utf8)
2900 {
2901 length = 1;
2902 ecode++;
2903 GETCHARLEN(fc, ecode, length);
2904
2905 if (length > md->end_subject - eptr)
2906 {
2907 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2908 MRRETURN(MATCH_NOMATCH);
2909 }
2910
2911 /* If the pattern character's value is < 128, we have only one byte, and
2912 can use the fast lookup table. */
2913
2914 if (fc < 128)
2915 {
2916 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2917 }
2918
2919 /* Otherwise we must pick up the subject character */
2920
2921 else
2922 {
2923 unsigned int dc;
2924 GETCHARINC(dc, eptr);
2925 ecode += length;
2926
2927 /* If we have Unicode property support, we can use it to test the other
2928 case of the character, if there is one. */
2929
2930 if (fc != dc)
2931 {
2932 #ifdef SUPPORT_UCP
2933 if (dc != UCD_OTHERCASE(fc))
2934 #endif
2935 MRRETURN(MATCH_NOMATCH);
2936 }
2937 }
2938 }
2939 else
2940 #endif /* SUPPORT_UTF8 */
2941
2942 /* Non-UTF-8 mode */
2943 {
2944 if (md->end_subject - eptr < 1)
2945 {
2946 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2947 MRRETURN(MATCH_NOMATCH);
2948 }
2949 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2950 ecode += 2;
2951 }
2952 break;
2953
2954 /* Match a single character repeatedly. */
2955
2956 case OP_EXACT:
2957 case OP_EXACTI:
2958 min = max = GET2(ecode, 1);
2959 ecode += 3;
2960 goto REPEATCHAR;
2961
2962 case OP_POSUPTO:
2963 case OP_POSUPTOI:
2964 possessive = TRUE;
2965 /* Fall through */
2966
2967 case OP_UPTO:
2968 case OP_UPTOI:
2969 case OP_MINUPTO:
2970 case OP_MINUPTOI:
2971 min = 0;
2972 max = GET2(ecode, 1);
2973 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2974 ecode += 3;
2975 goto REPEATCHAR;
2976
2977 case OP_POSSTAR:
2978 case OP_POSSTARI:
2979 possessive = TRUE;
2980 min = 0;
2981 max = INT_MAX;
2982 ecode++;
2983 goto REPEATCHAR;
2984
2985 case OP_POSPLUS:
2986 case OP_POSPLUSI:
2987 possessive = TRUE;
2988 min = 1;
2989 max = INT_MAX;
2990 ecode++;
2991 goto REPEATCHAR;
2992
2993 case OP_POSQUERY:
2994 case OP_POSQUERYI:
2995 possessive = TRUE;
2996 min = 0;
2997 max = 1;
2998 ecode++;
2999 goto REPEATCHAR;
3000
3001 case OP_STAR:
3002 case OP_STARI:
3003 case OP_MINSTAR:
3004 case OP_MINSTARI:
3005 case OP_PLUS:
3006 case OP_PLUSI:
3007 case OP_MINPLUS:
3008 case OP_MINPLUSI:
3009 case OP_QUERY:
3010 case OP_QUERYI:
3011 case OP_MINQUERY:
3012 case OP_MINQUERYI:
3013 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3014 minimize = (c & 1) != 0;
3015 min = rep_min[c]; /* Pick up values from tables; */
3016 max = rep_max[c]; /* zero for max => infinity */
3017 if (max == 0) max = INT_MAX;
3018
3019 /* Common code for all repeated single-character matches. */
3020
3021 REPEATCHAR:
3022 #ifdef SUPPORT_UTF8
3023 if (utf8)
3024 {
3025 length = 1;
3026 charptr = ecode;
3027 GETCHARLEN(fc, ecode, length);
3028 ecode += length;
3029
3030 /* Handle multibyte character matching specially here. There is
3031 support for caseless matching if UCP support is present. */
3032
3033 if (length > 1)
3034 {
3035 #ifdef SUPPORT_UCP
3036 unsigned int othercase;
3037 if (op >= OP_STARI && /* Caseless */
3038 (othercase = UCD_OTHERCASE(fc)) != fc)
3039 oclength = _pcre_ord2utf8(othercase, occhars);
3040 else oclength = 0;
3041 #endif /* SUPPORT_UCP */
3042
3043 for (i = 1; i <= min; i++)
3044 {
3045 if (eptr <= md->end_subject - length &&
3046 memcmp(eptr, charptr, length) == 0) eptr += length;
3047 #ifdef SUPPORT_UCP
3048 else if (oclength > 0 &&
3049 eptr <= md->end_subject - oclength &&
3050 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3051 #endif /* SUPPORT_UCP */
3052 else
3053 {
3054 CHECK_PARTIAL();
3055 MRRETURN(MATCH_NOMATCH);
3056 }
3057 }
3058
3059 if (min == max) continue;
3060
3061 if (minimize)
3062 {
3063 for (fi = min;; fi++)
3064 {
3065 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3067 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3068 if (eptr <= md->end_subject - length &&
3069 memcmp(eptr, charptr, length) == 0) eptr += length;
3070 #ifdef SUPPORT_UCP
3071 else if (oclength > 0 &&
3072 eptr <= md->end_subject - oclength &&
3073 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3074 #endif /* SUPPORT_UCP */
3075 else
3076 {
3077 CHECK_PARTIAL();
3078 MRRETURN(MATCH_NOMATCH);
3079 }
3080 }
3081 /* Control never gets here */
3082 }
3083
3084 else /* Maximize */
3085 {
3086 pp = eptr;
3087 for (i = min; i < max; i++)
3088 {
3089 if (eptr <= md->end_subject - length &&
3090 memcmp(eptr, charptr, length) == 0) eptr += length;
3091 #ifdef SUPPORT_UCP
3092 else if (oclength > 0 &&
3093 eptr <= md->end_subject - oclength &&
3094 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3095 #endif /* SUPPORT_UCP */
3096 else
3097 {
3098 CHECK_PARTIAL();
3099 break;
3100 }
3101 }
3102
3103 if (possessive) continue;
3104
3105 for(;;)
3106 {
3107 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3109 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3110 #ifdef SUPPORT_UCP
3111 eptr--;
3112 BACKCHAR(eptr);
3113 #else /* without SUPPORT_UCP */
3114 eptr -= length;
3115 #endif /* SUPPORT_UCP */
3116 }
3117 }
3118 /* Control never gets here */
3119 }
3120
3121 /* If the length of a UTF-8 character is 1, we fall through here, and
3122 obey the code as for non-UTF-8 characters below, though in this case the
3123 value of fc will always be < 128. */
3124 }
3125 else
3126 #endif /* SUPPORT_UTF8 */
3127
3128 /* When not in UTF-8 mode, load a single-byte character. */
3129
3130 fc = *ecode++;
3131
3132 /* The value of fc at this point is always less than 256, though we may or
3133 may not be in UTF-8 mode. The code is duplicated for the caseless and
3134 caseful cases, for speed, since matching characters is likely to be quite
3135 common. First, ensure the minimum number of matches are present. If min =
3136 max, continue at the same level without recursing. Otherwise, if
3137 minimizing, keep trying the rest of the expression and advancing one
3138 matching character if failing, up to the maximum. Alternatively, if
3139 maximizing, find the maximum number of characters and work backwards. */
3140
3141 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3142 max, eptr));
3143
3144 if (op >= OP_STARI) /* Caseless */
3145 {
3146 fc = md->lcc[fc];
3147 for (i = 1; i <= min; i++)
3148 {
3149 if (eptr >= md->end_subject)
3150 {
3151 SCHECK_PARTIAL();
3152 MRRETURN(MATCH_NOMATCH);
3153 }
3154 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3155 }
3156 if (min == max) continue;
3157 if (minimize)
3158 {
3159 for (fi = min;; fi++)
3160 {
3161 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3163 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3164 if (eptr >= md->end_subject)
3165 {
3166 SCHECK_PARTIAL();
3167 MRRETURN(MATCH_NOMATCH);
3168 }
3169 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3170 }
3171 /* Control never gets here */
3172 }
3173 else /* Maximize */
3174 {
3175 pp = eptr;
3176 for (i = min; i < max; i++)
3177 {
3178 if (eptr >= md->end_subject)
3179 {
3180 SCHECK_PARTIAL();
3181 break;
3182 }
3183 if (fc != md->lcc[*eptr]) break;
3184 eptr++;
3185 }
3186
3187 if (possessive) continue;
3188
3189 while (eptr >= pp)
3190 {
3191 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3192 eptr--;
3193 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3194 }
3195 MRRETURN(MATCH_NOMATCH);
3196 }
3197 /* Control never gets here */
3198 }
3199
3200 /* Caseful comparisons (includes all multi-byte characters) */
3201
3202 else
3203 {
3204 for (i = 1; i <= min; i++)
3205 {
3206 if (eptr >= md->end_subject)
3207 {
3208 SCHECK_PARTIAL();
3209 MRRETURN(MATCH_NOMATCH);
3210 }
3211 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3212 }
3213
3214 if (min == max) continue;
3215
3216 if (minimize)
3217 {
3218 for (fi = min;; fi++)
3219 {
3220 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3222 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3223 if (eptr >= md->end_subject)
3224 {
3225 SCHECK_PARTIAL();
3226 MRRETURN(MATCH_NOMATCH);
3227 }
3228 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3229 }
3230 /* Control never gets here */
3231 }
3232 else /* Maximize */
3233 {
3234 pp = eptr;
3235 for (i = min; i < max; i++)
3236 {
3237 if (eptr >= md->end_subject)
3238 {
3239 SCHECK_PARTIAL();
3240 break;
3241 }
3242 if (fc != *eptr) break;
3243 eptr++;
3244 }
3245 if (possessive) continue;
3246
3247 while (eptr >= pp)
3248 {
3249 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3250 eptr--;
3251 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3252 }
3253 MRRETURN(MATCH_NOMATCH);
3254 }
3255 }
3256 /* Control never gets here */
3257
3258 /* Match a negated single one-byte character. The character we are
3259 checking can be multibyte. */
3260
3261 case OP_NOT:
3262 case OP_NOTI:
3263 if (eptr >= md->end_subject)
3264 {
3265 SCHECK_PARTIAL();
3266 MRRETURN(MATCH_NOMATCH);
3267 }
3268 ecode++;
3269 GETCHARINCTEST(c, eptr);
3270 if (op == OP_NOTI) /* The caseless case */
3271 {
3272 #ifdef SUPPORT_UTF8
3273 if (c < 256)
3274 #endif
3275 c = md->lcc[c];
3276 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3277 }
3278 else /* Caseful */
3279 {
3280 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3281 }
3282 break;
3283
3284 /* Match a negated single one-byte character repeatedly. This is almost a
3285 repeat of the code for a repeated single character, but I haven't found a
3286 nice way of commoning these up that doesn't require a test of the
3287 positive/negative option for each character match. Maybe that wouldn't add
3288 very much to the time taken, but character matching *is* what this is all
3289 about... */
3290
3291 case OP_NOTEXACT:
3292 case OP_NOTEXACTI:
3293 min = max = GET2(ecode, 1);
3294 ecode += 3;
3295 goto REPEATNOTCHAR;
3296
3297 case OP_NOTUPTO:
3298 case OP_NOTUPTOI:
3299 case OP_NOTMINUPTO:
3300 case OP_NOTMINUPTOI:
3301 min = 0;
3302 max = GET2(ecode, 1);
3303 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3304 ecode += 3;
3305 goto REPEATNOTCHAR;
3306
3307 case OP_NOTPOSSTAR:
3308 case OP_NOTPOSSTARI:
3309 possessive = TRUE;
3310 min = 0;
3311 max = INT_MAX;
3312 ecode++;
3313 goto REPEATNOTCHAR;
3314
3315 case OP_NOTPOSPLUS:
3316 case OP_NOTPOSPLUSI:
3317 possessive = TRUE;
3318 min = 1;
3319 max = INT_MAX;
3320 ecode++;
3321 goto REPEATNOTCHAR;
3322
3323 case OP_NOTPOSQUERY:
3324 case OP_NOTPOSQUERYI:
3325 possessive = TRUE;
3326 min = 0;
3327 max = 1;
3328 ecode++;
3329 goto REPEATNOTCHAR;
3330
3331 case OP_NOTPOSUPTO:
3332 case OP_NOTPOSUPTOI:
3333 possessive = TRUE;
3334 min = 0;
3335 max = GET2(ecode, 1);
3336 ecode += 3;
3337 goto REPEATNOTCHAR;
3338
3339 case OP_NOTSTAR:
3340 case OP_NOTSTARI:
3341 case OP_NOTMINSTAR:
3342 case OP_NOTMINSTARI:
3343 case OP_NOTPLUS:
3344 case OP_NOTPLUSI:
3345 case OP_NOTMINPLUS:
3346 case OP_NOTMINPLUSI:
3347 case OP_NOTQUERY:
3348 case OP_NOTQUERYI:
3349 case OP_NOTMINQUERY:
3350 case OP_NOTMINQUERYI:
3351 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3352 minimize = (c & 1) != 0;
3353 min = rep_min[c]; /* Pick up values from tables; */
3354 max = rep_max[c]; /* zero for max => infinity */
3355 if (max == 0) max = INT_MAX;
3356
3357 /* Common code for all repeated single-byte matches. */
3358
3359 REPEATNOTCHAR:
3360 fc = *ecode++;
3361
3362 /* The code is duplicated for the caseless and caseful cases, for speed,
3363 since matching characters is likely to be quite common. First, ensure the
3364 minimum number of matches are present. If min = max, continue at the same
3365 level without recursing. Otherwise, if minimizing, keep trying the rest of
3366 the expression and advancing one matching character if failing, up to the
3367 maximum. Alternatively, if maximizing, find the maximum number of
3368 characters and work backwards. */
3369
3370 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3371 max, eptr));
3372
3373 if (op >= OP_NOTSTARI) /* Caseless */
3374 {
3375 fc = md->lcc[fc];
3376
3377 #ifdef SUPPORT_UTF8
3378 /* UTF-8 mode */
3379 if (utf8)
3380 {
3381 register unsigned int d;
3382 for (i = 1; i <= min; i++)
3383 {
3384 if (eptr >= md->end_subject)
3385 {
3386 SCHECK_PARTIAL();
3387 MRRETURN(MATCH_NOMATCH);
3388 }
3389 GETCHARINC(d, eptr);
3390 if (d < 256) d = md->lcc[d];
3391 if (fc == d) MRRETURN(MATCH_NOMATCH);
3392 }
3393 }
3394 else
3395 #endif
3396
3397 /* Not UTF-8 mode */
3398 {
3399 for (i = 1; i <= min; i++)
3400 {
3401 if (eptr >= md->end_subject)
3402 {
3403 SCHECK_PARTIAL();
3404 MRRETURN(MATCH_NOMATCH);
3405 }
3406 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3407 }
3408 }
3409
3410 if (min == max) continue;
3411
3412 if (minimize)
3413 {
3414 #ifdef SUPPORT_UTF8
3415 /* UTF-8 mode */
3416 if (utf8)
3417 {
3418 register unsigned int d;
3419 for (fi = min;; fi++)
3420 {
3421 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3423 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3424 if (eptr >= md->end_subject)
3425 {
3426 SCHECK_PARTIAL();
3427 MRRETURN(MATCH_NOMATCH);
3428 }
3429 GETCHARINC(d, eptr);
3430 if (d < 256) d = md->lcc[d];
3431 if (fc == d) MRRETURN(MATCH_NOMATCH);
3432 }
3433 }
3434 else
3435 #endif
3436 /* Not UTF-8 mode */
3437 {
3438 for (fi = min;; fi++)
3439 {
3440 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3441 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3443 if (eptr >= md->end_subject)
3444 {
3445 SCHECK_PARTIAL();
3446 MRRETURN(MATCH_NOMATCH);
3447 }
3448 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3449 }
3450 }
3451 /* Control never gets here */
3452 }
3453
3454 /* Maximize case */
3455
3456 else
3457 {
3458 pp = eptr;
3459
3460 #ifdef SUPPORT_UTF8
3461 /* UTF-8 mode */
3462 if (utf8)
3463 {
3464 register unsigned int d;
3465 for (i = min; i < max; i++)
3466 {
3467 int len = 1;
3468 if (eptr >= md->end_subject)
3469 {
3470 SCHECK_PARTIAL();
3471 break;
3472 }
3473 GETCHARLEN(d, eptr, len);
3474 if (d < 256) d = md->lcc[d];
3475 if (fc == d) break;
3476 eptr += len;
3477 }
3478 if (possessive) continue;
3479 for(;;)
3480 {
3481 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3483 if (eptr-- == pp) break; /* Stop if tried at original pos */
3484 BACKCHAR(eptr);
3485 }
3486 }
3487 else
3488 #endif
3489 /* Not UTF-8 mode */
3490 {
3491 for (i = min; i < max; i++)
3492 {
3493 if (eptr >= md->end_subject)
3494 {
3495 SCHECK_PARTIAL();
3496 break;
3497 }
3498 if (fc == md->lcc[*eptr]) break;
3499 eptr++;
3500 }
3501 if (possessive) continue;
3502 while (eptr >= pp)
3503 {
3504 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3505 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3506 eptr--;
3507 }
3508 }
3509
3510 MRRETURN(MATCH_NOMATCH);
3511 }
3512 /* Control never gets here */
3513 }
3514
3515 /* Caseful comparisons */
3516
3517 else
3518 {
3519 #ifdef SUPPORT_UTF8
3520 /* UTF-8 mode */
3521 if (utf8)
3522 {
3523 register unsigned int d;
3524 for (i = 1; i <= min; i++)
3525 {
3526 if (eptr >= md->end_subject)
3527 {
3528 SCHECK_PARTIAL();
3529 MRRETURN(MATCH_NOMATCH);
3530 }
3531 GETCHARINC(d, eptr);
3532 if (fc == d) MRRETURN(MATCH_NOMATCH);
3533 }
3534 }
3535 else
3536 #endif
3537 /* Not UTF-8 mode */
3538 {
3539 for (i = 1; i <= min; i++)
3540 {
3541 if (eptr >= md->end_subject)
3542 {
3543 SCHECK_PARTIAL();
3544 MRRETURN(MATCH_NOMATCH);
3545 }
3546 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3547 }
3548 }
3549
3550 if (min == max) continue;
3551
3552 if (minimize)
3553 {
3554 #ifdef SUPPORT_UTF8
3555 /* UTF-8 mode */
3556 if (utf8)
3557 {
3558 register unsigned int d;
3559 for (fi = min;; fi++)
3560 {
3561 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3562 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3563 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3564 if (eptr >= md->end_subject)
3565 {
3566 SCHECK_PARTIAL();
3567 MRRETURN(MATCH_NOMATCH);
3568 }
3569 GETCHARINC(d, eptr);
3570 if (fc == d) MRRETURN(MATCH_NOMATCH);
3571 }
3572 }
3573 else
3574 #endif
3575 /* Not UTF-8 mode */
3576 {
3577 for (fi = min;; fi++)
3578 {
3579 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3580 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3581 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3582 if (eptr >= md->end_subject)
3583 {
3584 SCHECK_PARTIAL();
3585 MRRETURN(MATCH_NOMATCH);
3586 }
3587 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3588 }
3589 }
3590 /* Control never gets here */
3591 }
3592
3593 /* Maximize case */
3594
3595 else
3596 {
3597 pp = eptr;
3598
3599 #ifdef SUPPORT_UTF8
3600 /* UTF-8 mode */
3601 if (utf8)
3602 {
3603 register unsigned int d;
3604 for (i = min; i < max; i++)
3605 {
3606 int len = 1;
3607 if (eptr >= md->end_subject)
3608 {
3609 SCHECK_PARTIAL();
3610 break;
3611 }
3612 GETCHARLEN(d, eptr, len);
3613 if (fc == d) break;
3614 eptr += len;
3615 }
3616 if (possessive) continue;
3617 for(;;)
3618 {
3619 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3620 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3621 if (eptr-- == pp) break; /* Stop if tried at original pos */
3622 BACKCHAR(eptr);
3623 }
3624 }
3625 else
3626 #endif
3627 /* Not UTF-8 mode */
3628 {
3629 for (i = min; i < max; i++)
3630 {
3631 if (eptr >= md->end_subject)
3632 {
3633 SCHECK_PARTIAL();
3634 break;
3635 }
3636 if (fc == *eptr) break;
3637 eptr++;
3638 }
3639 if (possessive) continue;
3640 while (eptr >= pp)
3641 {
3642 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3643 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3644 eptr--;
3645 }
3646 }
3647
3648 MRRETURN(MATCH_NOMATCH);
3649 }
3650 }
3651 /* Control never gets here */
3652
3653 /* Match a single character type repeatedly; several different opcodes
3654 share code. This is very similar to the code for single characters, but we
3655 repeat it in the interests of efficiency. */
3656
3657 case OP_TYPEEXACT:
3658 min = max = GET2(ecode, 1);
3659 minimize = TRUE;
3660 ecode += 3;
3661 goto REPEATTYPE;
3662
3663 case OP_TYPEUPTO:
3664 case OP_TYPEMINUPTO:
3665 min = 0;
3666 max = GET2(ecode, 1);
3667 minimize = *ecode == OP_TYPEMINUPTO;
3668 ecode += 3;
3669 goto REPEATTYPE;
3670
3671 case OP_TYPEPOSSTAR:
3672 possessive = TRUE;
3673 min = 0;
3674 max = INT_MAX;
3675 ecode++;
3676 goto REPEATTYPE;
3677
3678 case OP_TYPEPOSPLUS:
3679 possessive = TRUE;
3680 min = 1;
3681 max = INT_MAX;
3682 ecode++;
3683 goto REPEATTYPE;
3684
3685 case OP_TYPEPOSQUERY:
3686 possessive = TRUE;
3687 min = 0;
3688 max = 1;
3689 ecode++;
3690 goto REPEATTYPE;
3691
3692 case OP_TYPEPOSUPTO:
3693 possessive = TRUE;
3694 min = 0;
3695 max = GET2(ecode, 1);
3696 ecode += 3;
3697 goto REPEATTYPE;
3698
3699 case OP_TYPESTAR:
3700 case OP_TYPEMINSTAR:
3701 case OP_TYPEPLUS:
3702 case OP_TYPEMINPLUS:
3703 case OP_TYPEQUERY:
3704 case OP_TYPEMINQUERY:
3705 c = *ecode++ - OP_TYPESTAR;
3706 minimize = (c & 1) != 0;
3707 min = rep_min[c]; /* Pick up values from tables; */
3708 max = rep_max[c]; /* zero for max => infinity */
3709 if (max == 0) max = INT_MAX;
3710
3711 /* Common code for all repeated single character type matches. Note that
3712 in UTF-8 mode, '.' matches a character of any length, but for the other
3713 character types, the valid characters are all one-byte long. */
3714
3715 REPEATTYPE:
3716 ctype = *ecode++; /* Code for the character type */
3717
3718 #ifdef SUPPORT_UCP
3719 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3720 {
3721 prop_fail_result = ctype == OP_NOTPROP;
3722 prop_type = *ecode++;
3723 prop_value = *ecode++;
3724 }
3725 else prop_type = -1;
3726 #endif
3727
3728 /* First, ensure the minimum number of matches are present. Use inline
3729 code for maximizing the speed, and do the type test once at the start
3730 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3731 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3732 and single-bytes. */
3733
3734 if (min > 0)
3735 {
3736 #ifdef SUPPORT_UCP
3737 if (prop_type >= 0)
3738 {
3739 switch(prop_type)
3740 {
3741 case PT_ANY:
3742 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3743 for (i = 1; i <= min; i++)
3744 {
3745 if (eptr >= md->end_subject)
3746 {
3747 SCHECK_PARTIAL();
3748 MRRETURN(MATCH_NOMATCH);
3749 }
3750 GETCHARINCTEST(c, eptr);
3751 }
3752 break;
3753
3754 case PT_LAMP:
3755 for (i = 1; i <= min; i++)
3756 {
3757 if (eptr >= md->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 MRRETURN(MATCH_NOMATCH);
3761 }
3762 GETCHARINCTEST(c, eptr);
3763 prop_chartype = UCD_CHARTYPE(c);
3764 if ((prop_chartype == ucp_Lu ||
3765 prop_chartype == ucp_Ll ||
3766 prop_chartype == ucp_Lt) == prop_fail_result)
3767 MRRETURN(MATCH_NOMATCH);
3768 }
3769 break;
3770
3771 case PT_GC:
3772 for (i = 1; i <= min; i++)
3773 {
3774 if (eptr >= md->end_subject)
3775 {
3776 SCHECK_PARTIAL();
3777 MRRETURN(MATCH_NOMATCH);
3778 }
3779 GETCHARINCTEST(c, eptr);
3780 prop_category = UCD_CATEGORY(c);
3781 if ((prop_category == prop_value) == prop_fail_result)
3782 MRRETURN(MATCH_NOMATCH);
3783 }
3784 break;
3785
3786 case PT_PC:
3787 for (i = 1; i <= min; i++)
3788 {
3789 if (eptr >= md->end_subject)
3790 {
3791 SCHECK_PARTIAL();
3792 MRRETURN(MATCH_NOMATCH);
3793 }
3794 GETCHARINCTEST(c, eptr);
3795 prop_chartype = UCD_CHARTYPE(c);
3796 if ((prop_chartype == prop_value) == prop_fail_result)
3797 MRRETURN(MATCH_NOMATCH);
3798 }
3799 break;
3800
3801 case PT_SC:
3802 for (i = 1; i <= min; i++)
3803 {
3804 if (eptr >= md->end_subject)
3805 {
3806 SCHECK_PARTIAL();
3807 MRRETURN(MATCH_NOMATCH);
3808 }
3809 GETCHARINCTEST(c, eptr);
3810 prop_script = UCD_SCRIPT(c);
3811 if ((prop_script == prop_value) == prop_fail_result)
3812 MRRETURN(MATCH_NOMATCH);
3813 }
3814 break;
3815
3816 case PT_ALNUM:
3817 for (i = 1; i <= min; i++)
3818 {
3819 if (eptr >= md->end_subject)
3820 {
3821 SCHECK_PARTIAL();
3822 MRRETURN(MATCH_NOMATCH);
3823 }
3824 GETCHARINCTEST(c, eptr);
3825 prop_category = UCD_CATEGORY(c);
3826 if ((prop_category == ucp_L || prop_category == ucp_N)
3827 == prop_fail_result)
3828 MRRETURN(MATCH_NOMATCH);
3829 }
3830 break;
3831
3832 case PT_SPACE: /* Perl space */
3833 for (i = 1; i <= min; i++)
3834 {
3835 if (eptr >= md->end_subject)
3836 {
3837 SCHECK_PARTIAL();
3838 MRRETURN(MATCH_NOMATCH);
3839 }
3840 GETCHARINCTEST(c, eptr);
3841 prop_category = UCD_CATEGORY(c);
3842 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3843 c == CHAR_FF || c == CHAR_CR)
3844 == prop_fail_result)
3845 MRRETURN(MATCH_NOMATCH);
3846 }
3847 break;
3848
3849 case PT_PXSPACE: /* POSIX space */
3850 for (i = 1; i <= min; i++)
3851 {
3852 if (eptr >= md->end_subject)
3853 {
3854 SCHECK_PARTIAL();
3855 MRRETURN(MATCH_NOMATCH);
3856 }
3857 GETCHARINCTEST(c, eptr);
3858 prop_category = UCD_CATEGORY(c);
3859 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3860 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3861 == prop_fail_result)
3862 MRRETURN(MATCH_NOMATCH);
3863 }
3864 break;
3865
3866 case PT_WORD:
3867 for (i = 1; i <= min; i++)
3868 {
3869 if (eptr >= md->end_subject)
3870 {
3871 SCHECK_PARTIAL();
3872 MRRETURN(MATCH_NOMATCH);
3873 }
3874 GETCHARINCTEST(c, eptr);
3875 prop_category = UCD_CATEGORY(c);
3876 if ((prop_category == ucp_L || prop_category == ucp_N ||
3877 c == CHAR_UNDERSCORE)
3878 == prop_fail_result)
3879 MRRETURN(MATCH_NOMATCH);
3880 }
3881 break;
3882
3883 /* This should not occur */
3884
3885 default:
3886 RRETURN(PCRE_ERROR_INTERNAL);
3887 }
3888 }
3889
3890 /* Match extended Unicode sequences. We will get here only if the
3891 support is in the binary; otherwise a compile-time error occurs. */
3892
3893 else if (ctype == OP_EXTUNI)
3894 {
3895 for (i = 1; i <= min; i++)
3896 {
3897 if (eptr >= md->end_subject)
3898 {
3899 SCHECK_PARTIAL();
3900 MRRETURN(MATCH_NOMATCH);
3901 }
3902 GETCHARINCTEST(c, eptr);
3903 prop_category = UCD_CATEGORY(c);
3904 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3905 while (eptr < md->end_subject)
3906 {
3907 int len = 1;
3908 if (!utf8) c = *eptr;
3909 else { GETCHARLEN(c, eptr, len); }
3910 prop_category = UCD_CATEGORY(c);
3911 if (prop_category != ucp_M) break;
3912 eptr += len;
3913 }
3914 }
3915 }
3916
3917 else
3918 #endif /* SUPPORT_UCP */
3919
3920 /* Handle all other cases when the coding is UTF-8 */
3921
3922 #ifdef SUPPORT_UTF8
3923 if (utf8) switch(ctype)
3924 {
3925 case OP_ANY:
3926 for (i = 1; i <= min; i++)
3927 {
3928 if (eptr >= md->end_subject)
3929 {
3930 SCHECK_PARTIAL();
3931 MRRETURN(MATCH_NOMATCH);
3932 }
3933 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3934 eptr++;
3935 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3936 }
3937 break;
3938
3939 case OP_ALLANY:
3940 for (i = 1; i <= min; i++)
3941 {
3942 if (eptr >= md->end_subject)
3943 {
3944 SCHECK_PARTIAL();
3945 MRRETURN(MATCH_NOMATCH);
3946 }
3947 eptr++;
3948 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3949 }
3950 break;
3951
3952 case OP_ANYBYTE:
3953 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3954 eptr += min;
3955 break;
3956
3957 case OP_ANYNL:
3958 for (i = 1; i <= min; i++)
3959 {
3960 if (eptr >= md->end_subject)
3961 {
3962 SCHECK_PARTIAL();
3963 MRRETURN(MATCH_NOMATCH);
3964 }
3965 GETCHARINC(c, eptr);
3966 switch(c)
3967 {
3968 default: MRRETURN(MATCH_NOMATCH);
3969
3970 case 0x000d:
3971 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3972 break;
3973
3974 case 0x000a:
3975 break;
3976
3977 case 0x000b:
3978 case 0x000c:
3979 case 0x0085:
3980 case 0x2028:
3981 case 0x2029:
3982 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3983 break;
3984 }
3985 }
3986 break;
3987
3988 case OP_NOT_HSPACE:
3989 for (i = 1; i <= min; i++)
3990 {
3991 if (eptr >= md->end_subject)
3992 {
3993 SCHECK_PARTIAL();
3994 MRRETURN(MATCH_NOMATCH);
3995 }
3996 GETCHARINC(c, eptr);
3997 switch(c)
3998 {
3999 default: break;
4000 case 0x09: /* HT */
4001 case 0x20: /* SPACE */
4002 case 0xa0: /* NBSP */
4003 case 0x1680: /* OGHAM SPACE MARK */
4004 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4005 case 0x2000: /* EN QUAD */
4006 case 0x2001: /* EM QUAD */
4007 case 0x2002: /* EN SPACE */
4008 case 0x2003: /* EM SPACE */
4009 case 0x2004: /* THREE-PER-EM SPACE */
4010 case 0x2005: /* FOUR-PER-EM SPACE */
4011 case 0x2006: /* SIX-PER-EM SPACE */
4012 case 0x2007: /* FIGURE SPACE */
4013 case 0x2008: /* PUNCTUATION SPACE */
4014 case 0x2009: /* THIN SPACE */
4015 case 0x200A: /* HAIR SPACE */
4016 case 0x202f: /* NARROW NO-BREAK SPACE */
4017 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4018 case 0x3000: /* IDEOGRAPHIC SPACE */
4019 MRRETURN(MATCH_NOMATCH);
4020 }
4021 }
4022 break;
4023
4024 case OP_HSPACE:
4025 for (i = 1; i <= min; i++)
4026 {
4027 if (eptr >= md->end_subject)
4028 {
4029 SCHECK_PARTIAL();
4030 MRRETURN(MATCH_NOMATCH);
4031 }
4032 GETCHARINC(c, eptr);
4033 switch(c)
4034 {
4035 default: MRRETURN(MATCH_NOMATCH);
4036 case 0x09: /* HT */
4037 case 0x20: /* SPACE */
4038 case 0xa0: /* NBSP */
4039 case 0x1680: /* OGHAM SPACE MARK */
4040 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4041 case 0x2000: /* EN QUAD */
4042 case 0x2001: /* EM QUAD */
4043 case 0x2002: /* EN SPACE */
4044 case 0x2003: /* EM SPACE */
4045 case 0x2004: /* THREE-PER-EM SPACE */
4046 case 0x2005: /* FOUR-PER-EM SPACE */
4047 case 0x2006: /* SIX-PER-EM SPACE */
4048 case 0x2007: /* FIGURE SPACE */
4049 case 0x2008: /* PUNCTUATION SPACE */
4050 case 0x2009: /* THIN SPACE */
4051 case 0x200A: /* HAIR SPACE */
4052 case 0x202f: /* NARROW NO-BREAK SPACE */
4053 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4054 case 0x3000: /* IDEOGRAPHIC SPACE */
4055 break;
4056 }
4057 }
4058 break;
4059
4060 case OP_NOT_VSPACE:
4061 for (i = 1; i <= min; i++)
4062 {
4063 if (eptr >= md->end_subject)
4064 {
4065 SCHECK_PARTIAL();
4066 MRRETURN(MATCH_NOMATCH);
4067 }
4068 GETCHARINC(c, eptr);
4069 switch(c)
4070 {
4071 default: break;
4072 case 0x0a: /* LF */
4073 case 0x0b: /* VT */
4074 case 0x0c: /* FF */
4075 case 0x0d: /* CR */
4076 case 0x85: /* NEL */
4077 case 0x2028: /* LINE SEPARATOR */
4078 case 0x2029: /* PARAGRAPH SEPARATOR */
4079 MRRETURN(MATCH_NOMATCH);
4080 }
4081 }
4082 break;
4083
4084 case OP_VSPACE:
4085 for (i = 1; i <= min; i++)
4086 {
4087 if (eptr >= md->end_subject)
4088 {
4089 SCHECK_PARTIAL();
4090 MRRETURN(MATCH_NOMATCH);
4091 }
4092 GETCHARINC(c, eptr);
4093 switch(c)
4094 {
4095 default: MRRETURN(MATCH_NOMATCH);
4096 case 0x0a: /* LF */
4097 case 0x0b: /* VT */
4098 case 0x0c: /* FF */
4099 case 0x0d: /* CR */
4100 case 0x85: /* NEL */
4101 case 0x2028: /* LINE SEPARATOR */
4102 case 0x2029: /* PARAGRAPH SEPARATOR */
4103 break;
4104 }
4105 }
4106 break;
4107
4108 case OP_NOT_DIGIT:
4109 for (i = 1; i <= min; i++)
4110 {
4111 if (eptr >= md->end_subject)
4112 {
4113 SCHECK_PARTIAL();
4114 MRRETURN(MATCH_NOMATCH);
4115 }
4116 GETCHARINC(c, eptr);
4117 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4118 MRRETURN(MATCH_NOMATCH);
4119 }
4120 break;
4121
4122 case OP_DIGIT:
4123 for (i = 1; i <= min; i++)
4124 {
4125 if (eptr >= md->end_subject)
4126 {
4127 SCHECK_PARTIAL();
4128 MRRETURN(MATCH_NOMATCH);
4129 }
4130 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4131 MRRETURN(MATCH_NOMATCH);
4132 /* No need to skip more bytes - we know it's a 1-byte character */
4133 }
4134 break;
4135
4136 case OP_NOT_WHITESPACE:
4137 for (i = 1; i <= min; i++)
4138 {
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 MRRETURN(MATCH_NOMATCH);
4143 }
4144 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4145 MRRETURN(MATCH_NOMATCH);
4146 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4147 }
4148 break;
4149
4150 case OP_WHITESPACE:
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 MRRETURN(MATCH_NOMATCH);
4157 }
4158 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4159 MRRETURN(MATCH_NOMATCH);
4160 /* No need to skip more bytes - we know it's a 1-byte character */
4161 }
4162 break;
4163
4164 case OP_NOT_WORDCHAR:
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 MRRETURN(MATCH_NOMATCH);
4171 }
4172 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4173 MRRETURN(MATCH_NOMATCH);
4174 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4175 }
4176 break;
4177
4178 case OP_WORDCHAR:
4179 for (i = 1; i <= min; i++)
4180 {
4181 if (eptr >= md->end_subject)
4182 {
4183 SCHECK_PARTIAL();
4184 MRRETURN(MATCH_NOMATCH);
4185 }
4186 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4187 MRRETURN(MATCH_NOMATCH);
4188 /* No need to skip more bytes - we know it's a 1-byte character */
4189 }
4190 break;
4191
4192 default:
4193 RRETURN(PCRE_ERROR_INTERNAL);
4194 } /* End switch(ctype) */
4195
4196 else
4197 #endif /* SUPPORT_UTF8 */
4198
4199 /* Code for the non-UTF-8 case for minimum matching of operators other
4200 than OP_PROP and OP_NOTPROP. */
4201
4202 switch(ctype)
4203 {
4204 case OP_ANY:
4205 for (i = 1; i <= min; i++)
4206 {
4207 if (eptr >= md->end_subject)
4208 {
4209 SCHECK_PARTIAL();
4210 MRRETURN(MATCH_NOMATCH);
4211 }
4212 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4213 eptr++;
4214 }
4215 break;
4216
4217 case OP_ALLANY:
4218 if (eptr > md->end_subject - min)
4219 {
4220 SCHECK_PARTIAL();
4221 MRRETURN(MATCH_NOMATCH);
4222 }
4223 eptr += min;
4224 break;
4225
4226 case OP_ANYBYTE:
4227 if (eptr > md->end_subject - min)
4228 {
4229 SCHECK_PARTIAL();
4230 MRRETURN(MATCH_NOMATCH);
4231 }
4232 eptr += min;
4233 break;
4234
4235 case OP_ANYNL:
4236 for (i = 1; i <= min; i++)
4237 {
4238 if (eptr >= md->end_subject)
4239 {
4240 SCHECK_PARTIAL();
4241 MRRETURN(MATCH_NOMATCH);
4242 }
4243 switch(*eptr++)
4244 {
4245 default: MRRETURN(MATCH_NOMATCH);
4246
4247 case 0x000d:
4248 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4249 break;
4250
4251 case 0x000a:
4252 break;
4253
4254 case 0x000b:
4255 case 0x000c:
4256 case 0x0085:
4257 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4258 break;
4259 }
4260 }
4261 break;
4262
4263 case OP_NOT_HSPACE:
4264 for (i = 1; i <= min; i++)
4265 {
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 MRRETURN(MATCH_NOMATCH);
4270 }
4271 switch(*eptr++)
4272 {
4273 default: break;
4274 case 0x09: /* HT */
4275 case 0x20: /* SPACE */
4276 case 0xa0: /* NBSP */
4277 MRRETURN(MATCH_NOMATCH);
4278 }
4279 }
4280 break;
4281
4282 case OP_HSPACE:
4283 for (i = 1; i <= min; i++)
4284 {
4285 if (eptr >= md->end_subject)
4286 {
4287 SCHECK_PARTIAL();
4288 MRRETURN(MATCH_NOMATCH);
4289 }
4290 switch(*eptr++)
4291 {
4292 default: MRRETURN(MATCH_NOMATCH);
4293 case 0x09: /* HT */
4294 case 0x20: /* SPACE */
4295 case 0xa0: /* NBSP */
4296 break;
4297 }
4298 }
4299 break;
4300
4301 case OP_NOT_VSPACE:
4302 for (i = 1; i <= min; i++)
4303 {
4304 if (eptr >= md->end_subject)
4305 {
4306 SCHECK_PARTIAL();
4307 MRRETURN(MATCH_NOMATCH);
4308 }
4309 switch(*eptr++)
4310 {
4311 default: break;
4312 case 0x0a: /* LF */
4313 case 0x0b: /* VT */
4314 case 0x0c: /* FF */
4315 case 0x0d: /* CR */
4316 case 0x85: /* NEL */
4317 MRRETURN(MATCH_NOMATCH);
4318 }
4319 }
4320 break;
4321
4322 case OP_VSPACE:
4323 for (i = 1; i <= min; i++)
4324 {
4325 if (eptr >= md->end_subject)
4326 {
4327 SCHECK_PARTIAL();
4328 MRRETURN(MATCH_NOMATCH);
4329 }
4330 switch(*eptr++)
4331 {
4332 default: MRRETURN(MATCH_NOMATCH);
4333 case 0x0a: /* LF */
4334 case 0x0b: /* VT */
4335 case 0x0c: /* FF */
4336 case 0x0d: /* CR */
4337 case 0x85: /* NEL */
4338 break;
4339 }
4340 }
4341 break;
4342
4343 case OP_NOT_DIGIT:
4344 for (i = 1; i <= min; i++)
4345 {
4346 if (eptr >= md->end_subject)
4347 {
4348 SCHECK_PARTIAL();
4349 MRRETURN(MATCH_NOMATCH);
4350 }
4351 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4352 }
4353 break;
4354
4355 case OP_DIGIT:
4356 for (i = 1; i <= min; i++)
4357 {
4358 if (eptr >= md->end_subject)
4359 {
4360 SCHECK_PARTIAL();
4361 MRRETURN(MATCH_NOMATCH);
4362 }
4363 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4364 }
4365 break;
4366
4367 case OP_NOT_WHITESPACE:
4368 for (i = 1; i <= min; i++)
4369 {
4370 if (eptr >= md->end_subject)
4371 {
4372 SCHECK_PARTIAL();
4373 MRRETURN(MATCH_NOMATCH);
4374 }
4375 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4376 }
4377 break;
4378
4379 case OP_WHITESPACE:
4380 for (i = 1; i <= min; i++)
4381 {
4382 if (eptr >= md->end_subject)
4383 {
4384 SCHECK_PARTIAL();
4385 MRRETURN(MATCH_NOMATCH);
4386 }
4387 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4388 }
4389 break;
4390
4391 case OP_NOT_WORDCHAR:
4392 for (i = 1; i <= min; i++)
4393 {
4394 if (eptr >= md->end_subject)
4395 {
4396 SCHECK_PARTIAL();
4397 MRRETURN(MATCH_NOMATCH);
4398 }
4399 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4400 MRRETURN(MATCH_NOMATCH);
4401 }
4402 break;
4403
4404 case OP_WORDCHAR:
4405 for (i = 1; i <= min; i++)
4406 {
4407 if (eptr >= md->end_subject)
4408 {
4409 SCHECK_PARTIAL();
4410 MRRETURN(MATCH_NOMATCH);
4411 }
4412 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4413 MRRETURN(MATCH_NOMATCH);
4414 }
4415 break;
4416
4417 default:
4418 RRETURN(PCRE_ERROR_INTERNAL);
4419 }
4420 }
4421
4422 /* If min = max, continue at the same level without recursing */
4423
4424 if (min == max) continue;
4425
4426 /* If minimizing, we have to test the rest of the pattern before each
4427 subsequent match. Again, separate the UTF-8 case for speed, and also
4428 separate the UCP cases. */
4429
4430 if (minimize)
4431 {
4432 #ifdef SUPPORT_UCP
4433 if (prop_type >= 0)
4434 {
4435 switch(prop_type)
4436 {
4437 case PT_ANY:
4438 for (fi = min;; fi++)
4439 {
4440 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4441 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4442 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4443 if (eptr >= md->end_subject)
4444 {
4445 SCHECK_PARTIAL();
4446 MRRETURN(MATCH_NOMATCH);
4447 }
4448 GETCHARINCTEST(c, eptr);
4449 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4450 }
4451 /* Control never gets here */
4452
4453 case PT_LAMP:
4454 for (fi = min;; fi++)
4455 {
4456 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4457 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4458 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4459 if (eptr >= md->end_subject)
4460 {
4461 SCHECK_PARTIAL();
4462 MRRETURN(MATCH_NOMATCH);
4463 }
4464 GETCHARINCTEST(c, eptr);
4465 prop_chartype = UCD_CHARTYPE(c);
4466 if ((prop_chartype == ucp_Lu ||
4467 prop_chartype == ucp_Ll ||
4468 prop_chartype == ucp_Lt) == prop_fail_result)
4469 MRRETURN(MATCH_NOMATCH);
4470 }
4471 /* Control never gets here */
4472
4473 case PT_GC:
4474 for (fi = min;; fi++)
4475 {
4476 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4477 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4478 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4479 if (eptr >= md->end_subject)
4480 {
4481 SCHECK_PARTIAL();
4482 MRRETURN(MATCH_NOMATCH);
4483 }
4484 GETCHARINCTEST(c, eptr);
4485 prop_category = UCD_CATEGORY(c);
4486 if ((prop_category == prop_value) == prop_fail_result)
4487 MRRETURN(MATCH_NOMATCH);
4488 }
4489 /* Control never gets here */
4490
4491 case PT_PC:
4492 for (fi = min;; fi++)
4493 {
4494 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4496 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4497 if (eptr >= md->end_subject)
4498 {
4499 SCHECK_PARTIAL();
4500 MRRETURN(MATCH_NOMATCH);
4501 }
4502 GETCHARINCTEST(c, eptr);
4503 prop_chartype = UCD_CHARTYPE(c);
4504 if ((prop_chartype == prop_value) == prop_fail_result)
4505 MRRETURN(MATCH_NOMATCH);
4506 }
4507 /* Control never gets here */
4508
4509 case PT_SC:
4510 for (fi = min;; fi++)
4511 {
4512 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4513 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4514 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4515 if (eptr >= md->end_subject)
4516 {
4517 SCHECK_PARTIAL();
4518 MRRETURN(MATCH_NOMATCH);
4519 }
4520 GETCHARINCTEST(c, eptr);
4521 prop_script = UCD_SCRIPT(c);
4522 if ((prop_script == prop_value) == prop_fail_result)
4523 MRRETURN(MATCH_NOMATCH);
4524 }
4525 /* Control never gets here */
4526
4527 case PT_ALNUM:
4528 for (fi = min;; fi++)
4529 {
4530 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4531 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4532 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4533 if (eptr >= md->end_subject)
4534 {
4535 SCHECK_PARTIAL();
4536 MRRETURN(MATCH_NOMATCH);
4537 }
4538 GETCHARINCTEST(c, eptr);
4539 prop_category = UCD_CATEGORY(c);
4540 if ((prop_category == ucp_L || prop_category == ucp_N)
4541 == prop_fail_result)
4542 MRRETURN(MATCH_NOMATCH);
4543 }
4544 /* Control never gets here */
4545
4546 case PT_SPACE: /* Perl space */
4547 for (fi = min;; fi++)
4548 {
4549 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4550 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4551 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4552 if (eptr >= md->end_subject)
4553 {
4554 SCHECK_PARTIAL();
4555 MRRETURN(MATCH_NOMATCH);
4556 }
4557 GETCHARINCTEST(c, eptr);
4558 prop_category = UCD_CATEGORY(c);
4559 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4560 c == CHAR_FF || c == CHAR_CR)
4561 == prop_fail_result)
4562 MRRETURN(MATCH_NOMATCH);
4563 }
4564 /* Control never gets here */
4565
4566 case PT_PXSPACE: /* POSIX space */
4567 for (fi = min;; fi++)
4568 {
4569 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4571 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 MRRETURN(MATCH_NOMATCH);
4576 }
4577 GETCHARINCTEST(c, eptr);
4578 prop_category = UCD_CATEGORY(c);
4579 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4580 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4581 == prop_fail_result)
4582 MRRETURN(MATCH_NOMATCH);
4583 }
4584 /* Control never gets here */
4585
4586 case PT_WORD:
4587 for (fi = min;; fi++)
4588 {
4589 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4590 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4591 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4592 if (eptr >= md->end_subject)
4593 {
4594 SCHECK_PARTIAL();
4595 MRRETURN(MATCH_NOMATCH);
4596 }
4597 GETCHARINCTEST(c, eptr);
4598 prop_category = UCD_CATEGORY(c);
4599 if ((prop_category == ucp_L ||
4600 prop_category == ucp_N ||
4601 c == CHAR_UNDERSCORE)
4602 == prop_fail_result)
4603 MRRETURN(MATCH_NOMATCH);
4604 }
4605 /* Control never gets here */
4606
4607 /* This should never occur */
4608
4609 default:
4610 RRETURN(PCRE_ERROR_INTERNAL);
4611 }
4612 }
4613
4614 /* Match extended Unicode sequences. We will get here only if the
4615 support is in the binary; otherwise a compile-time error occurs. */
4616
4617 else if (ctype == OP_EXTUNI)
4618 {
4619 for (fi = min;; fi++)
4620 {
4621 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4623 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4624 if (eptr >= md->end_subject)
4625 {
4626 SCHECK_PARTIAL();
4627 MRRETURN(MATCH_NOMATCH);
4628 }
4629 GETCHARINCTEST(c, eptr);
4630 prop_category = UCD_CATEGORY(c);
4631 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4632 while (eptr < md->end_subject)
4633 {
4634 int len = 1;
4635 if (!utf8) c = *eptr;
4636 else { GETCHARLEN(c, eptr, len); }
4637 prop_category = UCD_CATEGORY(c);
4638 if (prop_category != ucp_M) break;
4639 eptr += len;
4640 }
4641 }
4642 }
4643
4644 else
4645 #endif /* SUPPORT_UCP */
4646
4647 #ifdef SUPPORT_UTF8
4648 /* UTF-8 mode */
4649 if (utf8)
4650 {
4651 for (fi = min;; fi++)
4652 {
4653 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4655 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4656 if (eptr >= md->end_subject)
4657 {
4658 SCHECK_PARTIAL();
4659 MRRETURN(MATCH_NOMATCH);
4660 }
4661 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4662 MRRETURN(MATCH_NOMATCH);
4663 GETCHARINC(c, eptr);
4664 switch(ctype)
4665 {
4666 case OP_ANY: /* This is the non-NL case */
4667 case OP_ALLANY:
4668 case OP_ANYBYTE:
4669 break;
4670
4671 case OP_ANYNL:
4672 switch(c)
4673 {
4674 default: MRRETURN(MATCH_NOMATCH);
4675 case 0x000d:
4676 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4677 break;
4678 case 0x000a:
4679 break;
4680
4681 case 0x000b:
4682 case 0x000c:
4683 case 0x0085:
4684 case 0x2028:
4685 case 0x2029:
4686 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4687 break;
4688 }
4689 break;
4690
4691 case OP_NOT_HSPACE:
4692 switch(c)
4693 {
4694 default: break;
4695 case 0x09: /* HT */
4696 case 0x20: /* SPACE */
4697 case 0xa0: /* NBSP */
4698 case 0x1680: /* OGHAM SPACE MARK */
4699 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4700 case 0x2000: /* EN QUAD */
4701 case 0x2001: /* EM QUAD */
4702 case 0x2002: /* EN SPACE */
4703 case 0x2003: /* EM SPACE */
4704 case 0x2004: /* THREE-PER-EM SPACE */
4705 case 0x2005: /* FOUR-PER-EM SPACE */
4706 case 0x2006: /* SIX-PER-EM SPACE */
4707 case 0x2007: /* FIGURE SPACE */
4708 case 0x2008: /* PUNCTUATION SPACE */
4709 case 0x2009: /* THIN SPACE */
4710 case 0x200A: /* HAIR SPACE */
4711 case 0x202f: /* NARROW NO-BREAK SPACE */
4712 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4713 case 0x3000: /* IDEOGRAPHIC SPACE */
4714 MRRETURN(MATCH_NOMATCH);
4715 }
4716 break;
4717
4718 case OP_HSPACE:
4719 switch(c)
4720 {
4721 default: MRRETURN(MATCH_NOMATCH);
4722 case 0x09: /* HT */
4723 case 0x20: /* SPACE */
4724 case 0xa0: /* NBSP */
4725 case 0x1680: /* OGHAM SPACE MARK */
4726 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4727 case 0x2000: /* EN QUAD */
4728 case 0x2001: /* EM QUAD */
4729 case 0x2002: /* EN SPACE */
4730 case 0x2003: /* EM SPACE */
4731 case 0x2004: /* THREE-PER-EM SPACE */
4732 case 0x2005: /* FOUR-PER-EM SPACE */
4733 case 0x2006: /* SIX-PER-EM SPACE */
4734 case 0x2007: /* FIGURE SPACE */
4735 case 0x2008: /* PUNCTUATION SPACE */
4736 case 0x2009: /* THIN SPACE */
4737 case 0x200A: /* HAIR SPACE */
4738 case 0x202f: /* NARROW NO-BREAK SPACE */
4739 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4740 case 0x3000: /* IDEOGRAPHIC SPACE */
4741 break;
4742 }
4743 break;
4744
4745 case OP_NOT_VSPACE:
4746 switch(c)
4747 {
4748 default: break;
4749 case 0x0a: /* LF */
4750 case 0x0b: /* VT */
4751 case 0x0c: /* FF */
4752 case 0x0d: /* CR */
4753 case 0x85: /* NEL */
4754 case 0x2028: /* LINE SEPARATOR */
4755 case 0x2029: /* PARAGRAPH SEPARATOR */
4756 MRRETURN(MATCH_NOMATCH);
4757 }
4758 break;
4759
4760 case OP_VSPACE:
4761 switch(c)
4762 {
4763 default: MRRETURN(MATCH_NOMATCH);
4764 case 0x0a: /* LF */
4765 case 0x0b: /* VT */
4766 case 0x0c: /* FF */
4767 case 0x0d: /* CR */
4768 case 0x85: /* NEL */
4769 case 0x2028: /* LINE SEPARATOR */
4770 case 0x2029: /* PARAGRAPH SEPARATOR */
4771 break;
4772 }
4773 break;
4774
4775 case OP_NOT_DIGIT:
4776 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4777 MRRETURN(MATCH_NOMATCH);
4778 break;
4779
4780 case OP_DIGIT:
4781 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4782 MRRETURN(MATCH_NOMATCH);
4783 break;
4784
4785 case OP_NOT_WHITESPACE:
4786 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4787 MRRETURN(MATCH_NOMATCH);
4788 break;
4789
4790 case OP_WHITESPACE:
4791 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4792 MRRETURN(MATCH_NOMATCH);
4793 break;
4794
4795 case OP_NOT_WORDCHAR:
4796 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4797 MRRETURN(MATCH_NOMATCH);
4798 break;
4799
4800 case OP_WORDCHAR:
4801 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4802 MRRETURN(MATCH_NOMATCH);
4803 break;
4804
4805 default:
4806 RRETURN(PCRE_ERROR_INTERNAL);
4807 }
4808 }
4809 }
4810 else
4811 #endif
4812 /* Not UTF-8 mode */
4813 {
4814 for (fi = min;; fi++)
4815 {
4816 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4817 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4818 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4819 if (eptr >= md->end_subject)
4820 {
4821 SCHECK_PARTIAL();
4822 MRRETURN(MATCH_NOMATCH);
4823 }
4824 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4825 MRRETURN(MATCH_NOMATCH);
4826 c = *eptr++;
4827 switch(ctype)
4828 {
4829 case OP_ANY: /* This is the non-NL case */
4830 case OP_ALLANY:
4831 case OP_ANYBYTE:
4832 break;
4833
4834 case OP_ANYNL:
4835 switch(c)
4836 {
4837 default: MRRETURN(MATCH_NOMATCH);
4838 case 0x000d:
4839 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4840 break;
4841
4842 case 0x000a:
4843 break;
4844
4845 case 0x000b:
4846 case 0x000c:
4847 case 0x0085:
4848 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4849 break;
4850 }
4851 break;
4852
4853 case OP_NOT_HSPACE:
4854 switch(c)
4855 {
4856 default: break;
4857 case 0x09: /* HT */
4858 case 0x20: /* SPACE */
4859 case 0xa0: /* NBSP */
4860 MRRETURN(MATCH_NOMATCH);
4861 }
4862 break;
4863
4864 case OP_HSPACE:
4865 switch(c)
4866 {
4867 default: MRRETURN(MATCH_NOMATCH);
4868 case 0x09: /* HT */
4869 case 0x20: /* SPACE */
4870 case 0xa0: /* NBSP */
4871 break;
4872 }
4873 break;
4874
4875 case OP_NOT_VSPACE:
4876 switch(c)
4877 {
4878 default: break;
4879 case 0x0a: /* LF */
4880 case 0x0b: /* VT */
4881 case 0x0c: /* FF */
4882 case 0x0d: /* CR */
4883 case 0x85: /* NEL */
4884 MRRETURN(MATCH_NOMATCH);
4885 }
4886 break;
4887
4888 case OP_VSPACE:
4889 switch(c)
4890 {
4891 default: MRRETURN(MATCH_NOMATCH);
4892 case 0x0a: /* LF */
4893 case 0x0b: /* VT */
4894 case 0x0c: /* FF */
4895 case 0x0d: /* CR */
4896 case 0x85: /* NEL */
4897 break;
4898 }
4899 break;
4900
4901 case OP_NOT_DIGIT:
4902 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4903 break;
4904
4905 case OP_DIGIT:
4906 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4907 break;
4908
4909 case OP_NOT_WHITESPACE:
4910 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4911 break;
4912
4913 case OP_WHITESPACE:
4914 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4915 break;
4916
4917 case OP_NOT_WORDCHAR:
4918 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4919 break;
4920
4921 case OP_WORDCHAR:
4922 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4923 break;
4924
4925 default:
4926 RRETURN(PCRE_ERROR_INTERNAL);
4927 }
4928 }
4929 }
4930 /* Control never gets here */
4931 }
4932
4933 /* If maximizing, it is worth using inline code for speed, doing the type
4934 test once at the start (i.e. keep it out of the loop). Again, keep the
4935 UTF-8 and UCP stuff separate. */
4936
4937 else
4938 {
4939 pp = eptr; /* Remember where we started */
4940
4941 #ifdef SUPPORT_UCP
4942 if (prop_type >= 0)
4943 {
4944 switch(prop_type)
4945 {
4946 case PT_ANY:
4947 for (i = min; i < max; i++)
4948 {
4949 int len = 1;
4950 if (eptr >= md->end_subject)
4951 {
4952 SCHECK_PARTIAL();
4953 break;
4954 }
4955 GETCHARLENTEST(c, eptr, len);
4956 if (prop_fail_result) break;
4957 eptr+= len;
4958 }
4959 break;
4960
4961 case PT_LAMP:
4962 for (i = min; i < max; i++)
4963 {
4964 int len = 1;
4965 if (eptr >= md->end_subject)
4966 {
4967 SCHECK_PARTIAL();
4968 break;
4969 }
4970 GETCHARLENTEST(c, eptr, len);
4971 prop_chartype = UCD_CHARTYPE(c);
4972 if ((prop_chartype == ucp_Lu ||
4973 prop_chartype == ucp_Ll ||
4974 prop_chartype == ucp_Lt) == prop_fail_result)
4975 break;
4976 eptr+= len;
4977 }
4978 break;
4979
4980 case PT_GC:
4981 for (i = min; i < max; i++)
4982 {
4983 int len = 1;
4984 if (eptr >= md->end_subject)
4985 {
4986 SCHECK_PARTIAL();
4987 break;
4988 }
4989 GETCHARLENTEST(c, eptr, len);
4990 prop_category = UCD_CATEGORY(c);
4991 if ((prop_category == prop_value) == prop_fail_result)
4992 break;
4993 eptr+= len;
4994 }
4995 break;
4996
4997 case PT_PC:
4998 for (i = min; i < max; i++)
4999 {
5000 int len = 1;
5001 if (eptr >= md->end_subject)
5002 {
5003 SCHECK_PARTIAL();
5004 break;
5005 }
5006 GETCHARLENTEST(c, eptr, len);
5007 prop_chartype = UCD_CHARTYPE(c);
5008 if ((prop_chartype == prop_value) == prop_fail_result)
5009 break;
5010 eptr+= len;
5011 }
5012 break;
5013
5014 case PT_SC:
5015 for (i = min; i < max; i++)
5016 {
5017 int len = 1;
5018 if (eptr >= md->end_subject)
5019 {
5020 SCHECK_PARTIAL();
5021 break;
5022 }
5023 GETCHARLENTEST(c, eptr, len);
5024 prop_script = UCD_SCRIPT(c);
5025 if ((prop_script == prop_value) == prop_fail_result)
5026 break;
5027 eptr+= len;
5028 }
5029 break;
5030
5031 case PT_ALNUM:
5032 for (i = min; i < max; i++)
5033 {
5034 int len = 1;
5035 if (eptr >= md->end_subject)
5036 {
5037 SCHECK_PARTIAL();
5038 break;
5039 }
5040 GETCHARLENTEST(c, eptr, len);
5041 prop_category = UCD_CATEGORY(c);
5042 if ((prop_category == ucp_L || prop_category == ucp_N)
5043 == prop_fail_result)
5044 break;
5045 eptr+= len;
5046 }
5047 break;
5048
5049 case PT_SPACE: /* Perl space */
5050 for (i = min; i < max; i++)
5051 {
5052 int len = 1;
5053 if (eptr >= md->end_subject)
5054 {
5055 SCHECK_PARTIAL();
5056 break;
5057 }
5058 GETCHARLENTEST(c, eptr, len);
5059 prop_category = UCD_CATEGORY(c);
5060 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5061 c == CHAR_FF || c == CHAR_CR)
5062 == prop_fail_result)
5063 break;
5064 eptr+= len;
5065 }
5066 break;
5067
5068 case PT_PXSPACE: /* POSIX space */
5069 for (i = min; i < max; i++)
5070 {
5071 int len = 1;
5072 if (eptr >= md->end_subject)
5073 {
5074 SCHECK_PARTIAL();
5075 break;
5076 }
5077 GETCHARLENTEST(c, eptr, len);
5078 prop_category = UCD_CATEGORY(c);
5079 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5080 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5081 == prop_fail_result)
5082 break;
5083 eptr+= len;
5084 }
5085 break;
5086
5087 case PT_WORD:
5088 for (i = min; i < max; i++)
5089 {
5090 int len = 1;
5091 if (eptr >= md->end_subject)
5092 {
5093 SCHECK_PARTIAL();
5094 break;
5095 }
5096 GETCHARLENTEST(c, eptr, len);
5097 prop_category = UCD_CATEGORY(c);
5098 if ((prop_category == ucp_L || prop_category == ucp_N ||
5099 c == CHAR_UNDERSCORE) == prop_fail_result)
5100 break;
5101 eptr+= len;
5102 }
5103 break;
5104
5105 default:
5106 RRETURN(PCRE_ERROR_INTERNAL);
5107 }
5108
5109 /* eptr is now past the end of the maximum run */
5110
5111 if (possessive) continue;
5112 for(;;)
5113 {
5114 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5115 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5116 if (eptr-- == pp) break; /* Stop if tried at original pos */
5117 if (utf8) BACKCHAR(eptr);
5118 }
5119 }
5120
5121 /* Match extended Unicode sequences. We will get here only if the
5122 support is in the binary; otherwise a compile-time error occurs. */
5123
5124 else if (ctype == OP_EXTUNI)
5125 {
5126 for (i = min; i < max; i++)
5127 {
5128 if (eptr >= md->end_subject)
5129 {
5130 SCHECK_PARTIAL();
5131 break;
5132 }
5133 GETCHARINCTEST(c, eptr);
5134 prop_category = UCD_CATEGORY(c);
5135 if (prop_category == ucp_M) break;
5136 while (eptr < md->end_subject)
5137 {
5138 int len = 1;
5139 if (!utf8) c = *eptr; else
5140 {
5141 GETCHARLEN(c, eptr, len);
5142 }
5143 prop_category = UCD_CATEGORY(c);
5144 if (prop_category != ucp_M) break;
5145 eptr += len;
5146 }
5147 }
5148
5149 /* eptr is now past the end of the maximum run */
5150
5151 if (possessive) continue;
5152
5153 for(;;)
5154 {
5155 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5156 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5157 if (eptr-- == pp) break; /* Stop if tried at original pos */
5158 for (;;) /* Move back over one extended */
5159 {
5160 int len = 1;
5161 if (!utf8) c = *eptr; else
5162 {
5163 BACKCHAR(eptr);
5164 GETCHARLEN(c, eptr, len);
5165 }
5166 prop_category = UCD_CATEGORY(c);
5167 if (prop_category != ucp_M) break;
5168 eptr--;
5169 }
5170 }
5171 }
5172
5173 else
5174 #endif /* SUPPORT_UCP */
5175
5176 #ifdef SUPPORT_UTF8
5177 /* UTF-8 mode */
5178
5179 if (utf8)
5180 {
5181 switch(ctype)
5182 {
5183 case OP_ANY:
5184 if (max < INT_MAX)
5185 {
5186 for (i = min; i < max; i++)
5187 {
5188 if (eptr >= md->end_subject)
5189 {
5190 SCHECK_PARTIAL();
5191 break;
5192 }
5193 if (IS_NEWLINE(eptr)) break;
5194 eptr++;
5195 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5196 }
5197 }
5198
5199 /* Handle unlimited UTF-8 repeat */
5200
5201 else
5202 {
5203 for (i = min; i < max; i++)
5204 {
5205 if (eptr >= md->end_subject)
5206 {
5207 SCHECK_PARTIAL();
5208 break;
5209 }
5210 if (IS_NEWLINE(eptr)) break;
5211 eptr++;
5212 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5213 }
5214 }
5215 break;
5216
5217 case OP_ALLANY:
5218 if (max < INT_MAX)
5219 {
5220 for (i = min; i < max; i++)
5221 {
5222 if (eptr >= md->end_subject)
5223 {
5224 SCHECK_PARTIAL();
5225 break;
5226 }
5227 eptr++;
5228 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5229 }
5230 }
5231 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5232 break;
5233
5234 /* The byte case is the same as non-UTF8 */
5235
5236 case OP_ANYBYTE:
5237 c = max - min;
5238 if (c > (unsigned int)(md->end_subject - eptr))
5239 {
5240 eptr = md->end_subject;
5241 SCHECK_PARTIAL();
5242 }
5243 else eptr += c;
5244 break;
5245
5246 case OP_ANYNL:
5247 for (i = min; i < max; i++)
5248 {
5249 int len = 1;
5250 if (eptr >= md->end_subject)
5251 {
5252 SCHECK_PARTIAL();
5253 break;
5254 }
5255 GETCHARLEN(c, eptr, len);
5256 if (c == 0x000d)
5257 {
5258 if (++eptr >= md->end_subject) break;
5259 if (*eptr == 0x000a) eptr++;
5260 }
5261 else
5262 {
5263 if (c != 0x000a &&
5264 (md->bsr_anycrlf ||
5265 (c != 0x000b && c != 0x000c &&
5266 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5267 break;
5268 eptr += len;
5269 }
5270 }
5271 break;
5272
5273 case OP_NOT_HSPACE:
5274 case OP_HSPACE:
5275 for (i = min; i < max; i++)
5276 {
5277 BOOL gotspace;
5278 int len = 1;
5279 if (eptr >= md->end_subject)
5280 {
5281 SCHECK_PARTIAL();
5282 break;
5283 }
5284 GETCHARLEN(c, eptr, len);
5285 switch(c)
5286 {
5287 default: gotspace = FALSE; break;
5288 case 0x09: /* HT */
5289 case 0x20: /* SPACE */
5290 case 0xa0: /* NBSP */
5291 case 0x1680: /* OGHAM SPACE MARK */
5292 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5293 case 0x2000: /* EN QUAD */
5294 case 0x2001: /* EM QUAD */
5295 case 0x2002: /* EN SPACE */
5296 case 0x2003: /* EM SPACE */
5297 case 0x2004: /* THREE-PER-EM SPACE */
5298 case 0x2005: /* FOUR-PER-EM SPACE */
5299 case 0x2006: /* SIX-PER-EM SPACE */
5300 case 0x2007: /* FIGURE SPACE */
5301 case 0x2008: /* PUNCTUATION SPACE */
5302 case 0x2009: /* THIN SPACE */
5303 case 0x200A: /* HAIR SPACE */
5304 case 0x202f: /* NARROW NO-BREAK SPACE */
5305 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5306 case 0x3000: /* IDEOGRAPHIC SPACE */
5307 gotspace = TRUE;
5308 break;
5309 }
5310 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5311 eptr += len;
5312 }
5313 break;
5314
5315 case OP_NOT_VSPACE:
5316 case OP_VSPACE:
5317 for (i = min; i < max; i++)
5318 {
5319 BOOL gotspace;
5320 int len = 1;
5321 if (eptr >= md->end_subject)
5322 {
5323 SCHECK_PARTIAL();
5324 break;
5325 }
5326 GETCHARLEN(c, eptr, len);
5327 switch(c)
5328 {
5329 default: gotspace = FALSE; break;
5330 case 0x0a: /* LF */
5331 case 0x0b: /* VT */
5332 case 0x0c: /* FF */
5333 case 0x0d: /* CR */
5334 case 0x85: /* NEL */
5335 case 0x2028: /* LINE SEPARATOR */
5336 case 0x2029: /* PARAGRAPH SEPARATOR */
5337 gotspace = TRUE;
5338 break;
5339 }
5340 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5341 eptr += len;
5342 }
5343 break;
5344
5345 case OP_NOT_DIGIT:
5346 for (i = min; i < max; i++)
5347 {
5348 int len = 1;
5349 if (eptr >= md->end_subject)
5350 {
5351 SCHECK_PARTIAL();
5352 break;
5353 }
5354 GETCHARLEN(c, eptr, len);
5355 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5356 eptr+= len;
5357 }
5358 break;
5359
5360 case OP_DIGIT:
5361 for (i = min; i < max; i++)
5362 {
5363 int len = 1;
5364 if (eptr >= md->end_subject)
5365 {
5366 SCHECK_PARTIAL();
5367 break;
5368 }
5369 GETCHARLEN(c, eptr, len);
5370 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5371 eptr+= len;
5372 }
5373 break;
5374
5375 case OP_NOT_WHITESPACE:
5376 for (i = min; i < max; i++)
5377 {
5378 int len = 1;
5379 if (eptr >= md->end_subject)
5380 {
5381 SCHECK_PARTIAL();
5382 break;
5383 }
5384 GETCHARLEN(c, eptr, len);
5385 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5386 eptr+= len;
5387 }
5388 break;
5389
5390 case OP_WHITESPACE:
5391 for (i = min; i < max; i++)
5392 {
5393 int len = 1;
5394 if (eptr >= md->end_subject)
5395 {
5396 SCHECK_PARTIAL();
5397 break;
5398 }
5399 GETCHARLEN(c, eptr, len);
5400 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5401 eptr+= len;
5402 }
5403 break;
5404
5405 case OP_NOT_WORDCHAR:
5406 for (i = min; i < max; i++)
5407 {
5408 int len = 1;
5409 if (eptr >= md->end_subject)
5410 {
5411 SCHECK_PARTIAL();
5412 break;
5413 }
5414 GETCHARLEN(c, eptr, len);
5415 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5416 eptr+= len;
5417 }
5418 break;
5419
5420 case OP_WORDCHAR:
5421 for (i = min; i < max; i++)
5422 {
5423 int len = 1;
5424 if (eptr >= md->end_subject)
5425 {
5426 SCHECK_PARTIAL();
5427 break;
5428 }
5429 GETCHARLEN(c, eptr, len);
5430 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5431 eptr+= len;
5432 }
5433 break;
5434
5435 default:
5436 RRETURN(PCRE_ERROR_INTERNAL);
5437 }
5438
5439 /* eptr is now past the end of the maximum run. If possessive, we are
5440 done (no backing up). Otherwise, match at this position; anything other
5441 than no match is immediately returned. For nomatch, back up one
5442 character, unless we are matching \R and the last thing matched was
5443 \r\n, in which case, back up two bytes. */
5444
5445 if (possessive) continue;
5446 for(;;)
5447 {
5448 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5449 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5450 if (eptr-- == pp) break; /* Stop if tried at original pos */
5451 BACKCHAR(eptr);
5452 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5453 eptr[-1] == '\r') eptr--;
5454 }
5455 }
5456 else
5457 #endif /* SUPPORT_UTF8 */
5458
5459 /* Not UTF-8 mode */
5460 {
5461 switch(ctype)
5462 {
5463 case OP_ANY:
5464 for (i = min; i < max; i++)
5465 {
5466 if (eptr >= md->end_subject)
5467 {
5468 SCHECK_PARTIAL();
5469 break;
5470 }
5471 if (IS_NEWLINE(eptr)) break;
5472 eptr++;
5473 }
5474 break;
5475
5476 case OP_ALLANY:
5477 case OP_ANYBYTE:
5478 c = max - min;
5479 if (c > (unsigned int)(md->end_subject - eptr))
5480 {
5481 eptr = md->end_subject;
5482 SCHECK_PARTIAL();
5483 }
5484 else eptr += c;
5485 break;
5486
5487 case OP_ANYNL:
5488 for (i = min; i < max; i++)
5489 {
5490 if (eptr >= md->end_subject)
5491 {
5492 SCHECK_PARTIAL();
5493 break;
5494 }
5495 c = *eptr;
5496 if (c == 0x000d)
5497 {
5498 if (++eptr >= md->end_subject) break;
5499 if (*eptr == 0x000a) eptr++;
5500 }
5501 else
5502 {
5503 if (c != 0x000a &&
5504 (md->bsr_anycrlf ||
5505 (c != 0x000b && c != 0x000c && c != 0x0085)))
5506 break;
5507 eptr++;
5508 }
5509 }
5510 break;
5511
5512 case OP_NOT_HSPACE:
5513 for (i = min; i < max; i++)
5514 {
5515 if (eptr >= md->end_subject)
5516 {
5517 SCHECK_PARTIAL();
5518 break;
5519 }
5520 c = *eptr;
5521 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5522 eptr++;
5523 }
5524 break;
5525
5526 case OP_HSPACE:
5527 for (i = min; i < max; i++)
5528 {
5529 if (eptr >= md->end_subject)
5530 {
5531 SCHECK_PARTIAL();
5532 break;
5533 }
5534 c = *eptr;
5535 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5536 eptr++;
5537 }
5538 break;
5539
5540 case OP_NOT_VSPACE:
5541 for (i = min; i < max; i++)
5542 {
5543 if (eptr >= md->end_subject)
5544 {
5545 SCHECK_PARTIAL();
5546 break;
5547 }
5548 c = *eptr;
5549 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5550 break;
5551 eptr++;
5552 }
5553 break;
5554
5555 case OP_VSPACE:
5556 for (i = min; i < max; i++)
5557 {
5558 if (eptr >= md->end_subject)
5559 {
5560 SCHECK_PARTIAL();
5561 break;
5562 }
5563 c = *eptr;
5564 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5565 break;
5566 eptr++;
5567 }
5568 break;
5569
5570 case OP_NOT_DIGIT:
5571 for (i = min; i < max; i++)
5572 {
5573 if (eptr >= md->end_subject)
5574 {
5575 SCHECK_PARTIAL();
5576 break;
5577 }
5578 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5579 eptr++;
5580 }
5581 break;
5582
5583 case OP_DIGIT:
5584 for (i = min; i < max; i++)
5585 {
5586 if (eptr >= md->end_subject)
5587 {
5588 SCHECK_PARTIAL();
5589 break;
5590 }
5591 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5592 eptr++;
5593 }
5594 break;
5595
5596 case OP_NOT_WHITESPACE:
5597 for (i = min; i < max; i++)
5598 {
5599 if (eptr >= md->end_subject)
5600 {
5601 SCHECK_PARTIAL();
5602 break;
5603 }
5604 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5605 eptr++;
5606 }
5607 break;
5608
5609 case OP_WHITESPACE:
5610 for (i = min; i < max; i++)
5611 {
5612 if (eptr >= md->end_subject)
5613 {
5614 SCHECK_PARTIAL();
5615 break;
5616 }
5617 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5618 eptr++;
5619 }
5620 break;
5621
5622 case OP_NOT_WORDCHAR:
5623 for (i = min; i < max; i++)
5624 {
5625 if (eptr >= md->end_subject)
5626 {
5627 SCHECK_PARTIAL();
5628 break;
5629 }
5630 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5631 eptr++;
5632 }
5633 break;
5634
5635 case OP_WORDCHAR:
5636 for (i = min; i < max; i++)
5637 {
5638 if (eptr >= md->end_subject)
5639 {
5640 SCHECK_PARTIAL();
5641 break;
5642 }
5643 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5644 eptr++;
5645 }
5646 break;
5647
5648 default:
5649 RRETURN(PCRE_ERROR_INTERNAL);
5650 }
5651
5652 /* eptr is now past the end of the maximum run. If possessive, we are
5653 done (no backing up). Otherwise, match at this position; anything other
5654 than no match is immediately returned. For nomatch, back up one
5655 character (byte), unless we are matching \R and the last thing matched
5656 was \r\n, in which case, back up two bytes. */
5657
5658 if (possessive) continue;
5659 while (eptr >= pp)
5660 {
5661 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5663 eptr--;
5664 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5665 eptr[-1] == '\r') eptr--;
5666 }
5667 }
5668
5669 /* Get here if we can't make it match with any permitted repetitions */
5670
5671 MRRETURN(MATCH_NOMATCH);
5672 }
5673 /* Control never gets here */
5674
5675 /* There's been some horrible disaster. Arrival here can only mean there is
5676 something seriously wrong in the code above or the OP_xxx definitions. */
5677
5678 default:
5679 DPRINTF(("Unknown opcode %d\n", *ecode));
5680 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5681 }
5682
5683 /* Do not stick any code in here without much thought; it is assumed
5684 that "continue" in the code above comes out to here to repeat the main
5685 loop. */
5686
5687 } /* End of main loop */
5688 /* Control never reaches here */
5689
5690
5691 /* When compiling to use the heap rather than the stack for recursive calls to
5692 match(), the RRETURN() macro jumps here. The number that is saved in
5693 frame->Xwhere indicates which label we actually want to return to. */
5694
5695 #ifdef NO_RECURSE
5696 #define LBL(val) case val: goto L_RM##val;
5697 HEAP_RETURN:
5698 switch (frame->Xwhere)
5699 {
5700 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5701 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5702 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5703 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5704 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5705 #ifdef SUPPORT_UTF8
5706 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5707 LBL(32) LBL(34) LBL(42) LBL(46)
5708 #ifdef SUPPORT_UCP
5709 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5710 LBL(59) LBL(60) LBL(61) LBL(62)
5711 #endif /* SUPPORT_UCP */
5712 #endif /* SUPPORT_UTF8 */
5713 default:
5714 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5715 return PCRE_ERROR_INTERNAL;
5716 }
5717 #undef LBL
5718 #endif /* NO_RECURSE */
5719 }
5720
5721
5722 /***************************************************************************
5723 ****************************************************************************
5724 RECURSION IN THE match() FUNCTION
5725
5726 Undefine all the macros that were defined above to handle this. */
5727
5728 #ifdef NO_RECURSE
5729 #undef eptr
5730 #undef ecode
5731 #undef mstart
5732 #undef offset_top
5733 #undef eptrb
5734 #undef flags
5735
5736 #undef callpat
5737 #undef charptr
5738 #undef data
5739 #undef next
5740 #undef pp
5741 #undef prev
5742 #undef saved_eptr
5743
5744 #undef new_recursive
5745
5746 #undef cur_is_word
5747 #undef condition
5748 #undef prev_is_word
5749
5750 #undef ctype
5751 #undef length
5752 #undef max
5753 #undef min
5754 #undef number
5755 #undef offset
5756 #undef op
5757 #undef save_capture_last
5758 #undef save_offset1
5759 #undef save_offset2
5760 #undef save_offset3
5761 #undef stacksave
5762
5763 #undef newptrb
5764
5765 #endif
5766
5767 /* These two are defined as macros in both cases */
5768
5769 #undef fc
5770 #undef fi
5771
5772 /***************************************************************************
5773 ***************************************************************************/
5774
5775
5776
5777 /*************************************************
5778 * Execute a Regular Expression *
5779 *************************************************/
5780
5781 /* This function applies a compiled re to a subject string and picks out
5782 portions of the string if it matches. Two elements in the vector are set for
5783 each substring: the offsets to the start and end of the substring.
5784
5785 Arguments:
5786 argument_re points to the compiled expression
5787 extra_data points to extra data or is NULL
5788 subject points to the subject string
5789 length length of subject string (may contain binary zeros)
5790 start_offset where to start in the subject string
5791 options option bits
5792 offsets points to a vector of ints to be filled in with offsets
5793 offsetcount the number of elements in the vector
5794
5795 Returns: > 0 => success; value is the number of elements filled in
5796 = 0 => success, but offsets is not big enough
5797 -1 => failed to match
5798 < -1 => some kind of unexpected problem
5799 */
5800
5801 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5802 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5803 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5804 int offsetcount)
5805 {
5806 int rc, resetcount, ocount;
5807 int first_byte = -1;
5808 int req_byte = -1;
5809 int req_byte2 = -1;
5810 int newline;
5811 BOOL using_temporary_offsets = FALSE;
5812 BOOL anchored;
5813 BOOL startline;
5814 BOOL firstline;
5815 BOOL first_byte_caseless = FALSE;
5816 BOOL req_byte_caseless = FALSE;
5817 BOOL utf8;
5818 match_data match_block;
5819 match_data *md = &match_block;
5820 const uschar *tables;
5821 const uschar *start_bits = NULL;
5822 USPTR start_match = (USPTR)subject + start_offset;
5823 USPTR end_subject;
5824 USPTR start_partial = NULL;
5825 USPTR req_byte_ptr = start_match - 1;
5826
5827 pcre_study_data internal_study;
5828 const pcre_study_data *study;
5829
5830 real_pcre internal_re;
5831 const real_pcre *external_re = (const real_pcre *)argument_re;
5832 const real_pcre *re = external_re;
5833
5834 /* Plausibility checks */
5835
5836 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5837 if (re == NULL || subject == NULL ||
5838 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5839 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5840 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5841
5842 /* This information is for finding all the numbers associated with a given
5843 name, for condition testing. */
5844
5845 md->name_table = (uschar *)re + re->name_table_offset;
5846 md->name_count = re->name_count;
5847 md->name_entry_size = re->name_entry_size;
5848
5849 /* Fish out the optional data from the extra_data structure, first setting
5850 the default values. */
5851
5852 study = NULL;
5853 md->match_limit = MATCH_LIMIT;
5854 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5855 md->callout_data = NULL;
5856
5857 /* The table pointer is always in native byte order. */
5858
5859 tables = external_re->tables;
5860
5861 if (extra_data != NULL)
5862 {
5863 register unsigned int flags = extra_data->flags;
5864 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5865 study = (const pcre_study_data *)extra_data->study_data;
5866 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5867 md->match_limit = extra_data->match_limit;
5868 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5869 md->match_limit_recursion = extra_data->match_limit_recursion;
5870 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5871 md->callout_data = extra_data->callout_data;
5872 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5873 }
5874
5875 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5876 is a feature that makes it possible to save compiled regex and re-use them
5877 in other programs later. */
5878
5879 if (tables == NULL) tables = _pcre_default_tables;
5880
5881 /* Check that the first field in the block is the magic number. If it is not,
5882 test for a regex that was compiled on a host of opposite endianness. If this is
5883 the case, flipped values are put in internal_re and internal_study if there was
5884 study data too. */
5885
5886 if (re->magic_number != MAGIC_NUMBER)
5887 {
5888 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5889 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5890 if (study != NULL) study = &internal_study;
5891 }
5892
5893 /* Set up other data */
5894
5895 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5896 startline = (re->flags & PCRE_STARTLINE) != 0;
5897 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5898
5899 /* The code starts after the real_pcre block and the capture name table. */
5900
5901 md->start_code = (const uschar *)external_re + re->name_table_offset +
5902 re->name_count * re->name_entry_size;
5903
5904 md->start_subject = (USPTR)subject;
5905 md->start_offset = start_offset;
5906 md->end_subject = md->start_subject + length;
5907 end_subject = md->end_subject;
5908
5909 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5910 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5911 md->use_ucp = (re->options & PCRE_UCP) != 0;
5912 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5913
5914 md->notbol = (options & PCRE_NOTBOL) != 0;
5915 md->noteol = (options & PCRE_NOTEOL) != 0;
5916 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5917 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5918 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5919 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5920 md->hitend = FALSE;
5921 md->mark = NULL; /* In case never set */
5922
5923 md->recursive = NULL; /* No recursion at top level */
5924
5925 md->lcc = tables + lcc_offset;
5926 md->ctypes = tables + ctypes_offset;
5927
5928 /* Handle different \R options. */
5929
5930 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5931 {
5932 case 0:
5933 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5934 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5935 else
5936 #ifdef BSR_ANYCRLF
5937 md->bsr_anycrlf = TRUE;
5938 #else
5939 md->bsr_anycrlf = FALSE;
5940 #endif
5941 break;
5942
5943 case PCRE_BSR_ANYCRLF:
5944 md->bsr_anycrlf = TRUE;
5945 break;
5946
5947 case PCRE_BSR_UNICODE:
5948 md->bsr_anycrlf = FALSE;
5949 break;
5950
5951 default: return PCRE_ERROR_BADNEWLINE;
5952 }
5953
5954 /* Handle different types of newline. The three bits give eight cases. If
5955 nothing is set at run time, whatever was used at compile time applies. */
5956
5957 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5958 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5959 {
5960 case 0: newline = NEWLINE; break; /* Compile-time default */
5961 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5962 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5963 case PCRE_NEWLINE_CR+
5964 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5965 case PCRE_NEWLINE_ANY: newline = -1; break;
5966 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5967 default: return PCRE_ERROR_BADNEWLINE;
5968 }
5969
5970 if (newline == -2)
5971 {
5972 md->nltype = NLTYPE_ANYCRLF;
5973 }
5974 else if (newline < 0)
5975 {
5976 md->nltype = NLTYPE_ANY;
5977 }
5978 else
5979 {
5980 md->nltype = NLTYPE_FIXED;
5981 if (newline > 255)
5982 {
5983 md->nllen = 2;
5984 md->nl[0] = (newline >> 8) & 255;
5985 md->nl[1] = newline & 255;
5986 }
5987 else
5988 {
5989 md->nllen = 1;
5990 md->nl[0] = newline;
5991 }
5992 }
5993
5994 /* Partial matching was originally supported only for a restricted set of
5995 regexes; from release 8.00 there are no restrictions, but the bits are still
5996 defined (though never set). So there's no harm in leaving this code. */
5997
5998 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5999 return PCRE_ERROR_BADPARTIAL;
6000
6001 /* Check a UTF-8 string if required. Pass back the character offset and error
6002 code if a results vector is available. */
6003
6004 #ifdef SUPPORT_UTF8
6005 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6006 {
6007 int errorcode;
6008 int tb = _pcre_valid_utf8((USPTR)subject, length, &errorcode);
6009 if (tb >= 0)
6010 {
6011 if (offsetcount >= 2)
6012 {
6013 offsets[0] = tb;
6014 offsets[1] = errorcode;
6015 }
6016 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6017 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6018 }
6019 if (start_offset > 0 && start_offset < length)
6020 {
6021 tb = ((USPTR)subject)[start_offset] & 0xc0;
6022 if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
6023 }
6024 }
6025 #endif
6026
6027 /* If the expression has got more back references than the offsets supplied can
6028 hold, we get a temporary chunk of working store to use during the matching.
6029 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6030 of 3. */
6031
6032 ocount = offsetcount - (offsetcount % 3);
6033
6034 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6035 {
6036 ocount = re->top_backref * 3 + 3;
6037 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6038 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6039 using_temporary_offsets = TRUE;
6040 DPRINTF(("Got memory to hold back references\n"));
6041 }
6042 else md->offset_vector = offsets;
6043
6044 md->offset_end = ocount;
6045 md->offset_max = (2*ocount)/3;
6046 md->offset_overflow = FALSE;
6047 md->capture_last = -1;
6048
6049 /* Compute the minimum number of offsets that we need to reset each time. Doing
6050 this makes a huge difference to execution time when there aren't many brackets
6051 in the pattern. */
6052
6053 resetcount = 2 + re->top_bracket * 2;
6054 if (resetcount > offsetcount) resetcount = ocount;
6055
6056 /* Reset the working variable associated with each extraction. These should
6057 never be used unless previously set, but they get saved and restored, and so we
6058 initialize them to avoid reading uninitialized locations. */
6059
6060 if (md->offset_vector != NULL)
6061 {
6062 register int *iptr = md->offset_vector + ocount;
6063 register int *iend = iptr - resetcount/2 + 1;
6064 while (--iptr >= iend) *iptr = -1;
6065 }
6066
6067 /* Set up the first character to match, if available. The first_byte value is
6068 never set for an anchored regular expression, but the anchoring may be forced
6069 at run time, so we have to test for anchoring. The first char may be unset for
6070 an unanchored pattern, of course. If there's no first char and the pattern was
6071 studied, there may be a bitmap of possible first characters. */
6072
6073 if (!anchored)
6074 {
6075 if ((re->flags & PCRE_FIRSTSET) != 0)
6076 {
6077 first_byte = re->first_byte & 255;
6078 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6079 first_byte = md->lcc[first_byte];
6080 }
6081 else
6082 if (!startline && study != NULL &&
6083 (study->flags & PCRE_STUDY_MAPPED) != 0)
6084 start_bits = study->start_bits;
6085 }
6086
6087 /* For anchored or unanchored matches, there may be a "last known required
6088 character" set. */
6089
6090 if ((re->flags & PCRE_REQCHSET) != 0)
6091 {
6092 req_byte = re->req_byte & 255;
6093 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6094 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6095 }
6096
6097
6098 /* ==========================================================================*/
6099
6100 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6101 the loop runs just once. */
6102
6103 for(;;)
6104 {
6105 USPTR save_end_subject = end_subject;
6106 USPTR new_start_match;
6107
6108 /* Reset the maximum number of extractions we might see. */
6109
6110 if (md->offset_vector != NULL)
6111 {
6112 register int *iptr = md->offset_vector;
6113 register int *iend = iptr + resetcount;
6114 while (iptr < iend) *iptr++ = -1;
6115 }
6116
6117 /* If firstline is TRUE, the start of the match is constrained to the first
6118 line of a multiline string. That is, the match must be before or at the first
6119 newline. Implement this by temporarily adjusting end_subject so that we stop
6120 scanning at a newline. If the match fails at the newline, later code breaks
6121 this loop. */
6122
6123 if (firstline)
6124 {
6125 USPTR t = start_match;
6126 #ifdef SUPPORT_UTF8
6127 if (utf8)
6128 {
6129 while (t < md->end_subject && !IS_NEWLINE(t))
6130 {
6131 t++;
6132 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6133 }
6134 }
6135 else
6136 #endif
6137 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6138 end_subject = t;
6139 }
6140
6141 /* There are some optimizations that avoid running the match if a known
6142 starting point is not found, or if a known later character is not present.
6143 However, there is an option that disables these, for testing and for ensuring
6144 that all callouts do actually occur. The option can be set in the regex by
6145 (*NO_START_OPT) or passed in match-time options. */
6146
6147 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6148 {
6149 /* Advance to a unique first byte if there is one. */
6150
6151 if (first_byte >= 0)
6152 {
6153 if (first_byte_caseless)
6154 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6155 start_match++;
6156 else
6157 while (start_match < end_subject && *start_match != first_byte)
6158 start_match++;
6159 }
6160
6161 /* Or to just after a linebreak for a multiline match */
6162
6163 else if (startline)
6164 {
6165 if (start_match > md->start_subject + start_offset)
6166 {
6167 #ifdef SUPPORT_UTF8
6168 if (utf8)
6169 {
6170 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6171 {
6172 start_match++;
6173 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6174 start_match++;
6175 }
6176 }
6177 else
6178 #endif
6179 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6180 start_match++;
6181
6182 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6183 and we are now at a LF, advance the match position by one more character.
6184 */
6185
6186 if (start_match[-1] == CHAR_CR &&
6187 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6188 start_match < end_subject &&
6189 *start_match == CHAR_NL)
6190 start_match++;
6191 }
6192 }
6193
6194 /* Or to a non-unique first byte after study */
6195
6196 else if (start_bits != NULL)
6197 {
6198 while (start_match < end_subject)
6199 {
6200 register unsigned int c = *start_match;
6201 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6202 {
6203 start_match++;
6204 #ifdef SUPPORT_UTF8
6205 if (utf8)
6206 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6207 start_match++;
6208 #endif
6209 }
6210 else break;
6211 }
6212 }
6213 } /* Starting optimizations */
6214
6215 /* Restore fudged end_subject */
6216
6217 end_subject = save_end_subject;
6218
6219 /* The following two optimizations are disabled for partial matching or if
6220 disabling is explicitly requested. */
6221
6222 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6223 {
6224 /* If the pattern was studied, a minimum subject length may be set. This is
6225 a lower bound; no actual string of that length may actually match the
6226 pattern. Although the value is, strictly, in characters, we treat it as
6227 bytes to avoid spending too much time in this optimization. */
6228
6229 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6230 (pcre_uint32)(end_subject - start_match) < study->minlength)
6231 {
6232 rc = MATCH_NOMATCH;
6233 break;
6234 }
6235
6236 /* If req_byte is set, we know that that character must appear in the
6237 subject for the match to succeed. If the first character is set, req_byte
6238 must be later in the subject; otherwise the test starts at the match point.
6239 This optimization can save a huge amount of backtracking in patterns with
6240 nested unlimited repeats that aren't going to match. Writing separate code
6241 for cased/caseless versions makes it go faster, as does using an
6242 autoincrement and backing off on a match.
6243
6244 HOWEVER: when the subject string is very, very long, searching to its end
6245 can take a long time, and give bad performance on quite ordinary patterns.
6246 This showed up when somebody was matching something like /^\d+C/ on a
6247 32-megabyte string... so we don't do this when the string is sufficiently
6248 long. */
6249
6250 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6251 {
6252 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6253
6254 /* We don't need to repeat the search if we haven't yet reached the
6255 place we found it at last time. */
6256
6257 if (p > req_byte_ptr)
6258 {
6259 if (req_byte_caseless)
6260 {
6261 while (p < end_subject)
6262 {
6263 register int pp = *p++;
6264 if (pp == req_byte || pp == req_byte2) { p--; break; }
6265 }
6266 }
6267 else
6268 {
6269 while (p < end_subject)
6270 {
6271 if (*p++ == req_byte) { p--; break; }
6272 }
6273 }
6274
6275 /* If we can't find the required character, break the matching loop,
6276 forcing a match failure. */
6277
6278 if (p >= end_subject)
6279 {
6280 rc = MATCH_NOMATCH;
6281 break;
6282 }
6283
6284 /* If we have found the required character, save the point where we
6285 found it, so that we don't search again next time round the loop if
6286 the start hasn't passed this character yet. */
6287
6288 req_byte_ptr = p;
6289 }
6290 }
6291 }
6292
6293 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6294 printf(">>>> Match against: ");
6295 pchars(start_match, end_subject - start_match, TRUE, md);
6296 printf("\n");
6297 #endif
6298
6299 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6300 first starting point for which a partial match was found. */
6301
6302 md->start_match_ptr = start_match;
6303 md->start_used_ptr = start_match;
6304 md->match_call_count = 0;
6305 md->match_function_type = 0;
6306 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6307 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6308
6309 switch(rc)
6310 {
6311 /* SKIP passes back the next starting point explicitly, but if it is the
6312 same as the match we have just done, treat it as NOMATCH. */
6313
6314 case MATCH_SKIP:
6315 if (md->start_match_ptr != start_match)
6316 {
6317 new_start_match = md->start_match_ptr;
6318 break;
6319 }
6320 /* Fall through */
6321
6322 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6323 the SKIP's arg was not found. We also treat this as NOMATCH. */
6324
6325 case MATCH_SKIP_ARG:
6326 /* Fall through */
6327
6328 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6329 exactly like PRUNE. */
6330
6331 case MATCH_NOMATCH:
6332 case MATCH_PRUNE:
6333 case MATCH_THEN:
6334 new_start_match = start_match + 1;
6335 #ifdef SUPPORT_UTF8
6336 if (utf8)
6337 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6338 new_start_match++;
6339 #endif
6340 break;
6341
6342 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6343
6344 case MATCH_COMMIT:
6345 rc = MATCH_NOMATCH;
6346 goto ENDLOOP;
6347
6348 /* Any other return is either a match, or some kind of error. */
6349
6350 default:
6351 goto ENDLOOP;
6352 }
6353
6354 /* Control reaches here for the various types of "no match at this point"
6355 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6356
6357 rc = MATCH_NOMATCH;
6358
6359 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6360 newline in the subject (though it may continue over the newline). Therefore,
6361 if we have just failed to match, starting at a newline, do not continue. */
6362
6363 if (firstline && IS_NEWLINE(start_match)) break;
6364
6365 /* Advance to new matching position */
6366
6367 start_match = new_start_match;
6368
6369 /* Break the loop if the pattern is anchored or if we have passed the end of
6370 the subject. */
6371
6372 if (anchored || start_match > end_subject) break;
6373
6374 /* If we have just passed a CR and we are now at a LF, and the pattern does
6375 not contain any explicit matches for \r or \n, and the newline option is CRLF
6376 or ANY or ANYCRLF, advance the match position by one more character. */
6377
6378 if (start_match[-1] == CHAR_CR &&
6379 start_match < end_subject &&
6380 *start_match == CHAR_NL &&
6381 (re->flags & PCRE_HASCRORLF) == 0 &&
6382 (md->nltype == NLTYPE_ANY ||
6383 md->nltype == NLTYPE_ANYCRLF ||
6384 md->nllen == 2))
6385 start_match++;
6386
6387 md->mark = NULL; /* Reset for start of next match attempt */
6388 } /* End of for(;;) "bumpalong" loop */
6389
6390 /* ==========================================================================*/
6391
6392 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6393 conditions is true:
6394
6395 (1) The pattern is anchored or the match was failed by (*COMMIT);
6396
6397 (2) We are past the end of the subject;
6398
6399 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6400 this option requests that a match occur at or before the first newline in
6401 the subject.
6402
6403 When we have a match and the offset vector is big enough to deal with any
6404 backreferences, captured substring offsets will already be set up. In the case
6405 where we had to get some local store to hold offsets for backreference
6406 processing, copy those that we can. In this case there need not be overflow if
6407 certain parts of the pattern were not used, even though there are more
6408 capturing parentheses than vector slots. */
6409
6410 ENDLOOP:
6411
6412 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6413 {
6414 if (using_temporary_offsets)
6415 {
6416 if (offsetcount >= 4)
6417 {
6418 memcpy(offsets + 2, md->offset_vector + 2,
6419 (offsetcount - 2) * sizeof(int));
6420 DPRINTF(("Copied offsets from temporary memory\n"));
6421 }
6422 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6423 DPRINTF(("Freeing temporary memory\n"));
6424 (pcre_free)(md->offset_vector);
6425 }
6426
6427 /* Set the return code to the number of captured strings, or 0 if there are
6428 too many to fit into the vector. */
6429
6430 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6431
6432 /* If there is space, set up the whole thing as substring 0. The value of
6433 md->start_match_ptr might be modified if \K was encountered on the success
6434 matching path. */
6435
6436 if (offsetcount < 2) rc = 0; else
6437 {
6438 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6439 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6440 }
6441
6442 DPRINTF((">>>> returning %d\n", rc));
6443 goto RETURN_MARK;
6444 }
6445
6446 /* Control gets here if there has been an error, or if the overall match
6447 attempt has failed at all permitted starting positions. */
6448
6449 if (using_temporary_offsets)
6450 {
6451 DPRINTF(("Freeing temporary memory\n"));
6452 (pcre_free)(md->offset_vector);
6453 }
6454
6455 /* For anything other than nomatch or partial match, just return the code. */
6456
6457 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6458 {
6459 DPRINTF((">>>> error: returning %d\n", rc));
6460 return rc;
6461 }
6462
6463 /* Handle partial matches - disable any mark data */
6464
6465 if (start_partial != NULL)
6466 {
6467 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6468 md->mark = NULL;
6469 if (offsetcount > 1)
6470 {
6471 offsets[0] = (int)(start_partial - (USPTR)subject);
6472 offsets[1] = (int)(end_subject - (USPTR)subject);
6473 }
6474 rc = PCRE_ERROR_PARTIAL;
6475 }
6476
6477 /* This is the classic nomatch case */
6478
6479 else
6480 {
6481 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6482 rc = PCRE_ERROR_NOMATCH;
6483 }
6484
6485 /* Return the MARK data if it has been requested. */
6486
6487 RETURN_MARK:
6488
6489 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6490 *(extra_data->mark) = (unsigned char *)(md->mark);
6491 return rc;
6492 }
6493
6494 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5