/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 617 - (show annotations)
Tue Jul 12 11:00:10 2011 UTC (8 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 195108 byte(s)
Error occurred while calculating annotation data.
Small code and comment tidy
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63};
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. A failure to match might occur after the group has succeeded,
812 if something later on doesn't match. For this reason, we need to restore
813 the working value and also the values of the final offsets, in case they
814 were set by a previous iteration of the same bracket.
815
816 If there isn't enough space in the offset vector, treat this as if it were
817 a non-capturing bracket. Don't worry about setting the flag for the error
818 case here; that is handled in the code for KET. */
819
820 case OP_CBRA:
821 case OP_SCBRA:
822 number = GET2(ecode, 1+LINK_SIZE);
823 offset = number << 1;
824
825 #ifdef PCRE_DEBUG
826 printf("start bracket %d\n", number);
827 printf("subject=");
828 pchars(eptr, 16, TRUE, md);
829 printf("\n");
830 #endif
831
832 if (offset < md->offset_max)
833 {
834 save_offset1 = md->offset_vector[offset];
835 save_offset2 = md->offset_vector[offset+1];
836 save_offset3 = md->offset_vector[md->offset_end - number];
837 save_capture_last = md->capture_last;
838
839 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
840 md->offset_vector[md->offset_end - number] =
841 (int)(eptr - md->start_subject);
842
843 for (;;)
844 {
845 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
846 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
847 eptrb, RM1);
848 if (rrc != MATCH_NOMATCH &&
849 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
850 RRETURN(rrc);
851 md->capture_last = save_capture_last;
852 ecode += GET(ecode, 1);
853 if (*ecode != OP_ALT) break;
854 }
855
856 DPRINTF(("bracket %d failed\n", number));
857
858 md->offset_vector[offset] = save_offset1;
859 md->offset_vector[offset+1] = save_offset2;
860 md->offset_vector[md->offset_end - number] = save_offset3;
861
862 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
863 RRETURN(MATCH_NOMATCH);
864 }
865
866 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
867 as a non-capturing bracket. */
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
873
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
876
877 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
878 for all the alternatives. When we get to the final alternative within the
879 brackets, we used to return the result of a recursive call to match()
880 whatever happened so it was possible to reduce stack usage by turning this
881 into a tail recursion, except in the case of a possibly empty group.
882 However, now that there is the possiblity of (*THEN) occurring in the final
883 alternative, this optimization is no longer possible. */
884
885 case OP_BRA:
886 case OP_SBRA:
887 DPRINTF(("start non-capturing bracket\n"));
888 for (;;)
889 {
890 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
891 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
892 RM2);
893 if (rrc != MATCH_NOMATCH &&
894 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
895 RRETURN(rrc);
896 ecode += GET(ecode, 1);
897 if (*ecode != OP_ALT) break;
898 }
899
900 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
901 RRETURN(MATCH_NOMATCH);
902
903 /* Handle possessive capturing brackets with an unlimited repeat. We come
904 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
905 handled similarly to the normal case above. However, the matching is
906 different. The end of these brackets will always be OP_KETRPOS, which
907 returns MATCH_KETRPOS without going further in the pattern. By this means
908 we can handle the group by iteration rather than recursion, thereby
909 reducing the amount of stack needed. */
910
911 case OP_CBRAPOS:
912 case OP_SCBRAPOS:
913 allow_zero = FALSE;
914
915 POSSESSIVE_CAPTURE:
916 number = GET2(ecode, 1+LINK_SIZE);
917 offset = number << 1;
918
919 #ifdef PCRE_DEBUG
920 printf("start possessive bracket %d\n", number);
921 printf("subject=");
922 pchars(eptr, 16, TRUE, md);
923 printf("\n");
924 #endif
925
926 if (offset < md->offset_max)
927 {
928 matched_once = FALSE;
929 code_offset = ecode - md->start_code;
930
931 save_offset1 = md->offset_vector[offset];
932 save_offset2 = md->offset_vector[offset+1];
933 save_offset3 = md->offset_vector[md->offset_end - number];
934 save_capture_last = md->capture_last;
935
936 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
937
938 /* Each time round the loop, save the current subject position for use
939 when the group matches. For MATCH_MATCH, the group has matched, so we
940 restart it with a new subject starting position, remembering that we had
941 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
942 usual. If we haven't matched any alternatives in any iteration, check to
943 see if a previous iteration matched. If so, the group has matched;
944 continue from afterwards. Otherwise it has failed; restore the previous
945 capture values before returning NOMATCH. */
946
947 for (;;)
948 {
949 md->offset_vector[md->offset_end - number] =
950 (int)(eptr - md->start_subject);
951 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
952 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
953 eptrb, RM63);
954 if (rrc == MATCH_KETRPOS)
955 {
956 offset_top = md->end_offset_top;
957 eptr = md->end_match_ptr;
958 ecode = md->start_code + code_offset;
959 save_capture_last = md->capture_last;
960 matched_once = TRUE;
961 continue;
962 }
963 if (rrc != MATCH_NOMATCH &&
964 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
965 RRETURN(rrc);
966 md->capture_last = save_capture_last;
967 ecode += GET(ecode, 1);
968 if (*ecode != OP_ALT) break;
969 }
970
971 if (!matched_once)
972 {
973 md->offset_vector[offset] = save_offset1;
974 md->offset_vector[offset+1] = save_offset2;
975 md->offset_vector[md->offset_end - number] = save_offset3;
976 }
977
978 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
979 if (allow_zero || matched_once)
980 {
981 ecode += 1 + LINK_SIZE;
982 break;
983 }
984
985 RRETURN(MATCH_NOMATCH);
986 }
987
988 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
989 as a non-capturing bracket. */
990
991 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
992 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
993
994 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
995
996 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
997 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
998
999 /* Non-capturing possessive bracket with unlimited repeat. We come here
1000 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1001 without the capturing complication. It is written out separately for speed
1002 and cleanliness. */
1003
1004 case OP_BRAPOS:
1005 case OP_SBRAPOS:
1006 allow_zero = FALSE;
1007
1008 POSSESSIVE_NON_CAPTURE:
1009 matched_once = FALSE;
1010 code_offset = ecode - md->start_code;
1011
1012 for (;;)
1013 {
1014 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1015 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1016 eptrb, RM48);
1017 if (rrc == MATCH_KETRPOS)
1018 {
1019 offset_top = md->end_offset_top;
1020 eptr = md->end_match_ptr;
1021 ecode = md->start_code + code_offset;
1022 matched_once = TRUE;
1023 continue;
1024 }
1025 if (rrc != MATCH_NOMATCH &&
1026 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1027 RRETURN(rrc);
1028 ecode += GET(ecode, 1);
1029 if (*ecode != OP_ALT) break;
1030 }
1031
1032 if (matched_once || allow_zero)
1033 {
1034 ecode += 1 + LINK_SIZE;
1035 break;
1036 }
1037 RRETURN(MATCH_NOMATCH);
1038
1039 /* Control never reaches here. */
1040
1041 /* Conditional group: compilation checked that there are no more than
1042 two branches. If the condition is false, skipping the first branch takes us
1043 past the end if there is only one branch, but that's OK because that is
1044 exactly what going to the ket would do. */
1045
1046 case OP_COND:
1047 case OP_SCOND:
1048 codelink = GET(ecode, 1);
1049
1050 /* Because of the way auto-callout works during compile, a callout item is
1051 inserted between OP_COND and an assertion condition. */
1052
1053 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1054 {
1055 if (pcre_callout != NULL)
1056 {
1057 pcre_callout_block cb;
1058 cb.version = 1; /* Version 1 of the callout block */
1059 cb.callout_number = ecode[LINK_SIZE+2];
1060 cb.offset_vector = md->offset_vector;
1061 cb.subject = (PCRE_SPTR)md->start_subject;
1062 cb.subject_length = (int)(md->end_subject - md->start_subject);
1063 cb.start_match = (int)(mstart - md->start_subject);
1064 cb.current_position = (int)(eptr - md->start_subject);
1065 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1066 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1067 cb.capture_top = offset_top/2;
1068 cb.capture_last = md->capture_last;
1069 cb.callout_data = md->callout_data;
1070 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1071 if (rrc < 0) RRETURN(rrc);
1072 }
1073 ecode += _pcre_OP_lengths[OP_CALLOUT];
1074 }
1075
1076 condcode = ecode[LINK_SIZE+1];
1077
1078 /* Now see what the actual condition is */
1079
1080 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1081 {
1082 if (md->recursive == NULL) /* Not recursing => FALSE */
1083 {
1084 condition = FALSE;
1085 ecode += GET(ecode, 1);
1086 }
1087 else
1088 {
1089 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1090 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1091
1092 /* If the test is for recursion into a specific subpattern, and it is
1093 false, but the test was set up by name, scan the table to see if the
1094 name refers to any other numbers, and test them. The condition is true
1095 if any one is set. */
1096
1097 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1098 {
1099 uschar *slotA = md->name_table;
1100 for (i = 0; i < md->name_count; i++)
1101 {
1102 if (GET2(slotA, 0) == recno) break;
1103 slotA += md->name_entry_size;
1104 }
1105
1106 /* Found a name for the number - there can be only one; duplicate
1107 names for different numbers are allowed, but not vice versa. First
1108 scan down for duplicates. */
1109
1110 if (i < md->name_count)
1111 {
1112 uschar *slotB = slotA;
1113 while (slotB > md->name_table)
1114 {
1115 slotB -= md->name_entry_size;
1116 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1117 {
1118 condition = GET2(slotB, 0) == md->recursive->group_num;
1119 if (condition) break;
1120 }
1121 else break;
1122 }
1123
1124 /* Scan up for duplicates */
1125
1126 if (!condition)
1127 {
1128 slotB = slotA;
1129 for (i++; i < md->name_count; i++)
1130 {
1131 slotB += md->name_entry_size;
1132 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1133 {
1134 condition = GET2(slotB, 0) == md->recursive->group_num;
1135 if (condition) break;
1136 }
1137 else break;
1138 }
1139 }
1140 }
1141 }
1142
1143 /* Chose branch according to the condition */
1144
1145 ecode += condition? 3 : GET(ecode, 1);
1146 }
1147 }
1148
1149 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1150 {
1151 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1152 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1153
1154 /* If the numbered capture is unset, but the reference was by name,
1155 scan the table to see if the name refers to any other numbers, and test
1156 them. The condition is true if any one is set. This is tediously similar
1157 to the code above, but not close enough to try to amalgamate. */
1158
1159 if (!condition && condcode == OP_NCREF)
1160 {
1161 int refno = offset >> 1;
1162 uschar *slotA = md->name_table;
1163
1164 for (i = 0; i < md->name_count; i++)
1165 {
1166 if (GET2(slotA, 0) == refno) break;
1167 slotA += md->name_entry_size;
1168 }
1169
1170 /* Found a name for the number - there can be only one; duplicate names
1171 for different numbers are allowed, but not vice versa. First scan down
1172 for duplicates. */
1173
1174 if (i < md->name_count)
1175 {
1176 uschar *slotB = slotA;
1177 while (slotB > md->name_table)
1178 {
1179 slotB -= md->name_entry_size;
1180 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1181 {
1182 offset = GET2(slotB, 0) << 1;
1183 condition = offset < offset_top &&
1184 md->offset_vector[offset] >= 0;
1185 if (condition) break;
1186 }
1187 else break;
1188 }
1189
1190 /* Scan up for duplicates */
1191
1192 if (!condition)
1193 {
1194 slotB = slotA;
1195 for (i++; i < md->name_count; i++)
1196 {
1197 slotB += md->name_entry_size;
1198 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1199 {
1200 offset = GET2(slotB, 0) << 1;
1201 condition = offset < offset_top &&
1202 md->offset_vector[offset] >= 0;
1203 if (condition) break;
1204 }
1205 else break;
1206 }
1207 }
1208 }
1209 }
1210
1211 /* Chose branch according to the condition */
1212
1213 ecode += condition? 3 : GET(ecode, 1);
1214 }
1215
1216 else if (condcode == OP_DEF) /* DEFINE - always false */
1217 {
1218 condition = FALSE;
1219 ecode += GET(ecode, 1);
1220 }
1221
1222 /* The condition is an assertion. Call match() to evaluate it - setting
1223 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1224 an assertion. */
1225
1226 else
1227 {
1228 md->match_function_type = MATCH_CONDASSERT;
1229 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1230 if (rrc == MATCH_MATCH)
1231 {
1232 condition = TRUE;
1233 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1234 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1235 }
1236 else if (rrc != MATCH_NOMATCH &&
1237 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1238 {
1239 RRETURN(rrc); /* Need braces because of following else */
1240 }
1241 else
1242 {
1243 condition = FALSE;
1244 ecode += codelink;
1245 }
1246 }
1247
1248 /* We are now at the branch that is to be obeyed. As there is only one,
1249 we used to use tail recursion to avoid using another stack frame, except
1250 when there was unlimited repeat of a possibly empty group. However, that
1251 strategy no longer works because of the possibilty of (*THEN) being
1252 encountered in the branch. A recursive call to match() is always required,
1253 unless the second alternative doesn't exist, in which case we can just
1254 plough on. */
1255
1256 if (condition || *ecode == OP_ALT)
1257 {
1258 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1259 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1260 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1261 rrc = MATCH_NOMATCH;
1262 RRETURN(rrc);
1263 }
1264 else /* Condition false & no alternative */
1265 {
1266 ecode += 1 + LINK_SIZE;
1267 }
1268 break;
1269
1270
1271 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1272 to close any currently open capturing brackets. */
1273
1274 case OP_CLOSE:
1275 number = GET2(ecode, 1);
1276 offset = number << 1;
1277
1278 #ifdef PCRE_DEBUG
1279 printf("end bracket %d at *ACCEPT", number);
1280 printf("\n");
1281 #endif
1282
1283 md->capture_last = number;
1284 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1285 {
1286 md->offset_vector[offset] =
1287 md->offset_vector[md->offset_end - number];
1288 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1289 if (offset_top <= offset) offset_top = offset + 2;
1290 }
1291 ecode += 3;
1292 break;
1293
1294
1295 /* End of the pattern, either real or forced. If we are in a recursion, we
1296 should restore the offsets appropriately, and if it's a top-level
1297 recursion, continue from after the call. */
1298
1299 case OP_ACCEPT:
1300 case OP_ASSERT_ACCEPT:
1301 case OP_END:
1302 if (md->recursive != NULL)
1303 {
1304 recursion_info *rec = md->recursive;
1305 md->recursive = rec->prevrec;
1306 memmove(md->offset_vector, rec->offset_save,
1307 rec->saved_max * sizeof(int));
1308 offset_top = rec->save_offset_top;
1309 if (rec->group_num == 0)
1310 {
1311 ecode = rec->after_call;
1312 break;
1313 }
1314 }
1315
1316 /* Otherwise, if we have matched an empty string, fail if not in an
1317 assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1318 is set and we have matched at the start of the subject. In both cases,
1319 backtracking will then try other alternatives, if any. */
1320
1321 else if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1322 (md->notempty ||
1323 (md->notempty_atstart &&
1324 mstart == md->start_subject + md->start_offset)))
1325 MRRETURN(MATCH_NOMATCH);
1326
1327 /* Otherwise, we have a match. */
1328
1329 md->end_match_ptr = eptr; /* Record where we ended */
1330 md->end_offset_top = offset_top; /* and how many extracts were taken */
1331 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1332
1333 /* For some reason, the macros don't work properly if an expression is
1334 given as the argument to MRRETURN when the heap is in use. */
1335
1336 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1337 MRRETURN(rrc);
1338
1339 /* Assertion brackets. Check the alternative branches in turn - the
1340 matching won't pass the KET for an assertion. If any one branch matches,
1341 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1342 start of each branch to move the current point backwards, so the code at
1343 this level is identical to the lookahead case. When the assertion is part
1344 of a condition, we want to return immediately afterwards. The caller of
1345 this incarnation of the match() function will have set MATCH_CONDASSERT in
1346 md->match_function type, and one of these opcodes will be the first opcode
1347 that is processed. We use a local variable that is preserved over calls to
1348 match() to remember this case. */
1349
1350 case OP_ASSERT:
1351 case OP_ASSERTBACK:
1352 if (md->match_function_type == MATCH_CONDASSERT)
1353 {
1354 condassert = TRUE;
1355 md->match_function_type = 0;
1356 }
1357 else condassert = FALSE;
1358
1359 do
1360 {
1361 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1362 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1363 {
1364 mstart = md->start_match_ptr; /* In case \K reset it */
1365 break;
1366 }
1367 if (rrc != MATCH_NOMATCH &&
1368 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1369 RRETURN(rrc);
1370 ecode += GET(ecode, 1);
1371 }
1372 while (*ecode == OP_ALT);
1373
1374 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1375
1376 /* If checking an assertion for a condition, return MATCH_MATCH. */
1377
1378 if (condassert) RRETURN(MATCH_MATCH);
1379
1380 /* Continue from after the assertion, updating the offsets high water
1381 mark, since extracts may have been taken during the assertion. */
1382
1383 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1384 ecode += 1 + LINK_SIZE;
1385 offset_top = md->end_offset_top;
1386 continue;
1387
1388 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1389 PRUNE, or COMMIT means we must assume failure without checking subsequent
1390 branches. */
1391
1392 case OP_ASSERT_NOT:
1393 case OP_ASSERTBACK_NOT:
1394 if (md->match_function_type == MATCH_CONDASSERT)
1395 {
1396 condassert = TRUE;
1397 md->match_function_type = 0;
1398 }
1399 else condassert = FALSE;
1400
1401 do
1402 {
1403 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1404 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1405 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1406 {
1407 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1408 break;
1409 }
1410 if (rrc != MATCH_NOMATCH &&
1411 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1412 RRETURN(rrc);
1413 ecode += GET(ecode,1);
1414 }
1415 while (*ecode == OP_ALT);
1416
1417 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1418
1419 ecode += 1 + LINK_SIZE;
1420 continue;
1421
1422 /* Move the subject pointer back. This occurs only at the start of
1423 each branch of a lookbehind assertion. If we are too close to the start to
1424 move back, this match function fails. When working with UTF-8 we move
1425 back a number of characters, not bytes. */
1426
1427 case OP_REVERSE:
1428 #ifdef SUPPORT_UTF8
1429 if (utf8)
1430 {
1431 i = GET(ecode, 1);
1432 while (i-- > 0)
1433 {
1434 eptr--;
1435 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1436 BACKCHAR(eptr);
1437 }
1438 }
1439 else
1440 #endif
1441
1442 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1443
1444 {
1445 eptr -= GET(ecode, 1);
1446 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1447 }
1448
1449 /* Save the earliest consulted character, then skip to next op code */
1450
1451 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1452 ecode += 1 + LINK_SIZE;
1453 break;
1454
1455 /* The callout item calls an external function, if one is provided, passing
1456 details of the match so far. This is mainly for debugging, though the
1457 function is able to force a failure. */
1458
1459 case OP_CALLOUT:
1460 if (pcre_callout != NULL)
1461 {
1462 pcre_callout_block cb;
1463 cb.version = 1; /* Version 1 of the callout block */
1464 cb.callout_number = ecode[1];
1465 cb.offset_vector = md->offset_vector;
1466 cb.subject = (PCRE_SPTR)md->start_subject;
1467 cb.subject_length = (int)(md->end_subject - md->start_subject);
1468 cb.start_match = (int)(mstart - md->start_subject);
1469 cb.current_position = (int)(eptr - md->start_subject);
1470 cb.pattern_position = GET(ecode, 2);
1471 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1472 cb.capture_top = offset_top/2;
1473 cb.capture_last = md->capture_last;
1474 cb.callout_data = md->callout_data;
1475 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1476 if (rrc < 0) RRETURN(rrc);
1477 }
1478 ecode += 2 + 2*LINK_SIZE;
1479 break;
1480
1481 /* Recursion either matches the current regex, or some subexpression. The
1482 offset data is the offset to the starting bracket from the start of the
1483 whole pattern. (This is so that it works from duplicated subpatterns.)
1484
1485 If there are any capturing brackets started but not finished, we have to
1486 save their starting points and reinstate them after the recursion. However,
1487 we don't know how many such there are (offset_top records the completed
1488 total) so we just have to save all the potential data. There may be up to
1489 65535 such values, which is too large to put on the stack, but using malloc
1490 for small numbers seems expensive. As a compromise, the stack is used when
1491 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1492 is used.
1493
1494 There are also other values that have to be saved. We use a chained
1495 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1496 for the original version of this logic. */
1497
1498 case OP_RECURSE:
1499 {
1500 callpat = md->start_code + GET(ecode, 1);
1501 new_recursive.group_num = (callpat == md->start_code)? 0 :
1502 GET2(callpat, 1 + LINK_SIZE);
1503
1504 /* Add to "recursing stack" */
1505
1506 new_recursive.prevrec = md->recursive;
1507 md->recursive = &new_recursive;
1508
1509 /* Find where to continue from afterwards */
1510
1511 ecode += 1 + LINK_SIZE;
1512 new_recursive.after_call = ecode;
1513
1514 /* Now save the offset data. */
1515
1516 new_recursive.saved_max = md->offset_end;
1517 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1518 new_recursive.offset_save = stacksave;
1519 else
1520 {
1521 new_recursive.offset_save =
1522 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1523 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1524 }
1525
1526 memcpy(new_recursive.offset_save, md->offset_vector,
1527 new_recursive.saved_max * sizeof(int));
1528 new_recursive.save_offset_top = offset_top;
1529
1530 /* OK, now we can do the recursion. For each top-level alternative we
1531 restore the offset and recursion data. */
1532
1533 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1534 cbegroup = (*callpat >= OP_SBRA);
1535 do
1536 {
1537 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1538 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1539 md, eptrb, RM6);
1540 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1541 {
1542 DPRINTF(("Recursion matched\n"));
1543 md->recursive = new_recursive.prevrec;
1544 if (new_recursive.offset_save != stacksave)
1545 (pcre_free)(new_recursive.offset_save);
1546 MRRETURN(MATCH_MATCH);
1547 }
1548 else if (rrc != MATCH_NOMATCH &&
1549 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1550 {
1551 DPRINTF(("Recursion gave error %d\n", rrc));
1552 if (new_recursive.offset_save != stacksave)
1553 (pcre_free)(new_recursive.offset_save);
1554 RRETURN(rrc);
1555 }
1556
1557 md->recursive = &new_recursive;
1558 memcpy(md->offset_vector, new_recursive.offset_save,
1559 new_recursive.saved_max * sizeof(int));
1560 callpat += GET(callpat, 1);
1561 }
1562 while (*callpat == OP_ALT);
1563
1564 DPRINTF(("Recursion didn't match\n"));
1565 md->recursive = new_recursive.prevrec;
1566 if (new_recursive.offset_save != stacksave)
1567 (pcre_free)(new_recursive.offset_save);
1568 MRRETURN(MATCH_NOMATCH);
1569 }
1570 /* Control never reaches here */
1571
1572 /* "Once" brackets are like assertion brackets except that after a match,
1573 the point in the subject string is not moved back. Thus there can never be
1574 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1575 Check the alternative branches in turn - the matching won't pass the KET
1576 for this kind of subpattern. If any one branch matches, we carry on as at
1577 the end of a normal bracket, leaving the subject pointer, but resetting
1578 the start-of-match value in case it was changed by \K. */
1579
1580 case OP_ONCE:
1581 prev = ecode;
1582 saved_eptr = eptr;
1583
1584 do
1585 {
1586 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1587 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1588 {
1589 mstart = md->start_match_ptr;
1590 break;
1591 }
1592 if (rrc != MATCH_NOMATCH &&
1593 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1594 RRETURN(rrc);
1595 ecode += GET(ecode,1);
1596 }
1597 while (*ecode == OP_ALT);
1598
1599 /* If hit the end of the group (which could be repeated), fail */
1600
1601 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1602
1603 /* Continue after the group, updating the offsets high water mark, since
1604 extracts may have been taken. */
1605
1606 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1607
1608 offset_top = md->end_offset_top;
1609 eptr = md->end_match_ptr;
1610
1611 /* For a non-repeating ket, just continue at this level. This also
1612 happens for a repeating ket if no characters were matched in the group.
1613 This is the forcible breaking of infinite loops as implemented in Perl
1614 5.005. */
1615
1616 if (*ecode == OP_KET || eptr == saved_eptr)
1617 {
1618 ecode += 1+LINK_SIZE;
1619 break;
1620 }
1621
1622 /* The repeating kets try the rest of the pattern or restart from the
1623 preceding bracket, in the appropriate order. The second "call" of match()
1624 uses tail recursion, to avoid using another stack frame. */
1625
1626 if (*ecode == OP_KETRMIN)
1627 {
1628 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1630 ecode = prev;
1631 }
1632 else /* OP_KETRMAX */
1633 {
1634 md->match_function_type = MATCH_CBEGROUP;
1635 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1637 ecode += 1 + LINK_SIZE;
1638 }
1639 goto TAIL_RECURSE;
1640
1641 /* Control never gets here */
1642
1643 /* An alternation is the end of a branch; scan along to find the end of the
1644 bracketed group and go to there. */
1645
1646 case OP_ALT:
1647 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1648 break;
1649
1650 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1651 indicating that it may occur zero times. It may repeat infinitely, or not
1652 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1653 with fixed upper repeat limits are compiled as a number of copies, with the
1654 optional ones preceded by BRAZERO or BRAMINZERO. */
1655
1656 case OP_BRAZERO:
1657 next = ecode + 1;
1658 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1660 do next += GET(next, 1); while (*next == OP_ALT);
1661 ecode = next + 1 + LINK_SIZE;
1662 break;
1663
1664 case OP_BRAMINZERO:
1665 next = ecode + 1;
1666 do next += GET(next, 1); while (*next == OP_ALT);
1667 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1669 ecode++;
1670 break;
1671
1672 case OP_SKIPZERO:
1673 next = ecode+1;
1674 do next += GET(next,1); while (*next == OP_ALT);
1675 ecode = next + 1 + LINK_SIZE;
1676 break;
1677
1678 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1679 here; just jump to the group, with allow_zero set TRUE. */
1680
1681 case OP_BRAPOSZERO:
1682 op = *(++ecode);
1683 allow_zero = TRUE;
1684 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1685 goto POSSESSIVE_NON_CAPTURE;
1686
1687 /* End of a group, repeated or non-repeating. */
1688
1689 case OP_KET:
1690 case OP_KETRMIN:
1691 case OP_KETRMAX:
1692 case OP_KETRPOS:
1693 prev = ecode - GET(ecode, 1);
1694
1695 /* If this was a group that remembered the subject start, in order to break
1696 infinite repeats of empty string matches, retrieve the subject start from
1697 the chain. Otherwise, set it NULL. */
1698
1699 if (*prev >= OP_SBRA)
1700 {
1701 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1702 eptrb = eptrb->epb_prev; /* Backup to previous group */
1703 }
1704 else saved_eptr = NULL;
1705
1706 /* If we are at the end of an assertion group or an atomic group, stop
1707 matching and return MATCH_MATCH, but record the current high water mark for
1708 use by positive assertions. We also need to record the match start in case
1709 it was changed by \K. */
1710
1711 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1712 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1713 *prev == OP_ONCE)
1714 {
1715 md->end_match_ptr = eptr; /* For ONCE */
1716 md->end_offset_top = offset_top;
1717 md->start_match_ptr = mstart;
1718 MRRETURN(MATCH_MATCH);
1719 }
1720
1721 /* For capturing groups we have to check the group number back at the start
1722 and if necessary complete handling an extraction by setting the offsets and
1723 bumping the high water mark. Note that whole-pattern recursion is coded as
1724 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1725 when the OP_END is reached. Other recursion is handled here. */
1726
1727 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1728 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1729 {
1730 number = GET2(prev, 1+LINK_SIZE);
1731 offset = number << 1;
1732
1733 #ifdef PCRE_DEBUG
1734 printf("end bracket %d", number);
1735 printf("\n");
1736 #endif
1737
1738 md->capture_last = number;
1739 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1740 {
1741 /* If offset is greater than offset_top, it means that we are
1742 "skipping" a capturing group, and that group's offsets must be marked
1743 unset. In earlier versions of PCRE, all the offsets were unset at the
1744 start of matching, but this doesn't work because atomic groups and
1745 assertions can cause a value to be set that should later be unset.
1746 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1747 part of the atomic group, but this is not on the final matching path,
1748 so must be unset when 2 is set. (If there is no group 2, there is no
1749 problem, because offset_top will then be 2, indicating no capture.) */
1750
1751 if (offset > offset_top)
1752 {
1753 register int *iptr = md->offset_vector + offset_top;
1754 register int *iend = md->offset_vector + offset;
1755 while (iptr < iend) *iptr++ = -1;
1756 }
1757
1758 /* Now make the extraction */
1759
1760 md->offset_vector[offset] =
1761 md->offset_vector[md->offset_end - number];
1762 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1763 if (offset_top <= offset) offset_top = offset + 2;
1764 }
1765
1766 /* Handle a recursively called group. Restore the offsets
1767 appropriately and continue from after the call. */
1768
1769 if (md->recursive != NULL && md->recursive->group_num == number)
1770 {
1771 recursion_info *rec = md->recursive;
1772 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1773 md->recursive = rec->prevrec;
1774 memcpy(md->offset_vector, rec->offset_save,
1775 rec->saved_max * sizeof(int));
1776 offset_top = rec->save_offset_top;
1777 ecode = rec->after_call;
1778 break;
1779 }
1780 }
1781
1782 /* For a non-repeating ket, just continue at this level. This also
1783 happens for a repeating ket if no characters were matched in the group.
1784 This is the forcible breaking of infinite loops as implemented in Perl
1785 5.005. If there is an options reset, it will get obeyed in the normal
1786 course of events. */
1787
1788 if (*ecode == OP_KET || eptr == saved_eptr)
1789 {
1790 ecode += 1 + LINK_SIZE;
1791 break;
1792 }
1793
1794 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1795 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1796 at a time from the outer level, thus saving stack. */
1797
1798 if (*ecode == OP_KETRPOS)
1799 {
1800 md->end_match_ptr = eptr;
1801 md->end_offset_top = offset_top;
1802 RRETURN(MATCH_KETRPOS);
1803 }
1804
1805 /* The normal repeating kets try the rest of the pattern or restart from
1806 the preceding bracket, in the appropriate order. In the second case, we can
1807 use tail recursion to avoid using another stack frame, unless we have an
1808 unlimited repeat of a group that can match an empty string. */
1809
1810 if (*ecode == OP_KETRMIN)
1811 {
1812 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1814 if (*prev >= OP_SBRA) /* Could match an empty string */
1815 {
1816 md->match_function_type = MATCH_CBEGROUP;
1817 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1818 RRETURN(rrc);
1819 }
1820 ecode = prev;
1821 goto TAIL_RECURSE;
1822 }
1823 else /* OP_KETRMAX */
1824 {
1825 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1826 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1828 ecode += 1 + LINK_SIZE;
1829 goto TAIL_RECURSE;
1830 }
1831 /* Control never gets here */
1832
1833 /* Not multiline mode: start of subject assertion, unless notbol. */
1834
1835 case OP_CIRC:
1836 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1837
1838 /* Start of subject assertion */
1839
1840 case OP_SOD:
1841 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1842 ecode++;
1843 break;
1844
1845 /* Multiline mode: start of subject unless notbol, or after any newline. */
1846
1847 case OP_CIRCM:
1848 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1849 if (eptr != md->start_subject &&
1850 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1851 MRRETURN(MATCH_NOMATCH);
1852 ecode++;
1853 break;
1854
1855 /* Start of match assertion */
1856
1857 case OP_SOM:
1858 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1859 ecode++;
1860 break;
1861
1862 /* Reset the start of match point */
1863
1864 case OP_SET_SOM:
1865 mstart = eptr;
1866 ecode++;
1867 break;
1868
1869 /* Multiline mode: assert before any newline, or before end of subject
1870 unless noteol is set. */
1871
1872 case OP_DOLLM:
1873 if (eptr < md->end_subject)
1874 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1875 else
1876 {
1877 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1878 SCHECK_PARTIAL();
1879 }
1880 ecode++;
1881 break;
1882
1883 /* Not multiline mode: assert before a terminating newline or before end of
1884 subject unless noteol is set. */
1885
1886 case OP_DOLL:
1887 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1888 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1889
1890 /* ... else fall through for endonly */
1891
1892 /* End of subject assertion (\z) */
1893
1894 case OP_EOD:
1895 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1896 SCHECK_PARTIAL();
1897 ecode++;
1898 break;
1899
1900 /* End of subject or ending \n assertion (\Z) */
1901
1902 case OP_EODN:
1903 ASSERT_NL_OR_EOS:
1904 if (eptr < md->end_subject &&
1905 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1906 MRRETURN(MATCH_NOMATCH);
1907
1908 /* Either at end of string or \n before end. */
1909
1910 SCHECK_PARTIAL();
1911 ecode++;
1912 break;
1913
1914 /* Word boundary assertions */
1915
1916 case OP_NOT_WORD_BOUNDARY:
1917 case OP_WORD_BOUNDARY:
1918 {
1919
1920 /* Find out if the previous and current characters are "word" characters.
1921 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1922 be "non-word" characters. Remember the earliest consulted character for
1923 partial matching. */
1924
1925 #ifdef SUPPORT_UTF8
1926 if (utf8)
1927 {
1928 /* Get status of previous character */
1929
1930 if (eptr == md->start_subject) prev_is_word = FALSE; else
1931 {
1932 USPTR lastptr = eptr - 1;
1933 while((*lastptr & 0xc0) == 0x80) lastptr--;
1934 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1935 GETCHAR(c, lastptr);
1936 #ifdef SUPPORT_UCP
1937 if (md->use_ucp)
1938 {
1939 if (c == '_') prev_is_word = TRUE; else
1940 {
1941 int cat = UCD_CATEGORY(c);
1942 prev_is_word = (cat == ucp_L || cat == ucp_N);
1943 }
1944 }
1945 else
1946 #endif
1947 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1948 }
1949
1950 /* Get status of next character */
1951
1952 if (eptr >= md->end_subject)
1953 {
1954 SCHECK_PARTIAL();
1955 cur_is_word = FALSE;
1956 }
1957 else
1958 {
1959 GETCHAR(c, eptr);
1960 #ifdef SUPPORT_UCP
1961 if (md->use_ucp)
1962 {
1963 if (c == '_') cur_is_word = TRUE; else
1964 {
1965 int cat = UCD_CATEGORY(c);
1966 cur_is_word = (cat == ucp_L || cat == ucp_N);
1967 }
1968 }
1969 else
1970 #endif
1971 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1972 }
1973 }
1974 else
1975 #endif
1976
1977 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1978 consistency with the behaviour of \w we do use it in this case. */
1979
1980 {
1981 /* Get status of previous character */
1982
1983 if (eptr == md->start_subject) prev_is_word = FALSE; else
1984 {
1985 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1986 #ifdef SUPPORT_UCP
1987 if (md->use_ucp)
1988 {
1989 c = eptr[-1];
1990 if (c == '_') prev_is_word = TRUE; else
1991 {
1992 int cat = UCD_CATEGORY(c);
1993 prev_is_word = (cat == ucp_L || cat == ucp_N);
1994 }
1995 }
1996 else
1997 #endif
1998 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1999 }
2000
2001 /* Get status of next character */
2002
2003 if (eptr >= md->end_subject)
2004 {
2005 SCHECK_PARTIAL();
2006 cur_is_word = FALSE;
2007 }
2008 else
2009 #ifdef SUPPORT_UCP
2010 if (md->use_ucp)
2011 {
2012 c = *eptr;
2013 if (c == '_') cur_is_word = TRUE; else
2014 {
2015 int cat = UCD_CATEGORY(c);
2016 cur_is_word = (cat == ucp_L || cat == ucp_N);
2017 }
2018 }
2019 else
2020 #endif
2021 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2022 }
2023
2024 /* Now see if the situation is what we want */
2025
2026 if ((*ecode++ == OP_WORD_BOUNDARY)?
2027 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2028 MRRETURN(MATCH_NOMATCH);
2029 }
2030 break;
2031
2032 /* Match a single character type; inline for speed */
2033
2034 case OP_ANY:
2035 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2036 /* Fall through */
2037
2038 case OP_ALLANY:
2039 if (eptr++ >= md->end_subject)
2040 {
2041 SCHECK_PARTIAL();
2042 MRRETURN(MATCH_NOMATCH);
2043 }
2044 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2045 ecode++;
2046 break;
2047
2048 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2049 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2050
2051 case OP_ANYBYTE:
2052 if (eptr++ >= md->end_subject)
2053 {
2054 SCHECK_PARTIAL();
2055 MRRETURN(MATCH_NOMATCH);
2056 }
2057 ecode++;
2058 break;
2059
2060 case OP_NOT_DIGIT:
2061 if (eptr >= md->end_subject)
2062 {
2063 SCHECK_PARTIAL();
2064 MRRETURN(MATCH_NOMATCH);
2065 }
2066 GETCHARINCTEST(c, eptr);
2067 if (
2068 #ifdef SUPPORT_UTF8
2069 c < 256 &&
2070 #endif
2071 (md->ctypes[c] & ctype_digit) != 0
2072 )
2073 MRRETURN(MATCH_NOMATCH);
2074 ecode++;
2075 break;
2076
2077 case OP_DIGIT:
2078 if (eptr >= md->end_subject)
2079 {
2080 SCHECK_PARTIAL();
2081 MRRETURN(MATCH_NOMATCH);
2082 }
2083 GETCHARINCTEST(c, eptr);
2084 if (
2085 #ifdef SUPPORT_UTF8
2086 c >= 256 ||
2087 #endif
2088 (md->ctypes[c] & ctype_digit) == 0
2089 )
2090 MRRETURN(MATCH_NOMATCH);
2091 ecode++;
2092 break;
2093
2094 case OP_NOT_WHITESPACE:
2095 if (eptr >= md->end_subject)
2096 {
2097 SCHECK_PARTIAL();
2098 MRRETURN(MATCH_NOMATCH);
2099 }
2100 GETCHARINCTEST(c, eptr);
2101 if (
2102 #ifdef SUPPORT_UTF8
2103 c < 256 &&
2104 #endif
2105 (md->ctypes[c] & ctype_space) != 0
2106 )
2107 MRRETURN(MATCH_NOMATCH);
2108 ecode++;
2109 break;
2110
2111 case OP_WHITESPACE:
2112 if (eptr >= md->end_subject)
2113 {
2114 SCHECK_PARTIAL();
2115 MRRETURN(MATCH_NOMATCH);
2116 }
2117 GETCHARINCTEST(c, eptr);
2118 if (
2119 #ifdef SUPPORT_UTF8
2120 c >= 256 ||
2121 #endif
2122 (md->ctypes[c] & ctype_space) == 0
2123 )
2124 MRRETURN(MATCH_NOMATCH);
2125 ecode++;
2126 break;
2127
2128 case OP_NOT_WORDCHAR:
2129 if (eptr >= md->end_subject)
2130 {
2131 SCHECK_PARTIAL();
2132 MRRETURN(MATCH_NOMATCH);
2133 }
2134 GETCHARINCTEST(c, eptr);
2135 if (
2136 #ifdef SUPPORT_UTF8
2137 c < 256 &&
2138 #endif
2139 (md->ctypes[c] & ctype_word) != 0
2140 )
2141 MRRETURN(MATCH_NOMATCH);
2142 ecode++;
2143 break;
2144
2145 case OP_WORDCHAR:
2146 if (eptr >= md->end_subject)
2147 {
2148 SCHECK_PARTIAL();
2149 MRRETURN(MATCH_NOMATCH);
2150 }
2151 GETCHARINCTEST(c, eptr);
2152 if (
2153 #ifdef SUPPORT_UTF8
2154 c >= 256 ||
2155 #endif
2156 (md->ctypes[c] & ctype_word) == 0
2157 )
2158 MRRETURN(MATCH_NOMATCH);
2159 ecode++;
2160 break;
2161
2162 case OP_ANYNL:
2163 if (eptr >= md->end_subject)
2164 {
2165 SCHECK_PARTIAL();
2166 MRRETURN(MATCH_NOMATCH);
2167 }
2168 GETCHARINCTEST(c, eptr);
2169 switch(c)
2170 {
2171 default: MRRETURN(MATCH_NOMATCH);
2172
2173 case 0x000d:
2174 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2175 break;
2176
2177 case 0x000a:
2178 break;
2179
2180 case 0x000b:
2181 case 0x000c:
2182 case 0x0085:
2183 case 0x2028:
2184 case 0x2029:
2185 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2186 break;
2187 }
2188 ecode++;
2189 break;
2190
2191 case OP_NOT_HSPACE:
2192 if (eptr >= md->end_subject)
2193 {
2194 SCHECK_PARTIAL();
2195 MRRETURN(MATCH_NOMATCH);
2196 }
2197 GETCHARINCTEST(c, eptr);
2198 switch(c)
2199 {
2200 default: break;
2201 case 0x09: /* HT */
2202 case 0x20: /* SPACE */
2203 case 0xa0: /* NBSP */
2204 case 0x1680: /* OGHAM SPACE MARK */
2205 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2206 case 0x2000: /* EN QUAD */
2207 case 0x2001: /* EM QUAD */
2208 case 0x2002: /* EN SPACE */
2209 case 0x2003: /* EM SPACE */
2210 case 0x2004: /* THREE-PER-EM SPACE */
2211 case 0x2005: /* FOUR-PER-EM SPACE */
2212 case 0x2006: /* SIX-PER-EM SPACE */
2213 case 0x2007: /* FIGURE SPACE */
2214 case 0x2008: /* PUNCTUATION SPACE */
2215 case 0x2009: /* THIN SPACE */
2216 case 0x200A: /* HAIR SPACE */
2217 case 0x202f: /* NARROW NO-BREAK SPACE */
2218 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2219 case 0x3000: /* IDEOGRAPHIC SPACE */
2220 MRRETURN(MATCH_NOMATCH);
2221 }
2222 ecode++;
2223 break;
2224
2225 case OP_HSPACE:
2226 if (eptr >= md->end_subject)
2227 {
2228 SCHECK_PARTIAL();
2229 MRRETURN(MATCH_NOMATCH);
2230 }
2231 GETCHARINCTEST(c, eptr);
2232 switch(c)
2233 {
2234 default: MRRETURN(MATCH_NOMATCH);
2235 case 0x09: /* HT */
2236 case 0x20: /* SPACE */
2237 case 0xa0: /* NBSP */
2238 case 0x1680: /* OGHAM SPACE MARK */
2239 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2240 case 0x2000: /* EN QUAD */
2241 case 0x2001: /* EM QUAD */
2242 case 0x2002: /* EN SPACE */
2243 case 0x2003: /* EM SPACE */
2244 case 0x2004: /* THREE-PER-EM SPACE */
2245 case 0x2005: /* FOUR-PER-EM SPACE */
2246 case 0x2006: /* SIX-PER-EM SPACE */
2247 case 0x2007: /* FIGURE SPACE */
2248 case 0x2008: /* PUNCTUATION SPACE */
2249 case 0x2009: /* THIN SPACE */
2250 case 0x200A: /* HAIR SPACE */
2251 case 0x202f: /* NARROW NO-BREAK SPACE */
2252 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2253 case 0x3000: /* IDEOGRAPHIC SPACE */
2254 break;
2255 }
2256 ecode++;
2257 break;
2258
2259 case OP_NOT_VSPACE:
2260 if (eptr >= md->end_subject)
2261 {
2262 SCHECK_PARTIAL();
2263 MRRETURN(MATCH_NOMATCH);
2264 }
2265 GETCHARINCTEST(c, eptr);
2266 switch(c)
2267 {
2268 default: break;
2269 case 0x0a: /* LF */
2270 case 0x0b: /* VT */
2271 case 0x0c: /* FF */
2272 case 0x0d: /* CR */
2273 case 0x85: /* NEL */
2274 case 0x2028: /* LINE SEPARATOR */
2275 case 0x2029: /* PARAGRAPH SEPARATOR */
2276 MRRETURN(MATCH_NOMATCH);
2277 }
2278 ecode++;
2279 break;
2280
2281 case OP_VSPACE:
2282 if (eptr >= md->end_subject)
2283 {
2284 SCHECK_PARTIAL();
2285 MRRETURN(MATCH_NOMATCH);
2286 }
2287 GETCHARINCTEST(c, eptr);
2288 switch(c)
2289 {
2290 default: MRRETURN(MATCH_NOMATCH);
2291 case 0x0a: /* LF */
2292 case 0x0b: /* VT */
2293 case 0x0c: /* FF */
2294 case 0x0d: /* CR */
2295 case 0x85: /* NEL */
2296 case 0x2028: /* LINE SEPARATOR */
2297 case 0x2029: /* PARAGRAPH SEPARATOR */
2298 break;
2299 }
2300 ecode++;
2301 break;
2302
2303 #ifdef SUPPORT_UCP
2304 /* Check the next character by Unicode property. We will get here only
2305 if the support is in the binary; otherwise a compile-time error occurs. */
2306
2307 case OP_PROP:
2308 case OP_NOTPROP:
2309 if (eptr >= md->end_subject)
2310 {
2311 SCHECK_PARTIAL();
2312 MRRETURN(MATCH_NOMATCH);
2313 }
2314 GETCHARINCTEST(c, eptr);
2315 {
2316 const ucd_record *prop = GET_UCD(c);
2317
2318 switch(ecode[1])
2319 {
2320 case PT_ANY:
2321 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2322 break;
2323
2324 case PT_LAMP:
2325 if ((prop->chartype == ucp_Lu ||
2326 prop->chartype == ucp_Ll ||
2327 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2328 MRRETURN(MATCH_NOMATCH);
2329 break;
2330
2331 case PT_GC:
2332 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2333 MRRETURN(MATCH_NOMATCH);
2334 break;
2335
2336 case PT_PC:
2337 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2338 MRRETURN(MATCH_NOMATCH);
2339 break;
2340
2341 case PT_SC:
2342 if ((ecode[2] != prop->script) == (op == OP_PROP))
2343 MRRETURN(MATCH_NOMATCH);
2344 break;
2345
2346 /* These are specials */
2347
2348 case PT_ALNUM:
2349 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2350 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2351 MRRETURN(MATCH_NOMATCH);
2352 break;
2353
2354 case PT_SPACE: /* Perl space */
2355 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2356 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2357 == (op == OP_NOTPROP))
2358 MRRETURN(MATCH_NOMATCH);
2359 break;
2360
2361 case PT_PXSPACE: /* POSIX space */
2362 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2363 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2364 c == CHAR_FF || c == CHAR_CR)
2365 == (op == OP_NOTPROP))
2366 MRRETURN(MATCH_NOMATCH);
2367 break;
2368
2369 case PT_WORD:
2370 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2371 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2372 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2373 MRRETURN(MATCH_NOMATCH);
2374 break;
2375
2376 /* This should never occur */
2377
2378 default:
2379 RRETURN(PCRE_ERROR_INTERNAL);
2380 }
2381
2382 ecode += 3;
2383 }
2384 break;
2385
2386 /* Match an extended Unicode sequence. We will get here only if the support
2387 is in the binary; otherwise a compile-time error occurs. */
2388
2389 case OP_EXTUNI:
2390 if (eptr >= md->end_subject)
2391 {
2392 SCHECK_PARTIAL();
2393 MRRETURN(MATCH_NOMATCH);
2394 }
2395 GETCHARINCTEST(c, eptr);
2396 {
2397 int category = UCD_CATEGORY(c);
2398 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2399 while (eptr < md->end_subject)
2400 {
2401 int len = 1;
2402 if (!utf8) c = *eptr; else
2403 {
2404 GETCHARLEN(c, eptr, len);
2405 }
2406 category = UCD_CATEGORY(c);
2407 if (category != ucp_M) break;
2408 eptr += len;
2409 }
2410 }
2411 ecode++;
2412 break;
2413 #endif
2414
2415
2416 /* Match a back reference, possibly repeatedly. Look past the end of the
2417 item to see if there is repeat information following. The code is similar
2418 to that for character classes, but repeated for efficiency. Then obey
2419 similar code to character type repeats - written out again for speed.
2420 However, if the referenced string is the empty string, always treat
2421 it as matched, any number of times (otherwise there could be infinite
2422 loops). */
2423
2424 case OP_REF:
2425 case OP_REFI:
2426 caseless = op == OP_REFI;
2427 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2428 ecode += 3;
2429
2430 /* If the reference is unset, there are two possibilities:
2431
2432 (a) In the default, Perl-compatible state, set the length negative;
2433 this ensures that every attempt at a match fails. We can't just fail
2434 here, because of the possibility of quantifiers with zero minima.
2435
2436 (b) If the JavaScript compatibility flag is set, set the length to zero
2437 so that the back reference matches an empty string.
2438
2439 Otherwise, set the length to the length of what was matched by the
2440 referenced subpattern. */
2441
2442 if (offset >= offset_top || md->offset_vector[offset] < 0)
2443 length = (md->jscript_compat)? 0 : -1;
2444 else
2445 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2446
2447 /* Set up for repetition, or handle the non-repeated case */
2448
2449 switch (*ecode)
2450 {
2451 case OP_CRSTAR:
2452 case OP_CRMINSTAR:
2453 case OP_CRPLUS:
2454 case OP_CRMINPLUS:
2455 case OP_CRQUERY:
2456 case OP_CRMINQUERY:
2457 c = *ecode++ - OP_CRSTAR;
2458 minimize = (c & 1) != 0;
2459 min = rep_min[c]; /* Pick up values from tables; */
2460 max = rep_max[c]; /* zero for max => infinity */
2461 if (max == 0) max = INT_MAX;
2462 break;
2463
2464 case OP_CRRANGE:
2465 case OP_CRMINRANGE:
2466 minimize = (*ecode == OP_CRMINRANGE);
2467 min = GET2(ecode, 1);
2468 max = GET2(ecode, 3);
2469 if (max == 0) max = INT_MAX;
2470 ecode += 5;
2471 break;
2472
2473 default: /* No repeat follows */
2474 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2475 {
2476 CHECK_PARTIAL();
2477 MRRETURN(MATCH_NOMATCH);
2478 }
2479 eptr += length;
2480 continue; /* With the main loop */
2481 }
2482
2483 /* Handle repeated back references. If the length of the reference is
2484 zero, just continue with the main loop. */
2485
2486 if (length == 0) continue;
2487
2488 /* First, ensure the minimum number of matches are present. We get back
2489 the length of the reference string explicitly rather than passing the
2490 address of eptr, so that eptr can be a register variable. */
2491
2492 for (i = 1; i <= min; i++)
2493 {
2494 int slength;
2495 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2496 {
2497 CHECK_PARTIAL();
2498 MRRETURN(MATCH_NOMATCH);
2499 }
2500 eptr += slength;
2501 }
2502
2503 /* If min = max, continue at the same level without recursion.
2504 They are not both allowed to be zero. */
2505
2506 if (min == max) continue;
2507
2508 /* If minimizing, keep trying and advancing the pointer */
2509
2510 if (minimize)
2511 {
2512 for (fi = min;; fi++)
2513 {
2514 int slength;
2515 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2517 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2518 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2519 {
2520 CHECK_PARTIAL();
2521 MRRETURN(MATCH_NOMATCH);
2522 }
2523 eptr += slength;
2524 }
2525 /* Control never gets here */
2526 }
2527
2528 /* If maximizing, find the longest string and work backwards */
2529
2530 else
2531 {
2532 pp = eptr;
2533 for (i = min; i < max; i++)
2534 {
2535 int slength;
2536 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2537 {
2538 CHECK_PARTIAL();
2539 break;
2540 }
2541 eptr += slength;
2542 }
2543 while (eptr >= pp)
2544 {
2545 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2546 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2547 eptr -= length;
2548 }
2549 MRRETURN(MATCH_NOMATCH);
2550 }
2551 /* Control never gets here */
2552
2553 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2554 used when all the characters in the class have values in the range 0-255,
2555 and either the matching is caseful, or the characters are in the range
2556 0-127 when UTF-8 processing is enabled. The only difference between
2557 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2558 encountered.
2559
2560 First, look past the end of the item to see if there is repeat information
2561 following. Then obey similar code to character type repeats - written out
2562 again for speed. */
2563
2564 case OP_NCLASS:
2565 case OP_CLASS:
2566 {
2567 data = ecode + 1; /* Save for matching */
2568 ecode += 33; /* Advance past the item */
2569
2570 switch (*ecode)
2571 {
2572 case OP_CRSTAR:
2573 case OP_CRMINSTAR:
2574 case OP_CRPLUS:
2575 case OP_CRMINPLUS:
2576 case OP_CRQUERY:
2577 case OP_CRMINQUERY:
2578 c = *ecode++ - OP_CRSTAR;
2579 minimize = (c & 1) != 0;
2580 min = rep_min[c]; /* Pick up values from tables; */
2581 max = rep_max[c]; /* zero for max => infinity */
2582 if (max == 0) max = INT_MAX;
2583 break;
2584
2585 case OP_CRRANGE:
2586 case OP_CRMINRANGE:
2587 minimize = (*ecode == OP_CRMINRANGE);
2588 min = GET2(ecode, 1);
2589 max = GET2(ecode, 3);
2590 if (max == 0) max = INT_MAX;
2591 ecode += 5;
2592 break;
2593
2594 default: /* No repeat follows */
2595 min = max = 1;
2596 break;
2597 }
2598
2599 /* First, ensure the minimum number of matches are present. */
2600
2601 #ifdef SUPPORT_UTF8
2602 /* UTF-8 mode */
2603 if (utf8)
2604 {
2605 for (i = 1; i <= min; i++)
2606 {
2607 if (eptr >= md->end_subject)
2608 {
2609 SCHECK_PARTIAL();
2610 MRRETURN(MATCH_NOMATCH);
2611 }
2612 GETCHARINC(c, eptr);
2613 if (c > 255)
2614 {
2615 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2616 }
2617 else
2618 {
2619 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2620 }
2621 }
2622 }
2623 else
2624 #endif
2625 /* Not UTF-8 mode */
2626 {
2627 for (i = 1; i <= min; i++)
2628 {
2629 if (eptr >= md->end_subject)
2630 {
2631 SCHECK_PARTIAL();
2632 MRRETURN(MATCH_NOMATCH);
2633 }
2634 c = *eptr++;
2635 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2636 }
2637 }
2638
2639 /* If max == min we can continue with the main loop without the
2640 need to recurse. */
2641
2642 if (min == max) continue;
2643
2644 /* If minimizing, keep testing the rest of the expression and advancing
2645 the pointer while it matches the class. */
2646
2647 if (minimize)
2648 {
2649 #ifdef SUPPORT_UTF8
2650 /* UTF-8 mode */
2651 if (utf8)
2652 {
2653 for (fi = min;; fi++)
2654 {
2655 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2656 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2657 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2658 if (eptr >= md->end_subject)
2659 {
2660 SCHECK_PARTIAL();
2661 MRRETURN(MATCH_NOMATCH);
2662 }
2663 GETCHARINC(c, eptr);
2664 if (c > 255)
2665 {
2666 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2667 }
2668 else
2669 {
2670 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2671 }
2672 }
2673 }
2674 else
2675 #endif
2676 /* Not UTF-8 mode */
2677 {
2678 for (fi = min;; fi++)
2679 {
2680 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2682 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2683 if (eptr >= md->end_subject)
2684 {
2685 SCHECK_PARTIAL();
2686 MRRETURN(MATCH_NOMATCH);
2687 }
2688 c = *eptr++;
2689 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2690 }
2691 }
2692 /* Control never gets here */
2693 }
2694
2695 /* If maximizing, find the longest possible run, then work backwards. */
2696
2697 else
2698 {
2699 pp = eptr;
2700
2701 #ifdef SUPPORT_UTF8
2702 /* UTF-8 mode */
2703 if (utf8)
2704 {
2705 for (i = min; i < max; i++)
2706 {
2707 int len = 1;
2708 if (eptr >= md->end_subject)
2709 {
2710 SCHECK_PARTIAL();
2711 break;
2712 }
2713 GETCHARLEN(c, eptr, len);
2714 if (c > 255)
2715 {
2716 if (op == OP_CLASS) break;
2717 }
2718 else
2719 {
2720 if ((data[c/8] & (1 << (c&7))) == 0) break;
2721 }
2722 eptr += len;
2723 }
2724 for (;;)
2725 {
2726 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2727 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2728 if (eptr-- == pp) break; /* Stop if tried at original pos */
2729 BACKCHAR(eptr);
2730 }
2731 }
2732 else
2733 #endif
2734 /* Not UTF-8 mode */
2735 {
2736 for (i = min; i < max; i++)
2737 {
2738 if (eptr >= md->end_subject)
2739 {
2740 SCHECK_PARTIAL();
2741 break;
2742 }
2743 c = *eptr;
2744 if ((data[c/8] & (1 << (c&7))) == 0) break;
2745 eptr++;
2746 }
2747 while (eptr >= pp)
2748 {
2749 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2750 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2751 eptr--;
2752 }
2753 }
2754
2755 MRRETURN(MATCH_NOMATCH);
2756 }
2757 }
2758 /* Control never gets here */
2759
2760
2761 /* Match an extended character class. This opcode is encountered only
2762 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2763 mode, because Unicode properties are supported in non-UTF-8 mode. */
2764
2765 #ifdef SUPPORT_UTF8
2766 case OP_XCLASS:
2767 {
2768 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2769 ecode += GET(ecode, 1); /* Advance past the item */
2770
2771 switch (*ecode)
2772 {
2773 case OP_CRSTAR:
2774 case OP_CRMINSTAR:
2775 case OP_CRPLUS:
2776 case OP_CRMINPLUS:
2777 case OP_CRQUERY:
2778 case OP_CRMINQUERY:
2779 c = *ecode++ - OP_CRSTAR;
2780 minimize = (c & 1) != 0;
2781 min = rep_min[c]; /* Pick up values from tables; */
2782 max = rep_max[c]; /* zero for max => infinity */
2783 if (max == 0) max = INT_MAX;
2784 break;
2785
2786 case OP_CRRANGE:
2787 case OP_CRMINRANGE:
2788 minimize = (*ecode == OP_CRMINRANGE);
2789 min = GET2(ecode, 1);
2790 max = GET2(ecode, 3);
2791 if (max == 0) max = INT_MAX;
2792 ecode += 5;
2793 break;
2794
2795 default: /* No repeat follows */
2796 min = max = 1;
2797 break;
2798 }
2799
2800 /* First, ensure the minimum number of matches are present. */
2801
2802 for (i = 1; i <= min; i++)
2803 {
2804 if (eptr >= md->end_subject)
2805 {
2806 SCHECK_PARTIAL();
2807 MRRETURN(MATCH_NOMATCH);
2808 }
2809 GETCHARINCTEST(c, eptr);
2810 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2811 }
2812
2813 /* If max == min we can continue with the main loop without the
2814 need to recurse. */
2815
2816 if (min == max) continue;
2817
2818 /* If minimizing, keep testing the rest of the expression and advancing
2819 the pointer while it matches the class. */
2820
2821 if (minimize)
2822 {
2823 for (fi = min;; fi++)
2824 {
2825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2827 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2828 if (eptr >= md->end_subject)
2829 {
2830 SCHECK_PARTIAL();
2831 MRRETURN(MATCH_NOMATCH);
2832 }
2833 GETCHARINCTEST(c, eptr);
2834 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2835 }
2836 /* Control never gets here */
2837 }
2838
2839 /* If maximizing, find the longest possible run, then work backwards. */
2840
2841 else
2842 {
2843 pp = eptr;
2844 for (i = min; i < max; i++)
2845 {
2846 int len = 1;
2847 if (eptr >= md->end_subject)
2848 {
2849 SCHECK_PARTIAL();
2850 break;
2851 }
2852 GETCHARLENTEST(c, eptr, len);
2853 if (!_pcre_xclass(c, data)) break;
2854 eptr += len;
2855 }
2856 for(;;)
2857 {
2858 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2860 if (eptr-- == pp) break; /* Stop if tried at original pos */
2861 if (utf8) BACKCHAR(eptr);
2862 }
2863 MRRETURN(MATCH_NOMATCH);
2864 }
2865
2866 /* Control never gets here */
2867 }
2868 #endif /* End of XCLASS */
2869
2870 /* Match a single character, casefully */
2871
2872 case OP_CHAR:
2873 #ifdef SUPPORT_UTF8
2874 if (utf8)
2875 {
2876 length = 1;
2877 ecode++;
2878 GETCHARLEN(fc, ecode, length);
2879 if (length > md->end_subject - eptr)
2880 {
2881 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2882 MRRETURN(MATCH_NOMATCH);
2883 }
2884 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2885 }
2886 else
2887 #endif
2888
2889 /* Non-UTF-8 mode */
2890 {
2891 if (md->end_subject - eptr < 1)
2892 {
2893 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2894 MRRETURN(MATCH_NOMATCH);
2895 }
2896 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2897 ecode += 2;
2898 }
2899 break;
2900
2901 /* Match a single character, caselessly */
2902
2903 case OP_CHARI:
2904 #ifdef SUPPORT_UTF8
2905 if (utf8)
2906 {
2907 length = 1;
2908 ecode++;
2909 GETCHARLEN(fc, ecode, length);
2910
2911 if (length > md->end_subject - eptr)
2912 {
2913 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2914 MRRETURN(MATCH_NOMATCH);
2915 }
2916
2917 /* If the pattern character's value is < 128, we have only one byte, and
2918 can use the fast lookup table. */
2919
2920 if (fc < 128)
2921 {
2922 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2923 }
2924
2925 /* Otherwise we must pick up the subject character */
2926
2927 else
2928 {
2929 unsigned int dc;
2930 GETCHARINC(dc, eptr);
2931 ecode += length;
2932
2933 /* If we have Unicode property support, we can use it to test the other
2934 case of the character, if there is one. */
2935
2936 if (fc != dc)
2937 {
2938 #ifdef SUPPORT_UCP
2939 if (dc != UCD_OTHERCASE(fc))
2940 #endif
2941 MRRETURN(MATCH_NOMATCH);
2942 }
2943 }
2944 }
2945 else
2946 #endif /* SUPPORT_UTF8 */
2947
2948 /* Non-UTF-8 mode */
2949 {
2950 if (md->end_subject - eptr < 1)
2951 {
2952 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2953 MRRETURN(MATCH_NOMATCH);
2954 }
2955 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2956 ecode += 2;
2957 }
2958 break;
2959
2960 /* Match a single character repeatedly. */
2961
2962 case OP_EXACT:
2963 case OP_EXACTI:
2964 min = max = GET2(ecode, 1);
2965 ecode += 3;
2966 goto REPEATCHAR;
2967
2968 case OP_POSUPTO:
2969 case OP_POSUPTOI:
2970 possessive = TRUE;
2971 /* Fall through */
2972
2973 case OP_UPTO:
2974 case OP_UPTOI:
2975 case OP_MINUPTO:
2976 case OP_MINUPTOI:
2977 min = 0;
2978 max = GET2(ecode, 1);
2979 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2980 ecode += 3;
2981 goto REPEATCHAR;
2982
2983 case OP_POSSTAR:
2984 case OP_POSSTARI:
2985 possessive = TRUE;
2986 min = 0;
2987 max = INT_MAX;
2988 ecode++;
2989 goto REPEATCHAR;
2990
2991 case OP_POSPLUS:
2992 case OP_POSPLUSI:
2993 possessive = TRUE;
2994 min = 1;
2995 max = INT_MAX;
2996 ecode++;
2997 goto REPEATCHAR;
2998
2999 case OP_POSQUERY:
3000 case OP_POSQUERYI:
3001 possessive = TRUE;
3002 min = 0;
3003 max = 1;
3004 ecode++;
3005 goto REPEATCHAR;
3006
3007 case OP_STAR:
3008 case OP_STARI:
3009 case OP_MINSTAR:
3010 case OP_MINSTARI:
3011 case OP_PLUS:
3012 case OP_PLUSI:
3013 case OP_MINPLUS:
3014 case OP_MINPLUSI:
3015 case OP_QUERY:
3016 case OP_QUERYI:
3017 case OP_MINQUERY:
3018 case OP_MINQUERYI:
3019 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3020 minimize = (c & 1) != 0;
3021 min = rep_min[c]; /* Pick up values from tables; */
3022 max = rep_max[c]; /* zero for max => infinity */
3023 if (max == 0) max = INT_MAX;
3024
3025 /* Common code for all repeated single-character matches. */
3026
3027 REPEATCHAR:
3028 #ifdef SUPPORT_UTF8
3029 if (utf8)
3030 {
3031 length = 1;
3032 charptr = ecode;
3033 GETCHARLEN(fc, ecode, length);
3034 ecode += length;
3035
3036 /* Handle multibyte character matching specially here. There is
3037 support for caseless matching if UCP support is present. */
3038
3039 if (length > 1)
3040 {
3041 #ifdef SUPPORT_UCP
3042 unsigned int othercase;
3043 if (op >= OP_STARI && /* Caseless */
3044 (othercase = UCD_OTHERCASE(fc)) != fc)
3045 oclength = _pcre_ord2utf8(othercase, occhars);
3046 else oclength = 0;
3047 #endif /* SUPPORT_UCP */
3048
3049 for (i = 1; i <= min; i++)
3050 {
3051 if (eptr <= md->end_subject - length &&
3052 memcmp(eptr, charptr, length) == 0) eptr += length;
3053 #ifdef SUPPORT_UCP
3054 else if (oclength > 0 &&
3055 eptr <= md->end_subject - oclength &&
3056 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3057 #endif /* SUPPORT_UCP */
3058 else
3059 {
3060 CHECK_PARTIAL();
3061 MRRETURN(MATCH_NOMATCH);
3062 }
3063 }
3064
3065 if (min == max) continue;
3066
3067 if (minimize)
3068 {
3069 for (fi = min;; fi++)
3070 {
3071 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3072 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3073 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3074 if (eptr <= md->end_subject - length &&
3075 memcmp(eptr, charptr, length) == 0) eptr += length;
3076 #ifdef SUPPORT_UCP
3077 else if (oclength > 0 &&
3078 eptr <= md->end_subject - oclength &&
3079 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3080 #endif /* SUPPORT_UCP */
3081 else
3082 {
3083 CHECK_PARTIAL();
3084 MRRETURN(MATCH_NOMATCH);
3085 }
3086 }
3087 /* Control never gets here */
3088 }
3089
3090 else /* Maximize */
3091 {
3092 pp = eptr;
3093 for (i = min; i < max; i++)
3094 {
3095 if (eptr <= md->end_subject - length &&
3096 memcmp(eptr, charptr, length) == 0) eptr += length;
3097 #ifdef SUPPORT_UCP
3098 else if (oclength > 0 &&
3099 eptr <= md->end_subject - oclength &&
3100 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3101 #endif /* SUPPORT_UCP */
3102 else
3103 {
3104 CHECK_PARTIAL();
3105 break;
3106 }
3107 }
3108
3109 if (possessive) continue;
3110
3111 for(;;)
3112 {
3113 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3114 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3115 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3116 #ifdef SUPPORT_UCP
3117 eptr--;
3118 BACKCHAR(eptr);
3119 #else /* without SUPPORT_UCP */
3120 eptr -= length;
3121 #endif /* SUPPORT_UCP */
3122 }
3123 }
3124 /* Control never gets here */
3125 }
3126
3127 /* If the length of a UTF-8 character is 1, we fall through here, and
3128 obey the code as for non-UTF-8 characters below, though in this case the
3129 value of fc will always be < 128. */
3130 }
3131 else
3132 #endif /* SUPPORT_UTF8 */
3133
3134 /* When not in UTF-8 mode, load a single-byte character. */
3135
3136 fc = *ecode++;
3137
3138 /* The value of fc at this point is always less than 256, though we may or
3139 may not be in UTF-8 mode. The code is duplicated for the caseless and
3140 caseful cases, for speed, since matching characters is likely to be quite
3141 common. First, ensure the minimum number of matches are present. If min =
3142 max, continue at the same level without recursing. Otherwise, if
3143 minimizing, keep trying the rest of the expression and advancing one
3144 matching character if failing, up to the maximum. Alternatively, if
3145 maximizing, find the maximum number of characters and work backwards. */
3146
3147 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3148 max, eptr));
3149
3150 if (op >= OP_STARI) /* Caseless */
3151 {
3152 fc = md->lcc[fc];
3153 for (i = 1; i <= min; i++)
3154 {
3155 if (eptr >= md->end_subject)
3156 {
3157 SCHECK_PARTIAL();
3158 MRRETURN(MATCH_NOMATCH);
3159 }
3160 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3161 }
3162 if (min == max) continue;
3163 if (minimize)
3164 {
3165 for (fi = min;; fi++)
3166 {
3167 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3168 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3169 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3170 if (eptr >= md->end_subject)
3171 {
3172 SCHECK_PARTIAL();
3173 MRRETURN(MATCH_NOMATCH);
3174 }
3175 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3176 }
3177 /* Control never gets here */
3178 }
3179 else /* Maximize */
3180 {
3181 pp = eptr;
3182 for (i = min; i < max; i++)
3183 {
3184 if (eptr >= md->end_subject)
3185 {
3186 SCHECK_PARTIAL();
3187 break;
3188 }
3189 if (fc != md->lcc[*eptr]) break;
3190 eptr++;
3191 }
3192
3193 if (possessive) continue;
3194
3195 while (eptr >= pp)
3196 {
3197 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3198 eptr--;
3199 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3200 }
3201 MRRETURN(MATCH_NOMATCH);
3202 }
3203 /* Control never gets here */
3204 }
3205
3206 /* Caseful comparisons (includes all multi-byte characters) */
3207
3208 else
3209 {
3210 for (i = 1; i <= min; i++)
3211 {
3212 if (eptr >= md->end_subject)
3213 {
3214 SCHECK_PARTIAL();
3215 MRRETURN(MATCH_NOMATCH);
3216 }
3217 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3218 }
3219
3220 if (min == max) continue;
3221
3222 if (minimize)
3223 {
3224 for (fi = min;; fi++)
3225 {
3226 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3228 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3229 if (eptr >= md->end_subject)
3230 {
3231 SCHECK_PARTIAL();
3232 MRRETURN(MATCH_NOMATCH);
3233 }
3234 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3235 }
3236 /* Control never gets here */
3237 }
3238 else /* Maximize */
3239 {
3240 pp = eptr;
3241 for (i = min; i < max; i++)
3242 {
3243 if (eptr >= md->end_subject)
3244 {
3245 SCHECK_PARTIAL();
3246 break;
3247 }
3248 if (fc != *eptr) break;
3249 eptr++;
3250 }
3251 if (possessive) continue;
3252
3253 while (eptr >= pp)
3254 {
3255 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3256 eptr--;
3257 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3258 }
3259 MRRETURN(MATCH_NOMATCH);
3260 }
3261 }
3262 /* Control never gets here */
3263
3264 /* Match a negated single one-byte character. The character we are
3265 checking can be multibyte. */
3266
3267 case OP_NOT:
3268 case OP_NOTI:
3269 if (eptr >= md->end_subject)
3270 {
3271 SCHECK_PARTIAL();
3272 MRRETURN(MATCH_NOMATCH);
3273 }
3274 ecode++;
3275 GETCHARINCTEST(c, eptr);
3276 if (op == OP_NOTI) /* The caseless case */
3277 {
3278 #ifdef SUPPORT_UTF8
3279 if (c < 256)
3280 #endif
3281 c = md->lcc[c];
3282 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3283 }
3284 else /* Caseful */
3285 {
3286 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3287 }
3288 break;
3289
3290 /* Match a negated single one-byte character repeatedly. This is almost a
3291 repeat of the code for a repeated single character, but I haven't found a
3292 nice way of commoning these up that doesn't require a test of the
3293 positive/negative option for each character match. Maybe that wouldn't add
3294 very much to the time taken, but character matching *is* what this is all
3295 about... */
3296
3297 case OP_NOTEXACT:
3298 case OP_NOTEXACTI:
3299 min = max = GET2(ecode, 1);
3300 ecode += 3;
3301 goto REPEATNOTCHAR;
3302
3303 case OP_NOTUPTO:
3304 case OP_NOTUPTOI:
3305 case OP_NOTMINUPTO:
3306 case OP_NOTMINUPTOI:
3307 min = 0;
3308 max = GET2(ecode, 1);
3309 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3310 ecode += 3;
3311 goto REPEATNOTCHAR;
3312
3313 case OP_NOTPOSSTAR:
3314 case OP_NOTPOSSTARI:
3315 possessive = TRUE;
3316 min = 0;
3317 max = INT_MAX;
3318 ecode++;
3319 goto REPEATNOTCHAR;
3320
3321 case OP_NOTPOSPLUS:
3322 case OP_NOTPOSPLUSI:
3323 possessive = TRUE;
3324 min = 1;
3325 max = INT_MAX;
3326 ecode++;
3327 goto REPEATNOTCHAR;
3328
3329 case OP_NOTPOSQUERY:
3330 case OP_NOTPOSQUERYI:
3331 possessive = TRUE;
3332 min = 0;
3333 max = 1;
3334 ecode++;
3335 goto REPEATNOTCHAR;
3336
3337 case OP_NOTPOSUPTO:
3338 case OP_NOTPOSUPTOI:
3339 possessive = TRUE;
3340 min = 0;
3341 max = GET2(ecode, 1);
3342 ecode += 3;
3343 goto REPEATNOTCHAR;
3344
3345 case OP_NOTSTAR:
3346 case OP_NOTSTARI:
3347 case OP_NOTMINSTAR:
3348 case OP_NOTMINSTARI:
3349 case OP_NOTPLUS:
3350 case OP_NOTPLUSI:
3351 case OP_NOTMINPLUS:
3352 case OP_NOTMINPLUSI:
3353 case OP_NOTQUERY:
3354 case OP_NOTQUERYI:
3355 case OP_NOTMINQUERY:
3356 case OP_NOTMINQUERYI:
3357 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3358 minimize = (c & 1) != 0;
3359 min = rep_min[c]; /* Pick up values from tables; */
3360 max = rep_max[c]; /* zero for max => infinity */
3361 if (max == 0) max = INT_MAX;
3362
3363 /* Common code for all repeated single-byte matches. */
3364
3365 REPEATNOTCHAR:
3366 fc = *ecode++;
3367
3368 /* The code is duplicated for the caseless and caseful cases, for speed,
3369 since matching characters is likely to be quite common. First, ensure the
3370 minimum number of matches are present. If min = max, continue at the same
3371 level without recursing. Otherwise, if minimizing, keep trying the rest of
3372 the expression and advancing one matching character if failing, up to the
3373 maximum. Alternatively, if maximizing, find the maximum number of
3374 characters and work backwards. */
3375
3376 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3377 max, eptr));
3378
3379 if (op >= OP_NOTSTARI) /* Caseless */
3380 {
3381 fc = md->lcc[fc];
3382
3383 #ifdef SUPPORT_UTF8
3384 /* UTF-8 mode */
3385 if (utf8)
3386 {
3387 register unsigned int d;
3388 for (i = 1; i <= min; i++)
3389 {
3390 if (eptr >= md->end_subject)
3391 {
3392 SCHECK_PARTIAL();
3393 MRRETURN(MATCH_NOMATCH);
3394 }
3395 GETCHARINC(d, eptr);
3396 if (d < 256) d = md->lcc[d];
3397 if (fc == d) MRRETURN(MATCH_NOMATCH);
3398 }
3399 }
3400 else
3401 #endif
3402
3403 /* Not UTF-8 mode */
3404 {
3405 for (i = 1; i <= min; i++)
3406 {
3407 if (eptr >= md->end_subject)
3408 {
3409 SCHECK_PARTIAL();
3410 MRRETURN(MATCH_NOMATCH);
3411 }
3412 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3413 }
3414 }
3415
3416 if (min == max) continue;
3417
3418 if (minimize)
3419 {
3420 #ifdef SUPPORT_UTF8
3421 /* UTF-8 mode */
3422 if (utf8)
3423 {
3424 register unsigned int d;
3425 for (fi = min;; fi++)
3426 {
3427 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3430 if (eptr >= md->end_subject)
3431 {
3432 SCHECK_PARTIAL();
3433 MRRETURN(MATCH_NOMATCH);
3434 }
3435 GETCHARINC(d, eptr);
3436 if (d < 256) d = md->lcc[d];
3437 if (fc == d) MRRETURN(MATCH_NOMATCH);
3438 }
3439 }
3440 else
3441 #endif
3442 /* Not UTF-8 mode */
3443 {
3444 for (fi = min;; fi++)
3445 {
3446 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3448 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3449 if (eptr >= md->end_subject)
3450 {
3451 SCHECK_PARTIAL();
3452 MRRETURN(MATCH_NOMATCH);
3453 }
3454 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3455 }
3456 }
3457 /* Control never gets here */
3458 }
3459
3460 /* Maximize case */
3461
3462 else
3463 {
3464 pp = eptr;
3465
3466 #ifdef SUPPORT_UTF8
3467 /* UTF-8 mode */
3468 if (utf8)
3469 {
3470 register unsigned int d;
3471 for (i = min; i < max; i++)
3472 {
3473 int len = 1;
3474 if (eptr >= md->end_subject)
3475 {
3476 SCHECK_PARTIAL();
3477 break;
3478 }
3479 GETCHARLEN(d, eptr, len);
3480 if (d < 256) d = md->lcc[d];
3481 if (fc == d) break;
3482 eptr += len;
3483 }
3484 if (possessive) continue;
3485 for(;;)
3486 {
3487 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3488 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3489 if (eptr-- == pp) break; /* Stop if tried at original pos */
3490 BACKCHAR(eptr);
3491 }
3492 }
3493 else
3494 #endif
3495 /* Not UTF-8 mode */
3496 {
3497 for (i = min; i < max; i++)
3498 {
3499 if (eptr >= md->end_subject)
3500 {
3501 SCHECK_PARTIAL();
3502 break;
3503 }
3504 if (fc == md->lcc[*eptr]) break;
3505 eptr++;
3506 }
3507 if (possessive) continue;
3508 while (eptr >= pp)
3509 {
3510 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3511 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3512 eptr--;
3513 }
3514 }
3515
3516 MRRETURN(MATCH_NOMATCH);
3517 }
3518 /* Control never gets here */
3519 }
3520
3521 /* Caseful comparisons */
3522
3523 else
3524 {
3525 #ifdef SUPPORT_UTF8
3526 /* UTF-8 mode */
3527 if (utf8)
3528 {
3529 register unsigned int d;
3530 for (i = 1; i <= min; i++)
3531 {
3532 if (eptr >= md->end_subject)
3533 {
3534 SCHECK_PARTIAL();
3535 MRRETURN(MATCH_NOMATCH);
3536 }
3537 GETCHARINC(d, eptr);
3538 if (fc == d) MRRETURN(MATCH_NOMATCH);
3539 }
3540 }
3541 else
3542 #endif
3543 /* Not UTF-8 mode */
3544 {
3545 for (i = 1; i <= min; i++)
3546 {
3547 if (eptr >= md->end_subject)
3548 {
3549 SCHECK_PARTIAL();
3550 MRRETURN(MATCH_NOMATCH);
3551 }
3552 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3553 }
3554 }
3555
3556 if (min == max) continue;
3557
3558 if (minimize)
3559 {
3560 #ifdef SUPPORT_UTF8
3561 /* UTF-8 mode */
3562 if (utf8)
3563 {
3564 register unsigned int d;
3565 for (fi = min;; fi++)
3566 {
3567 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3569 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3570 if (eptr >= md->end_subject)
3571 {
3572 SCHECK_PARTIAL();
3573 MRRETURN(MATCH_NOMATCH);
3574 }
3575 GETCHARINC(d, eptr);
3576 if (fc == d) MRRETURN(MATCH_NOMATCH);
3577 }
3578 }
3579 else
3580 #endif
3581 /* Not UTF-8 mode */
3582 {
3583 for (fi = min;; fi++)
3584 {
3585 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3586 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3588 if (eptr >= md->end_subject)
3589 {
3590 SCHECK_PARTIAL();
3591 MRRETURN(MATCH_NOMATCH);
3592 }
3593 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3594 }
3595 }
3596 /* Control never gets here */
3597 }
3598
3599 /* Maximize case */
3600
3601 else
3602 {
3603 pp = eptr;
3604
3605 #ifdef SUPPORT_UTF8
3606 /* UTF-8 mode */
3607 if (utf8)
3608 {
3609 register unsigned int d;
3610 for (i = min; i < max; i++)
3611 {
3612 int len = 1;
3613 if (eptr >= md->end_subject)
3614 {
3615 SCHECK_PARTIAL();
3616 break;
3617 }
3618 GETCHARLEN(d, eptr, len);
3619 if (fc == d) break;
3620 eptr += len;
3621 }
3622 if (possessive) continue;
3623 for(;;)
3624 {
3625 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3627 if (eptr-- == pp) break; /* Stop if tried at original pos */
3628 BACKCHAR(eptr);
3629 }
3630 }
3631 else
3632 #endif
3633 /* Not UTF-8 mode */
3634 {
3635 for (i = min; i < max; i++)
3636 {
3637 if (eptr >= md->end_subject)
3638 {
3639 SCHECK_PARTIAL();
3640 break;
3641 }
3642 if (fc == *eptr) break;
3643 eptr++;
3644 }
3645 if (possessive) continue;
3646 while (eptr >= pp)
3647 {
3648 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3650 eptr--;
3651 }
3652 }
3653
3654 MRRETURN(MATCH_NOMATCH);
3655 }
3656 }
3657 /* Control never gets here */
3658
3659 /* Match a single character type repeatedly; several different opcodes
3660 share code. This is very similar to the code for single characters, but we
3661 repeat it in the interests of efficiency. */
3662
3663 case OP_TYPEEXACT:
3664 min = max = GET2(ecode, 1);
3665 minimize = TRUE;
3666 ecode += 3;
3667 goto REPEATTYPE;
3668
3669 case OP_TYPEUPTO:
3670 case OP_TYPEMINUPTO:
3671 min = 0;
3672 max = GET2(ecode, 1);
3673 minimize = *ecode == OP_TYPEMINUPTO;
3674 ecode += 3;
3675 goto REPEATTYPE;
3676
3677 case OP_TYPEPOSSTAR:
3678 possessive = TRUE;
3679 min = 0;
3680 max = INT_MAX;
3681 ecode++;
3682 goto REPEATTYPE;
3683
3684 case OP_TYPEPOSPLUS:
3685 possessive = TRUE;
3686 min = 1;
3687 max = INT_MAX;
3688 ecode++;
3689 goto REPEATTYPE;
3690
3691 case OP_TYPEPOSQUERY:
3692 possessive = TRUE;
3693 min = 0;
3694 max = 1;
3695 ecode++;
3696 goto REPEATTYPE;
3697
3698 case OP_TYPEPOSUPTO:
3699 possessive = TRUE;
3700 min = 0;
3701 max = GET2(ecode, 1);
3702 ecode += 3;
3703 goto REPEATTYPE;
3704
3705 case OP_TYPESTAR:
3706 case OP_TYPEMINSTAR:
3707 case OP_TYPEPLUS:
3708 case OP_TYPEMINPLUS:
3709 case OP_TYPEQUERY:
3710 case OP_TYPEMINQUERY:
3711 c = *ecode++ - OP_TYPESTAR;
3712 minimize = (c & 1) != 0;
3713 min = rep_min[c]; /* Pick up values from tables; */
3714 max = rep_max[c]; /* zero for max => infinity */
3715 if (max == 0) max = INT_MAX;
3716
3717 /* Common code for all repeated single character type matches. Note that
3718 in UTF-8 mode, '.' matches a character of any length, but for the other
3719 character types, the valid characters are all one-byte long. */
3720
3721 REPEATTYPE:
3722 ctype = *ecode++; /* Code for the character type */
3723
3724 #ifdef SUPPORT_UCP
3725 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3726 {
3727 prop_fail_result = ctype == OP_NOTPROP;
3728 prop_type = *ecode++;
3729 prop_value = *ecode++;
3730 }
3731 else prop_type = -1;
3732 #endif
3733
3734 /* First, ensure the minimum number of matches are present. Use inline
3735 code for maximizing the speed, and do the type test once at the start
3736 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3737 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3738 and single-bytes. */
3739
3740 if (min > 0)
3741 {
3742 #ifdef SUPPORT_UCP
3743 if (prop_type >= 0)
3744 {
3745 switch(prop_type)
3746 {
3747 case PT_ANY:
3748 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3749 for (i = 1; i <= min; i++)
3750 {
3751 if (eptr >= md->end_subject)
3752 {
3753 SCHECK_PARTIAL();
3754 MRRETURN(MATCH_NOMATCH);
3755 }
3756 GETCHARINCTEST(c, eptr);
3757 }
3758 break;
3759
3760 case PT_LAMP:
3761 for (i = 1; i <= min; i++)
3762 {
3763 if (eptr >= md->end_subject)
3764 {
3765 SCHECK_PARTIAL();
3766 MRRETURN(MATCH_NOMATCH);
3767 }
3768 GETCHARINCTEST(c, eptr);
3769 prop_chartype = UCD_CHARTYPE(c);
3770 if ((prop_chartype == ucp_Lu ||
3771 prop_chartype == ucp_Ll ||
3772 prop_chartype == ucp_Lt) == prop_fail_result)
3773 MRRETURN(MATCH_NOMATCH);
3774 }
3775 break;
3776
3777 case PT_GC:
3778 for (i = 1; i <= min; i++)
3779 {
3780 if (eptr >= md->end_subject)
3781 {
3782 SCHECK_PARTIAL();
3783 MRRETURN(MATCH_NOMATCH);
3784 }
3785 GETCHARINCTEST(c, eptr);
3786 prop_category = UCD_CATEGORY(c);
3787 if ((prop_category == prop_value) == prop_fail_result)
3788 MRRETURN(MATCH_NOMATCH);
3789 }
3790 break;
3791
3792 case PT_PC:
3793 for (i = 1; i <= min; i++)
3794 {
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 MRRETURN(MATCH_NOMATCH);
3799 }
3800 GETCHARINCTEST(c, eptr);
3801 prop_chartype = UCD_CHARTYPE(c);
3802 if ((prop_chartype == prop_value) == prop_fail_result)
3803 MRRETURN(MATCH_NOMATCH);
3804 }
3805 break;
3806
3807 case PT_SC:
3808 for (i = 1; i <= min; i++)
3809 {
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 MRRETURN(MATCH_NOMATCH);
3814 }
3815 GETCHARINCTEST(c, eptr);
3816 prop_script = UCD_SCRIPT(c);
3817 if ((prop_script == prop_value) == prop_fail_result)
3818 MRRETURN(MATCH_NOMATCH);
3819 }
3820 break;
3821
3822 case PT_ALNUM:
3823 for (i = 1; i <= min; i++)
3824 {
3825 if (eptr >= md->end_subject)
3826 {
3827 SCHECK_PARTIAL();
3828 MRRETURN(MATCH_NOMATCH);
3829 }
3830 GETCHARINCTEST(c, eptr);
3831 prop_category = UCD_CATEGORY(c);
3832 if ((prop_category == ucp_L || prop_category == ucp_N)
3833 == prop_fail_result)
3834 MRRETURN(MATCH_NOMATCH);
3835 }
3836 break;
3837
3838 case PT_SPACE: /* Perl space */
3839 for (i = 1; i <= min; i++)
3840 {
3841 if (eptr >= md->end_subject)
3842 {
3843 SCHECK_PARTIAL();
3844 MRRETURN(MATCH_NOMATCH);
3845 }
3846 GETCHARINCTEST(c, eptr);
3847 prop_category = UCD_CATEGORY(c);
3848 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3849 c == CHAR_FF || c == CHAR_CR)
3850 == prop_fail_result)
3851 MRRETURN(MATCH_NOMATCH);
3852 }
3853 break;
3854
3855 case PT_PXSPACE: /* POSIX space */
3856 for (i = 1; i <= min; i++)
3857 {
3858 if (eptr >= md->end_subject)
3859 {
3860 SCHECK_PARTIAL();
3861 MRRETURN(MATCH_NOMATCH);
3862 }
3863 GETCHARINCTEST(c, eptr);
3864 prop_category = UCD_CATEGORY(c);
3865 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3866 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3867 == prop_fail_result)
3868 MRRETURN(MATCH_NOMATCH);
3869 }
3870 break;
3871
3872 case PT_WORD:
3873 for (i = 1; i <= min; i++)
3874 {
3875 if (eptr >= md->end_subject)
3876 {
3877 SCHECK_PARTIAL();
3878 MRRETURN(MATCH_NOMATCH);
3879 }
3880 GETCHARINCTEST(c, eptr);
3881 prop_category = UCD_CATEGORY(c);
3882 if ((prop_category == ucp_L || prop_category == ucp_N ||
3883 c == CHAR_UNDERSCORE)
3884 == prop_fail_result)
3885 MRRETURN(MATCH_NOMATCH);
3886 }
3887 break;
3888
3889 /* This should not occur */
3890
3891 default:
3892 RRETURN(PCRE_ERROR_INTERNAL);
3893 }
3894 }
3895
3896 /* Match extended Unicode sequences. We will get here only if the
3897 support is in the binary; otherwise a compile-time error occurs. */
3898
3899 else if (ctype == OP_EXTUNI)
3900 {
3901 for (i = 1; i <= min; i++)
3902 {
3903 if (eptr >= md->end_subject)
3904 {
3905 SCHECK_PARTIAL();
3906 MRRETURN(MATCH_NOMATCH);
3907 }
3908 GETCHARINCTEST(c, eptr);
3909 prop_category = UCD_CATEGORY(c);
3910 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3911 while (eptr < md->end_subject)
3912 {
3913 int len = 1;
3914 if (!utf8) c = *eptr;
3915 else { GETCHARLEN(c, eptr, len); }
3916 prop_category = UCD_CATEGORY(c);
3917 if (prop_category != ucp_M) break;
3918 eptr += len;
3919 }
3920 }
3921 }
3922
3923 else
3924 #endif /* SUPPORT_UCP */
3925
3926 /* Handle all other cases when the coding is UTF-8 */
3927
3928 #ifdef SUPPORT_UTF8
3929 if (utf8) switch(ctype)
3930 {
3931 case OP_ANY:
3932 for (i = 1; i <= min; i++)
3933 {
3934 if (eptr >= md->end_subject)
3935 {
3936 SCHECK_PARTIAL();
3937 MRRETURN(MATCH_NOMATCH);
3938 }
3939 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3940 eptr++;
3941 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3942 }
3943 break;
3944
3945 case OP_ALLANY:
3946 for (i = 1; i <= min; i++)
3947 {
3948 if (eptr >= md->end_subject)
3949 {
3950 SCHECK_PARTIAL();
3951 MRRETURN(MATCH_NOMATCH);
3952 }
3953 eptr++;
3954 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3955 }
3956 break;
3957
3958 case OP_ANYBYTE:
3959 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3960 eptr += min;
3961 break;
3962
3963 case OP_ANYNL:
3964 for (i = 1; i <= min; i++)
3965 {
3966 if (eptr >= md->end_subject)
3967 {
3968 SCHECK_PARTIAL();
3969 MRRETURN(MATCH_NOMATCH);
3970 }
3971 GETCHARINC(c, eptr);
3972 switch(c)
3973 {
3974 default: MRRETURN(MATCH_NOMATCH);
3975
3976 case 0x000d:
3977 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3978 break;
3979
3980 case 0x000a:
3981 break;
3982
3983 case 0x000b:
3984 case 0x000c:
3985 case 0x0085:
3986 case 0x2028:
3987 case 0x2029:
3988 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3989 break;
3990 }
3991 }
3992 break;
3993
3994 case OP_NOT_HSPACE:
3995 for (i = 1; i <= min; i++)
3996 {
3997 if (eptr >= md->end_subject)
3998 {
3999 SCHECK_PARTIAL();
4000 MRRETURN(MATCH_NOMATCH);
4001 }
4002 GETCHARINC(c, eptr);
4003 switch(c)
4004 {
4005 default: break;
4006 case 0x09: /* HT */
4007 case 0x20: /* SPACE */
4008 case 0xa0: /* NBSP */
4009 case 0x1680: /* OGHAM SPACE MARK */
4010 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4011 case 0x2000: /* EN QUAD */
4012 case 0x2001: /* EM QUAD */
4013 case 0x2002: /* EN SPACE */
4014 case 0x2003: /* EM SPACE */
4015 case 0x2004: /* THREE-PER-EM SPACE */
4016 case 0x2005: /* FOUR-PER-EM SPACE */
4017 case 0x2006: /* SIX-PER-EM SPACE */
4018 case 0x2007: /* FIGURE SPACE */
4019 case 0x2008: /* PUNCTUATION SPACE */
4020 case 0x2009: /* THIN SPACE */
4021 case 0x200A: /* HAIR SPACE */
4022 case 0x202f: /* NARROW NO-BREAK SPACE */
4023 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4024 case 0x3000: /* IDEOGRAPHIC SPACE */
4025 MRRETURN(MATCH_NOMATCH);
4026 }
4027 }
4028 break;
4029
4030 case OP_HSPACE:
4031 for (i = 1; i <= min; i++)
4032 {
4033 if (eptr >= md->end_subject)
4034 {
4035 SCHECK_PARTIAL();
4036 MRRETURN(MATCH_NOMATCH);
4037 }
4038 GETCHARINC(c, eptr);
4039 switch(c)
4040 {
4041 default: MRRETURN(MATCH_NOMATCH);
4042 case 0x09: /* HT */
4043 case 0x20: /* SPACE */
4044 case 0xa0: /* NBSP */
4045 case 0x1680: /* OGHAM SPACE MARK */
4046 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4047 case 0x2000: /* EN QUAD */
4048 case 0x2001: /* EM QUAD */
4049 case 0x2002: /* EN SPACE */
4050 case 0x2003: /* EM SPACE */
4051 case 0x2004: /* THREE-PER-EM SPACE */
4052 case 0x2005: /* FOUR-PER-EM SPACE */
4053 case 0x2006: /* SIX-PER-EM SPACE */
4054 case 0x2007: /* FIGURE SPACE */
4055 case 0x2008: /* PUNCTUATION SPACE */
4056 case 0x2009: /* THIN SPACE */
4057 case 0x200A: /* HAIR SPACE */
4058 case 0x202f: /* NARROW NO-BREAK SPACE */
4059 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4060 case 0x3000: /* IDEOGRAPHIC SPACE */
4061 break;
4062 }
4063 }
4064 break;
4065
4066 case OP_NOT_VSPACE:
4067 for (i = 1; i <= min; i++)
4068 {
4069 if (eptr >= md->end_subject)
4070 {
4071 SCHECK_PARTIAL();
4072 MRRETURN(MATCH_NOMATCH);
4073 }
4074 GETCHARINC(c, eptr);
4075 switch(c)
4076 {
4077 default: break;
4078 case 0x0a: /* LF */
4079 case 0x0b: /* VT */
4080 case 0x0c: /* FF */
4081 case 0x0d: /* CR */
4082 case 0x85: /* NEL */
4083 case 0x2028: /* LINE SEPARATOR */
4084 case 0x2029: /* PARAGRAPH SEPARATOR */
4085 MRRETURN(MATCH_NOMATCH);
4086 }
4087 }
4088 break;
4089
4090 case OP_VSPACE:
4091 for (i = 1; i <= min; i++)
4092 {
4093 if (eptr >= md->end_subject)
4094 {
4095 SCHECK_PARTIAL();
4096 MRRETURN(MATCH_NOMATCH);
4097 }
4098 GETCHARINC(c, eptr);
4099 switch(c)
4100 {
4101 default: MRRETURN(MATCH_NOMATCH);
4102 case 0x0a: /* LF */
4103 case 0x0b: /* VT */
4104 case 0x0c: /* FF */
4105 case 0x0d: /* CR */
4106 case 0x85: /* NEL */
4107 case 0x2028: /* LINE SEPARATOR */
4108 case 0x2029: /* PARAGRAPH SEPARATOR */
4109 break;
4110 }
4111 }
4112 break;
4113
4114 case OP_NOT_DIGIT:
4115 for (i = 1; i <= min; i++)
4116 {
4117 if (eptr >= md->end_subject)
4118 {
4119 SCHECK_PARTIAL();
4120 MRRETURN(MATCH_NOMATCH);
4121 }
4122 GETCHARINC(c, eptr);
4123 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4124 MRRETURN(MATCH_NOMATCH);
4125 }
4126 break;
4127
4128 case OP_DIGIT:
4129 for (i = 1; i <= min; i++)
4130 {
4131 if (eptr >= md->end_subject)
4132 {
4133 SCHECK_PARTIAL();
4134 MRRETURN(MATCH_NOMATCH);
4135 }
4136 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4137 MRRETURN(MATCH_NOMATCH);
4138 /* No need to skip more bytes - we know it's a 1-byte character */
4139 }
4140 break;
4141
4142 case OP_NOT_WHITESPACE:
4143 for (i = 1; i <= min; i++)
4144 {
4145 if (eptr >= md->end_subject)
4146 {
4147 SCHECK_PARTIAL();
4148 MRRETURN(MATCH_NOMATCH);
4149 }
4150 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4151 MRRETURN(MATCH_NOMATCH);
4152 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4153 }
4154 break;
4155
4156 case OP_WHITESPACE:
4157 for (i = 1; i <= min; i++)
4158 {
4159 if (eptr >= md->end_subject)
4160 {
4161 SCHECK_PARTIAL();
4162 MRRETURN(MATCH_NOMATCH);
4163 }
4164 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4165 MRRETURN(MATCH_NOMATCH);
4166 /* No need to skip more bytes - we know it's a 1-byte character */
4167 }
4168 break;
4169
4170 case OP_NOT_WORDCHAR:
4171 for (i = 1; i <= min; i++)
4172 {
4173 if (eptr >= md->end_subject)
4174 {
4175 SCHECK_PARTIAL();
4176 MRRETURN(MATCH_NOMATCH);
4177 }
4178 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4179 MRRETURN(MATCH_NOMATCH);
4180 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4181 }
4182 break;
4183
4184 case OP_WORDCHAR:
4185 for (i = 1; i <= min; i++)
4186 {
4187 if (eptr >= md->end_subject)
4188 {
4189 SCHECK_PARTIAL();
4190 MRRETURN(MATCH_NOMATCH);
4191 }
4192 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4193 MRRETURN(MATCH_NOMATCH);
4194 /* No need to skip more bytes - we know it's a 1-byte character */
4195 }
4196 break;
4197
4198 default:
4199 RRETURN(PCRE_ERROR_INTERNAL);
4200 } /* End switch(ctype) */
4201
4202 else
4203 #endif /* SUPPORT_UTF8 */
4204
4205 /* Code for the non-UTF-8 case for minimum matching of operators other
4206 than OP_PROP and OP_NOTPROP. */
4207
4208 switch(ctype)
4209 {
4210 case OP_ANY:
4211 for (i = 1; i <= min; i++)
4212 {
4213 if (eptr >= md->end_subject)
4214 {
4215 SCHECK_PARTIAL();
4216 MRRETURN(MATCH_NOMATCH);
4217 }
4218 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4219 eptr++;
4220 }
4221 break;
4222
4223 case OP_ALLANY:
4224 if (eptr > md->end_subject - min)
4225 {
4226 SCHECK_PARTIAL();
4227 MRRETURN(MATCH_NOMATCH);
4228 }
4229 eptr += min;
4230 break;
4231
4232 case OP_ANYBYTE:
4233 if (eptr > md->end_subject - min)
4234 {
4235 SCHECK_PARTIAL();
4236 MRRETURN(MATCH_NOMATCH);
4237 }
4238 eptr += min;
4239 break;
4240
4241 case OP_ANYNL:
4242 for (i = 1; i <= min; i++)
4243 {
4244 if (eptr >= md->end_subject)
4245 {
4246 SCHECK_PARTIAL();
4247 MRRETURN(MATCH_NOMATCH);
4248 }
4249 switch(*eptr++)
4250 {
4251 default: MRRETURN(MATCH_NOMATCH);
4252
4253 case 0x000d:
4254 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4255 break;
4256
4257 case 0x000a:
4258 break;
4259
4260 case 0x000b:
4261 case 0x000c:
4262 case 0x0085:
4263 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4264 break;
4265 }
4266 }
4267 break;
4268
4269 case OP_NOT_HSPACE:
4270 for (i = 1; i <= min; i++)
4271 {
4272 if (eptr >= md->end_subject)
4273 {
4274 SCHECK_PARTIAL();
4275 MRRETURN(MATCH_NOMATCH);
4276 }
4277 switch(*eptr++)
4278 {
4279 default: break;
4280 case 0x09: /* HT */
4281 case 0x20: /* SPACE */
4282 case 0xa0: /* NBSP */
4283 MRRETURN(MATCH_NOMATCH);
4284 }
4285 }
4286 break;
4287
4288 case OP_HSPACE:
4289 for (i = 1; i <= min; i++)
4290 {
4291 if (eptr >= md->end_subject)
4292 {
4293 SCHECK_PARTIAL();
4294 MRRETURN(MATCH_NOMATCH);
4295 }
4296 switch(*eptr++)
4297 {
4298 default: MRRETURN(MATCH_NOMATCH);
4299 case 0x09: /* HT */
4300 case 0x20: /* SPACE */
4301 case 0xa0: /* NBSP */
4302 break;
4303 }
4304 }
4305 break;
4306
4307 case OP_NOT_VSPACE:
4308 for (i = 1; i <= min; i++)
4309 {
4310 if (eptr >= md->end_subject)
4311 {
4312 SCHECK_PARTIAL();
4313 MRRETURN(MATCH_NOMATCH);
4314 }
4315 switch(*eptr++)
4316 {
4317 default: break;
4318 case 0x0a: /* LF */
4319 case 0x0b: /* VT */
4320 case 0x0c: /* FF */
4321 case 0x0d: /* CR */
4322 case 0x85: /* NEL */
4323 MRRETURN(MATCH_NOMATCH);
4324 }
4325 }
4326 break;
4327
4328 case OP_VSPACE:
4329 for (i = 1; i <= min; i++)
4330 {
4331 if (eptr >= md->end_subject)
4332 {
4333 SCHECK_PARTIAL();
4334 MRRETURN(MATCH_NOMATCH);
4335 }
4336 switch(*eptr++)
4337 {
4338 default: MRRETURN(MATCH_NOMATCH);
4339 case 0x0a: /* LF */
4340 case 0x0b: /* VT */
4341 case 0x0c: /* FF */
4342 case 0x0d: /* CR */
4343 case 0x85: /* NEL */
4344 break;
4345 }
4346 }
4347 break;
4348
4349 case OP_NOT_DIGIT:
4350 for (i = 1; i <= min; i++)
4351 {
4352 if (eptr >= md->end_subject)
4353 {
4354 SCHECK_PARTIAL();
4355 MRRETURN(MATCH_NOMATCH);
4356 }
4357 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4358 }
4359 break;
4360
4361 case OP_DIGIT:
4362 for (i = 1; i <= min; i++)
4363 {
4364 if (eptr >= md->end_subject)
4365 {
4366 SCHECK_PARTIAL();
4367 MRRETURN(MATCH_NOMATCH);
4368 }
4369 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4370 }
4371 break;
4372
4373 case OP_NOT_WHITESPACE:
4374 for (i = 1; i <= min; i++)
4375 {
4376 if (eptr >= md->end_subject)
4377 {
4378 SCHECK_PARTIAL();
4379 MRRETURN(MATCH_NOMATCH);
4380 }
4381 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4382 }
4383 break;
4384
4385 case OP_WHITESPACE:
4386 for (i = 1; i <= min; i++)
4387 {
4388 if (eptr >= md->end_subject)
4389 {
4390 SCHECK_PARTIAL();
4391 MRRETURN(MATCH_NOMATCH);
4392 }
4393 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4394 }
4395 break;
4396
4397 case OP_NOT_WORDCHAR:
4398 for (i = 1; i <= min; i++)
4399 {
4400 if (eptr >= md->end_subject)
4401 {
4402 SCHECK_PARTIAL();
4403 MRRETURN(MATCH_NOMATCH);
4404 }
4405 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4406 MRRETURN(MATCH_NOMATCH);
4407 }
4408 break;
4409
4410 case OP_WORDCHAR:
4411 for (i = 1; i <= min; i++)
4412 {
4413 if (eptr >= md->end_subject)
4414 {
4415 SCHECK_PARTIAL();
4416 MRRETURN(MATCH_NOMATCH);
4417 }
4418 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4419 MRRETURN(MATCH_NOMATCH);
4420 }
4421 break;
4422
4423 default:
4424 RRETURN(PCRE_ERROR_INTERNAL);
4425 }
4426 }
4427
4428 /* If min = max, continue at the same level without recursing */
4429
4430 if (min == max) continue;
4431
4432 /* If minimizing, we have to test the rest of the pattern before each
4433 subsequent match. Again, separate the UTF-8 case for speed, and also
4434 separate the UCP cases. */
4435
4436 if (minimize)
4437 {
4438 #ifdef SUPPORT_UCP
4439 if (prop_type >= 0)
4440 {
4441 switch(prop_type)
4442 {
4443 case PT_ANY:
4444 for (fi = min;; fi++)
4445 {
4446 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4448 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4449 if (eptr >= md->end_subject)
4450 {
4451 SCHECK_PARTIAL();
4452 MRRETURN(MATCH_NOMATCH);
4453 }
4454 GETCHARINCTEST(c, eptr);
4455 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4456 }
4457 /* Control never gets here */
4458
4459 case PT_LAMP:
4460 for (fi = min;; fi++)
4461 {
4462 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4463 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4464 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 MRRETURN(MATCH_NOMATCH);
4469 }
4470 GETCHARINCTEST(c, eptr);
4471 prop_chartype = UCD_CHARTYPE(c);
4472 if ((prop_chartype == ucp_Lu ||
4473 prop_chartype == ucp_Ll ||
4474 prop_chartype == ucp_Lt) == prop_fail_result)
4475 MRRETURN(MATCH_NOMATCH);
4476 }
4477 /* Control never gets here */
4478
4479 case PT_GC:
4480 for (fi = min;; fi++)
4481 {
4482 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4484 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4485 if (eptr >= md->end_subject)
4486 {
4487 SCHECK_PARTIAL();
4488 MRRETURN(MATCH_NOMATCH);
4489 }
4490 GETCHARINCTEST(c, eptr);
4491 prop_category = UCD_CATEGORY(c);
4492 if ((prop_category == prop_value) == prop_fail_result)
4493 MRRETURN(MATCH_NOMATCH);
4494 }
4495 /* Control never gets here */
4496
4497 case PT_PC:
4498 for (fi = min;; fi++)
4499 {
4500 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4501 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4502 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4503 if (eptr >= md->end_subject)
4504 {
4505 SCHECK_PARTIAL();
4506 MRRETURN(MATCH_NOMATCH);
4507 }
4508 GETCHARINCTEST(c, eptr);
4509 prop_chartype = UCD_CHARTYPE(c);
4510 if ((prop_chartype == prop_value) == prop_fail_result)
4511 MRRETURN(MATCH_NOMATCH);
4512 }
4513 /* Control never gets here */
4514
4515 case PT_SC:
4516 for (fi = min;; fi++)
4517 {
4518 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4519 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4520 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4521 if (eptr >= md->end_subject)
4522 {
4523 SCHECK_PARTIAL();
4524 MRRETURN(MATCH_NOMATCH);
4525 }
4526 GETCHARINCTEST(c, eptr);
4527 prop_script = UCD_SCRIPT(c);
4528 if ((prop_script == prop_value) == prop_fail_result)
4529 MRRETURN(MATCH_NOMATCH);
4530 }
4531 /* Control never gets here */
4532
4533 case PT_ALNUM:
4534 for (fi = min;; fi++)
4535 {
4536 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4538 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4539 if (eptr >= md->end_subject)
4540 {
4541 SCHECK_PARTIAL();
4542 MRRETURN(MATCH_NOMATCH);
4543 }
4544 GETCHARINCTEST(c, eptr);
4545 prop_category = UCD_CATEGORY(c);
4546 if ((prop_category == ucp_L || prop_category == ucp_N)
4547 == prop_fail_result)
4548 MRRETURN(MATCH_NOMATCH);
4549 }
4550 /* Control never gets here */
4551
4552 case PT_SPACE: /* Perl space */
4553 for (fi = min;; fi++)
4554 {
4555 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4556 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4557 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4558 if (eptr >= md->end_subject)
4559 {
4560 SCHECK_PARTIAL();
4561 MRRETURN(MATCH_NOMATCH);
4562 }
4563 GETCHARINCTEST(c, eptr);
4564 prop_category = UCD_CATEGORY(c);
4565 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4566 c == CHAR_FF || c == CHAR_CR)
4567 == prop_fail_result)
4568 MRRETURN(MATCH_NOMATCH);
4569 }
4570 /* Control never gets here */
4571
4572 case PT_PXSPACE: /* POSIX space */
4573 for (fi = min;; fi++)
4574 {
4575 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4577 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4578 if (eptr >= md->end_subject)
4579 {
4580 SCHECK_PARTIAL();
4581 MRRETURN(MATCH_NOMATCH);
4582 }
4583 GETCHARINCTEST(c, eptr);
4584 prop_category = UCD_CATEGORY(c);
4585 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4586 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4587 == prop_fail_result)
4588 MRRETURN(MATCH_NOMATCH);
4589 }
4590 /* Control never gets here */
4591
4592 case PT_WORD:
4593 for (fi = min;; fi++)
4594 {
4595 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4596 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4597 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4598 if (eptr >= md->end_subject)
4599 {
4600 SCHECK_PARTIAL();
4601 MRRETURN(MATCH_NOMATCH);
4602 }
4603 GETCHARINCTEST(c, eptr);
4604 prop_category = UCD_CATEGORY(c);
4605 if ((prop_category == ucp_L ||
4606 prop_category == ucp_N ||
4607 c == CHAR_UNDERSCORE)
4608 == prop_fail_result)
4609 MRRETURN(MATCH_NOMATCH);
4610 }
4611 /* Control never gets here */
4612
4613 /* This should never occur */
4614
4615 default:
4616 RRETURN(PCRE_ERROR_INTERNAL);
4617 }
4618 }
4619
4620 /* Match extended Unicode sequences. We will get here only if the
4621 support is in the binary; otherwise a compile-time error occurs. */
4622
4623 else if (ctype == OP_EXTUNI)
4624 {
4625 for (fi = min;; fi++)
4626 {
4627 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4629 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4630 if (eptr >= md->end_subject)
4631 {
4632 SCHECK_PARTIAL();
4633 MRRETURN(MATCH_NOMATCH);
4634 }
4635 GETCHARINCTEST(c, eptr);
4636 prop_category = UCD_CATEGORY(c);
4637 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4638 while (eptr < md->end_subject)
4639 {
4640 int len = 1;
4641 if (!utf8) c = *eptr;
4642 else { GETCHARLEN(c, eptr, len); }
4643 prop_category = UCD_CATEGORY(c);
4644 if (prop_category != ucp_M) break;
4645 eptr += len;
4646 }
4647 }
4648 }
4649
4650 else
4651 #endif /* SUPPORT_UCP */
4652
4653 #ifdef SUPPORT_UTF8
4654 /* UTF-8 mode */
4655 if (utf8)
4656 {
4657 for (fi = min;; fi++)
4658 {
4659 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4660 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4661 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4662 if (eptr >= md->end_subject)
4663 {
4664 SCHECK_PARTIAL();
4665 MRRETURN(MATCH_NOMATCH);
4666 }
4667 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4668 MRRETURN(MATCH_NOMATCH);
4669 GETCHARINC(c, eptr);
4670 switch(ctype)
4671 {
4672 case OP_ANY: /* This is the non-NL case */
4673 case OP_ALLANY:
4674 case OP_ANYBYTE:
4675 break;
4676
4677 case OP_ANYNL:
4678 switch(c)
4679 {
4680 default: MRRETURN(MATCH_NOMATCH);
4681 case 0x000d:
4682 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4683 break;
4684 case 0x000a:
4685 break;
4686
4687 case 0x000b:
4688 case 0x000c:
4689 case 0x0085:
4690 case 0x2028:
4691 case 0x2029:
4692 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4693 break;
4694 }
4695 break;
4696
4697 case OP_NOT_HSPACE:
4698 switch(c)
4699 {
4700 default: break;
4701 case 0x09: /* HT */
4702 case 0x20: /* SPACE */
4703 case 0xa0: /* NBSP */
4704 case 0x1680: /* OGHAM SPACE MARK */
4705 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4706 case 0x2000: /* EN QUAD */
4707 case 0x2001: /* EM QUAD */
4708 case 0x2002: /* EN SPACE */
4709 case 0x2003: /* EM SPACE */
4710 case 0x2004: /* THREE-PER-EM SPACE */
4711 case 0x2005: /* FOUR-PER-EM SPACE */
4712 case 0x2006: /* SIX-PER-EM SPACE */
4713 case 0x2007: /* FIGURE SPACE */
4714 case 0x2008: /* PUNCTUATION SPACE */
4715 case 0x2009: /* THIN SPACE */
4716 case 0x200A: /* HAIR SPACE */
4717 case 0x202f: /* NARROW NO-BREAK SPACE */
4718 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4719 case 0x3000: /* IDEOGRAPHIC SPACE */
4720 MRRETURN(MATCH_NOMATCH);
4721 }
4722 break;
4723
4724 case OP_HSPACE:
4725 switch(c)
4726 {
4727 default: MRRETURN(MATCH_NOMATCH);
4728 case 0x09: /* HT */
4729 case 0x20: /* SPACE */
4730 case 0xa0: /* NBSP */
4731 case 0x1680: /* OGHAM SPACE MARK */
4732 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4733 case 0x2000: /* EN QUAD */
4734 case 0x2001: /* EM QUAD */
4735 case 0x2002: /* EN SPACE */
4736 case 0x2003: /* EM SPACE */
4737 case 0x2004: /* THREE-PER-EM SPACE */
4738 case 0x2005: /* FOUR-PER-EM SPACE */
4739 case 0x2006: /* SIX-PER-EM SPACE */
4740 case 0x2007: /* FIGURE SPACE */
4741 case 0x2008: /* PUNCTUATION SPACE */
4742 case 0x2009: /* THIN SPACE */
4743 case 0x200A: /* HAIR SPACE */
4744 case 0x202f: /* NARROW NO-BREAK SPACE */
4745 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4746 case 0x3000: /* IDEOGRAPHIC SPACE */
4747 break;
4748 }
4749 break;
4750
4751 case OP_NOT_VSPACE:
4752 switch(c)
4753 {
4754 default: break;
4755 case 0x0a: /* LF */
4756 case 0x0b: /* VT */
4757 case 0x0c: /* FF */
4758 case 0x0d: /* CR */
4759 case 0x85: /* NEL */
4760 case 0x2028: /* LINE SEPARATOR */
4761 case 0x2029: /* PARAGRAPH SEPARATOR */
4762 MRRETURN(MATCH_NOMATCH);
4763 }
4764 break;
4765
4766 case OP_VSPACE:
4767 switch(c)
4768 {
4769 default: MRRETURN(MATCH_NOMATCH);
4770 case 0x0a: /* LF */
4771 case 0x0b: /* VT */
4772 case 0x0c: /* FF */
4773 case 0x0d: /* CR */
4774 case 0x85: /* NEL */
4775 case 0x2028: /* LINE SEPARATOR */
4776 case 0x2029: /* PARAGRAPH SEPARATOR */
4777 break;
4778 }
4779 break;
4780
4781 case OP_NOT_DIGIT:
4782 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4783 MRRETURN(MATCH_NOMATCH);
4784 break;
4785
4786 case OP_DIGIT:
4787 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4788 MRRETURN(MATCH_NOMATCH);
4789 break;
4790
4791 case OP_NOT_WHITESPACE:
4792 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4793 MRRETURN(MATCH_NOMATCH);
4794 break;
4795
4796 case OP_WHITESPACE:
4797 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4798 MRRETURN(MATCH_NOMATCH);
4799 break;
4800
4801 case OP_NOT_WORDCHAR:
4802 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4803 MRRETURN(MATCH_NOMATCH);
4804 break;
4805
4806 case OP_WORDCHAR:
4807 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4808 MRRETURN(MATCH_NOMATCH);
4809 break;
4810
4811 default:
4812 RRETURN(PCRE_ERROR_INTERNAL);
4813 }
4814 }
4815 }
4816 else
4817 #endif
4818 /* Not UTF-8 mode */
4819 {
4820 for (fi = min;; fi++)
4821 {
4822 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4824 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4825 if (eptr >= md->end_subject)
4826 {
4827 SCHECK_PARTIAL();
4828 MRRETURN(MATCH_NOMATCH);
4829 }
4830 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4831 MRRETURN(MATCH_NOMATCH);
4832 c = *eptr++;
4833 switch(ctype)
4834 {
4835 case OP_ANY: /* This is the non-NL case */
4836 case OP_ALLANY:
4837 case OP_ANYBYTE:
4838 break;
4839
4840 case OP_ANYNL:
4841 switch(c)
4842 {
4843 default: MRRETURN(MATCH_NOMATCH);
4844 case 0x000d:
4845 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4846 break;
4847
4848 case 0x000a:
4849 break;
4850
4851 case 0x000b:
4852 case 0x000c:
4853 case 0x0085:
4854 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4855 break;
4856 }
4857 break;
4858
4859 case OP_NOT_HSPACE:
4860 switch(c)
4861 {
4862 default: break;
4863 case 0x09: /* HT */
4864 case 0x20: /* SPACE */
4865 case 0xa0: /* NBSP */
4866 MRRETURN(MATCH_NOMATCH);
4867 }
4868 break;
4869
4870 case OP_HSPACE:
4871 switch(c)
4872 {
4873 default: MRRETURN(MATCH_NOMATCH);
4874 case 0x09: /* HT */
4875 case 0x20: /* SPACE */
4876 case 0xa0: /* NBSP */
4877 break;
4878 }
4879 break;
4880
4881 case OP_NOT_VSPACE:
4882 switch(c)
4883 {
4884 default: break;
4885 case 0x0a: /* LF */
4886 case 0x0b: /* VT */
4887 case 0x0c: /* FF */
4888 case 0x0d: /* CR */
4889 case 0x85: /* NEL */
4890 MRRETURN(MATCH_NOMATCH);
4891 }
4892 break;
4893
4894 case OP_VSPACE:
4895 switch(c)
4896 {
4897 default: MRRETURN(MATCH_NOMATCH);
4898 case 0x0a: /* LF */
4899 case 0x0b: /* VT */
4900 case 0x0c: /* FF */
4901 case 0x0d: /* CR */
4902 case 0x85: /* NEL */
4903 break;
4904 }
4905 break;
4906
4907 case OP_NOT_DIGIT:
4908 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4909 break;
4910
4911 case OP_DIGIT:
4912 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4913 break;
4914
4915 case OP_NOT_WHITESPACE:
4916 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4917 break;
4918
4919 case OP_WHITESPACE:
4920 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4921 break;
4922
4923 case OP_NOT_WORDCHAR:
4924 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4925 break;
4926
4927 case OP_WORDCHAR:
4928 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4929 break;
4930
4931 default:
4932 RRETURN(PCRE_ERROR_INTERNAL);
4933 }
4934 }
4935 }
4936 /* Control never gets here */
4937 }
4938
4939 /* If maximizing, it is worth using inline code for speed, doing the type
4940 test once at the start (i.e. keep it out of the loop). Again, keep the
4941 UTF-8 and UCP stuff separate. */
4942
4943 else
4944 {
4945 pp = eptr; /* Remember where we started */
4946
4947 #ifdef SUPPORT_UCP
4948 if (prop_type >= 0)
4949 {
4950 switch(prop_type)
4951 {
4952 case PT_ANY:
4953 for (i = min; i < max; i++)
4954 {
4955 int len = 1;
4956 if (eptr >= md->end_subject)
4957 {
4958 SCHECK_PARTIAL();
4959 break;
4960 }
4961 GETCHARLENTEST(c, eptr, len);
4962 if (prop_fail_result) break;
4963 eptr+= len;
4964 }
4965 break;
4966
4967 case PT_LAMP:
4968 for (i = min; i < max; i++)
4969 {
4970 int len = 1;
4971 if (eptr >= md->end_subject)
4972 {
4973 SCHECK_PARTIAL();
4974 break;
4975 }
4976 GETCHARLENTEST(c, eptr, len);
4977 prop_chartype = UCD_CHARTYPE(c);
4978 if ((prop_chartype == ucp_Lu ||
4979 prop_chartype == ucp_Ll ||
4980 prop_chartype == ucp_Lt) == prop_fail_result)
4981 break;
4982 eptr+= len;
4983 }
4984 break;
4985
4986 case PT_GC:
4987 for (i = min; i < max; i++)
4988 {
4989 int len = 1;
4990 if (eptr >= md->end_subject)
4991 {
4992 SCHECK_PARTIAL();
4993 break;
4994 }
4995 GETCHARLENTEST(c, eptr, len);
4996 prop_category = UCD_CATEGORY(c);
4997 if ((prop_category == prop_value) == prop_fail_result)
4998 break;
4999 eptr+= len;
5000 }
5001 break;
5002
5003 case PT_PC:
5004 for (i = min; i < max; i++)
5005 {
5006 int len = 1;
5007 if (eptr >= md->end_subject)
5008 {
5009 SCHECK_PARTIAL();
5010 break;
5011 }
5012 GETCHARLENTEST(c, eptr, len);
5013 prop_chartype = UCD_CHARTYPE(c);
5014 if ((prop_chartype == prop_value) == prop_fail_result)
5015 break;
5016 eptr+= len;
5017 }
5018 break;
5019
5020 case PT_SC:
5021 for (i = min; i < max; i++)
5022 {
5023 int len = 1;
5024 if (eptr >= md->end_subject)
5025 {
5026 SCHECK_PARTIAL();
5027 break;
5028 }
5029 GETCHARLENTEST(c, eptr, len);
5030 prop_script = UCD_SCRIPT(c);
5031 if ((prop_script == prop_value) == prop_fail_result)
5032 break;
5033 eptr+= len;
5034 }
5035 break;
5036
5037 case PT_ALNUM:
5038 for (i = min; i < max; i++)
5039 {
5040 int len = 1;
5041 if (eptr >= md->end_subject)
5042 {
5043 SCHECK_PARTIAL();
5044 break;
5045 }
5046 GETCHARLENTEST(c, eptr, len);
5047 prop_category = UCD_CATEGORY(c);
5048 if ((prop_category == ucp_L || prop_category == ucp_N)
5049 == prop_fail_result)
5050 break;
5051 eptr+= len;
5052 }
5053 break;
5054
5055 case PT_SPACE: /* Perl space */
5056 for (i = min; i < max; i++)
5057 {
5058 int len = 1;
5059 if (eptr >= md->end_subject)
5060 {
5061 SCHECK_PARTIAL();
5062 break;
5063 }
5064 GETCHARLENTEST(c, eptr, len);
5065 prop_category = UCD_CATEGORY(c);
5066 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5067 c == CHAR_FF || c == CHAR_CR)
5068 == prop_fail_result)
5069 break;
5070 eptr+= len;
5071 }
5072 break;
5073
5074 case PT_PXSPACE: /* POSIX space */
5075 for (i = min; i < max; i++)
5076 {
5077 int len = 1;
5078 if (eptr >= md->end_subject)
5079 {
5080 SCHECK_PARTIAL();
5081 break;
5082 }
5083 GETCHARLENTEST(c, eptr, len);
5084 prop_category = UCD_CATEGORY(c);
5085 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5086 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5087 == prop_fail_result)
5088 break;
5089 eptr+= len;
5090 }
5091 break;
5092
5093 case PT_WORD:
5094 for (i = min; i < max; i++)
5095 {
5096 int len = 1;
5097 if (eptr >= md->end_subject)
5098 {
5099 SCHECK_PARTIAL();
5100 break;
5101 }
5102 GETCHARLENTEST(c, eptr, len);
5103 prop_category = UCD_CATEGORY(c);
5104 if ((prop_category == ucp_L || prop_category == ucp_N ||
5105 c == CHAR_UNDERSCORE) == prop_fail_result)
5106 break;
5107 eptr+= len;
5108 }
5109 break;
5110
5111 default:
5112 RRETURN(PCRE_ERROR_INTERNAL);
5113 }
5114
5115 /* eptr is now past the end of the maximum run */
5116
5117 if (possessive) continue;
5118 for(;;)
5119 {
5120 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5122 if (eptr-- == pp) break; /* Stop if tried at original pos */
5123 if (utf8) BACKCHAR(eptr);
5124 }
5125 }
5126
5127 /* Match extended Unicode sequences. We will get here only if the
5128 support is in the binary; otherwise a compile-time error occurs. */
5129
5130 else if (ctype == OP_EXTUNI)
5131 {
5132 for (i = min; i < max; i++)
5133 {
5134 if (eptr >= md->end_subject)
5135 {
5136 SCHECK_PARTIAL();
5137 break;
5138 }
5139 GETCHARINCTEST(c, eptr);
5140 prop_category = UCD_CATEGORY(c);
5141 if (prop_category == ucp_M) break;
5142 while (eptr < md->end_subject)
5143 {
5144 int len = 1;
5145 if (!utf8) c = *eptr; else
5146 {
5147 GETCHARLEN(c, eptr, len);
5148 }
5149 prop_category = UCD_CATEGORY(c);
5150 if (prop_category != ucp_M) break;
5151 eptr += len;
5152 }
5153 }
5154
5155 /* eptr is now past the end of the maximum run */
5156
5157 if (possessive) continue;
5158
5159 for(;;)
5160 {
5161 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5163 if (eptr-- == pp) break; /* Stop if tried at original pos */
5164 for (;;) /* Move back over one extended */
5165 {
5166 int len = 1;
5167 if (!utf8) c = *eptr; else
5168 {
5169 BACKCHAR(eptr);
5170 GETCHARLEN(c, eptr, len);
5171 }
5172 prop_category = UCD_CATEGORY(c);
5173 if (prop_category != ucp_M) break;
5174 eptr--;
5175 }
5176 }
5177 }
5178
5179 else
5180 #endif /* SUPPORT_UCP */
5181
5182 #ifdef SUPPORT_UTF8
5183 /* UTF-8 mode */
5184
5185 if (utf8)
5186 {
5187 switch(ctype)
5188 {
5189 case OP_ANY:
5190 if (max < INT_MAX)
5191 {
5192 for (i = min; i < max; i++)
5193 {
5194 if (eptr >= md->end_subject)
5195 {
5196 SCHECK_PARTIAL();
5197 break;
5198 }
5199 if (IS_NEWLINE(eptr)) break;
5200 eptr++;
5201 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5202 }
5203 }
5204
5205 /* Handle unlimited UTF-8 repeat */
5206
5207 else
5208 {
5209 for (i = min; i < max; i++)
5210 {
5211 if (eptr >= md->end_subject)
5212 {
5213 SCHECK_PARTIAL();
5214 break;
5215 }
5216 if (IS_NEWLINE(eptr)) break;
5217 eptr++;
5218 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5219 }
5220 }
5221 break;
5222
5223 case OP_ALLANY:
5224 if (max < INT_MAX)
5225 {
5226 for (i = min; i < max; i++)
5227 {
5228 if (eptr >= md->end_subject)
5229 {
5230 SCHECK_PARTIAL();
5231 break;
5232 }
5233 eptr++;
5234 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5235 }
5236 }
5237 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5238 break;
5239
5240 /* The byte case is the same as non-UTF8 */
5241
5242 case OP_ANYBYTE:
5243 c = max - min;
5244 if (c > (unsigned int)(md->end_subject - eptr))
5245 {
5246 eptr = md->end_subject;
5247 SCHECK_PARTIAL();
5248 }
5249 else eptr += c;
5250 break;
5251
5252 case OP_ANYNL:
5253 for (i = min; i < max; i++)
5254 {
5255 int len = 1;
5256 if (eptr >= md->end_subject)
5257 {
5258 SCHECK_PARTIAL();
5259 break;
5260 }
5261 GETCHARLEN(c, eptr, len);
5262 if (c == 0x000d)
5263 {
5264 if (++eptr >= md->end_subject) break;
5265 if (*eptr == 0x000a) eptr++;
5266 }
5267 else
5268 {
5269 if (c != 0x000a &&
5270 (md->bsr_anycrlf ||
5271 (c != 0x000b && c != 0x000c &&
5272 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5273 break;
5274 eptr += len;
5275 }
5276 }
5277 break;
5278
5279 case OP_NOT_HSPACE:
5280 case OP_HSPACE:
5281 for (i = min; i < max; i++)
5282 {
5283 BOOL gotspace;
5284 int len = 1;
5285 if (eptr >= md->end_subject)
5286 {
5287 SCHECK_PARTIAL();
5288 break;
5289 }
5290 GETCHARLEN(c, eptr, len);
5291 switch(c)
5292 {
5293 default: gotspace = FALSE; break;
5294 case 0x09: /* HT */
5295 case 0x20: /* SPACE */
5296 case 0xa0: /* NBSP */
5297 case 0x1680: /* OGHAM SPACE MARK */
5298 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5299 case 0x2000: /* EN QUAD */
5300 case 0x2001: /* EM QUAD */
5301 case 0x2002: /* EN SPACE */
5302 case 0x2003: /* EM SPACE */
5303 case 0x2004: /* THREE-PER-EM SPACE */
5304 case 0x2005: /* FOUR-PER-EM SPACE */
5305 case 0x2006: /* SIX-PER-EM SPACE */
5306 case 0x2007: /* FIGURE SPACE */
5307 case 0x2008: /* PUNCTUATION SPACE */
5308 case 0x2009: /* THIN SPACE */
5309 case 0x200A: /* HAIR SPACE */
5310 case 0x202f: /* NARROW NO-BREAK SPACE */
5311 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5312 case 0x3000: /* IDEOGRAPHIC SPACE */
5313 gotspace = TRUE;
5314 break;
5315 }
5316 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5317 eptr += len;
5318 }
5319 break;
5320
5321 case OP_NOT_VSPACE:
5322 case OP_VSPACE:
5323 for (i = min; i < max; i++)
5324 {
5325 BOOL gotspace;
5326 int len = 1;
5327 if (eptr >= md->end_subject)
5328 {
5329 SCHECK_PARTIAL();
5330 break;
5331 }
5332 GETCHARLEN(c, eptr, len);
5333 switch(c)
5334 {
5335 default: gotspace = FALSE; break;
5336 case 0x0a: /* LF */
5337 case 0x0b: /* VT */
5338 case 0x0c: /* FF */
5339 case 0x0d: /* CR */
5340 case 0x85: /* NEL */
5341 case 0x2028: /* LINE SEPARATOR */
5342 case 0x2029: /* PARAGRAPH SEPARATOR */
5343 gotspace = TRUE;
5344 break;
5345 }
5346 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5347 eptr += len;
5348 }
5349 break;
5350
5351 case OP_NOT_DIGIT:
5352 for (i = min; i < max; i++)
5353 {
5354 int len = 1;
5355 if (eptr >= md->end_subject)
5356 {
5357 SCHECK_PARTIAL();
5358 break;
5359 }
5360 GETCHARLEN(c, eptr, len);
5361 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5362 eptr+= len;
5363 }
5364 break;
5365
5366 case OP_DIGIT:
5367 for (i = min; i < max; i++)
5368 {
5369 int len = 1;
5370 if (eptr >= md->end_subject)
5371 {
5372 SCHECK_PARTIAL();
5373 break;
5374 }
5375 GETCHARLEN(c, eptr, len);
5376 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5377 eptr+= len;
5378 }
5379 break;
5380
5381 case OP_NOT_WHITESPACE:
5382 for (i = min; i < max; i++)
5383 {
5384 int len = 1;
5385 if (eptr >= md->end_subject)
5386 {
5387 SCHECK_PARTIAL();
5388 break;
5389 }
5390 GETCHARLEN(c, eptr, len);
5391 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5392 eptr+= len;
5393 }
5394 break;
5395
5396 case OP_WHITESPACE:
5397 for (i = min; i < max; i++)
5398 {
5399 int len = 1;
5400 if (eptr >= md->end_subject)
5401 {
5402 SCHECK_PARTIAL();
5403 break;
5404 }
5405 GETCHARLEN(c, eptr, len);
5406 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5407 eptr+= len;
5408 }
5409 break;
5410
5411 case OP_NOT_WORDCHAR:
5412 for (i = min; i < max; i++)
5413 {
5414 int len = 1;
5415 if (eptr >= md->end_subject)
5416 {
5417 SCHECK_PARTIAL();
5418 break;
5419 }
5420 GETCHARLEN(c, eptr, len);
5421 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5422 eptr+= len;
5423 }
5424 break;
5425
5426 case OP_WORDCHAR:
5427 for (i = min; i < max; i++)
5428 {
5429 int len = 1;
5430 if (eptr >= md->end_subject)
5431 {
5432 SCHECK_PARTIAL();
5433 break;
5434 }
5435 GETCHARLEN(c, eptr, len);
5436 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5437 eptr+= len;
5438 }
5439 break;
5440
5441 default:
5442 RRETURN(PCRE_ERROR_INTERNAL);
5443 }
5444
5445 /* eptr is now past the end of the maximum run. If possessive, we are
5446 done (no backing up). Otherwise, match at this position; anything other
5447 than no match is immediately returned. For nomatch, back up one
5448 character, unless we are matching \R and the last thing matched was
5449 \r\n, in which case, back up two bytes. */
5450
5451 if (possessive) continue;
5452 for(;;)
5453 {
5454 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5455 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5456 if (eptr-- == pp) break; /* Stop if tried at original pos */
5457 BACKCHAR(eptr);
5458 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5459 eptr[-1] == '\r') eptr--;
5460 }
5461 }
5462 else
5463 #endif /* SUPPORT_UTF8 */
5464
5465 /* Not UTF-8 mode */
5466 {
5467 switch(ctype)
5468 {
5469 case OP_ANY:
5470 for (i = min; i < max; i++)
5471 {
5472 if (eptr >= md->end_subject)
5473 {
5474 SCHECK_PARTIAL();
5475 break;
5476 }
5477 if (IS_NEWLINE(eptr)) break;
5478 eptr++;
5479 }
5480 break;
5481
5482 case OP_ALLANY:
5483 case OP_ANYBYTE:
5484 c = max - min;
5485 if (c > (unsigned int)(md->end_subject - eptr))
5486 {
5487 eptr = md->end_subject;
5488 SCHECK_PARTIAL();
5489 }
5490 else eptr += c;
5491 break;
5492
5493 case OP_ANYNL:
5494 for (i = min; i < max; i++)
5495 {
5496 if (eptr >= md->end_subject)
5497 {
5498 SCHECK_PARTIAL();
5499 break;
5500 }
5501 c = *eptr;
5502 if (c == 0x000d)
5503 {
5504 if (++eptr >= md->end_subject) break;
5505 if (*eptr == 0x000a) eptr++;
5506 }
5507 else
5508 {
5509 if (c != 0x000a &&
5510 (md->bsr_anycrlf ||
5511 (c != 0x000b && c != 0x000c && c != 0x0085)))
5512 break;
5513 eptr++;
5514 }
5515 }
5516 break;
5517
5518 case OP_NOT_HSPACE:
5519 for (i = min; i < max; i++)
5520 {
5521 if (eptr >= md->end_subject)
5522 {
5523 SCHECK_PARTIAL();
5524 break;
5525 }
5526 c = *eptr;
5527 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5528 eptr++;
5529 }
5530 break;
5531
5532 case OP_HSPACE:
5533 for (i = min; i < max; i++)
5534 {
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 c = *eptr;
5541 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5542 eptr++;
5543 }
5544 break;
5545
5546 case OP_NOT_VSPACE:
5547 for (i = min; i < max; i++)
5548 {
5549 if (eptr >= md->end_subject)
5550 {
5551 SCHECK_PARTIAL();
5552 break;
5553 }
5554 c = *eptr;
5555 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5556 break;
5557 eptr++;
5558 }
5559 break;
5560
5561 case OP_VSPACE:
5562 for (i = min; i < max; i++)
5563 {
5564 if (eptr >= md->end_subject)
5565 {
5566 SCHECK_PARTIAL();
5567 break;
5568 }
5569 c = *eptr;
5570 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5571 break;
5572 eptr++;
5573 }
5574 break;
5575
5576 case OP_NOT_DIGIT:
5577 for (i = min; i < max; i++)
5578 {
5579 if (eptr >= md->end_subject)
5580 {
5581 SCHECK_PARTIAL();
5582 break;
5583 }
5584 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5585 eptr++;
5586 }
5587 break;
5588
5589 case OP_DIGIT:
5590 for (i = min; i < max; i++)
5591 {
5592 if (eptr >= md->end_subject)
5593 {
5594 SCHECK_PARTIAL();
5595 break;
5596 }
5597 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5598 eptr++;
5599 }
5600 break;
5601
5602 case OP_NOT_WHITESPACE:
5603 for (i = min; i < max; i++)
5604 {
5605 if (eptr >= md->end_subject)
5606 {
5607 SCHECK_PARTIAL();
5608 break;
5609 }
5610 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5611 eptr++;
5612 }
5613 break;
5614
5615 case OP_WHITESPACE:
5616 for (i = min; i < max; i++)
5617 {
5618 if (eptr >= md->end_subject)
5619 {
5620 SCHECK_PARTIAL();
5621 break;
5622 }
5623 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5624 eptr++;
5625 }
5626 break;
5627
5628 case OP_NOT_WORDCHAR:
5629 for (i = min; i < max; i++)
5630 {
5631 if (eptr >= md->end_subject)
5632 {
5633 SCHECK_PARTIAL();
5634 break;
5635 }
5636 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5637 eptr++;
5638 }
5639 break;
5640
5641 case OP_WORDCHAR:
5642 for (i = min; i < max; i++)
5643 {
5644 if (eptr >= md->end_subject)
5645 {
5646 SCHECK_PARTIAL();
5647 break;
5648 }
5649 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5650 eptr++;
5651 }
5652 break;
5653
5654 default:
5655 RRETURN(PCRE_ERROR_INTERNAL);
5656 }
5657
5658 /* eptr is now past the end of the maximum run. If possessive, we are
5659 done (no backing up). Otherwise, match at this position; anything other
5660 than no match is immediately returned. For nomatch, back up one
5661 character (byte), unless we are matching \R and the last thing matched
5662 was \r\n, in which case, back up two bytes. */
5663
5664 if (possessive) continue;
5665 while (eptr >= pp)
5666 {
5667 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5669 eptr--;
5670 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5671 eptr[-1] == '\r') eptr--;
5672 }
5673 }
5674
5675 /* Get here if we can't make it match with any permitted repetitions */
5676
5677 MRRETURN(MATCH_NOMATCH);
5678 }
5679 /* Control never gets here */
5680
5681 /* There's been some horrible disaster. Arrival here can only mean there is
5682 something seriously wrong in the code above or the OP_xxx definitions. */
5683
5684 default:
5685 DPRINTF(("Unknown opcode %d\n", *ecode));
5686 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5687 }
5688
5689 /* Do not stick any code in here without much thought; it is assumed
5690 that "continue" in the code above comes out to here to repeat the main
5691 loop. */
5692
5693 } /* End of main loop */
5694 /* Control never reaches here */
5695
5696
5697 /* When compiling to use the heap rather than the stack for recursive calls to
5698 match(), the RRETURN() macro jumps here. The number that is saved in
5699 frame->Xwhere indicates which label we actually want to return to. */
5700
5701 #ifdef NO_RECURSE
5702 #define LBL(val) case val: goto L_RM##val;
5703 HEAP_RETURN:
5704 switch (frame->Xwhere)
5705 {
5706 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5707 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5708 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5709 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5710 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5711 #ifdef SUPPORT_UTF8
5712 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5713 LBL(32) LBL(34) LBL(42) LBL(46)
5714 #ifdef SUPPORT_UCP
5715 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5716 LBL(59) LBL(60) LBL(61) LBL(62)
5717 #endif /* SUPPORT_UCP */
5718 #endif /* SUPPORT_UTF8 */
5719 default:
5720 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5721 return PCRE_ERROR_INTERNAL;
5722 }
5723 #undef LBL
5724 #endif /* NO_RECURSE */
5725 }
5726
5727
5728 /***************************************************************************
5729 ****************************************************************************
5730 RECURSION IN THE match() FUNCTION
5731
5732 Undefine all the macros that were defined above to handle this. */
5733
5734 #ifdef NO_RECURSE
5735 #undef eptr
5736 #undef ecode
5737 #undef mstart
5738 #undef offset_top
5739 #undef eptrb
5740 #undef flags
5741
5742 #undef callpat
5743 #undef charptr
5744 #undef data
5745 #undef next
5746 #undef pp
5747 #undef prev
5748 #undef saved_eptr
5749
5750 #undef new_recursive
5751
5752 #undef cur_is_word
5753 #undef condition
5754 #undef prev_is_word
5755
5756 #undef ctype
5757 #undef length
5758 #undef max
5759 #undef min
5760 #undef number
5761 #undef offset
5762 #undef op
5763 #undef save_capture_last
5764 #undef save_offset1
5765 #undef save_offset2
5766 #undef save_offset3
5767 #undef stacksave
5768
5769 #undef newptrb
5770
5771 #endif
5772
5773 /* These two are defined as macros in both cases */
5774
5775 #undef fc
5776 #undef fi
5777
5778 /***************************************************************************
5779 ***************************************************************************/
5780
5781
5782
5783 /*************************************************
5784 * Execute a Regular Expression *
5785 *************************************************/
5786
5787 /* This function applies a compiled re to a subject string and picks out
5788 portions of the string if it matches. Two elements in the vector are set for
5789 each substring: the offsets to the start and end of the substring.
5790
5791 Arguments:
5792 argument_re points to the compiled expression
5793 extra_data points to extra data or is NULL
5794 subject points to the subject string
5795 length length of subject string (may contain binary zeros)
5796 start_offset where to start in the subject string
5797 options option bits
5798 offsets points to a vector of ints to be filled in with offsets
5799 offsetcount the number of elements in the vector
5800
5801 Returns: > 0 => success; value is the number of elements filled in
5802 = 0 => success, but offsets is not big enough
5803 -1 => failed to match
5804 < -1 => some kind of unexpected problem
5805 */
5806
5807 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5808 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5809 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5810 int offsetcount)
5811 {
5812 int rc, ocount;
5813 int first_byte = -1;
5814 int req_byte = -1;
5815 int req_byte2 = -1;
5816 int newline;
5817 BOOL using_temporary_offsets = FALSE;
5818 BOOL anchored;
5819 BOOL startline;
5820 BOOL firstline;
5821 BOOL first_byte_caseless = FALSE;
5822 BOOL req_byte_caseless = FALSE;
5823 BOOL utf8;
5824 match_data match_block;
5825 match_data *md = &match_block;
5826 const uschar *tables;
5827 const uschar *start_bits = NULL;
5828 USPTR start_match = (USPTR)subject + start_offset;
5829 USPTR end_subject;
5830 USPTR start_partial = NULL;
5831 USPTR req_byte_ptr = start_match - 1;
5832
5833 pcre_study_data internal_study;
5834 const pcre_study_data *study;
5835
5836 real_pcre internal_re;
5837 const real_pcre *external_re = (const real_pcre *)argument_re;
5838 const real_pcre *re = external_re;
5839
5840 /* Plausibility checks */
5841
5842 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5843 if (re == NULL || subject == NULL ||
5844 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5845 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5846 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5847
5848 /* This information is for finding all the numbers associated with a given
5849 name, for condition testing. */
5850
5851 md->name_table = (uschar *)re + re->name_table_offset;
5852 md->name_count = re->name_count;
5853 md->name_entry_size = re->name_entry_size;
5854
5855 /* Fish out the optional data from the extra_data structure, first setting
5856 the default values. */
5857
5858 study = NULL;
5859 md->match_limit = MATCH_LIMIT;
5860 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5861 md->callout_data = NULL;
5862
5863 /* The table pointer is always in native byte order. */
5864
5865 tables = external_re->tables;
5866
5867 if (extra_data != NULL)
5868 {
5869 register unsigned int flags = extra_data->flags;
5870 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5871 study = (const pcre_study_data *)extra_data->study_data;
5872 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5873 md->match_limit = extra_data->match_limit;
5874 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5875 md->match_limit_recursion = extra_data->match_limit_recursion;
5876 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5877 md->callout_data = extra_data->callout_data;
5878 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5879 }
5880
5881 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5882 is a feature that makes it possible to save compiled regex and re-use them
5883 in other programs later. */
5884
5885 if (tables == NULL) tables = _pcre_default_tables;
5886
5887 /* Check that the first field in the block is the magic number. If it is not,
5888 test for a regex that was compiled on a host of opposite endianness. If this is
5889 the case, flipped values are put in internal_re and internal_study if there was
5890 study data too. */
5891
5892 if (re->magic_number != MAGIC_NUMBER)
5893 {
5894 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5895 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5896 if (study != NULL) study = &internal_study;
5897 }
5898
5899 /* Set up other data */
5900
5901 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5902 startline = (re->flags & PCRE_STARTLINE) != 0;
5903 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5904
5905 /* The code starts after the real_pcre block and the capture name table. */
5906
5907 md->start_code = (const uschar *)external_re + re->name_table_offset +
5908 re->name_count * re->name_entry_size;
5909
5910 md->start_subject = (USPTR)subject;
5911 md->start_offset = start_offset;
5912 md->end_subject = md->start_subject + length;
5913 end_subject = md->end_subject;
5914
5915 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5916 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5917 md->use_ucp = (re->options & PCRE_UCP) != 0;
5918 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5919
5920 /* Some options are unpacked into BOOL variables in the hope that testing
5921 them will be faster than individual option bits. */
5922
5923 md->notbol = (options & PCRE_NOTBOL) != 0;
5924 md->noteol = (options & PCRE_NOTEOL) != 0;
5925 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5926 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5927 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5928 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5929
5930
5931 md->hitend = FALSE;
5932 md->mark = NULL; /* In case never set */
5933
5934 md->recursive = NULL; /* No recursion at top level */
5935
5936 md->lcc = tables + lcc_offset;
5937 md->ctypes = tables + ctypes_offset;
5938
5939 /* Handle different \R options. */
5940
5941 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5942 {
5943 case 0:
5944 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5945 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5946 else
5947 #ifdef BSR_ANYCRLF
5948 md->bsr_anycrlf = TRUE;
5949 #else
5950 md->bsr_anycrlf = FALSE;
5951 #endif
5952 break;
5953
5954 case PCRE_BSR_ANYCRLF:
5955 md->bsr_anycrlf = TRUE;
5956 break;
5957
5958 case PCRE_BSR_UNICODE:
5959 md->bsr_anycrlf = FALSE;
5960 break;
5961
5962 default: return PCRE_ERROR_BADNEWLINE;
5963 }
5964
5965 /* Handle different types of newline. The three bits give eight cases. If
5966 nothing is set at run time, whatever was used at compile time applies. */
5967
5968 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5969 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5970 {
5971 case 0: newline = NEWLINE; break; /* Compile-time default */
5972 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5973 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5974 case PCRE_NEWLINE_CR+
5975 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5976 case PCRE_NEWLINE_ANY: newline = -1; break;
5977 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5978 default: return PCRE_ERROR_BADNEWLINE;
5979 }
5980
5981 if (newline == -2)
5982 {
5983 md->nltype = NLTYPE_ANYCRLF;
5984 }
5985 else if (newline < 0)
5986 {
5987 md->nltype = NLTYPE_ANY;
5988 }
5989 else
5990 {
5991 md->nltype = NLTYPE_FIXED;
5992 if (newline > 255)
5993 {
5994 md->nllen = 2;
5995 md->nl[0] = (newline >> 8) & 255;
5996 md->nl[1] = newline & 255;
5997 }
5998 else
5999 {
6000 md->nllen = 1;
6001 md->nl[0] = newline;
6002 }
6003 }
6004
6005 /* Partial matching was originally supported only for a restricted set of
6006 regexes; from release 8.00 there are no restrictions, but the bits are still
6007 defined (though never set). So there's no harm in leaving this code. */
6008
6009 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6010 return PCRE_ERROR_BADPARTIAL;
6011
6012 /* Check a UTF-8 string if required. Pass back the character offset and error
6013 code for an invalid string if a results vector is available. */
6014
6015 #ifdef SUPPORT_UTF8
6016 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6017 {
6018 int erroroffset;
6019 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
6020 if (errorcode != 0)
6021 {
6022 if (offsetcount >= 2)
6023 {
6024 offsets[0] = erroroffset;
6025 offsets[1] = errorcode;
6026 }
6027 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6028 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6029 }
6030
6031 /* Check that a start_offset points to the start of a UTF-8 character. */
6032
6033 if (start_offset > 0 && start_offset < length &&
6034 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6035 return PCRE_ERROR_BADUTF8_OFFSET;
6036 }
6037 #endif
6038
6039 /* If the expression has got more back references than the offsets supplied can
6040 hold, we get a temporary chunk of working store to use during the matching.
6041 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6042 of 3. */
6043
6044 ocount = offsetcount - (offsetcount % 3);
6045
6046 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6047 {
6048 ocount = re->top_backref * 3 + 3;
6049 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6050 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6051 using_temporary_offsets = TRUE;
6052 DPRINTF(("Got memory to hold back references\n"));
6053 }
6054 else md->offset_vector = offsets;
6055
6056 md->offset_end = ocount;
6057 md->offset_max = (2*ocount)/3;
6058 md->offset_overflow = FALSE;
6059 md->capture_last = -1;
6060
6061 /* Reset the working variable associated with each extraction. These should
6062 never be used unless previously set, but they get saved and restored, and so we
6063 initialize them to avoid reading uninitialized locations. Also, unset the
6064 offsets for the matched string. This is really just for tidiness with callouts,
6065 in case they inspect these fields. */
6066
6067 if (md->offset_vector != NULL)
6068 {
6069 register int *iptr = md->offset_vector + ocount;
6070 register int *iend = iptr - re->top_bracket;
6071 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6072 while (--iptr >= iend) *iptr = -1;
6073 md->offset_vector[0] = md->offset_vector[1] = -1;
6074 }
6075
6076 /* Set up the first character to match, if available. The first_byte value is
6077 never set for an anchored regular expression, but the anchoring may be forced
6078 at run time, so we have to test for anchoring. The first char may be unset for
6079 an unanchored pattern, of course. If there's no first char and the pattern was
6080 studied, there may be a bitmap of possible first characters. */
6081
6082 if (!anchored)
6083 {
6084 if ((re->flags & PCRE_FIRSTSET) != 0)
6085 {
6086 first_byte = re->first_byte & 255;
6087 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6088 first_byte = md->lcc[first_byte];
6089 }
6090 else
6091 if (!startline && study != NULL &&
6092 (study->flags & PCRE_STUDY_MAPPED) != 0)
6093 start_bits = study->start_bits;
6094 }
6095
6096 /* For anchored or unanchored matches, there may be a "last known required
6097 character" set. */
6098
6099 if ((re->flags & PCRE_REQCHSET) != 0)
6100 {
6101 req_byte = re->req_byte & 255;
6102 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6103 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6104 }
6105
6106
6107
6108
6109 /* ==========================================================================*/
6110
6111 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6112 the loop runs just once. */
6113
6114 for(;;)
6115 {
6116 USPTR save_end_subject = end_subject;
6117 USPTR new_start_match;
6118
6119 /* If firstline is TRUE, the start of the match is constrained to the first
6120 line of a multiline string. That is, the match must be before or at the first
6121 newline. Implement this by temporarily adjusting end_subject so that we stop
6122 scanning at a newline. If the match fails at the newline, later code breaks
6123 this loop. */
6124
6125 if (firstline)
6126 {
6127 USPTR t = start_match;
6128 #ifdef SUPPORT_UTF8
6129 if (utf8)
6130 {
6131 while (t < md->end_subject && !IS_NEWLINE(t))
6132 {
6133 t++;
6134 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6135 }
6136 }
6137 else
6138 #endif
6139 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6140 end_subject = t;
6141 }
6142
6143 /* There are some optimizations that avoid running the match if a known
6144 starting point is not found, or if a known later character is not present.
6145 However, there is an option that disables these, for testing and for ensuring
6146 that all callouts do actually occur. The option can be set in the regex by
6147 (*NO_START_OPT) or passed in match-time options. */
6148
6149 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6150 {
6151 /* Advance to a unique first byte if there is one. */
6152
6153 if (first_byte >= 0)
6154 {
6155 if (first_byte_caseless)
6156 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6157 start_match++;
6158 else
6159 while (start_match < end_subject && *start_match != first_byte)
6160 start_match++;
6161 }
6162
6163 /* Or to just after a linebreak for a multiline match */
6164
6165 else if (startline)
6166 {
6167 if (start_match > md->start_subject + start_offset)
6168 {
6169 #ifdef SUPPORT_UTF8
6170 if (utf8)
6171 {
6172 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6173 {
6174 start_match++;
6175 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6176 start_match++;
6177 }
6178 }
6179 else
6180 #endif
6181 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6182 start_match++;
6183
6184 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6185 and we are now at a LF, advance the match position by one more character.
6186 */
6187
6188 if (start_match[-1] == CHAR_CR &&
6189 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6190 start_match < end_subject &&
6191 *start_match == CHAR_NL)
6192 start_match++;
6193 }
6194 }
6195
6196 /* Or to a non-unique first byte after study */
6197
6198 else if (start_bits != NULL)
6199 {
6200 while (start_match < end_subject)
6201 {
6202 register unsigned int c = *start_match;
6203 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6204 {
6205 start_match++;
6206 #ifdef SUPPORT_UTF8
6207 if (utf8)
6208 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6209 start_match++;
6210 #endif
6211 }
6212 else break;
6213 }
6214 }
6215 } /* Starting optimizations */
6216
6217 /* Restore fudged end_subject */
6218
6219 end_subject = save_end_subject;
6220
6221 /* The following two optimizations are disabled for partial matching or if
6222 disabling is explicitly requested. */
6223
6224 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6225 {
6226 /* If the pattern was studied, a minimum subject length may be set. This is
6227 a lower bound; no actual string of that length may actually match the
6228 pattern. Although the value is, strictly, in characters, we treat it as
6229 bytes to avoid spending too much time in this optimization. */
6230
6231 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6232 (pcre_uint32)(end_subject - start_match) < study->minlength)
6233 {
6234 rc = MATCH_NOMATCH;
6235 break;
6236 }
6237
6238 /* If req_byte is set, we know that that character must appear in the
6239 subject for the match to succeed. If the first character is set, req_byte
6240 must be later in the subject; otherwise the test starts at the match point.
6241 This optimization can save a huge amount of backtracking in patterns with
6242 nested unlimited repeats that aren't going to match. Writing separate code
6243 for cased/caseless versions makes it go faster, as does using an
6244 autoincrement and backing off on a match.
6245
6246 HOWEVER: when the subject string is very, very long, searching to its end
6247 can take a long time, and give bad performance on quite ordinary patterns.
6248 This showed up when somebody was matching something like /^\d+C/ on a
6249 32-megabyte string... so we don't do this when the string is sufficiently
6250 long. */
6251
6252 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6253 {
6254 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6255
6256 /* We don't need to repeat the search if we haven't yet reached the
6257 place we found it at last time. */
6258
6259 if (p > req_byte_ptr)
6260 {
6261 if (req_byte_caseless)
6262 {
6263 while (p < end_subject)
6264 {
6265 register int pp = *p++;
6266 if (pp == req_byte || pp == req_byte2) { p--; break; }
6267 }
6268 }
6269 else
6270 {
6271 while (p < end_subject)
6272 {
6273 if (*p++ == req_byte) { p--; break; }
6274 }
6275 }
6276
6277 /* If we can't find the required character, break the matching loop,
6278 forcing a match failure. */
6279
6280 if (p >= end_subject)
6281 {
6282 rc = MATCH_NOMATCH;
6283 break;
6284 }
6285
6286 /* If we have found the required character, save the point where we
6287 found it, so that we don't search again next time round the loop if
6288 the start hasn't passed this character yet. */
6289
6290 req_byte_ptr = p;
6291 }
6292 }
6293 }
6294
6295 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6296 printf(">>>> Match against: ");
6297 pchars(start_match, end_subject - start_match, TRUE, md);
6298 printf("\n");
6299 #endif
6300
6301 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6302 first starting point for which a partial match was found. */
6303
6304 md->start_match_ptr = start_match;
6305 md->start_used_ptr = start_match;
6306 md->match_call_count = 0;
6307 md->match_function_type = 0;
6308 md->end_offset_top = 0;
6309 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6310 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6311
6312 switch(rc)
6313 {
6314 /* SKIP passes back the next starting point explicitly, but if it is the
6315 same as the match we have just done, treat it as NOMATCH. */
6316
6317 case MATCH_SKIP:
6318 if (md->start_match_ptr != start_match)
6319 {
6320 new_start_match = md->start_match_ptr;
6321 break;
6322 }
6323 /* Fall through */
6324
6325 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6326 the SKIP's arg was not found. We also treat this as NOMATCH. */
6327
6328 case MATCH_SKIP_ARG:
6329 /* Fall through */
6330
6331 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6332 exactly like PRUNE. */
6333
6334 case MATCH_NOMATCH:
6335 case MATCH_PRUNE:
6336 case MATCH_THEN:
6337 new_start_match = start_match + 1;
6338 #ifdef SUPPORT_UTF8
6339 if (utf8)
6340 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6341 new_start_match++;
6342 #endif
6343 break;
6344
6345 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6346
6347 case MATCH_COMMIT:
6348 rc = MATCH_NOMATCH;
6349 goto ENDLOOP;
6350
6351 /* Any other return is either a match, or some kind of error. */
6352
6353 default:
6354 goto ENDLOOP;
6355 }
6356
6357 /* Control reaches here for the various types of "no match at this point"
6358 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6359
6360 rc = MATCH_NOMATCH;
6361
6362 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6363 newline in the subject (though it may continue over the newline). Therefore,
6364 if we have just failed to match, starting at a newline, do not continue. */
6365
6366 if (firstline && IS_NEWLINE(start_match)) break;
6367
6368 /* Advance to new matching position */
6369
6370 start_match = new_start_match;
6371
6372 /* Break the loop if the pattern is anchored or if we have passed the end of
6373 the subject. */
6374
6375 if (anchored || start_match > end_subject) break;
6376
6377 /* If we have just passed a CR and we are now at a LF, and the pattern does
6378 not contain any explicit matches for \r or \n, and the newline option is CRLF
6379 or ANY or ANYCRLF, advance the match position by one more character. */
6380
6381 if (start_match[-1] == CHAR_CR &&
6382 start_match < end_subject &&
6383 *start_match == CHAR_NL &&
6384 (re->flags & PCRE_HASCRORLF) == 0 &&
6385 (md->nltype == NLTYPE_ANY ||
6386 md->nltype == NLTYPE_ANYCRLF ||
6387 md->nllen == 2))
6388 start_match++;
6389
6390 md->mark = NULL; /* Reset for start of next match attempt */
6391 } /* End of for(;;) "bumpalong" loop */
6392
6393 /* ==========================================================================*/
6394
6395 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6396 conditions is true:
6397
6398 (1) The pattern is anchored or the match was failed by (*COMMIT);
6399
6400 (2) We are past the end of the subject;
6401
6402 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6403 this option requests that a match occur at or before the first newline in
6404 the subject.
6405
6406 When we have a match and the offset vector is big enough to deal with any
6407 backreferences, captured substring offsets will already be set up. In the case
6408 where we had to get some local store to hold offsets for backreference
6409 processing, copy those that we can. In this case there need not be overflow if
6410 certain parts of the pattern were not used, even though there are more
6411 capturing parentheses than vector slots. */
6412
6413 ENDLOOP:
6414
6415 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6416 {
6417 if (using_temporary_offsets)
6418 {
6419 if (offsetcount >= 4)
6420 {
6421 memcpy(offsets + 2, md->offset_vector + 2,
6422 (offsetcount - 2) * sizeof(int));
6423 DPRINTF(("Copied offsets from temporary memory\n"));
6424 }
6425 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6426 DPRINTF(("Freeing temporary memory\n"));
6427 (pcre_free)(md->offset_vector);
6428 }
6429
6430 /* Set the return code to the number of captured strings, or 0 if there are
6431 too many to fit into the vector. */
6432
6433 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6434
6435 /* If there is space, set up the whole thing as substring 0. The value of
6436 md->start_match_ptr might be modified if \K was encountered on the success
6437 matching path. */
6438
6439 if (offsetcount < 2) rc = 0; else
6440 {
6441 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6442 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6443 }
6444
6445 DPRINTF((">>>> returning %d\n", rc));
6446 goto RETURN_MARK;
6447 }
6448
6449 /* Control gets here if there has been an error, or if the overall match
6450 attempt has failed at all permitted starting positions. */
6451
6452 if (using_temporary_offsets)
6453 {
6454 DPRINTF(("Freeing temporary memory\n"));
6455 (pcre_free)(md->offset_vector);
6456 }
6457
6458 /* For anything other than nomatch or partial match, just return the code. */
6459
6460 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6461 {
6462 DPRINTF((">>>> error: returning %d\n", rc));
6463 return rc;
6464 }
6465
6466 /* Handle partial matches - disable any mark data */
6467
6468 if (start_partial != NULL)
6469 {
6470 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6471 md->mark = NULL;
6472 if (offsetcount > 1)
6473 {
6474 offsets[0] = (int)(start_partial - (USPTR)subject);
6475 offsets[1] = (int)(end_subject - (USPTR)subject);
6476 }
6477 rc = PCRE_ERROR_PARTIAL;
6478 }
6479
6480 /* This is the classic nomatch case */
6481
6482 else
6483 {
6484 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6485 rc = PCRE_ERROR_NOMATCH;
6486 }
6487
6488 /* Return the MARK data if it has been requested. */
6489
6490 RETURN_MARK:
6491
6492 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6493 *(extra_data->mark) = (unsigned char *)(md->mark);
6494 return rc;
6495 }
6496
6497 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5