/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 608 - (show annotations)
Sun Jun 12 16:25:55 2011 UTC (4 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 194678 byte(s)
Error occurred while calculating annotation data.
Fix problems with capturing parens and *ACCEPT with recursion.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63, RM64 };
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. If we fail to match, we need to restore this value and also the
812 values of the final offsets, in case they were set by a previous iteration
813 of the same bracket.
814
815 If there isn't enough space in the offset vector, treat this as if it were
816 a non-capturing bracket. Don't worry about setting the flag for the error
817 case here; that is handled in the code for KET. */
818
819 case OP_CBRA:
820 case OP_SCBRA:
821 number = GET2(ecode, 1+LINK_SIZE);
822 offset = number << 1;
823
824 #ifdef PCRE_DEBUG
825 printf("start bracket %d\n", number);
826 printf("subject=");
827 pchars(eptr, 16, TRUE, md);
828 printf("\n");
829 #endif
830
831 if (offset < md->offset_max)
832 {
833 save_offset1 = md->offset_vector[offset];
834 save_offset2 = md->offset_vector[offset+1];
835 save_offset3 = md->offset_vector[md->offset_end - number];
836 save_capture_last = md->capture_last;
837
838 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 md->offset_vector[md->offset_end - number] =
840 (int)(eptr - md->start_subject);
841
842 for (;;)
843 {
844 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846 eptrb, RM1);
847 if (rrc != MATCH_NOMATCH &&
848 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849 RRETURN(rrc);
850 md->capture_last = save_capture_last;
851 ecode += GET(ecode, 1);
852 if (*ecode != OP_ALT) break;
853 }
854
855 DPRINTF(("bracket %d failed\n", number));
856
857 md->offset_vector[offset] = save_offset1;
858 md->offset_vector[offset+1] = save_offset2;
859 md->offset_vector[md->offset_end - number] = save_offset3;
860
861 if (rrc != MATCH_THEN) md->mark = markptr;
862 RRETURN(MATCH_NOMATCH);
863 }
864
865 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866 as a non-capturing bracket. */
867
868 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870
871 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877 for all the alternatives. When we get to the final alternative within the
878 brackets, we would return the result of a recursive call to match()
879 whatever happened. We can reduce stack usage by turning this into a tail
880 recursion, except in the case of a possibly empty group.*/
881
882 case OP_BRA:
883 case OP_SBRA:
884 DPRINTF(("start non-capturing bracket\n"));
885 for (;;)
886 {
887 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
888 {
889 if (op >= OP_SBRA) /* Possibly empty group */
890 {
891 md->match_function_type = MATCH_CBEGROUP;
892 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
893 RM48);
894 if (rrc == MATCH_NOMATCH) md->mark = markptr;
895 RRETURN(rrc);
896 }
897 /* Not a possibly empty group; use tail recursion */
898 ecode += _pcre_OP_lengths[*ecode];
899 DPRINTF(("bracket 0 tail recursion\n"));
900 goto TAIL_RECURSE;
901 }
902
903 /* For non-final alternatives, continue the loop for a NOMATCH result;
904 otherwise return. */
905
906 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
907 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
908 RM2);
909 if (rrc != MATCH_NOMATCH &&
910 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
911 RRETURN(rrc);
912 ecode += GET(ecode, 1);
913 }
914 /* Control never reaches here. */
915
916 /* Handle possessive capturing brackets with an unlimited repeat. We come
917 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
918 handled similarly to the normal case above. However, the matching is
919 different. The end of these brackets will always be OP_KETRPOS, which
920 returns MATCH_KETRPOS without going further in the pattern. By this means
921 we can handle the group by iteration rather than recursion, thereby
922 reducing the amount of stack needed. */
923
924 case OP_CBRAPOS:
925 case OP_SCBRAPOS:
926 allow_zero = FALSE;
927
928 POSSESSIVE_CAPTURE:
929 number = GET2(ecode, 1+LINK_SIZE);
930 offset = number << 1;
931
932 #ifdef PCRE_DEBUG
933 printf("start possessive bracket %d\n", number);
934 printf("subject=");
935 pchars(eptr, 16, TRUE, md);
936 printf("\n");
937 #endif
938
939 if (offset < md->offset_max)
940 {
941 matched_once = FALSE;
942 code_offset = ecode - md->start_code;
943
944 save_offset1 = md->offset_vector[offset];
945 save_offset2 = md->offset_vector[offset+1];
946 save_offset3 = md->offset_vector[md->offset_end - number];
947 save_capture_last = md->capture_last;
948
949 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
950
951 /* Each time round the loop, save the current subject position for use
952 when the group matches. For MATCH_MATCH, the group has matched, so we
953 restart it with a new subject starting position, remembering that we had
954 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
955 usual. If we haven't matched any alternatives in any iteration, check to
956 see if a previous iteration matched. If so, the group has matched;
957 continue from afterwards. Otherwise it has failed; restore the previous
958 capture values before returning NOMATCH. */
959
960 for (;;)
961 {
962 md->offset_vector[md->offset_end - number] =
963 (int)(eptr - md->start_subject);
964 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
965 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
966 eptrb, RM63);
967 if (rrc == MATCH_KETRPOS)
968 {
969 offset_top = md->end_offset_top;
970 eptr = md->end_match_ptr;
971 ecode = md->start_code + code_offset;
972 save_capture_last = md->capture_last;
973 matched_once = TRUE;
974 continue;
975 }
976 if (rrc != MATCH_NOMATCH &&
977 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
978 RRETURN(rrc);
979 md->capture_last = save_capture_last;
980 ecode += GET(ecode, 1);
981 if (*ecode != OP_ALT) break;
982 }
983
984 if (!matched_once)
985 {
986 md->offset_vector[offset] = save_offset1;
987 md->offset_vector[offset+1] = save_offset2;
988 md->offset_vector[md->offset_end - number] = save_offset3;
989 }
990
991 if (rrc != MATCH_THEN) md->mark = markptr;
992 if (allow_zero || matched_once)
993 {
994 ecode += 1 + LINK_SIZE;
995 break;
996 }
997
998 RRETURN(MATCH_NOMATCH);
999 }
1000
1001 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1002 as a non-capturing bracket. */
1003
1004 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006
1007 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1008
1009 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1010 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1011
1012 /* Non-capturing possessive bracket with unlimited repeat. We come here
1013 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1014 without the capturing complication. It is written out separately for speed
1015 and cleanliness. */
1016
1017 case OP_BRAPOS:
1018 case OP_SBRAPOS:
1019 allow_zero = FALSE;
1020
1021 POSSESSIVE_NON_CAPTURE:
1022 matched_once = FALSE;
1023 code_offset = ecode - md->start_code;
1024
1025 for (;;)
1026 {
1027 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1028 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1029 eptrb, RM64);
1030 if (rrc == MATCH_KETRPOS)
1031 {
1032 eptr = md->end_match_ptr;
1033 ecode = md->start_code + code_offset;
1034 matched_once = TRUE;
1035 continue;
1036 }
1037 if (rrc != MATCH_NOMATCH &&
1038 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1039 RRETURN(rrc);
1040 ecode += GET(ecode, 1);
1041 if (*ecode != OP_ALT) break;
1042 }
1043
1044 if (matched_once || allow_zero)
1045 {
1046 ecode += 1 + LINK_SIZE;
1047 break;
1048 }
1049 RRETURN(MATCH_NOMATCH);
1050
1051 /* Control never reaches here. */
1052
1053 /* Conditional group: compilation checked that there are no more than
1054 two branches. If the condition is false, skipping the first branch takes us
1055 past the end if there is only one branch, but that's OK because that is
1056 exactly what going to the ket would do. As there is only one branch to be
1057 obeyed, we can use tail recursion to avoid using another stack frame. */
1058
1059 case OP_COND:
1060 case OP_SCOND:
1061 codelink = GET(ecode, 1);
1062
1063 /* Because of the way auto-callout works during compile, a callout item is
1064 inserted between OP_COND and an assertion condition. */
1065
1066 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1067 {
1068 if (pcre_callout != NULL)
1069 {
1070 pcre_callout_block cb;
1071 cb.version = 1; /* Version 1 of the callout block */
1072 cb.callout_number = ecode[LINK_SIZE+2];
1073 cb.offset_vector = md->offset_vector;
1074 cb.subject = (PCRE_SPTR)md->start_subject;
1075 cb.subject_length = (int)(md->end_subject - md->start_subject);
1076 cb.start_match = (int)(mstart - md->start_subject);
1077 cb.current_position = (int)(eptr - md->start_subject);
1078 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1079 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1080 cb.capture_top = offset_top/2;
1081 cb.capture_last = md->capture_last;
1082 cb.callout_data = md->callout_data;
1083 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1084 if (rrc < 0) RRETURN(rrc);
1085 }
1086 ecode += _pcre_OP_lengths[OP_CALLOUT];
1087 }
1088
1089 condcode = ecode[LINK_SIZE+1];
1090
1091 /* Now see what the actual condition is */
1092
1093 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1094 {
1095 if (md->recursive == NULL) /* Not recursing => FALSE */
1096 {
1097 condition = FALSE;
1098 ecode += GET(ecode, 1);
1099 }
1100 else
1101 {
1102 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1103 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1104
1105 /* If the test is for recursion into a specific subpattern, and it is
1106 false, but the test was set up by name, scan the table to see if the
1107 name refers to any other numbers, and test them. The condition is true
1108 if any one is set. */
1109
1110 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1111 {
1112 uschar *slotA = md->name_table;
1113 for (i = 0; i < md->name_count; i++)
1114 {
1115 if (GET2(slotA, 0) == recno) break;
1116 slotA += md->name_entry_size;
1117 }
1118
1119 /* Found a name for the number - there can be only one; duplicate
1120 names for different numbers are allowed, but not vice versa. First
1121 scan down for duplicates. */
1122
1123 if (i < md->name_count)
1124 {
1125 uschar *slotB = slotA;
1126 while (slotB > md->name_table)
1127 {
1128 slotB -= md->name_entry_size;
1129 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1130 {
1131 condition = GET2(slotB, 0) == md->recursive->group_num;
1132 if (condition) break;
1133 }
1134 else break;
1135 }
1136
1137 /* Scan up for duplicates */
1138
1139 if (!condition)
1140 {
1141 slotB = slotA;
1142 for (i++; i < md->name_count; i++)
1143 {
1144 slotB += md->name_entry_size;
1145 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1146 {
1147 condition = GET2(slotB, 0) == md->recursive->group_num;
1148 if (condition) break;
1149 }
1150 else break;
1151 }
1152 }
1153 }
1154 }
1155
1156 /* Chose branch according to the condition */
1157
1158 ecode += condition? 3 : GET(ecode, 1);
1159 }
1160 }
1161
1162 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1163 {
1164 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1165 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1166
1167 /* If the numbered capture is unset, but the reference was by name,
1168 scan the table to see if the name refers to any other numbers, and test
1169 them. The condition is true if any one is set. This is tediously similar
1170 to the code above, but not close enough to try to amalgamate. */
1171
1172 if (!condition && condcode == OP_NCREF)
1173 {
1174 int refno = offset >> 1;
1175 uschar *slotA = md->name_table;
1176
1177 for (i = 0; i < md->name_count; i++)
1178 {
1179 if (GET2(slotA, 0) == refno) break;
1180 slotA += md->name_entry_size;
1181 }
1182
1183 /* Found a name for the number - there can be only one; duplicate names
1184 for different numbers are allowed, but not vice versa. First scan down
1185 for duplicates. */
1186
1187 if (i < md->name_count)
1188 {
1189 uschar *slotB = slotA;
1190 while (slotB > md->name_table)
1191 {
1192 slotB -= md->name_entry_size;
1193 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1194 {
1195 offset = GET2(slotB, 0) << 1;
1196 condition = offset < offset_top &&
1197 md->offset_vector[offset] >= 0;
1198 if (condition) break;
1199 }
1200 else break;
1201 }
1202
1203 /* Scan up for duplicates */
1204
1205 if (!condition)
1206 {
1207 slotB = slotA;
1208 for (i++; i < md->name_count; i++)
1209 {
1210 slotB += md->name_entry_size;
1211 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1212 {
1213 offset = GET2(slotB, 0) << 1;
1214 condition = offset < offset_top &&
1215 md->offset_vector[offset] >= 0;
1216 if (condition) break;
1217 }
1218 else break;
1219 }
1220 }
1221 }
1222 }
1223
1224 /* Chose branch according to the condition */
1225
1226 ecode += condition? 3 : GET(ecode, 1);
1227 }
1228
1229 else if (condcode == OP_DEF) /* DEFINE - always false */
1230 {
1231 condition = FALSE;
1232 ecode += GET(ecode, 1);
1233 }
1234
1235 /* The condition is an assertion. Call match() to evaluate it - setting
1236 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1237 an assertion. */
1238
1239 else
1240 {
1241 md->match_function_type = MATCH_CONDASSERT;
1242 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1243 if (rrc == MATCH_MATCH)
1244 {
1245 condition = TRUE;
1246 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1247 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1248 }
1249 else if (rrc != MATCH_NOMATCH &&
1250 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1251 {
1252 RRETURN(rrc); /* Need braces because of following else */
1253 }
1254 else
1255 {
1256 condition = FALSE;
1257 ecode += codelink;
1258 }
1259 }
1260
1261 /* We are now at the branch that is to be obeyed. As there is only one,
1262 we can use tail recursion to avoid using another stack frame, except when
1263 we have an unlimited repeat of a possibly empty group. If the second
1264 alternative doesn't exist, we can just plough on. */
1265
1266 if (condition || *ecode == OP_ALT)
1267 {
1268 ecode += 1 + LINK_SIZE;
1269 if (op == OP_SCOND) /* Possibly empty group */
1270 {
1271 md->match_function_type = MATCH_CBEGROUP;
1272 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1273 RRETURN(rrc);
1274 }
1275 else goto TAIL_RECURSE;
1276 }
1277 else /* Condition false & no alternative */
1278 {
1279 ecode += 1 + LINK_SIZE;
1280 }
1281 break;
1282
1283
1284 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1285 to close any currently open capturing brackets. */
1286
1287 case OP_CLOSE:
1288 number = GET2(ecode, 1);
1289 offset = number << 1;
1290
1291 #ifdef PCRE_DEBUG
1292 printf("end bracket %d at *ACCEPT", number);
1293 printf("\n");
1294 #endif
1295
1296 md->capture_last = number;
1297 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1298 {
1299 md->offset_vector[offset] =
1300 md->offset_vector[md->offset_end - number];
1301 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1302 if (offset_top <= offset) offset_top = offset + 2;
1303 }
1304 ecode += 3;
1305 break;
1306
1307
1308 /* End of the pattern, either real or forced. If we are in a recursion, we
1309 should restore the offsets appropriately, and if it's a top-level
1310 recursion, continue from after the call. */
1311
1312 case OP_ACCEPT:
1313 case OP_END:
1314 if (md->recursive != NULL)
1315 {
1316 recursion_info *rec = md->recursive;
1317 md->recursive = rec->prevrec;
1318 memmove(md->offset_vector, rec->offset_save,
1319 rec->saved_max * sizeof(int));
1320 offset_top = rec->save_offset_top;
1321 if (rec->group_num == 0)
1322 {
1323 ecode = rec->after_call;
1324 break;
1325 }
1326 }
1327
1328 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1329 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1330 the subject. In both cases, backtracking will then try other alternatives,
1331 if any. */
1332
1333 else if (eptr == mstart &&
1334 (md->notempty ||
1335 (md->notempty_atstart &&
1336 mstart == md->start_subject + md->start_offset)))
1337 MRRETURN(MATCH_NOMATCH);
1338
1339 /* Otherwise, we have a match. */
1340
1341 md->end_match_ptr = eptr; /* Record where we ended */
1342 md->end_offset_top = offset_top; /* and how many extracts were taken */
1343 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1344
1345 /* For some reason, the macros don't work properly if an expression is
1346 given as the argument to MRRETURN when the heap is in use. */
1347
1348 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1349 MRRETURN(rrc);
1350
1351 /* Assertion brackets. Check the alternative branches in turn - the
1352 matching won't pass the KET for an assertion. If any one branch matches,
1353 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1354 start of each branch to move the current point backwards, so the code at
1355 this level is identical to the lookahead case. When the assertion is part
1356 of a condition, we want to return immediately afterwards. The caller of
1357 this incarnation of the match() function will have set MATCH_CONDASSERT in
1358 md->match_function type, and one of these opcodes will be the first opcode
1359 that is processed. We use a local variable that is preserved over calls to
1360 match() to remember this case. */
1361
1362 case OP_ASSERT:
1363 case OP_ASSERTBACK:
1364 if (md->match_function_type == MATCH_CONDASSERT)
1365 {
1366 condassert = TRUE;
1367 md->match_function_type = 0;
1368 }
1369 else condassert = FALSE;
1370
1371 do
1372 {
1373 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1374 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1375 {
1376 mstart = md->start_match_ptr; /* In case \K reset it */
1377 break;
1378 }
1379 if (rrc != MATCH_NOMATCH &&
1380 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1381 RRETURN(rrc);
1382 ecode += GET(ecode, 1);
1383 }
1384 while (*ecode == OP_ALT);
1385
1386 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1387
1388 /* If checking an assertion for a condition, return MATCH_MATCH. */
1389
1390 if (condassert) RRETURN(MATCH_MATCH);
1391
1392 /* Continue from after the assertion, updating the offsets high water
1393 mark, since extracts may have been taken during the assertion. */
1394
1395 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1396 ecode += 1 + LINK_SIZE;
1397 offset_top = md->end_offset_top;
1398 continue;
1399
1400 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1401 PRUNE, or COMMIT means we must assume failure without checking subsequent
1402 branches. */
1403
1404 case OP_ASSERT_NOT:
1405 case OP_ASSERTBACK_NOT:
1406 if (md->match_function_type == MATCH_CONDASSERT)
1407 {
1408 condassert = TRUE;
1409 md->match_function_type = 0;
1410 }
1411 else condassert = FALSE;
1412
1413 do
1414 {
1415 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1416 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1417 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1418 {
1419 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1420 break;
1421 }
1422 if (rrc != MATCH_NOMATCH &&
1423 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1424 RRETURN(rrc);
1425 ecode += GET(ecode,1);
1426 }
1427 while (*ecode == OP_ALT);
1428
1429 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1430
1431 ecode += 1 + LINK_SIZE;
1432 continue;
1433
1434 /* Move the subject pointer back. This occurs only at the start of
1435 each branch of a lookbehind assertion. If we are too close to the start to
1436 move back, this match function fails. When working with UTF-8 we move
1437 back a number of characters, not bytes. */
1438
1439 case OP_REVERSE:
1440 #ifdef SUPPORT_UTF8
1441 if (utf8)
1442 {
1443 i = GET(ecode, 1);
1444 while (i-- > 0)
1445 {
1446 eptr--;
1447 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1448 BACKCHAR(eptr);
1449 }
1450 }
1451 else
1452 #endif
1453
1454 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1455
1456 {
1457 eptr -= GET(ecode, 1);
1458 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1459 }
1460
1461 /* Save the earliest consulted character, then skip to next op code */
1462
1463 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1464 ecode += 1 + LINK_SIZE;
1465 break;
1466
1467 /* The callout item calls an external function, if one is provided, passing
1468 details of the match so far. This is mainly for debugging, though the
1469 function is able to force a failure. */
1470
1471 case OP_CALLOUT:
1472 if (pcre_callout != NULL)
1473 {
1474 pcre_callout_block cb;
1475 cb.version = 1; /* Version 1 of the callout block */
1476 cb.callout_number = ecode[1];
1477 cb.offset_vector = md->offset_vector;
1478 cb.subject = (PCRE_SPTR)md->start_subject;
1479 cb.subject_length = (int)(md->end_subject - md->start_subject);
1480 cb.start_match = (int)(mstart - md->start_subject);
1481 cb.current_position = (int)(eptr - md->start_subject);
1482 cb.pattern_position = GET(ecode, 2);
1483 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1484 cb.capture_top = offset_top/2;
1485 cb.capture_last = md->capture_last;
1486 cb.callout_data = md->callout_data;
1487 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1488 if (rrc < 0) RRETURN(rrc);
1489 }
1490 ecode += 2 + 2*LINK_SIZE;
1491 break;
1492
1493 /* Recursion either matches the current regex, or some subexpression. The
1494 offset data is the offset to the starting bracket from the start of the
1495 whole pattern. (This is so that it works from duplicated subpatterns.)
1496
1497 If there are any capturing brackets started but not finished, we have to
1498 save their starting points and reinstate them after the recursion. However,
1499 we don't know how many such there are (offset_top records the completed
1500 total) so we just have to save all the potential data. There may be up to
1501 65535 such values, which is too large to put on the stack, but using malloc
1502 for small numbers seems expensive. As a compromise, the stack is used when
1503 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1504 is used. A problem is what to do if the malloc fails ... there is no way of
1505 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1506 values on the stack, and accept that the rest may be wrong.
1507
1508 There are also other values that have to be saved. We use a chained
1509 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1510 for the original version of this logic. */
1511
1512 case OP_RECURSE:
1513 {
1514 callpat = md->start_code + GET(ecode, 1);
1515 new_recursive.group_num = (callpat == md->start_code)? 0 :
1516 GET2(callpat, 1 + LINK_SIZE);
1517
1518 /* Add to "recursing stack" */
1519
1520 new_recursive.prevrec = md->recursive;
1521 md->recursive = &new_recursive;
1522
1523 /* Find where to continue from afterwards */
1524
1525 ecode += 1 + LINK_SIZE;
1526 new_recursive.after_call = ecode;
1527
1528 /* Now save the offset data. */
1529
1530 new_recursive.saved_max = md->offset_end;
1531 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1532 new_recursive.offset_save = stacksave;
1533 else
1534 {
1535 new_recursive.offset_save =
1536 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1537 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1538 }
1539
1540 memcpy(new_recursive.offset_save, md->offset_vector,
1541 new_recursive.saved_max * sizeof(int));
1542 new_recursive.save_offset_top = offset_top;
1543
1544 /* OK, now we can do the recursion. For each top-level alternative we
1545 restore the offset and recursion data. */
1546
1547 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1548 cbegroup = (*callpat >= OP_SBRA);
1549 do
1550 {
1551 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1552 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1553 md, eptrb, RM6);
1554 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1555 {
1556 DPRINTF(("Recursion matched\n"));
1557 md->recursive = new_recursive.prevrec;
1558 if (new_recursive.offset_save != stacksave)
1559 (pcre_free)(new_recursive.offset_save);
1560 MRRETURN(MATCH_MATCH);
1561 }
1562 else if (rrc != MATCH_NOMATCH &&
1563 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1564 {
1565 DPRINTF(("Recursion gave error %d\n", rrc));
1566 if (new_recursive.offset_save != stacksave)
1567 (pcre_free)(new_recursive.offset_save);
1568 RRETURN(rrc);
1569 }
1570
1571 md->recursive = &new_recursive;
1572 memcpy(md->offset_vector, new_recursive.offset_save,
1573 new_recursive.saved_max * sizeof(int));
1574 callpat += GET(callpat, 1);
1575 }
1576 while (*callpat == OP_ALT);
1577
1578 DPRINTF(("Recursion didn't match\n"));
1579 md->recursive = new_recursive.prevrec;
1580 if (new_recursive.offset_save != stacksave)
1581 (pcre_free)(new_recursive.offset_save);
1582 MRRETURN(MATCH_NOMATCH);
1583 }
1584 /* Control never reaches here */
1585
1586 /* "Once" brackets are like assertion brackets except that after a match,
1587 the point in the subject string is not moved back. Thus there can never be
1588 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1589 Check the alternative branches in turn - the matching won't pass the KET
1590 for this kind of subpattern. If any one branch matches, we carry on as at
1591 the end of a normal bracket, leaving the subject pointer, but resetting
1592 the start-of-match value in case it was changed by \K. */
1593
1594 case OP_ONCE:
1595 prev = ecode;
1596 saved_eptr = eptr;
1597
1598 do
1599 {
1600 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1601 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1602 {
1603 mstart = md->start_match_ptr;
1604 break;
1605 }
1606 if (rrc != MATCH_NOMATCH &&
1607 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1608 RRETURN(rrc);
1609 ecode += GET(ecode,1);
1610 }
1611 while (*ecode == OP_ALT);
1612
1613 /* If hit the end of the group (which could be repeated), fail */
1614
1615 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1616
1617 /* Continue as from after the assertion, updating the offsets high water
1618 mark, since extracts may have been taken. */
1619
1620 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1621
1622 offset_top = md->end_offset_top;
1623 eptr = md->end_match_ptr;
1624
1625 /* For a non-repeating ket, just continue at this level. This also
1626 happens for a repeating ket if no characters were matched in the group.
1627 This is the forcible breaking of infinite loops as implemented in Perl
1628 5.005. If there is an options reset, it will get obeyed in the normal
1629 course of events. */
1630
1631 if (*ecode == OP_KET || eptr == saved_eptr)
1632 {
1633 ecode += 1+LINK_SIZE;
1634 break;
1635 }
1636
1637 /* The repeating kets try the rest of the pattern or restart from the
1638 preceding bracket, in the appropriate order. The second "call" of match()
1639 uses tail recursion, to avoid using another stack frame. */
1640
1641 if (*ecode == OP_KETRMIN)
1642 {
1643 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1644 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1645 ecode = prev;
1646 goto TAIL_RECURSE;
1647 }
1648 else /* OP_KETRMAX */
1649 {
1650 md->match_function_type = MATCH_CBEGROUP;
1651 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1653 ecode += 1 + LINK_SIZE;
1654 goto TAIL_RECURSE;
1655 }
1656 /* Control never gets here */
1657
1658 /* An alternation is the end of a branch; scan along to find the end of the
1659 bracketed group and go to there. */
1660
1661 case OP_ALT:
1662 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1663 break;
1664
1665 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1666 indicating that it may occur zero times. It may repeat infinitely, or not
1667 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1668 with fixed upper repeat limits are compiled as a number of copies, with the
1669 optional ones preceded by BRAZERO or BRAMINZERO. */
1670
1671 case OP_BRAZERO:
1672 next = ecode + 1;
1673 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1674 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1675 do next += GET(next, 1); while (*next == OP_ALT);
1676 ecode = next + 1 + LINK_SIZE;
1677 break;
1678
1679 case OP_BRAMINZERO:
1680 next = ecode + 1;
1681 do next += GET(next, 1); while (*next == OP_ALT);
1682 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1684 ecode++;
1685 break;
1686
1687 case OP_SKIPZERO:
1688 next = ecode+1;
1689 do next += GET(next,1); while (*next == OP_ALT);
1690 ecode = next + 1 + LINK_SIZE;
1691 break;
1692
1693 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1694 here; just jump to the group, with allow_zero set TRUE. */
1695
1696 case OP_BRAPOSZERO:
1697 op = *(++ecode);
1698 allow_zero = TRUE;
1699 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1700 goto POSSESSIVE_NON_CAPTURE;
1701
1702 /* End of a group, repeated or non-repeating. */
1703
1704 case OP_KET:
1705 case OP_KETRMIN:
1706 case OP_KETRMAX:
1707 case OP_KETRPOS:
1708 prev = ecode - GET(ecode, 1);
1709
1710 /* If this was a group that remembered the subject start, in order to break
1711 infinite repeats of empty string matches, retrieve the subject start from
1712 the chain. Otherwise, set it NULL. */
1713
1714 if (*prev >= OP_SBRA)
1715 {
1716 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1717 eptrb = eptrb->epb_prev; /* Backup to previous group */
1718 }
1719 else saved_eptr = NULL;
1720
1721 /* If we are at the end of an assertion group or an atomic group, stop
1722 matching and return MATCH_MATCH, but record the current high water mark for
1723 use by positive assertions. We also need to record the match start in case
1724 it was changed by \K. */
1725
1726 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1727 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1728 *prev == OP_ONCE)
1729 {
1730 md->end_match_ptr = eptr; /* For ONCE */
1731 md->end_offset_top = offset_top;
1732 md->start_match_ptr = mstart;
1733 MRRETURN(MATCH_MATCH);
1734 }
1735
1736 /* For capturing groups we have to check the group number back at the start
1737 and if necessary complete handling an extraction by setting the offsets and
1738 bumping the high water mark. Note that whole-pattern recursion is coded as
1739 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1740 when the OP_END is reached. Other recursion is handled here. */
1741
1742 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1743 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1744 {
1745 number = GET2(prev, 1+LINK_SIZE);
1746 offset = number << 1;
1747
1748 #ifdef PCRE_DEBUG
1749 printf("end bracket %d", number);
1750 printf("\n");
1751 #endif
1752
1753 md->capture_last = number;
1754 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1755 {
1756 md->offset_vector[offset] =
1757 md->offset_vector[md->offset_end - number];
1758 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1759 if (offset_top <= offset) offset_top = offset + 2;
1760 }
1761
1762 /* Handle a recursively called group. Restore the offsets
1763 appropriately and continue from after the call. */
1764
1765 if (md->recursive != NULL && md->recursive->group_num == number)
1766 {
1767 recursion_info *rec = md->recursive;
1768 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1769 md->recursive = rec->prevrec;
1770 memcpy(md->offset_vector, rec->offset_save,
1771 rec->saved_max * sizeof(int));
1772 offset_top = rec->save_offset_top;
1773 ecode = rec->after_call;
1774 break;
1775 }
1776 }
1777
1778 /* For a non-repeating ket, just continue at this level. This also
1779 happens for a repeating ket if no characters were matched in the group.
1780 This is the forcible breaking of infinite loops as implemented in Perl
1781 5.005. If there is an options reset, it will get obeyed in the normal
1782 course of events. */
1783
1784 if (*ecode == OP_KET || eptr == saved_eptr)
1785 {
1786 ecode += 1 + LINK_SIZE;
1787 break;
1788 }
1789
1790 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1791 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1792 at a time from the outer level, thus saving stack. */
1793
1794 if (*ecode == OP_KETRPOS)
1795 {
1796 md->end_match_ptr = eptr;
1797 md->end_offset_top = offset_top;
1798 RRETURN(MATCH_KETRPOS);
1799 }
1800
1801 /* The normal repeating kets try the rest of the pattern or restart from
1802 the preceding bracket, in the appropriate order. In the second case, we can
1803 use tail recursion to avoid using another stack frame, unless we have an
1804 unlimited repeat of a group that can match an empty string. */
1805
1806 if (*ecode == OP_KETRMIN)
1807 {
1808 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1810 if (*prev >= OP_SBRA) /* Could match an empty string */
1811 {
1812 md->match_function_type = MATCH_CBEGROUP;
1813 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1814 RRETURN(rrc);
1815 }
1816 ecode = prev;
1817 goto TAIL_RECURSE;
1818 }
1819 else /* OP_KETRMAX */
1820 {
1821 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1822 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1824 ecode += 1 + LINK_SIZE;
1825 goto TAIL_RECURSE;
1826 }
1827 /* Control never gets here */
1828
1829 /* Not multiline mode: start of subject assertion, unless notbol. */
1830
1831 case OP_CIRC:
1832 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1833
1834 /* Start of subject assertion */
1835
1836 case OP_SOD:
1837 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1838 ecode++;
1839 break;
1840
1841 /* Multiline mode: start of subject unless notbol, or after any newline. */
1842
1843 case OP_CIRCM:
1844 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1845 if (eptr != md->start_subject &&
1846 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1847 MRRETURN(MATCH_NOMATCH);
1848 ecode++;
1849 break;
1850
1851 /* Start of match assertion */
1852
1853 case OP_SOM:
1854 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1855 ecode++;
1856 break;
1857
1858 /* Reset the start of match point */
1859
1860 case OP_SET_SOM:
1861 mstart = eptr;
1862 ecode++;
1863 break;
1864
1865 /* Multiline mode: assert before any newline, or before end of subject
1866 unless noteol is set. */
1867
1868 case OP_DOLLM:
1869 if (eptr < md->end_subject)
1870 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1871 else
1872 {
1873 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1874 SCHECK_PARTIAL();
1875 }
1876 ecode++;
1877 break;
1878
1879 /* Not multiline mode: assert before a terminating newline or before end of
1880 subject unless noteol is set. */
1881
1882 case OP_DOLL:
1883 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1884 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1885
1886 /* ... else fall through for endonly */
1887
1888 /* End of subject assertion (\z) */
1889
1890 case OP_EOD:
1891 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1892 SCHECK_PARTIAL();
1893 ecode++;
1894 break;
1895
1896 /* End of subject or ending \n assertion (\Z) */
1897
1898 case OP_EODN:
1899 ASSERT_NL_OR_EOS:
1900 if (eptr < md->end_subject &&
1901 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1902 MRRETURN(MATCH_NOMATCH);
1903
1904 /* Either at end of string or \n before end. */
1905
1906 SCHECK_PARTIAL();
1907 ecode++;
1908 break;
1909
1910 /* Word boundary assertions */
1911
1912 case OP_NOT_WORD_BOUNDARY:
1913 case OP_WORD_BOUNDARY:
1914 {
1915
1916 /* Find out if the previous and current characters are "word" characters.
1917 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1918 be "non-word" characters. Remember the earliest consulted character for
1919 partial matching. */
1920
1921 #ifdef SUPPORT_UTF8
1922 if (utf8)
1923 {
1924 /* Get status of previous character */
1925
1926 if (eptr == md->start_subject) prev_is_word = FALSE; else
1927 {
1928 USPTR lastptr = eptr - 1;
1929 while((*lastptr & 0xc0) == 0x80) lastptr--;
1930 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1931 GETCHAR(c, lastptr);
1932 #ifdef SUPPORT_UCP
1933 if (md->use_ucp)
1934 {
1935 if (c == '_') prev_is_word = TRUE; else
1936 {
1937 int cat = UCD_CATEGORY(c);
1938 prev_is_word = (cat == ucp_L || cat == ucp_N);
1939 }
1940 }
1941 else
1942 #endif
1943 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1944 }
1945
1946 /* Get status of next character */
1947
1948 if (eptr >= md->end_subject)
1949 {
1950 SCHECK_PARTIAL();
1951 cur_is_word = FALSE;
1952 }
1953 else
1954 {
1955 GETCHAR(c, eptr);
1956 #ifdef SUPPORT_UCP
1957 if (md->use_ucp)
1958 {
1959 if (c == '_') cur_is_word = TRUE; else
1960 {
1961 int cat = UCD_CATEGORY(c);
1962 cur_is_word = (cat == ucp_L || cat == ucp_N);
1963 }
1964 }
1965 else
1966 #endif
1967 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1968 }
1969 }
1970 else
1971 #endif
1972
1973 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1974 consistency with the behaviour of \w we do use it in this case. */
1975
1976 {
1977 /* Get status of previous character */
1978
1979 if (eptr == md->start_subject) prev_is_word = FALSE; else
1980 {
1981 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1982 #ifdef SUPPORT_UCP
1983 if (md->use_ucp)
1984 {
1985 c = eptr[-1];
1986 if (c == '_') prev_is_word = TRUE; else
1987 {
1988 int cat = UCD_CATEGORY(c);
1989 prev_is_word = (cat == ucp_L || cat == ucp_N);
1990 }
1991 }
1992 else
1993 #endif
1994 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1995 }
1996
1997 /* Get status of next character */
1998
1999 if (eptr >= md->end_subject)
2000 {
2001 SCHECK_PARTIAL();
2002 cur_is_word = FALSE;
2003 }
2004 else
2005 #ifdef SUPPORT_UCP
2006 if (md->use_ucp)
2007 {
2008 c = *eptr;
2009 if (c == '_') cur_is_word = TRUE; else
2010 {
2011 int cat = UCD_CATEGORY(c);
2012 cur_is_word = (cat == ucp_L || cat == ucp_N);
2013 }
2014 }
2015 else
2016 #endif
2017 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2018 }
2019
2020 /* Now see if the situation is what we want */
2021
2022 if ((*ecode++ == OP_WORD_BOUNDARY)?
2023 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2024 MRRETURN(MATCH_NOMATCH);
2025 }
2026 break;
2027
2028 /* Match a single character type; inline for speed */
2029
2030 case OP_ANY:
2031 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2032 /* Fall through */
2033
2034 case OP_ALLANY:
2035 if (eptr++ >= md->end_subject)
2036 {
2037 SCHECK_PARTIAL();
2038 MRRETURN(MATCH_NOMATCH);
2039 }
2040 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2041 ecode++;
2042 break;
2043
2044 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2045 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2046
2047 case OP_ANYBYTE:
2048 if (eptr++ >= md->end_subject)
2049 {
2050 SCHECK_PARTIAL();
2051 MRRETURN(MATCH_NOMATCH);
2052 }
2053 ecode++;
2054 break;
2055
2056 case OP_NOT_DIGIT:
2057 if (eptr >= md->end_subject)
2058 {
2059 SCHECK_PARTIAL();
2060 MRRETURN(MATCH_NOMATCH);
2061 }
2062 GETCHARINCTEST(c, eptr);
2063 if (
2064 #ifdef SUPPORT_UTF8
2065 c < 256 &&
2066 #endif
2067 (md->ctypes[c] & ctype_digit) != 0
2068 )
2069 MRRETURN(MATCH_NOMATCH);
2070 ecode++;
2071 break;
2072
2073 case OP_DIGIT:
2074 if (eptr >= md->end_subject)
2075 {
2076 SCHECK_PARTIAL();
2077 MRRETURN(MATCH_NOMATCH);
2078 }
2079 GETCHARINCTEST(c, eptr);
2080 if (
2081 #ifdef SUPPORT_UTF8
2082 c >= 256 ||
2083 #endif
2084 (md->ctypes[c] & ctype_digit) == 0
2085 )
2086 MRRETURN(MATCH_NOMATCH);
2087 ecode++;
2088 break;
2089
2090 case OP_NOT_WHITESPACE:
2091 if (eptr >= md->end_subject)
2092 {
2093 SCHECK_PARTIAL();
2094 MRRETURN(MATCH_NOMATCH);
2095 }
2096 GETCHARINCTEST(c, eptr);
2097 if (
2098 #ifdef SUPPORT_UTF8
2099 c < 256 &&
2100 #endif
2101 (md->ctypes[c] & ctype_space) != 0
2102 )
2103 MRRETURN(MATCH_NOMATCH);
2104 ecode++;
2105 break;
2106
2107 case OP_WHITESPACE:
2108 if (eptr >= md->end_subject)
2109 {
2110 SCHECK_PARTIAL();
2111 MRRETURN(MATCH_NOMATCH);
2112 }
2113 GETCHARINCTEST(c, eptr);
2114 if (
2115 #ifdef SUPPORT_UTF8
2116 c >= 256 ||
2117 #endif
2118 (md->ctypes[c] & ctype_space) == 0
2119 )
2120 MRRETURN(MATCH_NOMATCH);
2121 ecode++;
2122 break;
2123
2124 case OP_NOT_WORDCHAR:
2125 if (eptr >= md->end_subject)
2126 {
2127 SCHECK_PARTIAL();
2128 MRRETURN(MATCH_NOMATCH);
2129 }
2130 GETCHARINCTEST(c, eptr);
2131 if (
2132 #ifdef SUPPORT_UTF8
2133 c < 256 &&
2134 #endif
2135 (md->ctypes[c] & ctype_word) != 0
2136 )
2137 MRRETURN(MATCH_NOMATCH);
2138 ecode++;
2139 break;
2140
2141 case OP_WORDCHAR:
2142 if (eptr >= md->end_subject)
2143 {
2144 SCHECK_PARTIAL();
2145 MRRETURN(MATCH_NOMATCH);
2146 }
2147 GETCHARINCTEST(c, eptr);
2148 if (
2149 #ifdef SUPPORT_UTF8
2150 c >= 256 ||
2151 #endif
2152 (md->ctypes[c] & ctype_word) == 0
2153 )
2154 MRRETURN(MATCH_NOMATCH);
2155 ecode++;
2156 break;
2157
2158 case OP_ANYNL:
2159 if (eptr >= md->end_subject)
2160 {
2161 SCHECK_PARTIAL();
2162 MRRETURN(MATCH_NOMATCH);
2163 }
2164 GETCHARINCTEST(c, eptr);
2165 switch(c)
2166 {
2167 default: MRRETURN(MATCH_NOMATCH);
2168
2169 case 0x000d:
2170 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2171 break;
2172
2173 case 0x000a:
2174 break;
2175
2176 case 0x000b:
2177 case 0x000c:
2178 case 0x0085:
2179 case 0x2028:
2180 case 0x2029:
2181 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2182 break;
2183 }
2184 ecode++;
2185 break;
2186
2187 case OP_NOT_HSPACE:
2188 if (eptr >= md->end_subject)
2189 {
2190 SCHECK_PARTIAL();
2191 MRRETURN(MATCH_NOMATCH);
2192 }
2193 GETCHARINCTEST(c, eptr);
2194 switch(c)
2195 {
2196 default: break;
2197 case 0x09: /* HT */
2198 case 0x20: /* SPACE */
2199 case 0xa0: /* NBSP */
2200 case 0x1680: /* OGHAM SPACE MARK */
2201 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2202 case 0x2000: /* EN QUAD */
2203 case 0x2001: /* EM QUAD */
2204 case 0x2002: /* EN SPACE */
2205 case 0x2003: /* EM SPACE */
2206 case 0x2004: /* THREE-PER-EM SPACE */
2207 case 0x2005: /* FOUR-PER-EM SPACE */
2208 case 0x2006: /* SIX-PER-EM SPACE */
2209 case 0x2007: /* FIGURE SPACE */
2210 case 0x2008: /* PUNCTUATION SPACE */
2211 case 0x2009: /* THIN SPACE */
2212 case 0x200A: /* HAIR SPACE */
2213 case 0x202f: /* NARROW NO-BREAK SPACE */
2214 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2215 case 0x3000: /* IDEOGRAPHIC SPACE */
2216 MRRETURN(MATCH_NOMATCH);
2217 }
2218 ecode++;
2219 break;
2220
2221 case OP_HSPACE:
2222 if (eptr >= md->end_subject)
2223 {
2224 SCHECK_PARTIAL();
2225 MRRETURN(MATCH_NOMATCH);
2226 }
2227 GETCHARINCTEST(c, eptr);
2228 switch(c)
2229 {
2230 default: MRRETURN(MATCH_NOMATCH);
2231 case 0x09: /* HT */
2232 case 0x20: /* SPACE */
2233 case 0xa0: /* NBSP */
2234 case 0x1680: /* OGHAM SPACE MARK */
2235 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2236 case 0x2000: /* EN QUAD */
2237 case 0x2001: /* EM QUAD */
2238 case 0x2002: /* EN SPACE */
2239 case 0x2003: /* EM SPACE */
2240 case 0x2004: /* THREE-PER-EM SPACE */
2241 case 0x2005: /* FOUR-PER-EM SPACE */
2242 case 0x2006: /* SIX-PER-EM SPACE */
2243 case 0x2007: /* FIGURE SPACE */
2244 case 0x2008: /* PUNCTUATION SPACE */
2245 case 0x2009: /* THIN SPACE */
2246 case 0x200A: /* HAIR SPACE */
2247 case 0x202f: /* NARROW NO-BREAK SPACE */
2248 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2249 case 0x3000: /* IDEOGRAPHIC SPACE */
2250 break;
2251 }
2252 ecode++;
2253 break;
2254
2255 case OP_NOT_VSPACE:
2256 if (eptr >= md->end_subject)
2257 {
2258 SCHECK_PARTIAL();
2259 MRRETURN(MATCH_NOMATCH);
2260 }
2261 GETCHARINCTEST(c, eptr);
2262 switch(c)
2263 {
2264 default: break;
2265 case 0x0a: /* LF */
2266 case 0x0b: /* VT */
2267 case 0x0c: /* FF */
2268 case 0x0d: /* CR */
2269 case 0x85: /* NEL */
2270 case 0x2028: /* LINE SEPARATOR */
2271 case 0x2029: /* PARAGRAPH SEPARATOR */
2272 MRRETURN(MATCH_NOMATCH);
2273 }
2274 ecode++;
2275 break;
2276
2277 case OP_VSPACE:
2278 if (eptr >= md->end_subject)
2279 {
2280 SCHECK_PARTIAL();
2281 MRRETURN(MATCH_NOMATCH);
2282 }
2283 GETCHARINCTEST(c, eptr);
2284 switch(c)
2285 {
2286 default: MRRETURN(MATCH_NOMATCH);
2287 case 0x0a: /* LF */
2288 case 0x0b: /* VT */
2289 case 0x0c: /* FF */
2290 case 0x0d: /* CR */
2291 case 0x85: /* NEL */
2292 case 0x2028: /* LINE SEPARATOR */
2293 case 0x2029: /* PARAGRAPH SEPARATOR */
2294 break;
2295 }
2296 ecode++;
2297 break;
2298
2299 #ifdef SUPPORT_UCP
2300 /* Check the next character by Unicode property. We will get here only
2301 if the support is in the binary; otherwise a compile-time error occurs. */
2302
2303 case OP_PROP:
2304 case OP_NOTPROP:
2305 if (eptr >= md->end_subject)
2306 {
2307 SCHECK_PARTIAL();
2308 MRRETURN(MATCH_NOMATCH);
2309 }
2310 GETCHARINCTEST(c, eptr);
2311 {
2312 const ucd_record *prop = GET_UCD(c);
2313
2314 switch(ecode[1])
2315 {
2316 case PT_ANY:
2317 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2318 break;
2319
2320 case PT_LAMP:
2321 if ((prop->chartype == ucp_Lu ||
2322 prop->chartype == ucp_Ll ||
2323 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2324 MRRETURN(MATCH_NOMATCH);
2325 break;
2326
2327 case PT_GC:
2328 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2329 MRRETURN(MATCH_NOMATCH);
2330 break;
2331
2332 case PT_PC:
2333 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2334 MRRETURN(MATCH_NOMATCH);
2335 break;
2336
2337 case PT_SC:
2338 if ((ecode[2] != prop->script) == (op == OP_PROP))
2339 MRRETURN(MATCH_NOMATCH);
2340 break;
2341
2342 /* These are specials */
2343
2344 case PT_ALNUM:
2345 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2346 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2347 MRRETURN(MATCH_NOMATCH);
2348 break;
2349
2350 case PT_SPACE: /* Perl space */
2351 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2352 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2353 == (op == OP_NOTPROP))
2354 MRRETURN(MATCH_NOMATCH);
2355 break;
2356
2357 case PT_PXSPACE: /* POSIX space */
2358 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2359 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2360 c == CHAR_FF || c == CHAR_CR)
2361 == (op == OP_NOTPROP))
2362 MRRETURN(MATCH_NOMATCH);
2363 break;
2364
2365 case PT_WORD:
2366 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2367 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2368 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2369 MRRETURN(MATCH_NOMATCH);
2370 break;
2371
2372 /* This should never occur */
2373
2374 default:
2375 RRETURN(PCRE_ERROR_INTERNAL);
2376 }
2377
2378 ecode += 3;
2379 }
2380 break;
2381
2382 /* Match an extended Unicode sequence. We will get here only if the support
2383 is in the binary; otherwise a compile-time error occurs. */
2384
2385 case OP_EXTUNI:
2386 if (eptr >= md->end_subject)
2387 {
2388 SCHECK_PARTIAL();
2389 MRRETURN(MATCH_NOMATCH);
2390 }
2391 GETCHARINCTEST(c, eptr);
2392 {
2393 int category = UCD_CATEGORY(c);
2394 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2395 while (eptr < md->end_subject)
2396 {
2397 int len = 1;
2398 if (!utf8) c = *eptr; else
2399 {
2400 GETCHARLEN(c, eptr, len);
2401 }
2402 category = UCD_CATEGORY(c);
2403 if (category != ucp_M) break;
2404 eptr += len;
2405 }
2406 }
2407 ecode++;
2408 break;
2409 #endif
2410
2411
2412 /* Match a back reference, possibly repeatedly. Look past the end of the
2413 item to see if there is repeat information following. The code is similar
2414 to that for character classes, but repeated for efficiency. Then obey
2415 similar code to character type repeats - written out again for speed.
2416 However, if the referenced string is the empty string, always treat
2417 it as matched, any number of times (otherwise there could be infinite
2418 loops). */
2419
2420 case OP_REF:
2421 case OP_REFI:
2422 caseless = op == OP_REFI;
2423 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2424 ecode += 3;
2425
2426 /* If the reference is unset, there are two possibilities:
2427
2428 (a) In the default, Perl-compatible state, set the length negative;
2429 this ensures that every attempt at a match fails. We can't just fail
2430 here, because of the possibility of quantifiers with zero minima.
2431
2432 (b) If the JavaScript compatibility flag is set, set the length to zero
2433 so that the back reference matches an empty string.
2434
2435 Otherwise, set the length to the length of what was matched by the
2436 referenced subpattern. */
2437
2438 if (offset >= offset_top || md->offset_vector[offset] < 0)
2439 length = (md->jscript_compat)? 0 : -1;
2440 else
2441 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2442
2443 /* Set up for repetition, or handle the non-repeated case */
2444
2445 switch (*ecode)
2446 {
2447 case OP_CRSTAR:
2448 case OP_CRMINSTAR:
2449 case OP_CRPLUS:
2450 case OP_CRMINPLUS:
2451 case OP_CRQUERY:
2452 case OP_CRMINQUERY:
2453 c = *ecode++ - OP_CRSTAR;
2454 minimize = (c & 1) != 0;
2455 min = rep_min[c]; /* Pick up values from tables; */
2456 max = rep_max[c]; /* zero for max => infinity */
2457 if (max == 0) max = INT_MAX;
2458 break;
2459
2460 case OP_CRRANGE:
2461 case OP_CRMINRANGE:
2462 minimize = (*ecode == OP_CRMINRANGE);
2463 min = GET2(ecode, 1);
2464 max = GET2(ecode, 3);
2465 if (max == 0) max = INT_MAX;
2466 ecode += 5;
2467 break;
2468
2469 default: /* No repeat follows */
2470 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2471 {
2472 CHECK_PARTIAL();
2473 MRRETURN(MATCH_NOMATCH);
2474 }
2475 eptr += length;
2476 continue; /* With the main loop */
2477 }
2478
2479 /* Handle repeated back references. If the length of the reference is
2480 zero, just continue with the main loop. */
2481
2482 if (length == 0) continue;
2483
2484 /* First, ensure the minimum number of matches are present. We get back
2485 the length of the reference string explicitly rather than passing the
2486 address of eptr, so that eptr can be a register variable. */
2487
2488 for (i = 1; i <= min; i++)
2489 {
2490 int slength;
2491 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2492 {
2493 CHECK_PARTIAL();
2494 MRRETURN(MATCH_NOMATCH);
2495 }
2496 eptr += slength;
2497 }
2498
2499 /* If min = max, continue at the same level without recursion.
2500 They are not both allowed to be zero. */
2501
2502 if (min == max) continue;
2503
2504 /* If minimizing, keep trying and advancing the pointer */
2505
2506 if (minimize)
2507 {
2508 for (fi = min;; fi++)
2509 {
2510 int slength;
2511 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2512 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2513 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2514 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2515 {
2516 CHECK_PARTIAL();
2517 MRRETURN(MATCH_NOMATCH);
2518 }
2519 eptr += slength;
2520 }
2521 /* Control never gets here */
2522 }
2523
2524 /* If maximizing, find the longest string and work backwards */
2525
2526 else
2527 {
2528 pp = eptr;
2529 for (i = min; i < max; i++)
2530 {
2531 int slength;
2532 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2533 {
2534 CHECK_PARTIAL();
2535 break;
2536 }
2537 eptr += slength;
2538 }
2539 while (eptr >= pp)
2540 {
2541 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2542 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2543 eptr -= length;
2544 }
2545 MRRETURN(MATCH_NOMATCH);
2546 }
2547 /* Control never gets here */
2548
2549 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2550 used when all the characters in the class have values in the range 0-255,
2551 and either the matching is caseful, or the characters are in the range
2552 0-127 when UTF-8 processing is enabled. The only difference between
2553 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2554 encountered.
2555
2556 First, look past the end of the item to see if there is repeat information
2557 following. Then obey similar code to character type repeats - written out
2558 again for speed. */
2559
2560 case OP_NCLASS:
2561 case OP_CLASS:
2562 {
2563 data = ecode + 1; /* Save for matching */
2564 ecode += 33; /* Advance past the item */
2565
2566 switch (*ecode)
2567 {
2568 case OP_CRSTAR:
2569 case OP_CRMINSTAR:
2570 case OP_CRPLUS:
2571 case OP_CRMINPLUS:
2572 case OP_CRQUERY:
2573 case OP_CRMINQUERY:
2574 c = *ecode++ - OP_CRSTAR;
2575 minimize = (c & 1) != 0;
2576 min = rep_min[c]; /* Pick up values from tables; */
2577 max = rep_max[c]; /* zero for max => infinity */
2578 if (max == 0) max = INT_MAX;
2579 break;
2580
2581 case OP_CRRANGE:
2582 case OP_CRMINRANGE:
2583 minimize = (*ecode == OP_CRMINRANGE);
2584 min = GET2(ecode, 1);
2585 max = GET2(ecode, 3);
2586 if (max == 0) max = INT_MAX;
2587 ecode += 5;
2588 break;
2589
2590 default: /* No repeat follows */
2591 min = max = 1;
2592 break;
2593 }
2594
2595 /* First, ensure the minimum number of matches are present. */
2596
2597 #ifdef SUPPORT_UTF8
2598 /* UTF-8 mode */
2599 if (utf8)
2600 {
2601 for (i = 1; i <= min; i++)
2602 {
2603 if (eptr >= md->end_subject)
2604 {
2605 SCHECK_PARTIAL();
2606 MRRETURN(MATCH_NOMATCH);
2607 }
2608 GETCHARINC(c, eptr);
2609 if (c > 255)
2610 {
2611 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2612 }
2613 else
2614 {
2615 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2616 }
2617 }
2618 }
2619 else
2620 #endif
2621 /* Not UTF-8 mode */
2622 {
2623 for (i = 1; i <= min; i++)
2624 {
2625 if (eptr >= md->end_subject)
2626 {
2627 SCHECK_PARTIAL();
2628 MRRETURN(MATCH_NOMATCH);
2629 }
2630 c = *eptr++;
2631 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2632 }
2633 }
2634
2635 /* If max == min we can continue with the main loop without the
2636 need to recurse. */
2637
2638 if (min == max) continue;
2639
2640 /* If minimizing, keep testing the rest of the expression and advancing
2641 the pointer while it matches the class. */
2642
2643 if (minimize)
2644 {
2645 #ifdef SUPPORT_UTF8
2646 /* UTF-8 mode */
2647 if (utf8)
2648 {
2649 for (fi = min;; fi++)
2650 {
2651 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2654 if (eptr >= md->end_subject)
2655 {
2656 SCHECK_PARTIAL();
2657 MRRETURN(MATCH_NOMATCH);
2658 }
2659 GETCHARINC(c, eptr);
2660 if (c > 255)
2661 {
2662 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2663 }
2664 else
2665 {
2666 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2667 }
2668 }
2669 }
2670 else
2671 #endif
2672 /* Not UTF-8 mode */
2673 {
2674 for (fi = min;; fi++)
2675 {
2676 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2678 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2679 if (eptr >= md->end_subject)
2680 {
2681 SCHECK_PARTIAL();
2682 MRRETURN(MATCH_NOMATCH);
2683 }
2684 c = *eptr++;
2685 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2686 }
2687 }
2688 /* Control never gets here */
2689 }
2690
2691 /* If maximizing, find the longest possible run, then work backwards. */
2692
2693 else
2694 {
2695 pp = eptr;
2696
2697 #ifdef SUPPORT_UTF8
2698 /* UTF-8 mode */
2699 if (utf8)
2700 {
2701 for (i = min; i < max; i++)
2702 {
2703 int len = 1;
2704 if (eptr >= md->end_subject)
2705 {
2706 SCHECK_PARTIAL();
2707 break;
2708 }
2709 GETCHARLEN(c, eptr, len);
2710 if (c > 255)
2711 {
2712 if (op == OP_CLASS) break;
2713 }
2714 else
2715 {
2716 if ((data[c/8] & (1 << (c&7))) == 0) break;
2717 }
2718 eptr += len;
2719 }
2720 for (;;)
2721 {
2722 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2724 if (eptr-- == pp) break; /* Stop if tried at original pos */
2725 BACKCHAR(eptr);
2726 }
2727 }
2728 else
2729 #endif
2730 /* Not UTF-8 mode */
2731 {
2732 for (i = min; i < max; i++)
2733 {
2734 if (eptr >= md->end_subject)
2735 {
2736 SCHECK_PARTIAL();
2737 break;
2738 }
2739 c = *eptr;
2740 if ((data[c/8] & (1 << (c&7))) == 0) break;
2741 eptr++;
2742 }
2743 while (eptr >= pp)
2744 {
2745 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2746 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2747 eptr--;
2748 }
2749 }
2750
2751 MRRETURN(MATCH_NOMATCH);
2752 }
2753 }
2754 /* Control never gets here */
2755
2756
2757 /* Match an extended character class. This opcode is encountered only
2758 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2759 mode, because Unicode properties are supported in non-UTF-8 mode. */
2760
2761 #ifdef SUPPORT_UTF8
2762 case OP_XCLASS:
2763 {
2764 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2765 ecode += GET(ecode, 1); /* Advance past the item */
2766
2767 switch (*ecode)
2768 {
2769 case OP_CRSTAR:
2770 case OP_CRMINSTAR:
2771 case OP_CRPLUS:
2772 case OP_CRMINPLUS:
2773 case OP_CRQUERY:
2774 case OP_CRMINQUERY:
2775 c = *ecode++ - OP_CRSTAR;
2776 minimize = (c & 1) != 0;
2777 min = rep_min[c]; /* Pick up values from tables; */
2778 max = rep_max[c]; /* zero for max => infinity */
2779 if (max == 0) max = INT_MAX;
2780 break;
2781
2782 case OP_CRRANGE:
2783 case OP_CRMINRANGE:
2784 minimize = (*ecode == OP_CRMINRANGE);
2785 min = GET2(ecode, 1);
2786 max = GET2(ecode, 3);
2787 if (max == 0) max = INT_MAX;
2788 ecode += 5;
2789 break;
2790
2791 default: /* No repeat follows */
2792 min = max = 1;
2793 break;
2794 }
2795
2796 /* First, ensure the minimum number of matches are present. */
2797
2798 for (i = 1; i <= min; i++)
2799 {
2800 if (eptr >= md->end_subject)
2801 {
2802 SCHECK_PARTIAL();
2803 MRRETURN(MATCH_NOMATCH);
2804 }
2805 GETCHARINCTEST(c, eptr);
2806 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2807 }
2808
2809 /* If max == min we can continue with the main loop without the
2810 need to recurse. */
2811
2812 if (min == max) continue;
2813
2814 /* If minimizing, keep testing the rest of the expression and advancing
2815 the pointer while it matches the class. */
2816
2817 if (minimize)
2818 {
2819 for (fi = min;; fi++)
2820 {
2821 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2823 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2824 if (eptr >= md->end_subject)
2825 {
2826 SCHECK_PARTIAL();
2827 MRRETURN(MATCH_NOMATCH);
2828 }
2829 GETCHARINCTEST(c, eptr);
2830 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2831 }
2832 /* Control never gets here */
2833 }
2834
2835 /* If maximizing, find the longest possible run, then work backwards. */
2836
2837 else
2838 {
2839 pp = eptr;
2840 for (i = min; i < max; i++)
2841 {
2842 int len = 1;
2843 if (eptr >= md->end_subject)
2844 {
2845 SCHECK_PARTIAL();
2846 break;
2847 }
2848 GETCHARLENTEST(c, eptr, len);
2849 if (!_pcre_xclass(c, data)) break;
2850 eptr += len;
2851 }
2852 for(;;)
2853 {
2854 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2855 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2856 if (eptr-- == pp) break; /* Stop if tried at original pos */
2857 if (utf8) BACKCHAR(eptr);
2858 }
2859 MRRETURN(MATCH_NOMATCH);
2860 }
2861
2862 /* Control never gets here */
2863 }
2864 #endif /* End of XCLASS */
2865
2866 /* Match a single character, casefully */
2867
2868 case OP_CHAR:
2869 #ifdef SUPPORT_UTF8
2870 if (utf8)
2871 {
2872 length = 1;
2873 ecode++;
2874 GETCHARLEN(fc, ecode, length);
2875 if (length > md->end_subject - eptr)
2876 {
2877 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2878 MRRETURN(MATCH_NOMATCH);
2879 }
2880 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2881 }
2882 else
2883 #endif
2884
2885 /* Non-UTF-8 mode */
2886 {
2887 if (md->end_subject - eptr < 1)
2888 {
2889 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2890 MRRETURN(MATCH_NOMATCH);
2891 }
2892 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2893 ecode += 2;
2894 }
2895 break;
2896
2897 /* Match a single character, caselessly */
2898
2899 case OP_CHARI:
2900 #ifdef SUPPORT_UTF8
2901 if (utf8)
2902 {
2903 length = 1;
2904 ecode++;
2905 GETCHARLEN(fc, ecode, length);
2906
2907 if (length > md->end_subject - eptr)
2908 {
2909 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2910 MRRETURN(MATCH_NOMATCH);
2911 }
2912
2913 /* If the pattern character's value is < 128, we have only one byte, and
2914 can use the fast lookup table. */
2915
2916 if (fc < 128)
2917 {
2918 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2919 }
2920
2921 /* Otherwise we must pick up the subject character */
2922
2923 else
2924 {
2925 unsigned int dc;
2926 GETCHARINC(dc, eptr);
2927 ecode += length;
2928
2929 /* If we have Unicode property support, we can use it to test the other
2930 case of the character, if there is one. */
2931
2932 if (fc != dc)
2933 {
2934 #ifdef SUPPORT_UCP
2935 if (dc != UCD_OTHERCASE(fc))
2936 #endif
2937 MRRETURN(MATCH_NOMATCH);
2938 }
2939 }
2940 }
2941 else
2942 #endif /* SUPPORT_UTF8 */
2943
2944 /* Non-UTF-8 mode */
2945 {
2946 if (md->end_subject - eptr < 1)
2947 {
2948 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2949 MRRETURN(MATCH_NOMATCH);
2950 }
2951 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2952 ecode += 2;
2953 }
2954 break;
2955
2956 /* Match a single character repeatedly. */
2957
2958 case OP_EXACT:
2959 case OP_EXACTI:
2960 min = max = GET2(ecode, 1);
2961 ecode += 3;
2962 goto REPEATCHAR;
2963
2964 case OP_POSUPTO:
2965 case OP_POSUPTOI:
2966 possessive = TRUE;
2967 /* Fall through */
2968
2969 case OP_UPTO:
2970 case OP_UPTOI:
2971 case OP_MINUPTO:
2972 case OP_MINUPTOI:
2973 min = 0;
2974 max = GET2(ecode, 1);
2975 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2976 ecode += 3;
2977 goto REPEATCHAR;
2978
2979 case OP_POSSTAR:
2980 case OP_POSSTARI:
2981 possessive = TRUE;
2982 min = 0;
2983 max = INT_MAX;
2984 ecode++;
2985 goto REPEATCHAR;
2986
2987 case OP_POSPLUS:
2988 case OP_POSPLUSI:
2989 possessive = TRUE;
2990 min = 1;
2991 max = INT_MAX;
2992 ecode++;
2993 goto REPEATCHAR;
2994
2995 case OP_POSQUERY:
2996 case OP_POSQUERYI:
2997 possessive = TRUE;
2998 min = 0;
2999 max = 1;
3000 ecode++;
3001 goto REPEATCHAR;
3002
3003 case OP_STAR:
3004 case OP_STARI:
3005 case OP_MINSTAR:
3006 case OP_MINSTARI:
3007 case OP_PLUS:
3008 case OP_PLUSI:
3009 case OP_MINPLUS:
3010 case OP_MINPLUSI:
3011 case OP_QUERY:
3012 case OP_QUERYI:
3013 case OP_MINQUERY:
3014 case OP_MINQUERYI:
3015 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3016 minimize = (c & 1) != 0;
3017 min = rep_min[c]; /* Pick up values from tables; */
3018 max = rep_max[c]; /* zero for max => infinity */
3019 if (max == 0) max = INT_MAX;
3020
3021 /* Common code for all repeated single-character matches. */
3022
3023 REPEATCHAR:
3024 #ifdef SUPPORT_UTF8
3025 if (utf8)
3026 {
3027 length = 1;
3028 charptr = ecode;
3029 GETCHARLEN(fc, ecode, length);
3030 ecode += length;
3031
3032 /* Handle multibyte character matching specially here. There is
3033 support for caseless matching if UCP support is present. */
3034
3035 if (length > 1)
3036 {
3037 #ifdef SUPPORT_UCP
3038 unsigned int othercase;
3039 if (op >= OP_STARI && /* Caseless */
3040 (othercase = UCD_OTHERCASE(fc)) != fc)
3041 oclength = _pcre_ord2utf8(othercase, occhars);
3042 else oclength = 0;
3043 #endif /* SUPPORT_UCP */
3044
3045 for (i = 1; i <= min; i++)
3046 {
3047 if (eptr <= md->end_subject - length &&
3048 memcmp(eptr, charptr, length) == 0) eptr += length;
3049 #ifdef SUPPORT_UCP
3050 else if (oclength > 0 &&
3051 eptr <= md->end_subject - oclength &&
3052 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3053 #endif /* SUPPORT_UCP */
3054 else
3055 {
3056 CHECK_PARTIAL();
3057 MRRETURN(MATCH_NOMATCH);
3058 }
3059 }
3060
3061 if (min == max) continue;
3062
3063 if (minimize)
3064 {
3065 for (fi = min;; fi++)
3066 {
3067 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3068 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3069 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3070 if (eptr <= md->end_subject - length &&
3071 memcmp(eptr, charptr, length) == 0) eptr += length;
3072 #ifdef SUPPORT_UCP
3073 else if (oclength > 0 &&
3074 eptr <= md->end_subject - oclength &&
3075 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3076 #endif /* SUPPORT_UCP */
3077 else
3078 {
3079 CHECK_PARTIAL();
3080 MRRETURN(MATCH_NOMATCH);
3081 }
3082 }
3083 /* Control never gets here */
3084 }
3085
3086 else /* Maximize */
3087 {
3088 pp = eptr;
3089 for (i = min; i < max; i++)
3090 {
3091 if (eptr <= md->end_subject - length &&
3092 memcmp(eptr, charptr, length) == 0) eptr += length;
3093 #ifdef SUPPORT_UCP
3094 else if (oclength > 0 &&
3095 eptr <= md->end_subject - oclength &&
3096 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3097 #endif /* SUPPORT_UCP */
3098 else
3099 {
3100 CHECK_PARTIAL();
3101 break;
3102 }
3103 }
3104
3105 if (possessive) continue;
3106
3107 for(;;)
3108 {
3109 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3111 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3112 #ifdef SUPPORT_UCP
3113 eptr--;
3114 BACKCHAR(eptr);
3115 #else /* without SUPPORT_UCP */
3116 eptr -= length;
3117 #endif /* SUPPORT_UCP */
3118 }
3119 }
3120 /* Control never gets here */
3121 }
3122
3123 /* If the length of a UTF-8 character is 1, we fall through here, and
3124 obey the code as for non-UTF-8 characters below, though in this case the
3125 value of fc will always be < 128. */
3126 }
3127 else
3128 #endif /* SUPPORT_UTF8 */
3129
3130 /* When not in UTF-8 mode, load a single-byte character. */
3131
3132 fc = *ecode++;
3133
3134 /* The value of fc at this point is always less than 256, though we may or
3135 may not be in UTF-8 mode. The code is duplicated for the caseless and
3136 caseful cases, for speed, since matching characters is likely to be quite
3137 common. First, ensure the minimum number of matches are present. If min =
3138 max, continue at the same level without recursing. Otherwise, if
3139 minimizing, keep trying the rest of the expression and advancing one
3140 matching character if failing, up to the maximum. Alternatively, if
3141 maximizing, find the maximum number of characters and work backwards. */
3142
3143 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3144 max, eptr));
3145
3146 if (op >= OP_STARI) /* Caseless */
3147 {
3148 fc = md->lcc[fc];
3149 for (i = 1; i <= min; i++)
3150 {
3151 if (eptr >= md->end_subject)
3152 {
3153 SCHECK_PARTIAL();
3154 MRRETURN(MATCH_NOMATCH);
3155 }
3156 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3157 }
3158 if (min == max) continue;
3159 if (minimize)
3160 {
3161 for (fi = min;; fi++)
3162 {
3163 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3164 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3165 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3166 if (eptr >= md->end_subject)
3167 {
3168 SCHECK_PARTIAL();
3169 MRRETURN(MATCH_NOMATCH);
3170 }
3171 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3172 }
3173 /* Control never gets here */
3174 }
3175 else /* Maximize */
3176 {
3177 pp = eptr;
3178 for (i = min; i < max; i++)
3179 {
3180 if (eptr >= md->end_subject)
3181 {
3182 SCHECK_PARTIAL();
3183 break;
3184 }
3185 if (fc != md->lcc[*eptr]) break;
3186 eptr++;
3187 }
3188
3189 if (possessive) continue;
3190
3191 while (eptr >= pp)
3192 {
3193 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3194 eptr--;
3195 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3196 }
3197 MRRETURN(MATCH_NOMATCH);
3198 }
3199 /* Control never gets here */
3200 }
3201
3202 /* Caseful comparisons (includes all multi-byte characters) */
3203
3204 else
3205 {
3206 for (i = 1; i <= min; i++)
3207 {
3208 if (eptr >= md->end_subject)
3209 {
3210 SCHECK_PARTIAL();
3211 MRRETURN(MATCH_NOMATCH);
3212 }
3213 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3214 }
3215
3216 if (min == max) continue;
3217
3218 if (minimize)
3219 {
3220 for (fi = min;; fi++)
3221 {
3222 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3224 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3225 if (eptr >= md->end_subject)
3226 {
3227 SCHECK_PARTIAL();
3228 MRRETURN(MATCH_NOMATCH);
3229 }
3230 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3231 }
3232 /* Control never gets here */
3233 }
3234 else /* Maximize */
3235 {
3236 pp = eptr;
3237 for (i = min; i < max; i++)
3238 {
3239 if (eptr >= md->end_subject)
3240 {
3241 SCHECK_PARTIAL();
3242 break;
3243 }
3244 if (fc != *eptr) break;
3245 eptr++;
3246 }
3247 if (possessive) continue;
3248
3249 while (eptr >= pp)
3250 {
3251 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3252 eptr--;
3253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3254 }
3255 MRRETURN(MATCH_NOMATCH);
3256 }
3257 }
3258 /* Control never gets here */
3259
3260 /* Match a negated single one-byte character. The character we are
3261 checking can be multibyte. */
3262
3263 case OP_NOT:
3264 case OP_NOTI:
3265 if (eptr >= md->end_subject)
3266 {
3267 SCHECK_PARTIAL();
3268 MRRETURN(MATCH_NOMATCH);
3269 }
3270 ecode++;
3271 GETCHARINCTEST(c, eptr);
3272 if (op == OP_NOTI) /* The caseless case */
3273 {
3274 #ifdef SUPPORT_UTF8
3275 if (c < 256)
3276 #endif
3277 c = md->lcc[c];
3278 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3279 }
3280 else /* Caseful */
3281 {
3282 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3283 }
3284 break;
3285
3286 /* Match a negated single one-byte character repeatedly. This is almost a
3287 repeat of the code for a repeated single character, but I haven't found a
3288 nice way of commoning these up that doesn't require a test of the
3289 positive/negative option for each character match. Maybe that wouldn't add
3290 very much to the time taken, but character matching *is* what this is all
3291 about... */
3292
3293 case OP_NOTEXACT:
3294 case OP_NOTEXACTI:
3295 min = max = GET2(ecode, 1);
3296 ecode += 3;
3297 goto REPEATNOTCHAR;
3298
3299 case OP_NOTUPTO:
3300 case OP_NOTUPTOI:
3301 case OP_NOTMINUPTO:
3302 case OP_NOTMINUPTOI:
3303 min = 0;
3304 max = GET2(ecode, 1);
3305 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3306 ecode += 3;
3307 goto REPEATNOTCHAR;
3308
3309 case OP_NOTPOSSTAR:
3310 case OP_NOTPOSSTARI:
3311 possessive = TRUE;
3312 min = 0;
3313 max = INT_MAX;
3314 ecode++;
3315 goto REPEATNOTCHAR;
3316
3317 case OP_NOTPOSPLUS:
3318 case OP_NOTPOSPLUSI:
3319 possessive = TRUE;
3320 min = 1;
3321 max = INT_MAX;
3322 ecode++;
3323 goto REPEATNOTCHAR;
3324
3325 case OP_NOTPOSQUERY:
3326 case OP_NOTPOSQUERYI:
3327 possessive = TRUE;
3328 min = 0;
3329 max = 1;
3330 ecode++;
3331 goto REPEATNOTCHAR;
3332
3333 case OP_NOTPOSUPTO:
3334 case OP_NOTPOSUPTOI:
3335 possessive = TRUE;
3336 min = 0;
3337 max = GET2(ecode, 1);
3338 ecode += 3;
3339 goto REPEATNOTCHAR;
3340
3341 case OP_NOTSTAR:
3342 case OP_NOTSTARI:
3343 case OP_NOTMINSTAR:
3344 case OP_NOTMINSTARI:
3345 case OP_NOTPLUS:
3346 case OP_NOTPLUSI:
3347 case OP_NOTMINPLUS:
3348 case OP_NOTMINPLUSI:
3349 case OP_NOTQUERY:
3350 case OP_NOTQUERYI:
3351 case OP_NOTMINQUERY:
3352 case OP_NOTMINQUERYI:
3353 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3354 minimize = (c & 1) != 0;
3355 min = rep_min[c]; /* Pick up values from tables; */
3356 max = rep_max[c]; /* zero for max => infinity */
3357 if (max == 0) max = INT_MAX;
3358
3359 /* Common code for all repeated single-byte matches. */
3360
3361 REPEATNOTCHAR:
3362 fc = *ecode++;
3363
3364 /* The code is duplicated for the caseless and caseful cases, for speed,
3365 since matching characters is likely to be quite common. First, ensure the
3366 minimum number of matches are present. If min = max, continue at the same
3367 level without recursing. Otherwise, if minimizing, keep trying the rest of
3368 the expression and advancing one matching character if failing, up to the
3369 maximum. Alternatively, if maximizing, find the maximum number of
3370 characters and work backwards. */
3371
3372 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3373 max, eptr));
3374
3375 if (op >= OP_NOTSTARI) /* Caseless */
3376 {
3377 fc = md->lcc[fc];
3378
3379 #ifdef SUPPORT_UTF8
3380 /* UTF-8 mode */
3381 if (utf8)
3382 {
3383 register unsigned int d;
3384 for (i = 1; i <= min; i++)
3385 {
3386 if (eptr >= md->end_subject)
3387 {
3388 SCHECK_PARTIAL();
3389 MRRETURN(MATCH_NOMATCH);
3390 }
3391 GETCHARINC(d, eptr);
3392 if (d < 256) d = md->lcc[d];
3393 if (fc == d) MRRETURN(MATCH_NOMATCH);
3394 }
3395 }
3396 else
3397 #endif
3398
3399 /* Not UTF-8 mode */
3400 {
3401 for (i = 1; i <= min; i++)
3402 {
3403 if (eptr >= md->end_subject)
3404 {
3405 SCHECK_PARTIAL();
3406 MRRETURN(MATCH_NOMATCH);
3407 }
3408 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3409 }
3410 }
3411
3412 if (min == max) continue;
3413
3414 if (minimize)
3415 {
3416 #ifdef SUPPORT_UTF8
3417 /* UTF-8 mode */
3418 if (utf8)
3419 {
3420 register unsigned int d;
3421 for (fi = min;; fi++)
3422 {
3423 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3425 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3426 if (eptr >= md->end_subject)
3427 {
3428 SCHECK_PARTIAL();
3429 MRRETURN(MATCH_NOMATCH);
3430 }
3431 GETCHARINC(d, eptr);
3432 if (d < 256) d = md->lcc[d];
3433 if (fc == d) MRRETURN(MATCH_NOMATCH);
3434 }
3435 }
3436 else
3437 #endif
3438 /* Not UTF-8 mode */
3439 {
3440 for (fi = min;; fi++)
3441 {
3442 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3445 if (eptr >= md->end_subject)
3446 {
3447 SCHECK_PARTIAL();
3448 MRRETURN(MATCH_NOMATCH);
3449 }
3450 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3451 }
3452 }
3453 /* Control never gets here */
3454 }
3455
3456 /* Maximize case */
3457
3458 else
3459 {
3460 pp = eptr;
3461
3462 #ifdef SUPPORT_UTF8
3463 /* UTF-8 mode */
3464 if (utf8)
3465 {
3466 register unsigned int d;
3467 for (i = min; i < max; i++)
3468 {
3469 int len = 1;
3470 if (eptr >= md->end_subject)
3471 {
3472 SCHECK_PARTIAL();
3473 break;
3474 }
3475 GETCHARLEN(d, eptr, len);
3476 if (d < 256) d = md->lcc[d];
3477 if (fc == d) break;
3478 eptr += len;
3479 }
3480 if (possessive) continue;
3481 for(;;)
3482 {
3483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3485 if (eptr-- == pp) break; /* Stop if tried at original pos */
3486 BACKCHAR(eptr);
3487 }
3488 }
3489 else
3490 #endif
3491 /* Not UTF-8 mode */
3492 {
3493 for (i = min; i < max; i++)
3494 {
3495 if (eptr >= md->end_subject)
3496 {
3497 SCHECK_PARTIAL();
3498 break;
3499 }
3500 if (fc == md->lcc[*eptr]) break;
3501 eptr++;
3502 }
3503 if (possessive) continue;
3504 while (eptr >= pp)
3505 {
3506 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3507 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508 eptr--;
3509 }
3510 }
3511
3512 MRRETURN(MATCH_NOMATCH);
3513 }
3514 /* Control never gets here */
3515 }
3516
3517 /* Caseful comparisons */
3518
3519 else
3520 {
3521 #ifdef SUPPORT_UTF8
3522 /* UTF-8 mode */
3523 if (utf8)
3524 {
3525 register unsigned int d;
3526 for (i = 1; i <= min; i++)
3527 {
3528 if (eptr >= md->end_subject)
3529 {
3530 SCHECK_PARTIAL();
3531 MRRETURN(MATCH_NOMATCH);
3532 }
3533 GETCHARINC(d, eptr);
3534 if (fc == d) MRRETURN(MATCH_NOMATCH);
3535 }
3536 }
3537 else
3538 #endif
3539 /* Not UTF-8 mode */
3540 {
3541 for (i = 1; i <= min; i++)
3542 {
3543 if (eptr >= md->end_subject)
3544 {
3545 SCHECK_PARTIAL();
3546 MRRETURN(MATCH_NOMATCH);
3547 }
3548 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3549 }
3550 }
3551
3552 if (min == max) continue;
3553
3554 if (minimize)
3555 {
3556 #ifdef SUPPORT_UTF8
3557 /* UTF-8 mode */
3558 if (utf8)
3559 {
3560 register unsigned int d;
3561 for (fi = min;; fi++)
3562 {
3563 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3564 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3565 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3566 if (eptr >= md->end_subject)
3567 {
3568 SCHECK_PARTIAL();
3569 MRRETURN(MATCH_NOMATCH);
3570 }
3571 GETCHARINC(d, eptr);
3572 if (fc == d) MRRETURN(MATCH_NOMATCH);
3573 }
3574 }
3575 else
3576 #endif
3577 /* Not UTF-8 mode */
3578 {
3579 for (fi = min;; fi++)
3580 {
3581 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3582 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3583 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3584 if (eptr >= md->end_subject)
3585 {
3586 SCHECK_PARTIAL();
3587 MRRETURN(MATCH_NOMATCH);
3588 }
3589 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3590 }
3591 }
3592 /* Control never gets here */
3593 }
3594
3595 /* Maximize case */
3596
3597 else
3598 {
3599 pp = eptr;
3600
3601 #ifdef SUPPORT_UTF8
3602 /* UTF-8 mode */
3603 if (utf8)
3604 {
3605 register unsigned int d;
3606 for (i = min; i < max; i++)
3607 {
3608 int len = 1;
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 break;
3613 }
3614 GETCHARLEN(d, eptr, len);
3615 if (fc == d) break;
3616 eptr += len;
3617 }
3618 if (possessive) continue;
3619 for(;;)
3620 {
3621 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3623 if (eptr-- == pp) break; /* Stop if tried at original pos */
3624 BACKCHAR(eptr);
3625 }
3626 }
3627 else
3628 #endif
3629 /* Not UTF-8 mode */
3630 {
3631 for (i = min; i < max; i++)
3632 {
3633 if (eptr >= md->end_subject)
3634 {
3635 SCHECK_PARTIAL();
3636 break;
3637 }
3638 if (fc == *eptr) break;
3639 eptr++;
3640 }
3641 if (possessive) continue;
3642 while (eptr >= pp)
3643 {
3644 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3645 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3646 eptr--;
3647 }
3648 }
3649
3650 MRRETURN(MATCH_NOMATCH);
3651 }
3652 }
3653 /* Control never gets here */
3654
3655 /* Match a single character type repeatedly; several different opcodes
3656 share code. This is very similar to the code for single characters, but we
3657 repeat it in the interests of efficiency. */
3658
3659 case OP_TYPEEXACT:
3660 min = max = GET2(ecode, 1);
3661 minimize = TRUE;
3662 ecode += 3;
3663 goto REPEATTYPE;
3664
3665 case OP_TYPEUPTO:
3666 case OP_TYPEMINUPTO:
3667 min = 0;
3668 max = GET2(ecode, 1);
3669 minimize = *ecode == OP_TYPEMINUPTO;
3670 ecode += 3;
3671 goto REPEATTYPE;
3672
3673 case OP_TYPEPOSSTAR:
3674 possessive = TRUE;
3675 min = 0;
3676 max = INT_MAX;
3677 ecode++;
3678 goto REPEATTYPE;
3679
3680 case OP_TYPEPOSPLUS:
3681 possessive = TRUE;
3682 min = 1;
3683 max = INT_MAX;
3684 ecode++;
3685 goto REPEATTYPE;
3686
3687 case OP_TYPEPOSQUERY:
3688 possessive = TRUE;
3689 min = 0;
3690 max = 1;
3691 ecode++;
3692 goto REPEATTYPE;
3693
3694 case OP_TYPEPOSUPTO:
3695 possessive = TRUE;
3696 min = 0;
3697 max = GET2(ecode, 1);
3698 ecode += 3;
3699 goto REPEATTYPE;
3700
3701 case OP_TYPESTAR:
3702 case OP_TYPEMINSTAR:
3703 case OP_TYPEPLUS:
3704 case OP_TYPEMINPLUS:
3705 case OP_TYPEQUERY:
3706 case OP_TYPEMINQUERY:
3707 c = *ecode++ - OP_TYPESTAR;
3708 minimize = (c & 1) != 0;
3709 min = rep_min[c]; /* Pick up values from tables; */
3710 max = rep_max[c]; /* zero for max => infinity */
3711 if (max == 0) max = INT_MAX;
3712
3713 /* Common code for all repeated single character type matches. Note that
3714 in UTF-8 mode, '.' matches a character of any length, but for the other
3715 character types, the valid characters are all one-byte long. */
3716
3717 REPEATTYPE:
3718 ctype = *ecode++; /* Code for the character type */
3719
3720 #ifdef SUPPORT_UCP
3721 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3722 {
3723 prop_fail_result = ctype == OP_NOTPROP;
3724 prop_type = *ecode++;
3725 prop_value = *ecode++;
3726 }
3727 else prop_type = -1;
3728 #endif
3729
3730 /* First, ensure the minimum number of matches are present. Use inline
3731 code for maximizing the speed, and do the type test once at the start
3732 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3733 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3734 and single-bytes. */
3735
3736 if (min > 0)
3737 {
3738 #ifdef SUPPORT_UCP
3739 if (prop_type >= 0)
3740 {
3741 switch(prop_type)
3742 {
3743 case PT_ANY:
3744 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3745 for (i = 1; i <= min; i++)
3746 {
3747 if (eptr >= md->end_subject)
3748 {
3749 SCHECK_PARTIAL();
3750 MRRETURN(MATCH_NOMATCH);
3751 }
3752 GETCHARINCTEST(c, eptr);
3753 }
3754 break;
3755
3756 case PT_LAMP:
3757 for (i = 1; i <= min; i++)
3758 {
3759 if (eptr >= md->end_subject)
3760 {
3761 SCHECK_PARTIAL();
3762 MRRETURN(MATCH_NOMATCH);
3763 }
3764 GETCHARINCTEST(c, eptr);
3765 prop_chartype = UCD_CHARTYPE(c);
3766 if ((prop_chartype == ucp_Lu ||
3767 prop_chartype == ucp_Ll ||
3768 prop_chartype == ucp_Lt) == prop_fail_result)
3769 MRRETURN(MATCH_NOMATCH);
3770 }
3771 break;
3772
3773 case PT_GC:
3774 for (i = 1; i <= min; i++)
3775 {
3776 if (eptr >= md->end_subject)
3777 {
3778 SCHECK_PARTIAL();
3779 MRRETURN(MATCH_NOMATCH);
3780 }
3781 GETCHARINCTEST(c, eptr);
3782 prop_category = UCD_CATEGORY(c);
3783 if ((prop_category == prop_value) == prop_fail_result)
3784 MRRETURN(MATCH_NOMATCH);
3785 }
3786 break;
3787
3788 case PT_PC:
3789 for (i = 1; i <= min; i++)
3790 {
3791 if (eptr >= md->end_subject)
3792 {
3793 SCHECK_PARTIAL();
3794 MRRETURN(MATCH_NOMATCH);
3795 }
3796 GETCHARINCTEST(c, eptr);
3797 prop_chartype = UCD_CHARTYPE(c);
3798 if ((prop_chartype == prop_value) == prop_fail_result)
3799 MRRETURN(MATCH_NOMATCH);
3800 }
3801 break;
3802
3803 case PT_SC:
3804 for (i = 1; i <= min; i++)
3805 {
3806 if (eptr >= md->end_subject)
3807 {
3808 SCHECK_PARTIAL();
3809 MRRETURN(MATCH_NOMATCH);
3810 }
3811 GETCHARINCTEST(c, eptr);
3812 prop_script = UCD_SCRIPT(c);
3813 if ((prop_script == prop_value) == prop_fail_result)
3814 MRRETURN(MATCH_NOMATCH);
3815 }
3816 break;
3817
3818 case PT_ALNUM:
3819 for (i = 1; i <= min; i++)
3820 {
3821 if (eptr >= md->end_subject)
3822 {
3823 SCHECK_PARTIAL();
3824 MRRETURN(MATCH_NOMATCH);
3825 }
3826 GETCHARINCTEST(c, eptr);
3827 prop_category = UCD_CATEGORY(c);
3828 if ((prop_category == ucp_L || prop_category == ucp_N)
3829 == prop_fail_result)
3830 MRRETURN(MATCH_NOMATCH);
3831 }
3832 break;
3833
3834 case PT_SPACE: /* Perl space */
3835 for (i = 1; i <= min; i++)
3836 {
3837 if (eptr >= md->end_subject)
3838 {
3839 SCHECK_PARTIAL();
3840 MRRETURN(MATCH_NOMATCH);
3841 }
3842 GETCHARINCTEST(c, eptr);
3843 prop_category = UCD_CATEGORY(c);
3844 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3845 c == CHAR_FF || c == CHAR_CR)
3846 == prop_fail_result)
3847 MRRETURN(MATCH_NOMATCH);
3848 }
3849 break;
3850
3851 case PT_PXSPACE: /* POSIX space */
3852 for (i = 1; i <= min; i++)
3853 {
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 MRRETURN(MATCH_NOMATCH);
3858 }
3859 GETCHARINCTEST(c, eptr);
3860 prop_category = UCD_CATEGORY(c);
3861 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3862 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3863 == prop_fail_result)
3864 MRRETURN(MATCH_NOMATCH);
3865 }
3866 break;
3867
3868 case PT_WORD:
3869 for (i = 1; i <= min; i++)
3870 {
3871 if (eptr >= md->end_subject)
3872 {
3873 SCHECK_PARTIAL();
3874 MRRETURN(MATCH_NOMATCH);
3875 }
3876 GETCHARINCTEST(c, eptr);
3877 prop_category = UCD_CATEGORY(c);
3878 if ((prop_category == ucp_L || prop_category == ucp_N ||
3879 c == CHAR_UNDERSCORE)
3880 == prop_fail_result)
3881 MRRETURN(MATCH_NOMATCH);
3882 }
3883 break;
3884
3885 /* This should not occur */
3886
3887 default:
3888 RRETURN(PCRE_ERROR_INTERNAL);
3889 }
3890 }
3891
3892 /* Match extended Unicode sequences. We will get here only if the
3893 support is in the binary; otherwise a compile-time error occurs. */
3894
3895 else if (ctype == OP_EXTUNI)
3896 {
3897 for (i = 1; i <= min; i++)
3898 {
3899 if (eptr >= md->end_subject)
3900 {
3901 SCHECK_PARTIAL();
3902 MRRETURN(MATCH_NOMATCH);
3903 }
3904 GETCHARINCTEST(c, eptr);
3905 prop_category = UCD_CATEGORY(c);
3906 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3907 while (eptr < md->end_subject)
3908 {
3909 int len = 1;
3910 if (!utf8) c = *eptr;
3911 else { GETCHARLEN(c, eptr, len); }
3912 prop_category = UCD_CATEGORY(c);
3913 if (prop_category != ucp_M) break;
3914 eptr += len;
3915 }
3916 }
3917 }
3918
3919 else
3920 #endif /* SUPPORT_UCP */
3921
3922 /* Handle all other cases when the coding is UTF-8 */
3923
3924 #ifdef SUPPORT_UTF8
3925 if (utf8) switch(ctype)
3926 {
3927 case OP_ANY:
3928 for (i = 1; i <= min; i++)
3929 {
3930 if (eptr >= md->end_subject)
3931 {
3932 SCHECK_PARTIAL();
3933 MRRETURN(MATCH_NOMATCH);
3934 }
3935 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3936 eptr++;
3937 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3938 }
3939 break;
3940
3941 case OP_ALLANY:
3942 for (i = 1; i <= min; i++)
3943 {
3944 if (eptr >= md->end_subject)
3945 {
3946 SCHECK_PARTIAL();
3947 MRRETURN(MATCH_NOMATCH);
3948 }
3949 eptr++;
3950 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3951 }
3952 break;
3953
3954 case OP_ANYBYTE:
3955 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3956 eptr += min;
3957 break;
3958
3959 case OP_ANYNL:
3960 for (i = 1; i <= min; i++)
3961 {
3962 if (eptr >= md->end_subject)
3963 {
3964 SCHECK_PARTIAL();
3965 MRRETURN(MATCH_NOMATCH);
3966 }
3967 GETCHARINC(c, eptr);
3968 switch(c)
3969 {
3970 default: MRRETURN(MATCH_NOMATCH);
3971
3972 case 0x000d:
3973 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3974 break;
3975
3976 case 0x000a:
3977 break;
3978
3979 case 0x000b:
3980 case 0x000c:
3981 case 0x0085:
3982 case 0x2028:
3983 case 0x2029:
3984 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3985 break;
3986 }
3987 }
3988 break;
3989
3990 case OP_NOT_HSPACE:
3991 for (i = 1; i <= min; i++)
3992 {
3993 if (eptr >= md->end_subject)
3994 {
3995 SCHECK_PARTIAL();
3996 MRRETURN(MATCH_NOMATCH);
3997 }
3998 GETCHARINC(c, eptr);
3999 switch(c)
4000 {
4001 default: break;
4002 case 0x09: /* HT */
4003 case 0x20: /* SPACE */
4004 case 0xa0: /* NBSP */
4005 case 0x1680: /* OGHAM SPACE MARK */
4006 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4007 case 0x2000: /* EN QUAD */
4008 case 0x2001: /* EM QUAD */
4009 case 0x2002: /* EN SPACE */
4010 case 0x2003: /* EM SPACE */
4011 case 0x2004: /* THREE-PER-EM SPACE */
4012 case 0x2005: /* FOUR-PER-EM SPACE */
4013 case 0x2006: /* SIX-PER-EM SPACE */
4014 case 0x2007: /* FIGURE SPACE */
4015 case 0x2008: /* PUNCTUATION SPACE */
4016 case 0x2009: /* THIN SPACE */
4017 case 0x200A: /* HAIR SPACE */
4018 case 0x202f: /* NARROW NO-BREAK SPACE */
4019 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4020 case 0x3000: /* IDEOGRAPHIC SPACE */
4021 MRRETURN(MATCH_NOMATCH);
4022 }
4023 }
4024 break;
4025
4026 case OP_HSPACE:
4027 for (i = 1; i <= min; i++)
4028 {
4029 if (eptr >= md->end_subject)
4030 {
4031 SCHECK_PARTIAL();
4032 MRRETURN(MATCH_NOMATCH);
4033 }
4034 GETCHARINC(c, eptr);
4035 switch(c)
4036 {
4037 default: MRRETURN(MATCH_NOMATCH);
4038 case 0x09: /* HT */
4039 case 0x20: /* SPACE */
4040 case 0xa0: /* NBSP */
4041 case 0x1680: /* OGHAM SPACE MARK */
4042 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4043 case 0x2000: /* EN QUAD */
4044 case 0x2001: /* EM QUAD */
4045 case 0x2002: /* EN SPACE */
4046 case 0x2003: /* EM SPACE */
4047 case 0x2004: /* THREE-PER-EM SPACE */
4048 case 0x2005: /* FOUR-PER-EM SPACE */
4049 case 0x2006: /* SIX-PER-EM SPACE */
4050 case 0x2007: /* FIGURE SPACE */
4051 case 0x2008: /* PUNCTUATION SPACE */
4052 case 0x2009: /* THIN SPACE */
4053 case 0x200A: /* HAIR SPACE */
4054 case 0x202f: /* NARROW NO-BREAK SPACE */
4055 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4056 case 0x3000: /* IDEOGRAPHIC SPACE */
4057 break;
4058 }
4059 }
4060 break;
4061
4062 case OP_NOT_VSPACE:
4063 for (i = 1; i <= min; i++)
4064 {
4065 if (eptr >= md->end_subject)
4066 {
4067 SCHECK_PARTIAL();
4068 MRRETURN(MATCH_NOMATCH);
4069 }
4070 GETCHARINC(c, eptr);
4071 switch(c)
4072 {
4073 default: break;
4074 case 0x0a: /* LF */
4075 case 0x0b: /* VT */
4076 case 0x0c: /* FF */
4077 case 0x0d: /* CR */
4078 case 0x85: /* NEL */
4079 case 0x2028: /* LINE SEPARATOR */
4080 case 0x2029: /* PARAGRAPH SEPARATOR */
4081 MRRETURN(MATCH_NOMATCH);
4082 }
4083 }
4084 break;
4085
4086 case OP_VSPACE:
4087 for (i = 1; i <= min; i++)
4088 {
4089 if (eptr >= md->end_subject)
4090 {
4091 SCHECK_PARTIAL();
4092 MRRETURN(MATCH_NOMATCH);
4093 }
4094 GETCHARINC(c, eptr);
4095 switch(c)
4096 {
4097 default: MRRETURN(MATCH_NOMATCH);
4098 case 0x0a: /* LF */
4099 case 0x0b: /* VT */
4100 case 0x0c: /* FF */
4101 case 0x0d: /* CR */
4102 case 0x85: /* NEL */
4103 case 0x2028: /* LINE SEPARATOR */
4104 case 0x2029: /* PARAGRAPH SEPARATOR */
4105 break;
4106 }
4107 }
4108 break;
4109
4110 case OP_NOT_DIGIT:
4111 for (i = 1; i <= min; i++)
4112 {
4113 if (eptr >= md->end_subject)
4114 {
4115 SCHECK_PARTIAL();
4116 MRRETURN(MATCH_NOMATCH);
4117 }
4118 GETCHARINC(c, eptr);
4119 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4120 MRRETURN(MATCH_NOMATCH);
4121 }
4122 break;
4123
4124 case OP_DIGIT:
4125 for (i = 1; i <= min; i++)
4126 {
4127 if (eptr >= md->end_subject)
4128 {
4129 SCHECK_PARTIAL();
4130 MRRETURN(MATCH_NOMATCH);
4131 }
4132 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4133 MRRETURN(MATCH_NOMATCH);
4134 /* No need to skip more bytes - we know it's a 1-byte character */
4135 }
4136 break;
4137
4138 case OP_NOT_WHITESPACE:
4139 for (i = 1; i <= min; i++)
4140 {
4141 if (eptr >= md->end_subject)
4142 {
4143 SCHECK_PARTIAL();
4144 MRRETURN(MATCH_NOMATCH);
4145 }
4146 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4147 MRRETURN(MATCH_NOMATCH);
4148 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4149 }
4150 break;
4151
4152 case OP_WHITESPACE:
4153 for (i = 1; i <= min; i++)
4154 {
4155 if (eptr >= md->end_subject)
4156 {
4157 SCHECK_PARTIAL();
4158 MRRETURN(MATCH_NOMATCH);
4159 }
4160 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4161 MRRETURN(MATCH_NOMATCH);
4162 /* No need to skip more bytes - we know it's a 1-byte character */
4163 }
4164 break;
4165
4166 case OP_NOT_WORDCHAR:
4167 for (i = 1; i <= min; i++)
4168 {
4169 if (eptr >= md->end_subject)
4170 {
4171 SCHECK_PARTIAL();
4172 MRRETURN(MATCH_NOMATCH);
4173 }
4174 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4175 MRRETURN(MATCH_NOMATCH);
4176 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4177 }
4178 break;
4179
4180 case OP_WORDCHAR:
4181 for (i = 1; i <= min; i++)
4182 {
4183 if (eptr >= md->end_subject)
4184 {
4185 SCHECK_PARTIAL();
4186 MRRETURN(MATCH_NOMATCH);
4187 }
4188 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4189 MRRETURN(MATCH_NOMATCH);
4190 /* No need to skip more bytes - we know it's a 1-byte character */
4191 }
4192 break;
4193
4194 default:
4195 RRETURN(PCRE_ERROR_INTERNAL);
4196 } /* End switch(ctype) */
4197
4198 else
4199 #endif /* SUPPORT_UTF8 */
4200
4201 /* Code for the non-UTF-8 case for minimum matching of operators other
4202 than OP_PROP and OP_NOTPROP. */
4203
4204 switch(ctype)
4205 {
4206 case OP_ANY:
4207 for (i = 1; i <= min; i++)
4208 {
4209 if (eptr >= md->end_subject)
4210 {
4211 SCHECK_PARTIAL();
4212 MRRETURN(MATCH_NOMATCH);
4213 }
4214 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4215 eptr++;
4216 }
4217 break;
4218
4219 case OP_ALLANY:
4220 if (eptr > md->end_subject - min)
4221 {
4222 SCHECK_PARTIAL();
4223 MRRETURN(MATCH_NOMATCH);
4224 }
4225 eptr += min;
4226 break;
4227
4228 case OP_ANYBYTE:
4229 if (eptr > md->end_subject - min)
4230 {
4231 SCHECK_PARTIAL();
4232 MRRETURN(MATCH_NOMATCH);
4233 }
4234 eptr += min;
4235 break;
4236
4237 case OP_ANYNL:
4238 for (i = 1; i <= min; i++)
4239 {
4240 if (eptr >= md->end_subject)
4241 {
4242 SCHECK_PARTIAL();
4243 MRRETURN(MATCH_NOMATCH);
4244 }
4245 switch(*eptr++)
4246 {
4247 default: MRRETURN(MATCH_NOMATCH);
4248
4249 case 0x000d:
4250 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4251 break;
4252
4253 case 0x000a:
4254 break;
4255
4256 case 0x000b:
4257 case 0x000c:
4258 case 0x0085:
4259 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4260 break;
4261 }
4262 }
4263 break;
4264
4265 case OP_NOT_HSPACE:
4266 for (i = 1; i <= min; i++)
4267 {
4268 if (eptr >= md->end_subject)
4269 {
4270 SCHECK_PARTIAL();
4271 MRRETURN(MATCH_NOMATCH);
4272 }
4273 switch(*eptr++)
4274 {
4275 default: break;
4276 case 0x09: /* HT */
4277 case 0x20: /* SPACE */
4278 case 0xa0: /* NBSP */
4279 MRRETURN(MATCH_NOMATCH);
4280 }
4281 }
4282 break;
4283
4284 case OP_HSPACE:
4285 for (i = 1; i <= min; i++)
4286 {
4287 if (eptr >= md->end_subject)
4288 {
4289 SCHECK_PARTIAL();
4290 MRRETURN(MATCH_NOMATCH);
4291 }
4292 switch(*eptr++)
4293 {
4294 default: MRRETURN(MATCH_NOMATCH);
4295 case 0x09: /* HT */
4296 case 0x20: /* SPACE */
4297 case 0xa0: /* NBSP */
4298 break;
4299 }
4300 }
4301 break;
4302
4303 case OP_NOT_VSPACE:
4304 for (i = 1; i <= min; i++)
4305 {
4306 if (eptr >= md->end_subject)
4307 {
4308 SCHECK_PARTIAL();
4309 MRRETURN(MATCH_NOMATCH);
4310 }
4311 switch(*eptr++)
4312 {
4313 default: break;
4314 case 0x0a: /* LF */
4315 case 0x0b: /* VT */
4316 case 0x0c: /* FF */
4317 case 0x0d: /* CR */
4318 case 0x85: /* NEL */
4319 MRRETURN(MATCH_NOMATCH);
4320 }
4321 }
4322 break;
4323
4324 case OP_VSPACE:
4325 for (i = 1; i <= min; i++)
4326 {
4327 if (eptr >= md->end_subject)
4328 {
4329 SCHECK_PARTIAL();
4330 MRRETURN(MATCH_NOMATCH);
4331 }
4332 switch(*eptr++)
4333 {
4334 default: MRRETURN(MATCH_NOMATCH);
4335 case 0x0a: /* LF */
4336 case 0x0b: /* VT */
4337 case 0x0c: /* FF */
4338 case 0x0d: /* CR */
4339 case 0x85: /* NEL */
4340 break;
4341 }
4342 }
4343 break;
4344
4345 case OP_NOT_DIGIT:
4346 for (i = 1; i <= min; i++)
4347 {
4348 if (eptr >= md->end_subject)
4349 {
4350 SCHECK_PARTIAL();
4351 MRRETURN(MATCH_NOMATCH);
4352 }
4353 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4354 }
4355 break;
4356
4357 case OP_DIGIT:
4358 for (i = 1; i <= min; i++)
4359 {
4360 if (eptr >= md->end_subject)
4361 {
4362 SCHECK_PARTIAL();
4363 MRRETURN(MATCH_NOMATCH);
4364 }
4365 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4366 }
4367 break;
4368
4369 case OP_NOT_WHITESPACE:
4370 for (i = 1; i <= min; i++)
4371 {
4372 if (eptr >= md->end_subject)
4373 {
4374 SCHECK_PARTIAL();
4375 MRRETURN(MATCH_NOMATCH);
4376 }
4377 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4378 }
4379 break;
4380
4381 case OP_WHITESPACE:
4382 for (i = 1; i <= min; i++)
4383 {
4384 if (eptr >= md->end_subject)
4385 {
4386 SCHECK_PARTIAL();
4387 MRRETURN(MATCH_NOMATCH);
4388 }
4389 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4390 }
4391 break;
4392
4393 case OP_NOT_WORDCHAR:
4394 for (i = 1; i <= min; i++)
4395 {
4396 if (eptr >= md->end_subject)
4397 {
4398 SCHECK_PARTIAL();
4399 MRRETURN(MATCH_NOMATCH);
4400 }
4401 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4402 MRRETURN(MATCH_NOMATCH);
4403 }
4404 break;
4405
4406 case OP_WORDCHAR:
4407 for (i = 1; i <= min; i++)
4408 {
4409 if (eptr >= md->end_subject)
4410 {
4411 SCHECK_PARTIAL();
4412 MRRETURN(MATCH_NOMATCH);
4413 }
4414 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4415 MRRETURN(MATCH_NOMATCH);
4416 }
4417 break;
4418
4419 default:
4420 RRETURN(PCRE_ERROR_INTERNAL);
4421 }
4422 }
4423
4424 /* If min = max, continue at the same level without recursing */
4425
4426 if (min == max) continue;
4427
4428 /* If minimizing, we have to test the rest of the pattern before each
4429 subsequent match. Again, separate the UTF-8 case for speed, and also
4430 separate the UCP cases. */
4431
4432 if (minimize)
4433 {
4434 #ifdef SUPPORT_UCP
4435 if (prop_type >= 0)
4436 {
4437 switch(prop_type)
4438 {
4439 case PT_ANY:
4440 for (fi = min;; fi++)
4441 {
4442 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4444 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4445 if (eptr >= md->end_subject)
4446 {
4447 SCHECK_PARTIAL();
4448 MRRETURN(MATCH_NOMATCH);
4449 }
4450 GETCHARINCTEST(c, eptr);
4451 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4452 }
4453 /* Control never gets here */
4454
4455 case PT_LAMP:
4456 for (fi = min;; fi++)
4457 {
4458 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4460 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4461 if (eptr >= md->end_subject)
4462 {
4463 SCHECK_PARTIAL();
4464 MRRETURN(MATCH_NOMATCH);
4465 }
4466 GETCHARINCTEST(c, eptr);
4467 prop_chartype = UCD_CHARTYPE(c);
4468 if ((prop_chartype == ucp_Lu ||
4469 prop_chartype == ucp_Ll ||
4470 prop_chartype == ucp_Lt) == prop_fail_result)
4471 MRRETURN(MATCH_NOMATCH);
4472 }
4473 /* Control never gets here */
4474
4475 case PT_GC:
4476 for (fi = min;; fi++)
4477 {
4478 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4479 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4480 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4481 if (eptr >= md->end_subject)
4482 {
4483 SCHECK_PARTIAL();
4484 MRRETURN(MATCH_NOMATCH);
4485 }
4486 GETCHARINCTEST(c, eptr);
4487 prop_category = UCD_CATEGORY(c);
4488 if ((prop_category == prop_value) == prop_fail_result)
4489 MRRETURN(MATCH_NOMATCH);
4490 }
4491 /* Control never gets here */
4492
4493 case PT_PC:
4494 for (fi = min;; fi++)
4495 {
4496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4498 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4499 if (eptr >= md->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 MRRETURN(MATCH_NOMATCH);
4503 }
4504 GETCHARINCTEST(c, eptr);
4505 prop_chartype = UCD_CHARTYPE(c);
4506 if ((prop_chartype == prop_value) == prop_fail_result)
4507 MRRETURN(MATCH_NOMATCH);
4508 }
4509 /* Control never gets here */
4510
4511 case PT_SC:
4512 for (fi = min;; fi++)
4513 {
4514 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4516 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4517 if (eptr >= md->end_subject)
4518 {
4519 SCHECK_PARTIAL();
4520 MRRETURN(MATCH_NOMATCH);
4521 }
4522 GETCHARINCTEST(c, eptr);
4523 prop_script = UCD_SCRIPT(c);
4524 if ((prop_script == prop_value) == prop_fail_result)
4525 MRRETURN(MATCH_NOMATCH);
4526 }
4527 /* Control never gets here */
4528
4529 case PT_ALNUM:
4530 for (fi = min;; fi++)
4531 {
4532 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4533 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4534 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4535 if (eptr >= md->end_subject)
4536 {
4537 SCHECK_PARTIAL();
4538 MRRETURN(MATCH_NOMATCH);
4539 }
4540 GETCHARINCTEST(c, eptr);
4541 prop_category = UCD_CATEGORY(c);
4542 if ((prop_category == ucp_L || prop_category == ucp_N)
4543 == prop_fail_result)
4544 MRRETURN(MATCH_NOMATCH);
4545 }
4546 /* Control never gets here */
4547
4548 case PT_SPACE: /* Perl space */
4549 for (fi = min;; fi++)
4550 {
4551 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4552 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4553 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4554 if (eptr >= md->end_subject)
4555 {
4556 SCHECK_PARTIAL();
4557 MRRETURN(MATCH_NOMATCH);
4558 }
4559 GETCHARINCTEST(c, eptr);
4560 prop_category = UCD_CATEGORY(c);
4561 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4562 c == CHAR_FF || c == CHAR_CR)
4563 == prop_fail_result)
4564 MRRETURN(MATCH_NOMATCH);
4565 }
4566 /* Control never gets here */
4567
4568 case PT_PXSPACE: /* POSIX space */
4569 for (fi = min;; fi++)
4570 {
4571 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4572 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4573 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4574 if (eptr >= md->end_subject)
4575 {
4576 SCHECK_PARTIAL();
4577 MRRETURN(MATCH_NOMATCH);
4578 }
4579 GETCHARINCTEST(c, eptr);
4580 prop_category = UCD_CATEGORY(c);
4581 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4582 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4583 == prop_fail_result)
4584 MRRETURN(MATCH_NOMATCH);
4585 }
4586 /* Control never gets here */
4587
4588 case PT_WORD:
4589 for (fi = min;; fi++)
4590 {
4591 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4592 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4593 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4594 if (eptr >= md->end_subject)
4595 {
4596 SCHECK_PARTIAL();
4597 MRRETURN(MATCH_NOMATCH);
4598 }
4599 GETCHARINCTEST(c, eptr);
4600 prop_category = UCD_CATEGORY(c);
4601 if ((prop_category == ucp_L ||
4602 prop_category == ucp_N ||
4603 c == CHAR_UNDERSCORE)
4604 == prop_fail_result)
4605 MRRETURN(MATCH_NOMATCH);
4606 }
4607 /* Control never gets here */
4608
4609 /* This should never occur */
4610
4611 default:
4612 RRETURN(PCRE_ERROR_INTERNAL);
4613 }
4614 }
4615
4616 /* Match extended Unicode sequences. We will get here only if the
4617 support is in the binary; otherwise a compile-time error occurs. */
4618
4619 else if (ctype == OP_EXTUNI)
4620 {
4621 for (fi = min;; fi++)
4622 {
4623 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4624 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4625 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4626 if (eptr >= md->end_subject)
4627 {
4628 SCHECK_PARTIAL();
4629 MRRETURN(MATCH_NOMATCH);
4630 }
4631 GETCHARINCTEST(c, eptr);
4632 prop_category = UCD_CATEGORY(c);
4633 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4634 while (eptr < md->end_subject)
4635 {
4636 int len = 1;
4637 if (!utf8) c = *eptr;
4638 else { GETCHARLEN(c, eptr, len); }
4639 prop_category = UCD_CATEGORY(c);
4640 if (prop_category != ucp_M) break;
4641 eptr += len;
4642 }
4643 }
4644 }
4645
4646 else
4647 #endif /* SUPPORT_UCP */
4648
4649 #ifdef SUPPORT_UTF8
4650 /* UTF-8 mode */
4651 if (utf8)
4652 {
4653 for (fi = min;; fi++)
4654 {
4655 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4656 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4657 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4658 if (eptr >= md->end_subject)
4659 {
4660 SCHECK_PARTIAL();
4661 MRRETURN(MATCH_NOMATCH);
4662 }
4663 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4664 MRRETURN(MATCH_NOMATCH);
4665 GETCHARINC(c, eptr);
4666 switch(ctype)
4667 {
4668 case OP_ANY: /* This is the non-NL case */
4669 case OP_ALLANY:
4670 case OP_ANYBYTE:
4671 break;
4672
4673 case OP_ANYNL:
4674 switch(c)
4675 {
4676 default: MRRETURN(MATCH_NOMATCH);
4677 case 0x000d:
4678 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4679 break;
4680 case 0x000a:
4681 break;
4682
4683 case 0x000b:
4684 case 0x000c:
4685 case 0x0085:
4686 case 0x2028:
4687 case 0x2029:
4688 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4689 break;
4690 }
4691 break;
4692
4693 case OP_NOT_HSPACE:
4694 switch(c)
4695 {
4696 default: break;
4697 case 0x09: /* HT */
4698 case 0x20: /* SPACE */
4699 case 0xa0: /* NBSP */
4700 case 0x1680: /* OGHAM SPACE MARK */
4701 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4702 case 0x2000: /* EN QUAD */
4703 case 0x2001: /* EM QUAD */
4704 case 0x2002: /* EN SPACE */
4705 case 0x2003: /* EM SPACE */
4706 case 0x2004: /* THREE-PER-EM SPACE */
4707 case 0x2005: /* FOUR-PER-EM SPACE */
4708 case 0x2006: /* SIX-PER-EM SPACE */
4709 case 0x2007: /* FIGURE SPACE */
4710 case 0x2008: /* PUNCTUATION SPACE */
4711 case 0x2009: /* THIN SPACE */
4712 case 0x200A: /* HAIR SPACE */
4713 case 0x202f: /* NARROW NO-BREAK SPACE */
4714 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4715 case 0x3000: /* IDEOGRAPHIC SPACE */
4716 MRRETURN(MATCH_NOMATCH);
4717 }
4718 break;
4719
4720 case OP_HSPACE:
4721 switch(c)
4722 {
4723 default: MRRETURN(MATCH_NOMATCH);
4724 case 0x09: /* HT */
4725 case 0x20: /* SPACE */
4726 case 0xa0: /* NBSP */
4727 case 0x1680: /* OGHAM SPACE MARK */
4728 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4729 case 0x2000: /* EN QUAD */
4730 case 0x2001: /* EM QUAD */
4731 case 0x2002: /* EN SPACE */
4732 case 0x2003: /* EM SPACE */
4733 case 0x2004: /* THREE-PER-EM SPACE */
4734 case 0x2005: /* FOUR-PER-EM SPACE */
4735 case 0x2006: /* SIX-PER-EM SPACE */
4736 case 0x2007: /* FIGURE SPACE */
4737 case 0x2008: /* PUNCTUATION SPACE */
4738 case 0x2009: /* THIN SPACE */
4739 case 0x200A: /* HAIR SPACE */
4740 case 0x202f: /* NARROW NO-BREAK SPACE */
4741 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4742 case 0x3000: /* IDEOGRAPHIC SPACE */
4743 break;
4744 }
4745 break;
4746
4747 case OP_NOT_VSPACE:
4748 switch(c)
4749 {
4750 default: break;
4751 case 0x0a: /* LF */
4752 case 0x0b: /* VT */
4753 case 0x0c: /* FF */
4754 case 0x0d: /* CR */
4755 case 0x85: /* NEL */
4756 case 0x2028: /* LINE SEPARATOR */
4757 case 0x2029: /* PARAGRAPH SEPARATOR */
4758 MRRETURN(MATCH_NOMATCH);
4759 }
4760 break;
4761
4762 case OP_VSPACE:
4763 switch(c)
4764 {
4765 default: MRRETURN(MATCH_NOMATCH);
4766 case 0x0a: /* LF */
4767 case 0x0b: /* VT */
4768 case 0x0c: /* FF */
4769 case 0x0d: /* CR */
4770 case 0x85: /* NEL */
4771 case 0x2028: /* LINE SEPARATOR */
4772 case 0x2029: /* PARAGRAPH SEPARATOR */
4773 break;
4774 }
4775 break;
4776
4777 case OP_NOT_DIGIT:
4778 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4779 MRRETURN(MATCH_NOMATCH);
4780 break;
4781
4782 case OP_DIGIT:
4783 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4784 MRRETURN(MATCH_NOMATCH);
4785 break;
4786
4787 case OP_NOT_WHITESPACE:
4788 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4789 MRRETURN(MATCH_NOMATCH);
4790 break;
4791
4792 case OP_WHITESPACE:
4793 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4794 MRRETURN(MATCH_NOMATCH);
4795 break;
4796
4797 case OP_NOT_WORDCHAR:
4798 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4799 MRRETURN(MATCH_NOMATCH);
4800 break;
4801
4802 case OP_WORDCHAR:
4803 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4804 MRRETURN(MATCH_NOMATCH);
4805 break;
4806
4807 default:
4808 RRETURN(PCRE_ERROR_INTERNAL);
4809 }
4810 }
4811 }
4812 else
4813 #endif
4814 /* Not UTF-8 mode */
4815 {
4816 for (fi = min;; fi++)
4817 {
4818 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4820 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4821 if (eptr >= md->end_subject)
4822 {
4823 SCHECK_PARTIAL();
4824 MRRETURN(MATCH_NOMATCH);
4825 }
4826 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4827 MRRETURN(MATCH_NOMATCH);
4828 c = *eptr++;
4829 switch(ctype)
4830 {
4831 case OP_ANY: /* This is the non-NL case */
4832 case OP_ALLANY:
4833 case OP_ANYBYTE:
4834 break;
4835
4836 case OP_ANYNL:
4837 switch(c)
4838 {
4839 default: MRRETURN(MATCH_NOMATCH);
4840 case 0x000d:
4841 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4842 break;
4843
4844 case 0x000a:
4845 break;
4846
4847 case 0x000b:
4848 case 0x000c:
4849 case 0x0085:
4850 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4851 break;
4852 }
4853 break;
4854
4855 case OP_NOT_HSPACE:
4856 switch(c)
4857 {
4858 default: break;
4859 case 0x09: /* HT */
4860 case 0x20: /* SPACE */
4861 case 0xa0: /* NBSP */
4862 MRRETURN(MATCH_NOMATCH);
4863 }
4864 break;
4865
4866 case OP_HSPACE:
4867 switch(c)
4868 {
4869 default: MRRETURN(MATCH_NOMATCH);
4870 case 0x09: /* HT */
4871 case 0x20: /* SPACE */
4872 case 0xa0: /* NBSP */
4873 break;
4874 }
4875 break;
4876
4877 case OP_NOT_VSPACE:
4878 switch(c)
4879 {
4880 default: break;
4881 case 0x0a: /* LF */
4882 case 0x0b: /* VT */
4883 case 0x0c: /* FF */
4884 case 0x0d: /* CR */
4885 case 0x85: /* NEL */
4886 MRRETURN(MATCH_NOMATCH);
4887 }
4888 break;
4889
4890 case OP_VSPACE:
4891 switch(c)
4892 {
4893 default: MRRETURN(MATCH_NOMATCH);
4894 case 0x0a: /* LF */
4895 case 0x0b: /* VT */
4896 case 0x0c: /* FF */
4897 case 0x0d: /* CR */
4898 case 0x85: /* NEL */
4899 break;
4900 }
4901 break;
4902
4903 case OP_NOT_DIGIT:
4904 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4905 break;
4906
4907 case OP_DIGIT:
4908 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4909 break;
4910
4911 case OP_NOT_WHITESPACE:
4912 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4913 break;
4914
4915 case OP_WHITESPACE:
4916 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4917 break;
4918
4919 case OP_NOT_WORDCHAR:
4920 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4921 break;
4922
4923 case OP_WORDCHAR:
4924 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4925 break;
4926
4927 default:
4928 RRETURN(PCRE_ERROR_INTERNAL);
4929 }
4930 }
4931 }
4932 /* Control never gets here */
4933 }
4934
4935 /* If maximizing, it is worth using inline code for speed, doing the type
4936 test once at the start (i.e. keep it out of the loop). Again, keep the
4937 UTF-8 and UCP stuff separate. */
4938
4939 else
4940 {
4941 pp = eptr; /* Remember where we started */
4942
4943 #ifdef SUPPORT_UCP
4944 if (prop_type >= 0)
4945 {
4946 switch(prop_type)
4947 {
4948 case PT_ANY:
4949 for (i = min; i < max; i++)
4950 {
4951 int len = 1;
4952 if (eptr >= md->end_subject)
4953 {
4954 SCHECK_PARTIAL();
4955 break;
4956 }
4957 GETCHARLENTEST(c, eptr, len);
4958 if (prop_fail_result) break;
4959 eptr+= len;
4960 }
4961 break;
4962
4963 case PT_LAMP:
4964 for (i = min; i < max; i++)
4965 {
4966 int len = 1;
4967 if (eptr >= md->end_subject)
4968 {
4969 SCHECK_PARTIAL();
4970 break;
4971 }
4972 GETCHARLENTEST(c, eptr, len);
4973 prop_chartype = UCD_CHARTYPE(c);
4974 if ((prop_chartype == ucp_Lu ||
4975 prop_chartype == ucp_Ll ||
4976 prop_chartype == ucp_Lt) == prop_fail_result)
4977 break;
4978 eptr+= len;
4979 }
4980 break;
4981
4982 case PT_GC:
4983 for (i = min; i < max; i++)
4984 {
4985 int len = 1;
4986 if (eptr >= md->end_subject)
4987 {
4988 SCHECK_PARTIAL();
4989 break;
4990 }
4991 GETCHARLENTEST(c, eptr, len);
4992 prop_category = UCD_CATEGORY(c);
4993 if ((prop_category == prop_value) == prop_fail_result)
4994 break;
4995 eptr+= len;
4996 }
4997 break;
4998
4999 case PT_PC:
5000 for (i = min; i < max; i++)
5001 {
5002 int len = 1;
5003 if (eptr >= md->end_subject)
5004 {
5005 SCHECK_PARTIAL();
5006 break;
5007 }
5008 GETCHARLENTEST(c, eptr, len);
5009 prop_chartype = UCD_CHARTYPE(c);
5010 if ((prop_chartype == prop_value) == prop_fail_result)
5011 break;
5012 eptr+= len;
5013 }
5014 break;
5015
5016 case PT_SC:
5017 for (i = min; i < max; i++)
5018 {
5019 int len = 1;
5020 if (eptr >= md->end_subject)
5021 {
5022 SCHECK_PARTIAL();
5023 break;
5024 }
5025 GETCHARLENTEST(c, eptr, len);
5026 prop_script = UCD_SCRIPT(c);
5027 if ((prop_script == prop_value) == prop_fail_result)
5028 break;
5029 eptr+= len;
5030 }
5031 break;
5032
5033 case PT_ALNUM:
5034 for (i = min; i < max; i++)
5035 {
5036 int len = 1;
5037 if (eptr >= md->end_subject)
5038 {
5039 SCHECK_PARTIAL();
5040 break;
5041 }
5042 GETCHARLENTEST(c, eptr, len);
5043 prop_category = UCD_CATEGORY(c);
5044 if ((prop_category == ucp_L || prop_category == ucp_N)
5045 == prop_fail_result)
5046 break;
5047 eptr+= len;
5048 }
5049 break;
5050
5051 case PT_SPACE: /* Perl space */
5052 for (i = min; i < max; i++)
5053 {
5054 int len = 1;
5055 if (eptr >= md->end_subject)
5056 {
5057 SCHECK_PARTIAL();
5058 break;
5059 }
5060 GETCHARLENTEST(c, eptr, len);
5061 prop_category = UCD_CATEGORY(c);
5062 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5063 c == CHAR_FF || c == CHAR_CR)
5064 == prop_fail_result)
5065 break;
5066 eptr+= len;
5067 }
5068 break;
5069
5070 case PT_PXSPACE: /* POSIX space */
5071 for (i = min; i < max; i++)
5072 {
5073 int len = 1;
5074 if (eptr >= md->end_subject)
5075 {
5076 SCHECK_PARTIAL();
5077 break;
5078 }
5079 GETCHARLENTEST(c, eptr, len);
5080 prop_category = UCD_CATEGORY(c);
5081 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5082 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5083 == prop_fail_result)
5084 break;
5085 eptr+= len;
5086 }
5087 break;
5088
5089 case PT_WORD:
5090 for (i = min; i < max; i++)
5091 {
5092 int len = 1;
5093 if (eptr >= md->end_subject)
5094 {
5095 SCHECK_PARTIAL();
5096 break;
5097 }
5098 GETCHARLENTEST(c, eptr, len);
5099 prop_category = UCD_CATEGORY(c);
5100 if ((prop_category == ucp_L || prop_category == ucp_N ||
5101 c == CHAR_UNDERSCORE) == prop_fail_result)
5102 break;
5103 eptr+= len;
5104 }
5105 break;
5106
5107 default:
5108 RRETURN(PCRE_ERROR_INTERNAL);
5109 }
5110
5111 /* eptr is now past the end of the maximum run */
5112
5113 if (possessive) continue;
5114 for(;;)
5115 {
5116 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5118 if (eptr-- == pp) break; /* Stop if tried at original pos */
5119 if (utf8) BACKCHAR(eptr);
5120 }
5121 }
5122
5123 /* Match extended Unicode sequences. We will get here only if the
5124 support is in the binary; otherwise a compile-time error occurs. */
5125
5126 else if (ctype == OP_EXTUNI)
5127 {
5128 for (i = min; i < max; i++)
5129 {
5130 if (eptr >= md->end_subject)
5131 {
5132 SCHECK_PARTIAL();
5133 break;
5134 }
5135 GETCHARINCTEST(c, eptr);
5136 prop_category = UCD_CATEGORY(c);
5137 if (prop_category == ucp_M) break;
5138 while (eptr < md->end_subject)
5139 {
5140 int len = 1;
5141 if (!utf8) c = *eptr; else
5142 {
5143 GETCHARLEN(c, eptr, len);
5144 }
5145 prop_category = UCD_CATEGORY(c);
5146 if (prop_category != ucp_M) break;
5147 eptr += len;
5148 }
5149 }
5150
5151 /* eptr is now past the end of the maximum run */
5152
5153 if (possessive) continue;
5154
5155 for(;;)
5156 {
5157 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5158 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5159 if (eptr-- == pp) break; /* Stop if tried at original pos */
5160 for (;;) /* Move back over one extended */
5161 {
5162 int len = 1;
5163 if (!utf8) c = *eptr; else
5164 {
5165 BACKCHAR(eptr);
5166 GETCHARLEN(c, eptr, len);
5167 }
5168 prop_category = UCD_CATEGORY(c);
5169 if (prop_category != ucp_M) break;
5170 eptr--;
5171 }
5172 }
5173 }
5174
5175 else
5176 #endif /* SUPPORT_UCP */
5177
5178 #ifdef SUPPORT_UTF8
5179 /* UTF-8 mode */
5180
5181 if (utf8)
5182 {
5183 switch(ctype)
5184 {
5185 case OP_ANY:
5186 if (max < INT_MAX)
5187 {
5188 for (i = min; i < max; i++)
5189 {
5190 if (eptr >= md->end_subject)
5191 {
5192 SCHECK_PARTIAL();
5193 break;
5194 }
5195 if (IS_NEWLINE(eptr)) break;
5196 eptr++;
5197 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5198 }
5199 }
5200
5201 /* Handle unlimited UTF-8 repeat */
5202
5203 else
5204 {
5205 for (i = min; i < max; i++)
5206 {
5207 if (eptr >= md->end_subject)
5208 {
5209 SCHECK_PARTIAL();
5210 break;
5211 }
5212 if (IS_NEWLINE(eptr)) break;
5213 eptr++;
5214 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5215 }
5216 }
5217 break;
5218
5219 case OP_ALLANY:
5220 if (max < INT_MAX)
5221 {
5222 for (i = min; i < max; i++)
5223 {
5224 if (eptr >= md->end_subject)
5225 {
5226 SCHECK_PARTIAL();
5227 break;
5228 }
5229 eptr++;
5230 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5231 }
5232 }
5233 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5234 break;
5235
5236 /* The byte case is the same as non-UTF8 */
5237
5238 case OP_ANYBYTE:
5239 c = max - min;
5240 if (c > (unsigned int)(md->end_subject - eptr))
5241 {
5242 eptr = md->end_subject;
5243 SCHECK_PARTIAL();
5244 }
5245 else eptr += c;
5246 break;
5247
5248 case OP_ANYNL:
5249 for (i = min; i < max; i++)
5250 {
5251 int len = 1;
5252 if (eptr >= md->end_subject)
5253 {
5254 SCHECK_PARTIAL();
5255 break;
5256 }
5257 GETCHARLEN(c, eptr, len);
5258 if (c == 0x000d)
5259 {
5260 if (++eptr >= md->end_subject) break;
5261 if (*eptr == 0x000a) eptr++;
5262 }
5263 else
5264 {
5265 if (c != 0x000a &&
5266 (md->bsr_anycrlf ||
5267 (c != 0x000b && c != 0x000c &&
5268 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5269 break;
5270 eptr += len;
5271 }
5272 }
5273 break;
5274
5275 case OP_NOT_HSPACE:
5276 case OP_HSPACE:
5277 for (i = min; i < max; i++)
5278 {
5279 BOOL gotspace;
5280 int len = 1;
5281 if (eptr >= md->end_subject)
5282 {
5283 SCHECK_PARTIAL();
5284 break;
5285 }
5286 GETCHARLEN(c, eptr, len);
5287 switch(c)
5288 {
5289 default: gotspace = FALSE; break;
5290 case 0x09: /* HT */
5291 case 0x20: /* SPACE */
5292 case 0xa0: /* NBSP */
5293 case 0x1680: /* OGHAM SPACE MARK */
5294 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5295 case 0x2000: /* EN QUAD */
5296 case 0x2001: /* EM QUAD */
5297 case 0x2002: /* EN SPACE */
5298 case 0x2003: /* EM SPACE */
5299 case 0x2004: /* THREE-PER-EM SPACE */
5300 case 0x2005: /* FOUR-PER-EM SPACE */
5301 case 0x2006: /* SIX-PER-EM SPACE */
5302 case 0x2007: /* FIGURE SPACE */
5303 case 0x2008: /* PUNCTUATION SPACE */
5304 case 0x2009: /* THIN SPACE */
5305 case 0x200A: /* HAIR SPACE */
5306 case 0x202f: /* NARROW NO-BREAK SPACE */
5307 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5308 case 0x3000: /* IDEOGRAPHIC SPACE */
5309 gotspace = TRUE;
5310 break;
5311 }
5312 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5313 eptr += len;
5314 }
5315 break;
5316
5317 case OP_NOT_VSPACE:
5318 case OP_VSPACE:
5319 for (i = min; i < max; i++)
5320 {
5321 BOOL gotspace;
5322 int len = 1;
5323 if (eptr >= md->end_subject)
5324 {
5325 SCHECK_PARTIAL();
5326 break;
5327 }
5328 GETCHARLEN(c, eptr, len);
5329 switch(c)
5330 {
5331 default: gotspace = FALSE; break;
5332 case 0x0a: /* LF */
5333 case 0x0b: /* VT */
5334 case 0x0c: /* FF */
5335 case 0x0d: /* CR */
5336 case 0x85: /* NEL */
5337 case 0x2028: /* LINE SEPARATOR */
5338 case 0x2029: /* PARAGRAPH SEPARATOR */
5339 gotspace = TRUE;
5340 break;
5341 }
5342 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5343 eptr += len;
5344 }
5345 break;
5346
5347 case OP_NOT_DIGIT:
5348 for (i = min; i < max; i++)
5349 {
5350 int len = 1;
5351 if (eptr >= md->end_subject)
5352 {
5353 SCHECK_PARTIAL();
5354 break;
5355 }
5356 GETCHARLEN(c, eptr, len);
5357 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5358 eptr+= len;
5359 }
5360 break;
5361
5362 case OP_DIGIT:
5363 for (i = min; i < max; i++)
5364 {
5365 int len = 1;
5366 if (eptr >= md->end_subject)
5367 {
5368 SCHECK_PARTIAL();
5369 break;
5370 }
5371 GETCHARLEN(c, eptr, len);
5372 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5373 eptr+= len;
5374 }
5375 break;
5376
5377 case OP_NOT_WHITESPACE:
5378 for (i = min; i < max; i++)
5379 {
5380 int len = 1;
5381 if (eptr >= md->end_subject)
5382 {
5383 SCHECK_PARTIAL();
5384 break;
5385 }
5386 GETCHARLEN(c, eptr, len);
5387 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5388 eptr+= len;
5389 }
5390 break;
5391
5392 case OP_WHITESPACE:
5393 for (i = min; i < max; i++)
5394 {
5395 int len = 1;
5396 if (eptr >= md->end_subject)
5397 {
5398 SCHECK_PARTIAL();
5399 break;
5400 }
5401 GETCHARLEN(c, eptr, len);
5402 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5403 eptr+= len;
5404 }
5405 break;
5406
5407 case OP_NOT_WORDCHAR:
5408 for (i = min; i < max; i++)
5409 {
5410 int len = 1;
5411 if (eptr >= md->end_subject)
5412 {
5413 SCHECK_PARTIAL();
5414 break;
5415 }
5416 GETCHARLEN(c, eptr, len);
5417 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5418 eptr+= len;
5419 }
5420 break;
5421
5422 case OP_WORDCHAR:
5423 for (i = min; i < max; i++)
5424 {
5425 int len = 1;
5426 if (eptr >= md->end_subject)
5427 {
5428 SCHECK_PARTIAL();
5429 break;
5430 }
5431 GETCHARLEN(c, eptr, len);
5432 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5433 eptr+= len;
5434 }
5435 break;
5436
5437 default:
5438 RRETURN(PCRE_ERROR_INTERNAL);
5439 }
5440
5441 /* eptr is now past the end of the maximum run. If possessive, we are
5442 done (no backing up). Otherwise, match at this position; anything other
5443 than no match is immediately returned. For nomatch, back up one
5444 character, unless we are matching \R and the last thing matched was
5445 \r\n, in which case, back up two bytes. */
5446
5447 if (possessive) continue;
5448 for(;;)
5449 {
5450 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5451 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5452 if (eptr-- == pp) break; /* Stop if tried at original pos */
5453 BACKCHAR(eptr);
5454 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5455 eptr[-1] == '\r') eptr--;
5456 }
5457 }
5458 else
5459 #endif /* SUPPORT_UTF8 */
5460
5461 /* Not UTF-8 mode */
5462 {
5463 switch(ctype)
5464 {
5465 case OP_ANY:
5466 for (i = min; i < max; i++)
5467 {
5468 if (eptr >= md->end_subject)
5469 {
5470 SCHECK_PARTIAL();
5471 break;
5472 }
5473 if (IS_NEWLINE(eptr)) break;
5474 eptr++;
5475 }
5476 break;
5477
5478 case OP_ALLANY:
5479 case OP_ANYBYTE:
5480 c = max - min;
5481 if (c > (unsigned int)(md->end_subject - eptr))
5482 {
5483 eptr = md->end_subject;
5484 SCHECK_PARTIAL();
5485 }
5486 else eptr += c;
5487 break;
5488
5489 case OP_ANYNL:
5490 for (i = min; i < max; i++)
5491 {
5492 if (eptr >= md->end_subject)
5493 {
5494 SCHECK_PARTIAL();
5495 break;
5496 }
5497 c = *eptr;
5498 if (c == 0x000d)
5499 {
5500 if (++eptr >= md->end_subject) break;
5501 if (*eptr == 0x000a) eptr++;
5502 }
5503 else
5504 {
5505 if (c != 0x000a &&
5506 (md->bsr_anycrlf ||
5507 (c != 0x000b && c != 0x000c && c != 0x0085)))
5508 break;
5509 eptr++;
5510 }
5511 }
5512 break;
5513
5514 case OP_NOT_HSPACE:
5515 for (i = min; i < max; i++)
5516 {
5517 if (eptr >= md->end_subject)
5518 {
5519 SCHECK_PARTIAL();
5520 break;
5521 }
5522 c = *eptr;
5523 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5524 eptr++;
5525 }
5526 break;
5527
5528 case OP_HSPACE:
5529 for (i = min; i < max; i++)
5530 {
5531 if (eptr >= md->end_subject)
5532 {
5533 SCHECK_PARTIAL();
5534 break;
5535 }
5536 c = *eptr;
5537 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5538 eptr++;
5539 }
5540 break;
5541
5542 case OP_NOT_VSPACE:
5543 for (i = min; i < max; i++)
5544 {
5545 if (eptr >= md->end_subject)
5546 {
5547 SCHECK_PARTIAL();
5548 break;
5549 }
5550 c = *eptr;
5551 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5552 break;
5553 eptr++;
5554 }
5555 break;
5556
5557 case OP_VSPACE:
5558 for (i = min; i < max; i++)
5559 {
5560 if (eptr >= md->end_subject)
5561 {
5562 SCHECK_PARTIAL();
5563 break;
5564 }
5565 c = *eptr;
5566 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5567 break;
5568 eptr++;
5569 }
5570 break;
5571
5572 case OP_NOT_DIGIT:
5573 for (i = min; i < max; i++)
5574 {
5575 if (eptr >= md->end_subject)
5576 {
5577 SCHECK_PARTIAL();
5578 break;
5579 }
5580 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5581 eptr++;
5582 }
5583 break;
5584
5585 case OP_DIGIT:
5586 for (i = min; i < max; i++)
5587 {
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5594 eptr++;
5595 }
5596 break;
5597
5598 case OP_NOT_WHITESPACE:
5599 for (i = min; i < max; i++)
5600 {
5601 if (eptr >= md->end_subject)
5602 {
5603 SCHECK_PARTIAL();
5604 break;
5605 }
5606 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5607 eptr++;
5608 }
5609 break;
5610
5611 case OP_WHITESPACE:
5612 for (i = min; i < max; i++)
5613 {
5614 if (eptr >= md->end_subject)
5615 {
5616 SCHECK_PARTIAL();
5617 break;
5618 }
5619 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5620 eptr++;
5621 }
5622 break;
5623
5624 case OP_NOT_WORDCHAR:
5625 for (i = min; i < max; i++)
5626 {
5627 if (eptr >= md->end_subject)
5628 {
5629 SCHECK_PARTIAL();
5630 break;
5631 }
5632 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5633 eptr++;
5634 }
5635 break;
5636
5637 case OP_WORDCHAR:
5638 for (i = min; i < max; i++)
5639 {
5640 if (eptr >= md->end_subject)
5641 {
5642 SCHECK_PARTIAL();
5643 break;
5644 }
5645 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5646 eptr++;
5647 }
5648 break;
5649
5650 default:
5651 RRETURN(PCRE_ERROR_INTERNAL);
5652 }
5653
5654 /* eptr is now past the end of the maximum run. If possessive, we are
5655 done (no backing up). Otherwise, match at this position; anything other
5656 than no match is immediately returned. For nomatch, back up one
5657 character (byte), unless we are matching \R and the last thing matched
5658 was \r\n, in which case, back up two bytes. */
5659
5660 if (possessive) continue;
5661 while (eptr >= pp)
5662 {
5663 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5664 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5665 eptr--;
5666 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5667 eptr[-1] == '\r') eptr--;
5668 }
5669 }
5670
5671 /* Get here if we can't make it match with any permitted repetitions */
5672
5673 MRRETURN(MATCH_NOMATCH);
5674 }
5675 /* Control never gets here */
5676
5677 /* There's been some horrible disaster. Arrival here can only mean there is
5678 something seriously wrong in the code above or the OP_xxx definitions. */
5679
5680 default:
5681 DPRINTF(("Unknown opcode %d\n", *ecode));
5682 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5683 }
5684
5685 /* Do not stick any code in here without much thought; it is assumed
5686 that "continue" in the code above comes out to here to repeat the main
5687 loop. */
5688
5689 } /* End of main loop */
5690 /* Control never reaches here */
5691
5692
5693 /* When compiling to use the heap rather than the stack for recursive calls to
5694 match(), the RRETURN() macro jumps here. The number that is saved in
5695 frame->Xwhere indicates which label we actually want to return to. */
5696
5697 #ifdef NO_RECURSE
5698 #define LBL(val) case val: goto L_RM##val;
5699 HEAP_RETURN:
5700 switch (frame->Xwhere)
5701 {
5702 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5703 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5704 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5705 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5706 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5707 #ifdef SUPPORT_UTF8
5708 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5709 LBL(32) LBL(34) LBL(42) LBL(46)
5710 #ifdef SUPPORT_UCP
5711 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5712 LBL(59) LBL(60) LBL(61) LBL(62)
5713 #endif /* SUPPORT_UCP */
5714 #endif /* SUPPORT_UTF8 */
5715 default:
5716 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5717 return PCRE_ERROR_INTERNAL;
5718 }
5719 #undef LBL
5720 #endif /* NO_RECURSE */
5721 }
5722
5723
5724 /***************************************************************************
5725 ****************************************************************************
5726 RECURSION IN THE match() FUNCTION
5727
5728 Undefine all the macros that were defined above to handle this. */
5729
5730 #ifdef NO_RECURSE
5731 #undef eptr
5732 #undef ecode
5733 #undef mstart
5734 #undef offset_top
5735 #undef eptrb
5736 #undef flags
5737
5738 #undef callpat
5739 #undef charptr
5740 #undef data
5741 #undef next
5742 #undef pp
5743 #undef prev
5744 #undef saved_eptr
5745
5746 #undef new_recursive
5747
5748 #undef cur_is_word
5749 #undef condition
5750 #undef prev_is_word
5751
5752 #undef ctype
5753 #undef length
5754 #undef max
5755 #undef min
5756 #undef number
5757 #undef offset
5758 #undef op
5759 #undef save_capture_last
5760 #undef save_offset1
5761 #undef save_offset2
5762 #undef save_offset3
5763 #undef stacksave
5764
5765 #undef newptrb
5766
5767 #endif
5768
5769 /* These two are defined as macros in both cases */
5770
5771 #undef fc
5772 #undef fi
5773
5774 /***************************************************************************
5775 ***************************************************************************/
5776
5777
5778
5779 /*************************************************
5780 * Execute a Regular Expression *
5781 *************************************************/
5782
5783 /* This function applies a compiled re to a subject string and picks out
5784 portions of the string if it matches. Two elements in the vector are set for
5785 each substring: the offsets to the start and end of the substring.
5786
5787 Arguments:
5788 argument_re points to the compiled expression
5789 extra_data points to extra data or is NULL
5790 subject points to the subject string
5791 length length of subject string (may contain binary zeros)
5792 start_offset where to start in the subject string
5793 options option bits
5794 offsets points to a vector of ints to be filled in with offsets
5795 offsetcount the number of elements in the vector
5796
5797 Returns: > 0 => success; value is the number of elements filled in
5798 = 0 => success, but offsets is not big enough
5799 -1 => failed to match
5800 < -1 => some kind of unexpected problem
5801 */
5802
5803 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5804 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5805 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5806 int offsetcount)
5807 {
5808 int rc, resetcount, ocount;
5809 int first_byte = -1;
5810 int req_byte = -1;
5811 int req_byte2 = -1;
5812 int newline;
5813 BOOL using_temporary_offsets = FALSE;
5814 BOOL anchored;
5815 BOOL startline;
5816 BOOL firstline;
5817 BOOL first_byte_caseless = FALSE;
5818 BOOL req_byte_caseless = FALSE;
5819 BOOL utf8;
5820 match_data match_block;
5821 match_data *md = &match_block;
5822 const uschar *tables;
5823 const uschar *start_bits = NULL;
5824 USPTR start_match = (USPTR)subject + start_offset;
5825 USPTR end_subject;
5826 USPTR start_partial = NULL;
5827 USPTR req_byte_ptr = start_match - 1;
5828
5829 pcre_study_data internal_study;
5830 const pcre_study_data *study;
5831
5832 real_pcre internal_re;
5833 const real_pcre *external_re = (const real_pcre *)argument_re;
5834 const real_pcre *re = external_re;
5835
5836 /* Plausibility checks */
5837
5838 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5839 if (re == NULL || subject == NULL ||
5840 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5841 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5842 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5843
5844 /* This information is for finding all the numbers associated with a given
5845 name, for condition testing. */
5846
5847 md->name_table = (uschar *)re + re->name_table_offset;
5848 md->name_count = re->name_count;
5849 md->name_entry_size = re->name_entry_size;
5850
5851 /* Fish out the optional data from the extra_data structure, first setting
5852 the default values. */
5853
5854 study = NULL;
5855 md->match_limit = MATCH_LIMIT;
5856 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5857 md->callout_data = NULL;
5858
5859 /* The table pointer is always in native byte order. */
5860
5861 tables = external_re->tables;
5862
5863 if (extra_data != NULL)
5864 {
5865 register unsigned int flags = extra_data->flags;
5866 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5867 study = (const pcre_study_data *)extra_data->study_data;
5868 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5869 md->match_limit = extra_data->match_limit;
5870 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5871 md->match_limit_recursion = extra_data->match_limit_recursion;
5872 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5873 md->callout_data = extra_data->callout_data;
5874 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5875 }
5876
5877 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5878 is a feature that makes it possible to save compiled regex and re-use them
5879 in other programs later. */
5880
5881 if (tables == NULL) tables = _pcre_default_tables;
5882
5883 /* Check that the first field in the block is the magic number. If it is not,
5884 test for a regex that was compiled on a host of opposite endianness. If this is
5885 the case, flipped values are put in internal_re and internal_study if there was
5886 study data too. */
5887
5888 if (re->magic_number != MAGIC_NUMBER)
5889 {
5890 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5891 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5892 if (study != NULL) study = &internal_study;
5893 }
5894
5895 /* Set up other data */
5896
5897 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5898 startline = (re->flags & PCRE_STARTLINE) != 0;
5899 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5900
5901 /* The code starts after the real_pcre block and the capture name table. */
5902
5903 md->start_code = (const uschar *)external_re + re->name_table_offset +
5904 re->name_count * re->name_entry_size;
5905
5906 md->start_subject = (USPTR)subject;
5907 md->start_offset = start_offset;
5908 md->end_subject = md->start_subject + length;
5909 end_subject = md->end_subject;
5910
5911 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5912 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5913 md->use_ucp = (re->options & PCRE_UCP) != 0;
5914 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5915
5916 md->notbol = (options & PCRE_NOTBOL) != 0;
5917 md->noteol = (options & PCRE_NOTEOL) != 0;
5918 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5919 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5920 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5921 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5922 md->hitend = FALSE;
5923 md->mark = NULL; /* In case never set */
5924
5925 md->recursive = NULL; /* No recursion at top level */
5926
5927 md->lcc = tables + lcc_offset;
5928 md->ctypes = tables + ctypes_offset;
5929
5930 /* Handle different \R options. */
5931
5932 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5933 {
5934 case 0:
5935 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5936 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5937 else
5938 #ifdef BSR_ANYCRLF
5939 md->bsr_anycrlf = TRUE;
5940 #else
5941 md->bsr_anycrlf = FALSE;
5942 #endif
5943 break;
5944
5945 case PCRE_BSR_ANYCRLF:
5946 md->bsr_anycrlf = TRUE;
5947 break;
5948
5949 case PCRE_BSR_UNICODE:
5950 md->bsr_anycrlf = FALSE;
5951 break;
5952
5953 default: return PCRE_ERROR_BADNEWLINE;
5954 }
5955
5956 /* Handle different types of newline. The three bits give eight cases. If
5957 nothing is set at run time, whatever was used at compile time applies. */
5958
5959 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5960 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5961 {
5962 case 0: newline = NEWLINE; break; /* Compile-time default */
5963 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5964 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5965 case PCRE_NEWLINE_CR+
5966 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5967 case PCRE_NEWLINE_ANY: newline = -1; break;
5968 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5969 default: return PCRE_ERROR_BADNEWLINE;
5970 }
5971
5972 if (newline == -2)
5973 {
5974 md->nltype = NLTYPE_ANYCRLF;
5975 }
5976 else if (newline < 0)
5977 {
5978 md->nltype = NLTYPE_ANY;
5979 }
5980 else
5981 {
5982 md->nltype = NLTYPE_FIXED;
5983 if (newline > 255)
5984 {
5985 md->nllen = 2;
5986 md->nl[0] = (newline >> 8) & 255;
5987 md->nl[1] = newline & 255;
5988 }
5989 else
5990 {
5991 md->nllen = 1;
5992 md->nl[0] = newline;
5993 }
5994 }
5995
5996 /* Partial matching was originally supported only for a restricted set of
5997 regexes; from release 8.00 there are no restrictions, but the bits are still
5998 defined (though never set). So there's no harm in leaving this code. */
5999
6000 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6001 return PCRE_ERROR_BADPARTIAL;
6002
6003 /* Check a UTF-8 string if required. Pass back the character offset and error
6004 code for an invalid string if a results vector is available. */
6005
6006 #ifdef SUPPORT_UTF8
6007 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6008 {
6009 int erroroffset;
6010 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
6011 if (errorcode != 0)
6012 {
6013 if (offsetcount >= 2)
6014 {
6015 offsets[0] = erroroffset;
6016 offsets[1] = errorcode;
6017 }
6018 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6019 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6020 }
6021
6022 /* Check that a start_offset points to the start of a UTF-8 character. */
6023
6024 if (start_offset > 0 && start_offset < length &&
6025 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6026 return PCRE_ERROR_BADUTF8_OFFSET;
6027 }
6028 #endif
6029
6030 /* If the expression has got more back references than the offsets supplied can
6031 hold, we get a temporary chunk of working store to use during the matching.
6032 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6033 of 3. */
6034
6035 ocount = offsetcount - (offsetcount % 3);
6036
6037 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6038 {
6039 ocount = re->top_backref * 3 + 3;
6040 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6041 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6042 using_temporary_offsets = TRUE;
6043 DPRINTF(("Got memory to hold back references\n"));
6044 }
6045 else md->offset_vector = offsets;
6046
6047 md->offset_end = ocount;
6048 md->offset_max = (2*ocount)/3;
6049 md->offset_overflow = FALSE;
6050 md->capture_last = -1;
6051
6052 /* Compute the minimum number of offsets that we need to reset each time. Doing
6053 this makes a huge difference to execution time when there aren't many brackets
6054 in the pattern. */
6055
6056 resetcount = 2 + re->top_bracket * 2;
6057 if (resetcount > offsetcount) resetcount = ocount;
6058
6059 /* Reset the working variable associated with each extraction. These should
6060 never be used unless previously set, but they get saved and restored, and so we
6061 initialize them to avoid reading uninitialized locations. */
6062
6063 if (md->offset_vector != NULL)
6064 {
6065 register int *iptr = md->offset_vector + ocount;
6066 register int *iend = iptr - resetcount/2 + 1;
6067 while (--iptr >= iend) *iptr = -1;
6068 }
6069
6070 /* Set up the first character to match, if available. The first_byte value is
6071 never set for an anchored regular expression, but the anchoring may be forced
6072 at run time, so we have to test for anchoring. The first char may be unset for
6073 an unanchored pattern, of course. If there's no first char and the pattern was
6074 studied, there may be a bitmap of possible first characters. */
6075
6076 if (!anchored)
6077 {
6078 if ((re->flags & PCRE_FIRSTSET) != 0)
6079 {
6080 first_byte = re->first_byte & 255;
6081 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6082 first_byte = md->lcc[first_byte];
6083 }
6084 else
6085 if (!startline && study != NULL &&
6086 (study->flags & PCRE_STUDY_MAPPED) != 0)
6087 start_bits = study->start_bits;
6088 }
6089
6090 /* For anchored or unanchored matches, there may be a "last known required
6091 character" set. */
6092
6093 if ((re->flags & PCRE_REQCHSET) != 0)
6094 {
6095 req_byte = re->req_byte & 255;
6096 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6097 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6098 }
6099
6100
6101 /* ==========================================================================*/
6102
6103 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6104 the loop runs just once. */
6105
6106 for(;;)
6107 {
6108 USPTR save_end_subject = end_subject;
6109 USPTR new_start_match;
6110
6111 /* Reset the maximum number of extractions we might see. */
6112
6113 if (md->offset_vector != NULL)
6114 {
6115 register int *iptr = md->offset_vector;
6116 register int *iend = iptr + resetcount;
6117 while (iptr < iend) *iptr++ = -1;
6118 }
6119
6120 /* If firstline is TRUE, the start of the match is constrained to the first
6121 line of a multiline string. That is, the match must be before or at the first
6122 newline. Implement this by temporarily adjusting end_subject so that we stop
6123 scanning at a newline. If the match fails at the newline, later code breaks
6124 this loop. */
6125
6126 if (firstline)
6127 {
6128 USPTR t = start_match;
6129 #ifdef SUPPORT_UTF8
6130 if (utf8)
6131 {
6132 while (t < md->end_subject && !IS_NEWLINE(t))
6133 {
6134 t++;
6135 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6136 }
6137 }
6138 else
6139 #endif
6140 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6141 end_subject = t;
6142 }
6143
6144 /* There are some optimizations that avoid running the match if a known
6145 starting point is not found, or if a known later character is not present.
6146 However, there is an option that disables these, for testing and for ensuring
6147 that all callouts do actually occur. The option can be set in the regex by
6148 (*NO_START_OPT) or passed in match-time options. */
6149
6150 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6151 {
6152 /* Advance to a unique first byte if there is one. */
6153
6154 if (first_byte >= 0)
6155 {
6156 if (first_byte_caseless)
6157 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6158 start_match++;
6159 else
6160 while (start_match < end_subject && *start_match != first_byte)
6161 start_match++;
6162 }
6163
6164 /* Or to just after a linebreak for a multiline match */
6165
6166 else if (startline)
6167 {
6168 if (start_match > md->start_subject + start_offset)
6169 {
6170 #ifdef SUPPORT_UTF8
6171 if (utf8)
6172 {
6173 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6174 {
6175 start_match++;
6176 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6177 start_match++;
6178 }
6179 }
6180 else
6181 #endif
6182 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6183 start_match++;
6184
6185 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6186 and we are now at a LF, advance the match position by one more character.
6187 */
6188
6189 if (start_match[-1] == CHAR_CR &&
6190 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6191 start_match < end_subject &&
6192 *start_match == CHAR_NL)
6193 start_match++;
6194 }
6195 }
6196
6197 /* Or to a non-unique first byte after study */
6198
6199 else if (start_bits != NULL)
6200 {
6201 while (start_match < end_subject)
6202 {
6203 register unsigned int c = *start_match;
6204 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6205 {
6206 start_match++;
6207 #ifdef SUPPORT_UTF8
6208 if (utf8)
6209 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6210 start_match++;
6211 #endif
6212 }
6213 else break;
6214 }
6215 }
6216 } /* Starting optimizations */
6217
6218 /* Restore fudged end_subject */
6219
6220 end_subject = save_end_subject;
6221
6222 /* The following two optimizations are disabled for partial matching or if
6223 disabling is explicitly requested. */
6224
6225 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6226 {
6227 /* If the pattern was studied, a minimum subject length may be set. This is
6228 a lower bound; no actual string of that length may actually match the
6229 pattern. Although the value is, strictly, in characters, we treat it as
6230 bytes to avoid spending too much time in this optimization. */
6231
6232 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6233 (pcre_uint32)(end_subject - start_match) < study->minlength)
6234 {
6235 rc = MATCH_NOMATCH;
6236 break;
6237 }
6238
6239 /* If req_byte is set, we know that that character must appear in the
6240 subject for the match to succeed. If the first character is set, req_byte
6241 must be later in the subject; otherwise the test starts at the match point.
6242 This optimization can save a huge amount of backtracking in patterns with
6243 nested unlimited repeats that aren't going to match. Writing separate code
6244 for cased/caseless versions makes it go faster, as does using an
6245 autoincrement and backing off on a match.
6246
6247 HOWEVER: when the subject string is very, very long, searching to its end
6248 can take a long time, and give bad performance on quite ordinary patterns.
6249 This showed up when somebody was matching something like /^\d+C/ on a
6250 32-megabyte string... so we don't do this when the string is sufficiently
6251 long. */
6252
6253 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6254 {
6255 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6256
6257 /* We don't need to repeat the search if we haven't yet reached the
6258 place we found it at last time. */
6259
6260 if (p > req_byte_ptr)
6261 {
6262 if (req_byte_caseless)
6263 {
6264 while (p < end_subject)
6265 {
6266 register int pp = *p++;
6267 if (pp == req_byte || pp == req_byte2) { p--; break; }
6268 }
6269 }
6270 else
6271 {
6272 while (p < end_subject)
6273 {
6274 if (*p++ == req_byte) { p--; break; }
6275 }
6276 }
6277
6278 /* If we can't find the required character, break the matching loop,
6279 forcing a match failure. */
6280
6281 if (p >= end_subject)
6282 {
6283 rc = MATCH_NOMATCH;
6284 break;
6285 }
6286
6287 /* If we have found the required character, save the point where we
6288 found it, so that we don't search again next time round the loop if
6289 the start hasn't passed this character yet. */
6290
6291 req_byte_ptr = p;
6292 }
6293 }
6294 }
6295
6296 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6297 printf(">>>> Match against: ");
6298 pchars(start_match, end_subject - start_match, TRUE, md);
6299 printf("\n");
6300 #endif
6301
6302 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6303 first starting point for which a partial match was found. */
6304
6305 md->start_match_ptr = start_match;
6306 md->start_used_ptr = start_match;
6307 md->match_call_count = 0;
6308 md->match_function_type = 0;
6309 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6310 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6311
6312 switch(rc)
6313 {
6314 /* SKIP passes back the next starting point explicitly, but if it is the
6315 same as the match we have just done, treat it as NOMATCH. */
6316
6317 case MATCH_SKIP:
6318 if (md->start_match_ptr != start_match)
6319 {
6320 new_start_match = md->start_match_ptr;
6321 break;
6322 }
6323 /* Fall through */
6324
6325 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6326 the SKIP's arg was not found. We also treat this as NOMATCH. */
6327
6328 case MATCH_SKIP_ARG:
6329 /* Fall through */
6330
6331 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6332 exactly like PRUNE. */
6333
6334 case MATCH_NOMATCH:
6335 case MATCH_PRUNE:
6336 case MATCH_THEN:
6337 new_start_match = start_match + 1;
6338 #ifdef SUPPORT_UTF8
6339 if (utf8)
6340 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6341 new_start_match++;
6342 #endif
6343 break;
6344
6345 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6346
6347 case MATCH_COMMIT:
6348 rc = MATCH_NOMATCH;
6349 goto ENDLOOP;
6350
6351 /* Any other return is either a match, or some kind of error. */
6352
6353 default:
6354 goto ENDLOOP;
6355 }
6356
6357 /* Control reaches here for the various types of "no match at this point"
6358 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6359
6360 rc = MATCH_NOMATCH;
6361
6362 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6363 newline in the subject (though it may continue over the newline). Therefore,
6364 if we have just failed to match, starting at a newline, do not continue. */
6365
6366 if (firstline && IS_NEWLINE(start_match)) break;
6367
6368 /* Advance to new matching position */
6369
6370 start_match = new_start_match;
6371
6372 /* Break the loop if the pattern is anchored or if we have passed the end of
6373 the subject. */
6374
6375 if (anchored || start_match > end_subject) break;
6376
6377 /* If we have just passed a CR and we are now at a LF, and the pattern does
6378 not contain any explicit matches for \r or \n, and the newline option is CRLF
6379 or ANY or ANYCRLF, advance the match position by one more character. */
6380
6381 if (start_match[-1] == CHAR_CR &&
6382 start_match < end_subject &&
6383 *start_match == CHAR_NL &&
6384 (re->flags & PCRE_HASCRORLF) == 0 &&
6385 (md->nltype == NLTYPE_ANY ||
6386 md->nltype == NLTYPE_ANYCRLF ||
6387 md->nllen == 2))
6388 start_match++;
6389
6390 md->mark = NULL; /* Reset for start of next match attempt */
6391 } /* End of for(;;) "bumpalong" loop */
6392
6393 /* ==========================================================================*/
6394
6395 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6396 conditions is true:
6397
6398 (1) The pattern is anchored or the match was failed by (*COMMIT);
6399
6400 (2) We are past the end of the subject;
6401
6402 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6403 this option requests that a match occur at or before the first newline in
6404 the subject.
6405
6406 When we have a match and the offset vector is big enough to deal with any
6407 backreferences, captured substring offsets will already be set up. In the case
6408 where we had to get some local store to hold offsets for backreference
6409 processing, copy those that we can. In this case there need not be overflow if
6410 certain parts of the pattern were not used, even though there are more
6411 capturing parentheses than vector slots. */
6412
6413 ENDLOOP:
6414
6415 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6416 {
6417 if (using_temporary_offsets)
6418 {
6419 if (offsetcount >= 4)
6420 {
6421 memcpy(offsets + 2, md->offset_vector + 2,
6422 (offsetcount - 2) * sizeof(int));
6423 DPRINTF(("Copied offsets from temporary memory\n"));
6424 }
6425 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6426 DPRINTF(("Freeing temporary memory\n"));
6427 (pcre_free)(md->offset_vector);
6428 }
6429
6430 /* Set the return code to the number of captured strings, or 0 if there are
6431 too many to fit into the vector. */
6432
6433 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6434
6435 /* If there is space, set up the whole thing as substring 0. The value of
6436 md->start_match_ptr might be modified if \K was encountered on the success
6437 matching path. */
6438
6439 if (offsetcount < 2) rc = 0; else
6440 {
6441 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6442 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6443 }
6444
6445 DPRINTF((">>>> returning %d\n", rc));
6446 goto RETURN_MARK;
6447 }
6448
6449 /* Control gets here if there has been an error, or if the overall match
6450 attempt has failed at all permitted starting positions. */
6451
6452 if (using_temporary_offsets)
6453 {
6454 DPRINTF(("Freeing temporary memory\n"));
6455 (pcre_free)(md->offset_vector);
6456 }
6457
6458 /* For anything other than nomatch or partial match, just return the code. */
6459
6460 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6461 {
6462 DPRINTF((">>>> error: returning %d\n", rc));
6463 return rc;
6464 }
6465
6466 /* Handle partial matches - disable any mark data */
6467
6468 if (start_partial != NULL)
6469 {
6470 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6471 md->mark = NULL;
6472 if (offsetcount > 1)
6473 {
6474 offsets[0] = (int)(start_partial - (USPTR)subject);
6475 offsets[1] = (int)(end_subject - (USPTR)subject);
6476 }
6477 rc = PCRE_ERROR_PARTIAL;
6478 }
6479
6480 /* This is the classic nomatch case */
6481
6482 else
6483 {
6484 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6485 rc = PCRE_ERROR_NOMATCH;
6486 }
6487
6488 /* Return the MARK data if it has been requested. */
6489
6490 RETURN_MARK:
6491
6492 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6493 *(extra_data->mark) = (unsigned char *)(md->mark);
6494 return rc;
6495 }
6496
6497 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5