/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 618 - (show annotations)
Sat Jul 16 17:24:16 2011 UTC (8 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 195257 byte(s)
Re-do atomic group processing to fix backtrack capture bugs. Recursion is also 
re-worked.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xprop_category;
388 int Xprop_chartype;
389 int Xprop_script;
390 int Xoclength;
391 uschar Xocchars[8];
392 #endif
393
394 int Xcodelink;
395 int Xctype;
396 unsigned int Xfc;
397 int Xfi;
398 int Xlength;
399 int Xmax;
400 int Xmin;
401 int Xnumber;
402 int Xoffset;
403 int Xop;
404 int Xsave_capture_last;
405 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
406 int Xstacksave[REC_STACK_SAVE_MAX];
407
408 eptrblock Xnewptrb;
409
410 /* Where to jump back to */
411
412 int Xwhere;
413
414 } heapframe;
415
416 #endif
417
418
419 /***************************************************************************
420 ***************************************************************************/
421
422
423
424 /*************************************************
425 * Match from current position *
426 *************************************************/
427
428 /* This function is called recursively in many circumstances. Whenever it
429 returns a negative (error) response, the outer incarnation must also return the
430 same response. */
431
432 /* These macros pack up tests that are used for partial matching, and which
433 appears several times in the code. We set the "hit end" flag if the pointer is
434 at the end of the subject and also past the start of the subject (i.e.
435 something has been matched). For hard partial matching, we then return
436 immediately. The second one is used when we already know we are past the end of
437 the subject. */
438
439 #define CHECK_PARTIAL()\
440 if (md->partial != 0 && eptr >= md->end_subject && \
441 eptr > md->start_used_ptr) \
442 { \
443 md->hitend = TRUE; \
444 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
445 }
446
447 #define SCHECK_PARTIAL()\
448 if (md->partial != 0 && eptr > md->start_used_ptr) \
449 { \
450 md->hitend = TRUE; \
451 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
452 }
453
454
455 /* Performance note: It might be tempting to extract commonly used fields from
456 the md structure (e.g. utf8, end_subject) into individual variables to improve
457 performance. Tests using gcc on a SPARC disproved this; in the first case, it
458 made performance worse.
459
460 Arguments:
461 eptr pointer to current character in subject
462 ecode pointer to current position in compiled code
463 mstart pointer to the current match start position (can be modified
464 by encountering \K)
465 markptr pointer to the most recent MARK name, or NULL
466 offset_top current top pointer
467 md pointer to "static" info for the match
468 eptrb pointer to chain of blocks containing eptr at start of
469 brackets - for testing for empty matches
470 rdepth the recursion depth
471
472 Returns: MATCH_MATCH if matched ) these values are >= 0
473 MATCH_NOMATCH if failed to match )
474 a negative MATCH_xxx value for PRUNE, SKIP, etc
475 a negative PCRE_ERROR_xxx value if aborted by an error condition
476 (e.g. stopped by repeated call or recursion limit)
477 */
478
479 static int
480 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
481 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
482 unsigned int rdepth)
483 {
484 /* These variables do not need to be preserved over recursion in this function,
485 so they can be ordinary variables in all cases. Mark some of them with
486 "register" because they are used a lot in loops. */
487
488 register int rrc; /* Returns from recursive calls */
489 register int i; /* Used for loops not involving calls to RMATCH() */
490 register unsigned int c; /* Character values not kept over RMATCH() calls */
491 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
492
493 BOOL minimize, possessive; /* Quantifier options */
494 BOOL caseless;
495 int condcode;
496
497 /* When recursion is not being used, all "local" variables that have to be
498 preserved over calls to RMATCH() are part of a "frame" which is obtained from
499 heap storage. Set up the top-level frame here; others are obtained from the
500 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
501
502 #ifdef NO_RECURSE
503 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
504 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
505 frame->Xprevframe = NULL; /* Marks the top level */
506
507 /* Copy in the original argument variables */
508
509 frame->Xeptr = eptr;
510 frame->Xecode = ecode;
511 frame->Xmstart = mstart;
512 frame->Xmarkptr = markptr;
513 frame->Xoffset_top = offset_top;
514 frame->Xeptrb = eptrb;
515 frame->Xrdepth = rdepth;
516
517 /* This is where control jumps back to to effect "recursion" */
518
519 HEAP_RECURSE:
520
521 /* Macros make the argument variables come from the current frame */
522
523 #define eptr frame->Xeptr
524 #define ecode frame->Xecode
525 #define mstart frame->Xmstart
526 #define markptr frame->Xmarkptr
527 #define offset_top frame->Xoffset_top
528 #define eptrb frame->Xeptrb
529 #define rdepth frame->Xrdepth
530
531 /* Ditto for the local variables */
532
533 #ifdef SUPPORT_UTF8
534 #define charptr frame->Xcharptr
535 #endif
536 #define callpat frame->Xcallpat
537 #define codelink frame->Xcodelink
538 #define data frame->Xdata
539 #define next frame->Xnext
540 #define pp frame->Xpp
541 #define prev frame->Xprev
542 #define saved_eptr frame->Xsaved_eptr
543
544 #define new_recursive frame->Xnew_recursive
545
546 #define cur_is_word frame->Xcur_is_word
547 #define condition frame->Xcondition
548 #define prev_is_word frame->Xprev_is_word
549
550 #ifdef SUPPORT_UCP
551 #define prop_type frame->Xprop_type
552 #define prop_value frame->Xprop_value
553 #define prop_fail_result frame->Xprop_fail_result
554 #define prop_category frame->Xprop_category
555 #define prop_chartype frame->Xprop_chartype
556 #define prop_script frame->Xprop_script
557 #define oclength frame->Xoclength
558 #define occhars frame->Xocchars
559 #endif
560
561 #define ctype frame->Xctype
562 #define fc frame->Xfc
563 #define fi frame->Xfi
564 #define length frame->Xlength
565 #define max frame->Xmax
566 #define min frame->Xmin
567 #define number frame->Xnumber
568 #define offset frame->Xoffset
569 #define op frame->Xop
570 #define save_capture_last frame->Xsave_capture_last
571 #define save_offset1 frame->Xsave_offset1
572 #define save_offset2 frame->Xsave_offset2
573 #define save_offset3 frame->Xsave_offset3
574 #define stacksave frame->Xstacksave
575
576 #define newptrb frame->Xnewptrb
577
578 /* When recursion is being used, local variables are allocated on the stack and
579 get preserved during recursion in the normal way. In this environment, fi and
580 i, and fc and c, can be the same variables. */
581
582 #else /* NO_RECURSE not defined */
583 #define fi i
584 #define fc c
585
586 /* Many of the following variables are used only in small blocks of the code.
587 My normal style of coding would have declared them within each of those blocks.
588 However, in order to accommodate the version of this code that uses an external
589 "stack" implemented on the heap, it is easier to declare them all here, so the
590 declarations can be cut out in a block. The only declarations within blocks
591 below are for variables that do not have to be preserved over a recursive call
592 to RMATCH(). */
593
594 #ifdef SUPPORT_UTF8
595 const uschar *charptr;
596 #endif
597 const uschar *callpat;
598 const uschar *data;
599 const uschar *next;
600 USPTR pp;
601 const uschar *prev;
602 USPTR saved_eptr;
603
604 recursion_info new_recursive;
605
606 BOOL cur_is_word;
607 BOOL condition;
608 BOOL prev_is_word;
609
610 #ifdef SUPPORT_UCP
611 int prop_type;
612 int prop_value;
613 int prop_fail_result;
614 int prop_category;
615 int prop_chartype;
616 int prop_script;
617 int oclength;
618 uschar occhars[8];
619 #endif
620
621 int codelink;
622 int ctype;
623 int length;
624 int max;
625 int min;
626 int number;
627 int offset;
628 int op;
629 int save_capture_last;
630 int save_offset1, save_offset2, save_offset3;
631 int stacksave[REC_STACK_SAVE_MAX];
632
633 eptrblock newptrb;
634 #endif /* NO_RECURSE */
635
636 /* To save space on the stack and in the heap frame, I have doubled up on some
637 of the local variables that are used only in localised parts of the code, but
638 still need to be preserved over recursive calls of match(). These macros define
639 the alternative names that are used. */
640
641 #define allow_zero cur_is_word
642 #define cbegroup condition
643 #define code_offset codelink
644 #define condassert condition
645 #define matched_once prev_is_word
646
647 /* These statements are here to stop the compiler complaining about unitialized
648 variables. */
649
650 #ifdef SUPPORT_UCP
651 prop_value = 0;
652 prop_fail_result = 0;
653 #endif
654
655
656 /* This label is used for tail recursion, which is used in a few cases even
657 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
658 used. Thanks to Ian Taylor for noticing this possibility and sending the
659 original patch. */
660
661 TAIL_RECURSE:
662
663 /* OK, now we can get on with the real code of the function. Recursive calls
664 are specified by the macro RMATCH and RRETURN is used to return. When
665 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
666 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
667 defined). However, RMATCH isn't like a function call because it's quite a
668 complicated macro. It has to be used in one particular way. This shouldn't,
669 however, impact performance when true recursion is being used. */
670
671 #ifdef SUPPORT_UTF8
672 utf8 = md->utf8; /* Local copy of the flag */
673 #else
674 utf8 = FALSE;
675 #endif
676
677 /* First check that we haven't called match() too many times, or that we
678 haven't exceeded the recursive call limit. */
679
680 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
681 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
682
683 /* At the start of a group with an unlimited repeat that may match an empty
684 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
685 done this way to save having to use another function argument, which would take
686 up space on the stack. See also MATCH_CONDASSERT below.
687
688 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
689 such remembered pointers, to be checked when we hit the closing ket, in order
690 to break infinite loops that match no characters. When match() is called in
691 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
692 NOT be used with tail recursion, because the memory block that is used is on
693 the stack, so a new one may be required for each match(). */
694
695 if (md->match_function_type == MATCH_CBEGROUP)
696 {
697 newptrb.epb_saved_eptr = eptr;
698 newptrb.epb_prev = eptrb;
699 eptrb = &newptrb;
700 md->match_function_type = 0;
701 }
702
703 /* Now start processing the opcodes. */
704
705 for (;;)
706 {
707 minimize = possessive = FALSE;
708 op = *ecode;
709
710 switch(op)
711 {
712 case OP_MARK:
713 markptr = ecode + 2;
714 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
715 eptrb, RM55);
716
717 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
718 argument, and we must check whether that argument matches this MARK's
719 argument. It is passed back in md->start_match_ptr (an overloading of that
720 variable). If it does match, we reset that variable to the current subject
721 position and return MATCH_SKIP. Otherwise, pass back the return code
722 unaltered. */
723
724 if (rrc == MATCH_SKIP_ARG &&
725 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
726 {
727 md->start_match_ptr = eptr;
728 RRETURN(MATCH_SKIP);
729 }
730
731 if (md->mark == NULL) md->mark = markptr;
732 RRETURN(rrc);
733
734 case OP_FAIL:
735 MRRETURN(MATCH_NOMATCH);
736
737 /* COMMIT overrides PRUNE, SKIP, and THEN */
738
739 case OP_COMMIT:
740 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
741 eptrb, RM52);
742 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
743 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
744 rrc != MATCH_THEN)
745 RRETURN(rrc);
746 MRRETURN(MATCH_COMMIT);
747
748 /* PRUNE overrides THEN */
749
750 case OP_PRUNE:
751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
752 eptrb, RM51);
753 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
754 MRRETURN(MATCH_PRUNE);
755
756 case OP_PRUNE_ARG:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
758 eptrb, RM56);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
760 md->mark = ecode + 2;
761 RRETURN(MATCH_PRUNE);
762
763 /* SKIP overrides PRUNE and THEN */
764
765 case OP_SKIP:
766 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
767 eptrb, RM53);
768 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
769 RRETURN(rrc);
770 md->start_match_ptr = eptr; /* Pass back current position */
771 MRRETURN(MATCH_SKIP);
772
773 case OP_SKIP_ARG:
774 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
775 eptrb, RM57);
776 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
777 RRETURN(rrc);
778
779 /* Pass back the current skip name by overloading md->start_match_ptr and
780 returning the special MATCH_SKIP_ARG return code. This will either be
781 caught by a matching MARK, or get to the top, where it is treated the same
782 as PRUNE. */
783
784 md->start_match_ptr = ecode + 2;
785 RRETURN(MATCH_SKIP_ARG);
786
787 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
788 the alt that is at the start of the current branch. This makes it possible
789 to skip back past alternatives that precede the THEN within the current
790 branch. */
791
792 case OP_THEN:
793 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
794 eptrb, RM54);
795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
796 md->start_match_ptr = ecode - GET(ecode, 1);
797 MRRETURN(MATCH_THEN);
798
799 case OP_THEN_ARG:
800 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
801 offset_top, md, eptrb, RM58);
802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
803 md->start_match_ptr = ecode - GET(ecode, 1);
804 md->mark = ecode + LINK_SIZE + 2;
805 RRETURN(MATCH_THEN);
806
807 /* Handle a capturing bracket, other than those that are possessive with an
808 unlimited repeat. If there is space in the offset vector, save the current
809 subject position in the working slot at the top of the vector. We mustn't
810 change the current values of the data slot, because they may be set from a
811 previous iteration of this group, and be referred to by a reference inside
812 the group. A failure to match might occur after the group has succeeded,
813 if something later on doesn't match. For this reason, we need to restore
814 the working value and also the values of the final offsets, in case they
815 were set by a previous iteration of the same bracket.
816
817 If there isn't enough space in the offset vector, treat this as if it were
818 a non-capturing bracket. Don't worry about setting the flag for the error
819 case here; that is handled in the code for KET. */
820
821 case OP_CBRA:
822 case OP_SCBRA:
823 number = GET2(ecode, 1+LINK_SIZE);
824 offset = number << 1;
825
826 #ifdef PCRE_DEBUG
827 printf("start bracket %d\n", number);
828 printf("subject=");
829 pchars(eptr, 16, TRUE, md);
830 printf("\n");
831 #endif
832
833 if (offset < md->offset_max)
834 {
835 save_offset1 = md->offset_vector[offset];
836 save_offset2 = md->offset_vector[offset+1];
837 save_offset3 = md->offset_vector[md->offset_end - number];
838 save_capture_last = md->capture_last;
839
840 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
841 md->offset_vector[md->offset_end - number] =
842 (int)(eptr - md->start_subject);
843
844 for (;;)
845 {
846 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
847 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
848 eptrb, RM1);
849 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
850 if (rrc != MATCH_NOMATCH &&
851 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
852 RRETURN(rrc);
853 md->capture_last = save_capture_last;
854 ecode += GET(ecode, 1);
855 if (*ecode != OP_ALT) break;
856 }
857
858 DPRINTF(("bracket %d failed\n", number));
859 md->offset_vector[offset] = save_offset1;
860 md->offset_vector[offset+1] = save_offset2;
861 md->offset_vector[md->offset_end - number] = save_offset3;
862
863 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
864 MATCH_THEN. */
865
866 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
867 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
868 }
869
870 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
871 as a non-capturing bracket. */
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
877
878 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
879 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
880
881 /* Non-capturing or atomic group, except for possessive with unlimited
882 repeat. Loop for all the alternatives. When we get to the final alternative
883 within the brackets, we used to return the result of a recursive call to
884 match() whatever happened so it was possible to reduce stack usage by
885 turning this into a tail recursion, except in the case of a possibly empty
886 group. However, now that there is the possiblity of (*THEN) occurring in
887 the final alternative, this optimization is no longer possible.
888
889 MATCH_ONCE is returned when the end of an atomic group is successfully
890 reached, but subsequent matching fails. It passes back up the tree (causing
891 captured values to be reset) until the original atomic group level is
892 reached. This is tested by comparing md->once_target with the start of the
893 group. At this point, the return is converted into MATCH_NOMATCH so that
894 previous backup points can be taken. */
895
896 case OP_ONCE:
897 case OP_BRA:
898 case OP_SBRA:
899 DPRINTF(("start non-capturing bracket\n"));
900
901 for (;;)
902 {
903 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
904 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
905 RM2);
906 if (rrc != MATCH_NOMATCH &&
907 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
908 {
909 if (rrc == MATCH_ONCE)
910 {
911 const uschar *scode = ecode;
912 if (*scode != OP_ONCE) /* If not at start, find it */
913 {
914 while (*scode == OP_ALT) scode += GET(scode, 1);
915 scode -= GET(scode, 1);
916 }
917 if (md->once_target == scode) rrc = MATCH_NOMATCH;
918 }
919 RRETURN(rrc);
920 }
921 ecode += GET(ecode, 1);
922 if (*ecode != OP_ALT) break;
923 }
924 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
925 RRETURN(MATCH_NOMATCH);
926
927 /* Handle possessive capturing brackets with an unlimited repeat. We come
928 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
929 handled similarly to the normal case above. However, the matching is
930 different. The end of these brackets will always be OP_KETRPOS, which
931 returns MATCH_KETRPOS without going further in the pattern. By this means
932 we can handle the group by iteration rather than recursion, thereby
933 reducing the amount of stack needed. */
934
935 case OP_CBRAPOS:
936 case OP_SCBRAPOS:
937 allow_zero = FALSE;
938
939 POSSESSIVE_CAPTURE:
940 number = GET2(ecode, 1+LINK_SIZE);
941 offset = number << 1;
942
943 #ifdef PCRE_DEBUG
944 printf("start possessive bracket %d\n", number);
945 printf("subject=");
946 pchars(eptr, 16, TRUE, md);
947 printf("\n");
948 #endif
949
950 if (offset < md->offset_max)
951 {
952 matched_once = FALSE;
953 code_offset = ecode - md->start_code;
954
955 save_offset1 = md->offset_vector[offset];
956 save_offset2 = md->offset_vector[offset+1];
957 save_offset3 = md->offset_vector[md->offset_end - number];
958 save_capture_last = md->capture_last;
959
960 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
961
962 /* Each time round the loop, save the current subject position for use
963 when the group matches. For MATCH_MATCH, the group has matched, so we
964 restart it with a new subject starting position, remembering that we had
965 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
966 usual. If we haven't matched any alternatives in any iteration, check to
967 see if a previous iteration matched. If so, the group has matched;
968 continue from afterwards. Otherwise it has failed; restore the previous
969 capture values before returning NOMATCH. */
970
971 for (;;)
972 {
973 md->offset_vector[md->offset_end - number] =
974 (int)(eptr - md->start_subject);
975 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
976 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
977 eptrb, RM63);
978 if (rrc == MATCH_KETRPOS)
979 {
980 offset_top = md->end_offset_top;
981 eptr = md->end_match_ptr;
982 ecode = md->start_code + code_offset;
983 save_capture_last = md->capture_last;
984 matched_once = TRUE;
985 continue;
986 }
987 if (rrc != MATCH_NOMATCH &&
988 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
989 RRETURN(rrc);
990 md->capture_last = save_capture_last;
991 ecode += GET(ecode, 1);
992 if (*ecode != OP_ALT) break;
993 }
994
995 if (!matched_once)
996 {
997 md->offset_vector[offset] = save_offset1;
998 md->offset_vector[offset+1] = save_offset2;
999 md->offset_vector[md->offset_end - number] = save_offset3;
1000 }
1001
1002 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1003 if (allow_zero || matched_once)
1004 {
1005 ecode += 1 + LINK_SIZE;
1006 break;
1007 }
1008
1009 RRETURN(MATCH_NOMATCH);
1010 }
1011
1012 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1013 as a non-capturing bracket. */
1014
1015 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1016 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1017
1018 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1019
1020 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1021 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1022
1023 /* Non-capturing possessive bracket with unlimited repeat. We come here
1024 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1025 without the capturing complication. It is written out separately for speed
1026 and cleanliness. */
1027
1028 case OP_BRAPOS:
1029 case OP_SBRAPOS:
1030 allow_zero = FALSE;
1031
1032 POSSESSIVE_NON_CAPTURE:
1033 matched_once = FALSE;
1034 code_offset = ecode - md->start_code;
1035
1036 for (;;)
1037 {
1038 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1039 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1040 eptrb, RM48);
1041 if (rrc == MATCH_KETRPOS)
1042 {
1043 offset_top = md->end_offset_top;
1044 eptr = md->end_match_ptr;
1045 ecode = md->start_code + code_offset;
1046 matched_once = TRUE;
1047 continue;
1048 }
1049 if (rrc != MATCH_NOMATCH &&
1050 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1051 RRETURN(rrc);
1052 ecode += GET(ecode, 1);
1053 if (*ecode != OP_ALT) break;
1054 }
1055
1056 if (matched_once || allow_zero)
1057 {
1058 ecode += 1 + LINK_SIZE;
1059 break;
1060 }
1061 RRETURN(MATCH_NOMATCH);
1062
1063 /* Control never reaches here. */
1064
1065 /* Conditional group: compilation checked that there are no more than
1066 two branches. If the condition is false, skipping the first branch takes us
1067 past the end if there is only one branch, but that's OK because that is
1068 exactly what going to the ket would do. */
1069
1070 case OP_COND:
1071 case OP_SCOND:
1072 codelink = GET(ecode, 1);
1073
1074 /* Because of the way auto-callout works during compile, a callout item is
1075 inserted between OP_COND and an assertion condition. */
1076
1077 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1078 {
1079 if (pcre_callout != NULL)
1080 {
1081 pcre_callout_block cb;
1082 cb.version = 1; /* Version 1 of the callout block */
1083 cb.callout_number = ecode[LINK_SIZE+2];
1084 cb.offset_vector = md->offset_vector;
1085 cb.subject = (PCRE_SPTR)md->start_subject;
1086 cb.subject_length = (int)(md->end_subject - md->start_subject);
1087 cb.start_match = (int)(mstart - md->start_subject);
1088 cb.current_position = (int)(eptr - md->start_subject);
1089 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1090 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1091 cb.capture_top = offset_top/2;
1092 cb.capture_last = md->capture_last;
1093 cb.callout_data = md->callout_data;
1094 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1095 if (rrc < 0) RRETURN(rrc);
1096 }
1097 ecode += _pcre_OP_lengths[OP_CALLOUT];
1098 }
1099
1100 condcode = ecode[LINK_SIZE+1];
1101
1102 /* Now see what the actual condition is */
1103
1104 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1105 {
1106 if (md->recursive == NULL) /* Not recursing => FALSE */
1107 {
1108 condition = FALSE;
1109 ecode += GET(ecode, 1);
1110 }
1111 else
1112 {
1113 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1114 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1115
1116 /* If the test is for recursion into a specific subpattern, and it is
1117 false, but the test was set up by name, scan the table to see if the
1118 name refers to any other numbers, and test them. The condition is true
1119 if any one is set. */
1120
1121 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1122 {
1123 uschar *slotA = md->name_table;
1124 for (i = 0; i < md->name_count; i++)
1125 {
1126 if (GET2(slotA, 0) == recno) break;
1127 slotA += md->name_entry_size;
1128 }
1129
1130 /* Found a name for the number - there can be only one; duplicate
1131 names for different numbers are allowed, but not vice versa. First
1132 scan down for duplicates. */
1133
1134 if (i < md->name_count)
1135 {
1136 uschar *slotB = slotA;
1137 while (slotB > md->name_table)
1138 {
1139 slotB -= md->name_entry_size;
1140 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1141 {
1142 condition = GET2(slotB, 0) == md->recursive->group_num;
1143 if (condition) break;
1144 }
1145 else break;
1146 }
1147
1148 /* Scan up for duplicates */
1149
1150 if (!condition)
1151 {
1152 slotB = slotA;
1153 for (i++; i < md->name_count; i++)
1154 {
1155 slotB += md->name_entry_size;
1156 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1157 {
1158 condition = GET2(slotB, 0) == md->recursive->group_num;
1159 if (condition) break;
1160 }
1161 else break;
1162 }
1163 }
1164 }
1165 }
1166
1167 /* Chose branch according to the condition */
1168
1169 ecode += condition? 3 : GET(ecode, 1);
1170 }
1171 }
1172
1173 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1174 {
1175 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1176 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1177
1178 /* If the numbered capture is unset, but the reference was by name,
1179 scan the table to see if the name refers to any other numbers, and test
1180 them. The condition is true if any one is set. This is tediously similar
1181 to the code above, but not close enough to try to amalgamate. */
1182
1183 if (!condition && condcode == OP_NCREF)
1184 {
1185 int refno = offset >> 1;
1186 uschar *slotA = md->name_table;
1187
1188 for (i = 0; i < md->name_count; i++)
1189 {
1190 if (GET2(slotA, 0) == refno) break;
1191 slotA += md->name_entry_size;
1192 }
1193
1194 /* Found a name for the number - there can be only one; duplicate names
1195 for different numbers are allowed, but not vice versa. First scan down
1196 for duplicates. */
1197
1198 if (i < md->name_count)
1199 {
1200 uschar *slotB = slotA;
1201 while (slotB > md->name_table)
1202 {
1203 slotB -= md->name_entry_size;
1204 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1205 {
1206 offset = GET2(slotB, 0) << 1;
1207 condition = offset < offset_top &&
1208 md->offset_vector[offset] >= 0;
1209 if (condition) break;
1210 }
1211 else break;
1212 }
1213
1214 /* Scan up for duplicates */
1215
1216 if (!condition)
1217 {
1218 slotB = slotA;
1219 for (i++; i < md->name_count; i++)
1220 {
1221 slotB += md->name_entry_size;
1222 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1223 {
1224 offset = GET2(slotB, 0) << 1;
1225 condition = offset < offset_top &&
1226 md->offset_vector[offset] >= 0;
1227 if (condition) break;
1228 }
1229 else break;
1230 }
1231 }
1232 }
1233 }
1234
1235 /* Chose branch according to the condition */
1236
1237 ecode += condition? 3 : GET(ecode, 1);
1238 }
1239
1240 else if (condcode == OP_DEF) /* DEFINE - always false */
1241 {
1242 condition = FALSE;
1243 ecode += GET(ecode, 1);
1244 }
1245
1246 /* The condition is an assertion. Call match() to evaluate it - setting
1247 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1248 an assertion. */
1249
1250 else
1251 {
1252 md->match_function_type = MATCH_CONDASSERT;
1253 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1254 if (rrc == MATCH_MATCH)
1255 {
1256 condition = TRUE;
1257 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1258 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1259 }
1260 else if (rrc != MATCH_NOMATCH &&
1261 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1262 {
1263 RRETURN(rrc); /* Need braces because of following else */
1264 }
1265 else
1266 {
1267 condition = FALSE;
1268 ecode += codelink;
1269 }
1270 }
1271
1272 /* We are now at the branch that is to be obeyed. As there is only one,
1273 we used to use tail recursion to avoid using another stack frame, except
1274 when there was unlimited repeat of a possibly empty group. However, that
1275 strategy no longer works because of the possibilty of (*THEN) being
1276 encountered in the branch. A recursive call to match() is always required,
1277 unless the second alternative doesn't exist, in which case we can just
1278 plough on. */
1279
1280 if (condition || *ecode == OP_ALT)
1281 {
1282 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1283 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1284 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1285 rrc = MATCH_NOMATCH;
1286 RRETURN(rrc);
1287 }
1288 else /* Condition false & no alternative */
1289 {
1290 ecode += 1 + LINK_SIZE;
1291 }
1292 break;
1293
1294
1295 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1296 to close any currently open capturing brackets. */
1297
1298 case OP_CLOSE:
1299 number = GET2(ecode, 1);
1300 offset = number << 1;
1301
1302 #ifdef PCRE_DEBUG
1303 printf("end bracket %d at *ACCEPT", number);
1304 printf("\n");
1305 #endif
1306
1307 md->capture_last = number;
1308 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1309 {
1310 md->offset_vector[offset] =
1311 md->offset_vector[md->offset_end - number];
1312 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1313 if (offset_top <= offset) offset_top = offset + 2;
1314 }
1315 ecode += 3;
1316 break;
1317
1318
1319 /* End of the pattern, either real or forced. If we are in a recursion, we
1320 should restore the offsets appropriately, and if it's a top-level
1321 recursion, continue from after the call. */
1322
1323 case OP_ACCEPT:
1324 case OP_ASSERT_ACCEPT:
1325 case OP_END:
1326
1327 /*
1328 if (md->recursive != NULL)
1329 {
1330 recursion_info *rec = md->recursive;
1331
1332 md->recursive = rec->prevrec;
1333
1334 memmove(md->offset_vector, rec->offset_save,
1335 rec->saved_max * sizeof(int));
1336 offset_top = rec->save_offset_top;
1337 if (rec->group_num == 0)
1338 {
1339 ecode = rec->after_call;
1340 break;
1341 }
1342 }
1343 */
1344 /* Otherwise, if we have matched an empty string, fail if not in an
1345 assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1346 is set and we have matched at the start of the subject. In both cases,
1347 backtracking will then try other alternatives, if any. */
1348
1349 /* else */ if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1350
1351 md->recursive == NULL &&
1352
1353 (md->notempty ||
1354 (md->notempty_atstart &&
1355 mstart == md->start_subject + md->start_offset)))
1356 MRRETURN(MATCH_NOMATCH);
1357
1358 /* Otherwise, we have a match. */
1359
1360 md->end_match_ptr = eptr; /* Record where we ended */
1361 md->end_offset_top = offset_top; /* and how many extracts were taken */
1362 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1363
1364 /* For some reason, the macros don't work properly if an expression is
1365 given as the argument to MRRETURN when the heap is in use. */
1366
1367 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1368 MRRETURN(rrc);
1369
1370 /* Assertion brackets. Check the alternative branches in turn - the
1371 matching won't pass the KET for an assertion. If any one branch matches,
1372 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1373 start of each branch to move the current point backwards, so the code at
1374 this level is identical to the lookahead case. When the assertion is part
1375 of a condition, we want to return immediately afterwards. The caller of
1376 this incarnation of the match() function will have set MATCH_CONDASSERT in
1377 md->match_function type, and one of these opcodes will be the first opcode
1378 that is processed. We use a local variable that is preserved over calls to
1379 match() to remember this case. */
1380
1381 case OP_ASSERT:
1382 case OP_ASSERTBACK:
1383 if (md->match_function_type == MATCH_CONDASSERT)
1384 {
1385 condassert = TRUE;
1386 md->match_function_type = 0;
1387 }
1388 else condassert = FALSE;
1389
1390 do
1391 {
1392 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1393 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1394 {
1395 mstart = md->start_match_ptr; /* In case \K reset it */
1396 break;
1397 }
1398 if (rrc != MATCH_NOMATCH &&
1399 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1400 RRETURN(rrc);
1401 ecode += GET(ecode, 1);
1402 }
1403 while (*ecode == OP_ALT);
1404
1405 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1406
1407 /* If checking an assertion for a condition, return MATCH_MATCH. */
1408
1409 if (condassert) RRETURN(MATCH_MATCH);
1410
1411 /* Continue from after the assertion, updating the offsets high water
1412 mark, since extracts may have been taken during the assertion. */
1413
1414 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1415 ecode += 1 + LINK_SIZE;
1416 offset_top = md->end_offset_top;
1417 continue;
1418
1419 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1420 PRUNE, or COMMIT means we must assume failure without checking subsequent
1421 branches. */
1422
1423 case OP_ASSERT_NOT:
1424 case OP_ASSERTBACK_NOT:
1425 if (md->match_function_type == MATCH_CONDASSERT)
1426 {
1427 condassert = TRUE;
1428 md->match_function_type = 0;
1429 }
1430 else condassert = FALSE;
1431
1432 do
1433 {
1434 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1435 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1436 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1437 {
1438 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1439 break;
1440 }
1441 if (rrc != MATCH_NOMATCH &&
1442 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1443 RRETURN(rrc);
1444 ecode += GET(ecode,1);
1445 }
1446 while (*ecode == OP_ALT);
1447
1448 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1449
1450 ecode += 1 + LINK_SIZE;
1451 continue;
1452
1453 /* Move the subject pointer back. This occurs only at the start of
1454 each branch of a lookbehind assertion. If we are too close to the start to
1455 move back, this match function fails. When working with UTF-8 we move
1456 back a number of characters, not bytes. */
1457
1458 case OP_REVERSE:
1459 #ifdef SUPPORT_UTF8
1460 if (utf8)
1461 {
1462 i = GET(ecode, 1);
1463 while (i-- > 0)
1464 {
1465 eptr--;
1466 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1467 BACKCHAR(eptr);
1468 }
1469 }
1470 else
1471 #endif
1472
1473 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1474
1475 {
1476 eptr -= GET(ecode, 1);
1477 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1478 }
1479
1480 /* Save the earliest consulted character, then skip to next op code */
1481
1482 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1483 ecode += 1 + LINK_SIZE;
1484 break;
1485
1486 /* The callout item calls an external function, if one is provided, passing
1487 details of the match so far. This is mainly for debugging, though the
1488 function is able to force a failure. */
1489
1490 case OP_CALLOUT:
1491 if (pcre_callout != NULL)
1492 {
1493 pcre_callout_block cb;
1494 cb.version = 1; /* Version 1 of the callout block */
1495 cb.callout_number = ecode[1];
1496 cb.offset_vector = md->offset_vector;
1497 cb.subject = (PCRE_SPTR)md->start_subject;
1498 cb.subject_length = (int)(md->end_subject - md->start_subject);
1499 cb.start_match = (int)(mstart - md->start_subject);
1500 cb.current_position = (int)(eptr - md->start_subject);
1501 cb.pattern_position = GET(ecode, 2);
1502 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1503 cb.capture_top = offset_top/2;
1504 cb.capture_last = md->capture_last;
1505 cb.callout_data = md->callout_data;
1506 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1507 if (rrc < 0) RRETURN(rrc);
1508 }
1509 ecode += 2 + 2*LINK_SIZE;
1510 break;
1511
1512 /* Recursion either matches the current regex, or some subexpression. The
1513 offset data is the offset to the starting bracket from the start of the
1514 whole pattern. (This is so that it works from duplicated subpatterns.)
1515
1516 The state of the capturing groups is preserved over recursion, and
1517 re-instated afterwards. We don't know how many are started and not yet
1518 finished (offset_top records the completed total) so we just have to save
1519 all the potential data. There may be up to 65535 such values, which is too
1520 large to put on the stack, but using malloc for small numbers seems
1521 expensive. As a compromise, the stack is used when there are no more than
1522 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1523
1524 There are also other values that have to be saved. We use a chained
1525 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1526 for the original version of this logic. It has, however, been hacked around
1527 a lot, so he is not to blame for the current way it works. */
1528
1529 case OP_RECURSE:
1530 {
1531 callpat = md->start_code + GET(ecode, 1);
1532 new_recursive.group_num = (callpat == md->start_code)? 0 :
1533 GET2(callpat, 1 + LINK_SIZE);
1534
1535 /* Add to "recursing stack" */
1536
1537 new_recursive.prevrec = md->recursive;
1538 md->recursive = &new_recursive;
1539
1540 /* Where to continue from afterwards */
1541
1542 ecode += 1 + LINK_SIZE;
1543
1544 /* Now save the offset data */
1545
1546 new_recursive.saved_max = md->offset_end;
1547 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1548 new_recursive.offset_save = stacksave;
1549 else
1550 {
1551 new_recursive.offset_save =
1552 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1553 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1554 }
1555 memcpy(new_recursive.offset_save, md->offset_vector,
1556 new_recursive.saved_max * sizeof(int));
1557
1558 /* OK, now we can do the recursion. After processing each alternative,
1559 restore the offset data. If there were nested recursions, md->recursive
1560 might be changed, so reset it before looping. */
1561
1562 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1563 cbegroup = (*callpat >= OP_SBRA);
1564 do
1565 {
1566 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1567 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1568 md, eptrb, RM6);
1569 memcpy(md->offset_vector, new_recursive.offset_save,
1570 new_recursive.saved_max * sizeof(int));
1571 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1572 {
1573 DPRINTF(("Recursion matched\n"));
1574 md->recursive = new_recursive.prevrec;
1575 if (new_recursive.offset_save != stacksave)
1576 (pcre_free)(new_recursive.offset_save);
1577
1578 /* Set where we got to in the subject, and reset the start in case
1579 it was changed by \K. This *is* propagated back out of a recursion,
1580 for Perl compatibility. */
1581
1582 eptr = md->end_match_ptr;
1583 mstart = md->start_match_ptr;
1584 goto RECURSION_MATCHED; /* Exit loop; end processing */
1585 }
1586 else if (rrc != MATCH_NOMATCH &&
1587 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1588 {
1589 DPRINTF(("Recursion gave error %d\n", rrc));
1590 if (new_recursive.offset_save != stacksave)
1591 (pcre_free)(new_recursive.offset_save);
1592 RRETURN(rrc);
1593 }
1594
1595 md->recursive = &new_recursive;
1596 callpat += GET(callpat, 1);
1597 }
1598 while (*callpat == OP_ALT);
1599
1600 DPRINTF(("Recursion didn't match\n"));
1601 md->recursive = new_recursive.prevrec;
1602 if (new_recursive.offset_save != stacksave)
1603 (pcre_free)(new_recursive.offset_save);
1604 MRRETURN(MATCH_NOMATCH);
1605 }
1606
1607 RECURSION_MATCHED:
1608 break;
1609
1610 /* An alternation is the end of a branch; scan along to find the end of the
1611 bracketed group and go to there. */
1612
1613 case OP_ALT:
1614 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1615 break;
1616
1617 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1618 indicating that it may occur zero times. It may repeat infinitely, or not
1619 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1620 with fixed upper repeat limits are compiled as a number of copies, with the
1621 optional ones preceded by BRAZERO or BRAMINZERO. */
1622
1623 case OP_BRAZERO:
1624 next = ecode + 1;
1625 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1627 do next += GET(next, 1); while (*next == OP_ALT);
1628 ecode = next + 1 + LINK_SIZE;
1629 break;
1630
1631 case OP_BRAMINZERO:
1632 next = ecode + 1;
1633 do next += GET(next, 1); while (*next == OP_ALT);
1634 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1636 ecode++;
1637 break;
1638
1639 case OP_SKIPZERO:
1640 next = ecode+1;
1641 do next += GET(next,1); while (*next == OP_ALT);
1642 ecode = next + 1 + LINK_SIZE;
1643 break;
1644
1645 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1646 here; just jump to the group, with allow_zero set TRUE. */
1647
1648 case OP_BRAPOSZERO:
1649 op = *(++ecode);
1650 allow_zero = TRUE;
1651 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1652 goto POSSESSIVE_NON_CAPTURE;
1653
1654 /* End of a group, repeated or non-repeating. */
1655
1656 case OP_KET:
1657 case OP_KETRMIN:
1658 case OP_KETRMAX:
1659 case OP_KETRPOS:
1660 prev = ecode - GET(ecode, 1);
1661
1662 /* If this was a group that remembered the subject start, in order to break
1663 infinite repeats of empty string matches, retrieve the subject start from
1664 the chain. Otherwise, set it NULL. */
1665
1666 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1667 {
1668 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1669 eptrb = eptrb->epb_prev; /* Backup to previous group */
1670 }
1671 else saved_eptr = NULL;
1672
1673 /* If we are at the end of an assertion group, stop matching and return
1674 MATCH_MATCH, but record the current high water mark for use by positive
1675 assertions. We also need to record the match start in case it was changed
1676 by \K. */
1677
1678 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1679 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1680 {
1681 md->end_match_ptr = eptr; /* For ONCE */
1682 md->end_offset_top = offset_top;
1683 md->start_match_ptr = mstart;
1684 MRRETURN(MATCH_MATCH);
1685 }
1686
1687 /* For capturing groups we have to check the group number back at the start
1688 and if necessary complete handling an extraction by setting the offsets and
1689 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1690 into group 0, so it won't be picked up here. Instead, we catch it when the
1691 OP_END is reached. Other recursion is handled here. We just have to record
1692 the current subject position and start match pointer and give a MATCH
1693 return. */
1694
1695 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1696 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1697 {
1698 number = GET2(prev, 1+LINK_SIZE);
1699 offset = number << 1;
1700
1701 #ifdef PCRE_DEBUG
1702 printf("end bracket %d", number);
1703 printf("\n");
1704 #endif
1705
1706 /* Handle a recursively called group. */
1707
1708 if (md->recursive != NULL && md->recursive->group_num == number)
1709 {
1710 md->end_match_ptr = eptr;
1711 md->start_match_ptr = mstart;
1712 RRETURN(MATCH_MATCH);
1713 }
1714
1715 /* Deal with capturing */
1716
1717 md->capture_last = number;
1718 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1719 {
1720 /* If offset is greater than offset_top, it means that we are
1721 "skipping" a capturing group, and that group's offsets must be marked
1722 unset. In earlier versions of PCRE, all the offsets were unset at the
1723 start of matching, but this doesn't work because atomic groups and
1724 assertions can cause a value to be set that should later be unset.
1725 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1726 part of the atomic group, but this is not on the final matching path,
1727 so must be unset when 2 is set. (If there is no group 2, there is no
1728 problem, because offset_top will then be 2, indicating no capture.) */
1729
1730 if (offset > offset_top)
1731 {
1732 register int *iptr = md->offset_vector + offset_top;
1733 register int *iend = md->offset_vector + offset;
1734 while (iptr < iend) *iptr++ = -1;
1735 }
1736
1737 /* Now make the extraction */
1738
1739 md->offset_vector[offset] =
1740 md->offset_vector[md->offset_end - number];
1741 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1742 if (offset_top <= offset) offset_top = offset + 2;
1743 }
1744 }
1745
1746 /* For an ordinary non-repeating ket, just continue at this level. This
1747 also happens for a repeating ket if no characters were matched in the
1748 group. This is the forcible breaking of infinite loops as implemented in
1749 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1750 processing the rest of the pattern at a lower level. If this results in a
1751 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1752 bypassing intermediate backup points, but resetting any captures that
1753 happened along the way. */
1754
1755 if (*ecode == OP_KET || eptr == saved_eptr)
1756 {
1757 if (*prev == OP_ONCE)
1758 {
1759 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1761 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1762 RRETURN(MATCH_ONCE);
1763 }
1764 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1765 break;
1766 }
1767
1768 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1769 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1770 at a time from the outer level, thus saving stack. */
1771
1772 if (*ecode == OP_KETRPOS)
1773 {
1774 md->end_match_ptr = eptr;
1775 md->end_offset_top = offset_top;
1776 RRETURN(MATCH_KETRPOS);
1777 }
1778
1779 /* The normal repeating kets try the rest of the pattern or restart from
1780 the preceding bracket, in the appropriate order. In the second case, we can
1781 use tail recursion to avoid using another stack frame, unless we have an
1782 an atomic group or an unlimited repeat of a group that can match an empty
1783 string. */
1784
1785 if (*ecode == OP_KETRMIN)
1786 {
1787 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
1788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1789 if (*prev == OP_ONCE)
1790 {
1791 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
1792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1794 RRETURN(MATCH_ONCE);
1795 }
1796 if (*prev >= OP_SBRA) /* Could match an empty string */
1797 {
1798 md->match_function_type = MATCH_CBEGROUP;
1799 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1800 RRETURN(rrc);
1801 }
1802 ecode = prev;
1803 goto TAIL_RECURSE;
1804 }
1805 else /* OP_KETRMAX */
1806 {
1807 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1808 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1809 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811 if (*prev == OP_ONCE)
1812 {
1813 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
1814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1815 md->once_target = prev;
1816 RRETURN(MATCH_ONCE);
1817 }
1818 ecode += 1 + LINK_SIZE;
1819 goto TAIL_RECURSE;
1820 }
1821 /* Control never gets here */
1822
1823 /* Not multiline mode: start of subject assertion, unless notbol. */
1824
1825 case OP_CIRC:
1826 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1827
1828 /* Start of subject assertion */
1829
1830 case OP_SOD:
1831 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1832 ecode++;
1833 break;
1834
1835 /* Multiline mode: start of subject unless notbol, or after any newline. */
1836
1837 case OP_CIRCM:
1838 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1839 if (eptr != md->start_subject &&
1840 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1841 MRRETURN(MATCH_NOMATCH);
1842 ecode++;
1843 break;
1844
1845 /* Start of match assertion */
1846
1847 case OP_SOM:
1848 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1849 ecode++;
1850 break;
1851
1852 /* Reset the start of match point */
1853
1854 case OP_SET_SOM:
1855 mstart = eptr;
1856 ecode++;
1857 break;
1858
1859 /* Multiline mode: assert before any newline, or before end of subject
1860 unless noteol is set. */
1861
1862 case OP_DOLLM:
1863 if (eptr < md->end_subject)
1864 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1865 else
1866 {
1867 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1868 SCHECK_PARTIAL();
1869 }
1870 ecode++;
1871 break;
1872
1873 /* Not multiline mode: assert before a terminating newline or before end of
1874 subject unless noteol is set. */
1875
1876 case OP_DOLL:
1877 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1878 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1879
1880 /* ... else fall through for endonly */
1881
1882 /* End of subject assertion (\z) */
1883
1884 case OP_EOD:
1885 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1886 SCHECK_PARTIAL();
1887 ecode++;
1888 break;
1889
1890 /* End of subject or ending \n assertion (\Z) */
1891
1892 case OP_EODN:
1893 ASSERT_NL_OR_EOS:
1894 if (eptr < md->end_subject &&
1895 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1896 MRRETURN(MATCH_NOMATCH);
1897
1898 /* Either at end of string or \n before end. */
1899
1900 SCHECK_PARTIAL();
1901 ecode++;
1902 break;
1903
1904 /* Word boundary assertions */
1905
1906 case OP_NOT_WORD_BOUNDARY:
1907 case OP_WORD_BOUNDARY:
1908 {
1909
1910 /* Find out if the previous and current characters are "word" characters.
1911 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1912 be "non-word" characters. Remember the earliest consulted character for
1913 partial matching. */
1914
1915 #ifdef SUPPORT_UTF8
1916 if (utf8)
1917 {
1918 /* Get status of previous character */
1919
1920 if (eptr == md->start_subject) prev_is_word = FALSE; else
1921 {
1922 USPTR lastptr = eptr - 1;
1923 while((*lastptr & 0xc0) == 0x80) lastptr--;
1924 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1925 GETCHAR(c, lastptr);
1926 #ifdef SUPPORT_UCP
1927 if (md->use_ucp)
1928 {
1929 if (c == '_') prev_is_word = TRUE; else
1930 {
1931 int cat = UCD_CATEGORY(c);
1932 prev_is_word = (cat == ucp_L || cat == ucp_N);
1933 }
1934 }
1935 else
1936 #endif
1937 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1938 }
1939
1940 /* Get status of next character */
1941
1942 if (eptr >= md->end_subject)
1943 {
1944 SCHECK_PARTIAL();
1945 cur_is_word = FALSE;
1946 }
1947 else
1948 {
1949 GETCHAR(c, eptr);
1950 #ifdef SUPPORT_UCP
1951 if (md->use_ucp)
1952 {
1953 if (c == '_') cur_is_word = TRUE; else
1954 {
1955 int cat = UCD_CATEGORY(c);
1956 cur_is_word = (cat == ucp_L || cat == ucp_N);
1957 }
1958 }
1959 else
1960 #endif
1961 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1962 }
1963 }
1964 else
1965 #endif
1966
1967 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1968 consistency with the behaviour of \w we do use it in this case. */
1969
1970 {
1971 /* Get status of previous character */
1972
1973 if (eptr == md->start_subject) prev_is_word = FALSE; else
1974 {
1975 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1976 #ifdef SUPPORT_UCP
1977 if (md->use_ucp)
1978 {
1979 c = eptr[-1];
1980 if (c == '_') prev_is_word = TRUE; else
1981 {
1982 int cat = UCD_CATEGORY(c);
1983 prev_is_word = (cat == ucp_L || cat == ucp_N);
1984 }
1985 }
1986 else
1987 #endif
1988 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1989 }
1990
1991 /* Get status of next character */
1992
1993 if (eptr >= md->end_subject)
1994 {
1995 SCHECK_PARTIAL();
1996 cur_is_word = FALSE;
1997 }
1998 else
1999 #ifdef SUPPORT_UCP
2000 if (md->use_ucp)
2001 {
2002 c = *eptr;
2003 if (c == '_') cur_is_word = TRUE; else
2004 {
2005 int cat = UCD_CATEGORY(c);
2006 cur_is_word = (cat == ucp_L || cat == ucp_N);
2007 }
2008 }
2009 else
2010 #endif
2011 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2012 }
2013
2014 /* Now see if the situation is what we want */
2015
2016 if ((*ecode++ == OP_WORD_BOUNDARY)?
2017 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2018 MRRETURN(MATCH_NOMATCH);
2019 }
2020 break;
2021
2022 /* Match a single character type; inline for speed */
2023
2024 case OP_ANY:
2025 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2026 /* Fall through */
2027
2028 case OP_ALLANY:
2029 if (eptr++ >= md->end_subject)
2030 {
2031 SCHECK_PARTIAL();
2032 MRRETURN(MATCH_NOMATCH);
2033 }
2034 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2035 ecode++;
2036 break;
2037
2038 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2039 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2040
2041 case OP_ANYBYTE:
2042 if (eptr++ >= md->end_subject)
2043 {
2044 SCHECK_PARTIAL();
2045 MRRETURN(MATCH_NOMATCH);
2046 }
2047 ecode++;
2048 break;
2049
2050 case OP_NOT_DIGIT:
2051 if (eptr >= md->end_subject)
2052 {
2053 SCHECK_PARTIAL();
2054 MRRETURN(MATCH_NOMATCH);
2055 }
2056 GETCHARINCTEST(c, eptr);
2057 if (
2058 #ifdef SUPPORT_UTF8
2059 c < 256 &&
2060 #endif
2061 (md->ctypes[c] & ctype_digit) != 0
2062 )
2063 MRRETURN(MATCH_NOMATCH);
2064 ecode++;
2065 break;
2066
2067 case OP_DIGIT:
2068 if (eptr >= md->end_subject)
2069 {
2070 SCHECK_PARTIAL();
2071 MRRETURN(MATCH_NOMATCH);
2072 }
2073 GETCHARINCTEST(c, eptr);
2074 if (
2075 #ifdef SUPPORT_UTF8
2076 c >= 256 ||
2077 #endif
2078 (md->ctypes[c] & ctype_digit) == 0
2079 )
2080 MRRETURN(MATCH_NOMATCH);
2081 ecode++;
2082 break;
2083
2084 case OP_NOT_WHITESPACE:
2085 if (eptr >= md->end_subject)
2086 {
2087 SCHECK_PARTIAL();
2088 MRRETURN(MATCH_NOMATCH);
2089 }
2090 GETCHARINCTEST(c, eptr);
2091 if (
2092 #ifdef SUPPORT_UTF8
2093 c < 256 &&
2094 #endif
2095 (md->ctypes[c] & ctype_space) != 0
2096 )
2097 MRRETURN(MATCH_NOMATCH);
2098 ecode++;
2099 break;
2100
2101 case OP_WHITESPACE:
2102 if (eptr >= md->end_subject)
2103 {
2104 SCHECK_PARTIAL();
2105 MRRETURN(MATCH_NOMATCH);
2106 }
2107 GETCHARINCTEST(c, eptr);
2108 if (
2109 #ifdef SUPPORT_UTF8
2110 c >= 256 ||
2111 #endif
2112 (md->ctypes[c] & ctype_space) == 0
2113 )
2114 MRRETURN(MATCH_NOMATCH);
2115 ecode++;
2116 break;
2117
2118 case OP_NOT_WORDCHAR:
2119 if (eptr >= md->end_subject)
2120 {
2121 SCHECK_PARTIAL();
2122 MRRETURN(MATCH_NOMATCH);
2123 }
2124 GETCHARINCTEST(c, eptr);
2125 if (
2126 #ifdef SUPPORT_UTF8
2127 c < 256 &&
2128 #endif
2129 (md->ctypes[c] & ctype_word) != 0
2130 )
2131 MRRETURN(MATCH_NOMATCH);
2132 ecode++;
2133 break;
2134
2135 case OP_WORDCHAR:
2136 if (eptr >= md->end_subject)
2137 {
2138 SCHECK_PARTIAL();
2139 MRRETURN(MATCH_NOMATCH);
2140 }
2141 GETCHARINCTEST(c, eptr);
2142 if (
2143 #ifdef SUPPORT_UTF8
2144 c >= 256 ||
2145 #endif
2146 (md->ctypes[c] & ctype_word) == 0
2147 )
2148 MRRETURN(MATCH_NOMATCH);
2149 ecode++;
2150 break;
2151
2152 case OP_ANYNL:
2153 if (eptr >= md->end_subject)
2154 {
2155 SCHECK_PARTIAL();
2156 MRRETURN(MATCH_NOMATCH);
2157 }
2158 GETCHARINCTEST(c, eptr);
2159 switch(c)
2160 {
2161 default: MRRETURN(MATCH_NOMATCH);
2162
2163 case 0x000d:
2164 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2165 break;
2166
2167 case 0x000a:
2168 break;
2169
2170 case 0x000b:
2171 case 0x000c:
2172 case 0x0085:
2173 case 0x2028:
2174 case 0x2029:
2175 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2176 break;
2177 }
2178 ecode++;
2179 break;
2180
2181 case OP_NOT_HSPACE:
2182 if (eptr >= md->end_subject)
2183 {
2184 SCHECK_PARTIAL();
2185 MRRETURN(MATCH_NOMATCH);
2186 }
2187 GETCHARINCTEST(c, eptr);
2188 switch(c)
2189 {
2190 default: break;
2191 case 0x09: /* HT */
2192 case 0x20: /* SPACE */
2193 case 0xa0: /* NBSP */
2194 case 0x1680: /* OGHAM SPACE MARK */
2195 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2196 case 0x2000: /* EN QUAD */
2197 case 0x2001: /* EM QUAD */
2198 case 0x2002: /* EN SPACE */
2199 case 0x2003: /* EM SPACE */
2200 case 0x2004: /* THREE-PER-EM SPACE */
2201 case 0x2005: /* FOUR-PER-EM SPACE */
2202 case 0x2006: /* SIX-PER-EM SPACE */
2203 case 0x2007: /* FIGURE SPACE */
2204 case 0x2008: /* PUNCTUATION SPACE */
2205 case 0x2009: /* THIN SPACE */
2206 case 0x200A: /* HAIR SPACE */
2207 case 0x202f: /* NARROW NO-BREAK SPACE */
2208 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2209 case 0x3000: /* IDEOGRAPHIC SPACE */
2210 MRRETURN(MATCH_NOMATCH);
2211 }
2212 ecode++;
2213 break;
2214
2215 case OP_HSPACE:
2216 if (eptr >= md->end_subject)
2217 {
2218 SCHECK_PARTIAL();
2219 MRRETURN(MATCH_NOMATCH);
2220 }
2221 GETCHARINCTEST(c, eptr);
2222 switch(c)
2223 {
2224 default: MRRETURN(MATCH_NOMATCH);
2225 case 0x09: /* HT */
2226 case 0x20: /* SPACE */
2227 case 0xa0: /* NBSP */
2228 case 0x1680: /* OGHAM SPACE MARK */
2229 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2230 case 0x2000: /* EN QUAD */
2231 case 0x2001: /* EM QUAD */
2232 case 0x2002: /* EN SPACE */
2233 case 0x2003: /* EM SPACE */
2234 case 0x2004: /* THREE-PER-EM SPACE */
2235 case 0x2005: /* FOUR-PER-EM SPACE */
2236 case 0x2006: /* SIX-PER-EM SPACE */
2237 case 0x2007: /* FIGURE SPACE */
2238 case 0x2008: /* PUNCTUATION SPACE */
2239 case 0x2009: /* THIN SPACE */
2240 case 0x200A: /* HAIR SPACE */
2241 case 0x202f: /* NARROW NO-BREAK SPACE */
2242 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2243 case 0x3000: /* IDEOGRAPHIC SPACE */
2244 break;
2245 }
2246 ecode++;
2247 break;
2248
2249 case OP_NOT_VSPACE:
2250 if (eptr >= md->end_subject)
2251 {
2252 SCHECK_PARTIAL();
2253 MRRETURN(MATCH_NOMATCH);
2254 }
2255 GETCHARINCTEST(c, eptr);
2256 switch(c)
2257 {
2258 default: break;
2259 case 0x0a: /* LF */
2260 case 0x0b: /* VT */
2261 case 0x0c: /* FF */
2262 case 0x0d: /* CR */
2263 case 0x85: /* NEL */
2264 case 0x2028: /* LINE SEPARATOR */
2265 case 0x2029: /* PARAGRAPH SEPARATOR */
2266 MRRETURN(MATCH_NOMATCH);
2267 }
2268 ecode++;
2269 break;
2270
2271 case OP_VSPACE:
2272 if (eptr >= md->end_subject)
2273 {
2274 SCHECK_PARTIAL();
2275 MRRETURN(MATCH_NOMATCH);
2276 }
2277 GETCHARINCTEST(c, eptr);
2278 switch(c)
2279 {
2280 default: MRRETURN(MATCH_NOMATCH);
2281 case 0x0a: /* LF */
2282 case 0x0b: /* VT */
2283 case 0x0c: /* FF */
2284 case 0x0d: /* CR */
2285 case 0x85: /* NEL */
2286 case 0x2028: /* LINE SEPARATOR */
2287 case 0x2029: /* PARAGRAPH SEPARATOR */
2288 break;
2289 }
2290 ecode++;
2291 break;
2292
2293 #ifdef SUPPORT_UCP
2294 /* Check the next character by Unicode property. We will get here only
2295 if the support is in the binary; otherwise a compile-time error occurs. */
2296
2297 case OP_PROP:
2298 case OP_NOTPROP:
2299 if (eptr >= md->end_subject)
2300 {
2301 SCHECK_PARTIAL();
2302 MRRETURN(MATCH_NOMATCH);
2303 }
2304 GETCHARINCTEST(c, eptr);
2305 {
2306 const ucd_record *prop = GET_UCD(c);
2307
2308 switch(ecode[1])
2309 {
2310 case PT_ANY:
2311 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2312 break;
2313
2314 case PT_LAMP:
2315 if ((prop->chartype == ucp_Lu ||
2316 prop->chartype == ucp_Ll ||
2317 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2318 MRRETURN(MATCH_NOMATCH);
2319 break;
2320
2321 case PT_GC:
2322 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2323 MRRETURN(MATCH_NOMATCH);
2324 break;
2325
2326 case PT_PC:
2327 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2328 MRRETURN(MATCH_NOMATCH);
2329 break;
2330
2331 case PT_SC:
2332 if ((ecode[2] != prop->script) == (op == OP_PROP))
2333 MRRETURN(MATCH_NOMATCH);
2334 break;
2335
2336 /* These are specials */
2337
2338 case PT_ALNUM:
2339 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2340 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2341 MRRETURN(MATCH_NOMATCH);
2342 break;
2343
2344 case PT_SPACE: /* Perl space */
2345 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2346 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2347 == (op == OP_NOTPROP))
2348 MRRETURN(MATCH_NOMATCH);
2349 break;
2350
2351 case PT_PXSPACE: /* POSIX space */
2352 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2353 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2354 c == CHAR_FF || c == CHAR_CR)
2355 == (op == OP_NOTPROP))
2356 MRRETURN(MATCH_NOMATCH);
2357 break;
2358
2359 case PT_WORD:
2360 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2361 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2362 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2363 MRRETURN(MATCH_NOMATCH);
2364 break;
2365
2366 /* This should never occur */
2367
2368 default:
2369 RRETURN(PCRE_ERROR_INTERNAL);
2370 }
2371
2372 ecode += 3;
2373 }
2374 break;
2375
2376 /* Match an extended Unicode sequence. We will get here only if the support
2377 is in the binary; otherwise a compile-time error occurs. */
2378
2379 case OP_EXTUNI:
2380 if (eptr >= md->end_subject)
2381 {
2382 SCHECK_PARTIAL();
2383 MRRETURN(MATCH_NOMATCH);
2384 }
2385 GETCHARINCTEST(c, eptr);
2386 {
2387 int category = UCD_CATEGORY(c);
2388 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2389 while (eptr < md->end_subject)
2390 {
2391 int len = 1;
2392 if (!utf8) c = *eptr; else
2393 {
2394 GETCHARLEN(c, eptr, len);
2395 }
2396 category = UCD_CATEGORY(c);
2397 if (category != ucp_M) break;
2398 eptr += len;
2399 }
2400 }
2401 ecode++;
2402 break;
2403 #endif
2404
2405
2406 /* Match a back reference, possibly repeatedly. Look past the end of the
2407 item to see if there is repeat information following. The code is similar
2408 to that for character classes, but repeated for efficiency. Then obey
2409 similar code to character type repeats - written out again for speed.
2410 However, if the referenced string is the empty string, always treat
2411 it as matched, any number of times (otherwise there could be infinite
2412 loops). */
2413
2414 case OP_REF:
2415 case OP_REFI:
2416 caseless = op == OP_REFI;
2417 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2418 ecode += 3;
2419
2420 /* If the reference is unset, there are two possibilities:
2421
2422 (a) In the default, Perl-compatible state, set the length negative;
2423 this ensures that every attempt at a match fails. We can't just fail
2424 here, because of the possibility of quantifiers with zero minima.
2425
2426 (b) If the JavaScript compatibility flag is set, set the length to zero
2427 so that the back reference matches an empty string.
2428
2429 Otherwise, set the length to the length of what was matched by the
2430 referenced subpattern. */
2431
2432 if (offset >= offset_top || md->offset_vector[offset] < 0)
2433 length = (md->jscript_compat)? 0 : -1;
2434 else
2435 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2436
2437 /* Set up for repetition, or handle the non-repeated case */
2438
2439 switch (*ecode)
2440 {
2441 case OP_CRSTAR:
2442 case OP_CRMINSTAR:
2443 case OP_CRPLUS:
2444 case OP_CRMINPLUS:
2445 case OP_CRQUERY:
2446 case OP_CRMINQUERY:
2447 c = *ecode++ - OP_CRSTAR;
2448 minimize = (c & 1) != 0;
2449 min = rep_min[c]; /* Pick up values from tables; */
2450 max = rep_max[c]; /* zero for max => infinity */
2451 if (max == 0) max = INT_MAX;
2452 break;
2453
2454 case OP_CRRANGE:
2455 case OP_CRMINRANGE:
2456 minimize = (*ecode == OP_CRMINRANGE);
2457 min = GET2(ecode, 1);
2458 max = GET2(ecode, 3);
2459 if (max == 0) max = INT_MAX;
2460 ecode += 5;
2461 break;
2462
2463 default: /* No repeat follows */
2464 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2465 {
2466 CHECK_PARTIAL();
2467 MRRETURN(MATCH_NOMATCH);
2468 }
2469 eptr += length;
2470 continue; /* With the main loop */
2471 }
2472
2473 /* Handle repeated back references. If the length of the reference is
2474 zero, just continue with the main loop. */
2475
2476 if (length == 0) continue;
2477
2478 /* First, ensure the minimum number of matches are present. We get back
2479 the length of the reference string explicitly rather than passing the
2480 address of eptr, so that eptr can be a register variable. */
2481
2482 for (i = 1; i <= min; i++)
2483 {
2484 int slength;
2485 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2486 {
2487 CHECK_PARTIAL();
2488 MRRETURN(MATCH_NOMATCH);
2489 }
2490 eptr += slength;
2491 }
2492
2493 /* If min = max, continue at the same level without recursion.
2494 They are not both allowed to be zero. */
2495
2496 if (min == max) continue;
2497
2498 /* If minimizing, keep trying and advancing the pointer */
2499
2500 if (minimize)
2501 {
2502 for (fi = min;; fi++)
2503 {
2504 int slength;
2505 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2506 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2507 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2508 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2509 {
2510 CHECK_PARTIAL();
2511 MRRETURN(MATCH_NOMATCH);
2512 }
2513 eptr += slength;
2514 }
2515 /* Control never gets here */
2516 }
2517
2518 /* If maximizing, find the longest string and work backwards */
2519
2520 else
2521 {
2522 pp = eptr;
2523 for (i = min; i < max; i++)
2524 {
2525 int slength;
2526 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2527 {
2528 CHECK_PARTIAL();
2529 break;
2530 }
2531 eptr += slength;
2532 }
2533 while (eptr >= pp)
2534 {
2535 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2536 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2537 eptr -= length;
2538 }
2539 MRRETURN(MATCH_NOMATCH);
2540 }
2541 /* Control never gets here */
2542
2543 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2544 used when all the characters in the class have values in the range 0-255,
2545 and either the matching is caseful, or the characters are in the range
2546 0-127 when UTF-8 processing is enabled. The only difference between
2547 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2548 encountered.
2549
2550 First, look past the end of the item to see if there is repeat information
2551 following. Then obey similar code to character type repeats - written out
2552 again for speed. */
2553
2554 case OP_NCLASS:
2555 case OP_CLASS:
2556 {
2557 data = ecode + 1; /* Save for matching */
2558 ecode += 33; /* Advance past the item */
2559
2560 switch (*ecode)
2561 {
2562 case OP_CRSTAR:
2563 case OP_CRMINSTAR:
2564 case OP_CRPLUS:
2565 case OP_CRMINPLUS:
2566 case OP_CRQUERY:
2567 case OP_CRMINQUERY:
2568 c = *ecode++ - OP_CRSTAR;
2569 minimize = (c & 1) != 0;
2570 min = rep_min[c]; /* Pick up values from tables; */
2571 max = rep_max[c]; /* zero for max => infinity */
2572 if (max == 0) max = INT_MAX;
2573 break;
2574
2575 case OP_CRRANGE:
2576 case OP_CRMINRANGE:
2577 minimize = (*ecode == OP_CRMINRANGE);
2578 min = GET2(ecode, 1);
2579 max = GET2(ecode, 3);
2580 if (max == 0) max = INT_MAX;
2581 ecode += 5;
2582 break;
2583
2584 default: /* No repeat follows */
2585 min = max = 1;
2586 break;
2587 }
2588
2589 /* First, ensure the minimum number of matches are present. */
2590
2591 #ifdef SUPPORT_UTF8
2592 /* UTF-8 mode */
2593 if (utf8)
2594 {
2595 for (i = 1; i <= min; i++)
2596 {
2597 if (eptr >= md->end_subject)
2598 {
2599 SCHECK_PARTIAL();
2600 MRRETURN(MATCH_NOMATCH);
2601 }
2602 GETCHARINC(c, eptr);
2603 if (c > 255)
2604 {
2605 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2606 }
2607 else
2608 {
2609 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2610 }
2611 }
2612 }
2613 else
2614 #endif
2615 /* Not UTF-8 mode */
2616 {
2617 for (i = 1; i <= min; i++)
2618 {
2619 if (eptr >= md->end_subject)
2620 {
2621 SCHECK_PARTIAL();
2622 MRRETURN(MATCH_NOMATCH);
2623 }
2624 c = *eptr++;
2625 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2626 }
2627 }
2628
2629 /* If max == min we can continue with the main loop without the
2630 need to recurse. */
2631
2632 if (min == max) continue;
2633
2634 /* If minimizing, keep testing the rest of the expression and advancing
2635 the pointer while it matches the class. */
2636
2637 if (minimize)
2638 {
2639 #ifdef SUPPORT_UTF8
2640 /* UTF-8 mode */
2641 if (utf8)
2642 {
2643 for (fi = min;; fi++)
2644 {
2645 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2646 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2647 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2648 if (eptr >= md->end_subject)
2649 {
2650 SCHECK_PARTIAL();
2651 MRRETURN(MATCH_NOMATCH);
2652 }
2653 GETCHARINC(c, eptr);
2654 if (c > 255)
2655 {
2656 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2657 }
2658 else
2659 {
2660 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2661 }
2662 }
2663 }
2664 else
2665 #endif
2666 /* Not UTF-8 mode */
2667 {
2668 for (fi = min;; fi++)
2669 {
2670 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2671 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2672 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2673 if (eptr >= md->end_subject)
2674 {
2675 SCHECK_PARTIAL();
2676 MRRETURN(MATCH_NOMATCH);
2677 }
2678 c = *eptr++;
2679 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2680 }
2681 }
2682 /* Control never gets here */
2683 }
2684
2685 /* If maximizing, find the longest possible run, then work backwards. */
2686
2687 else
2688 {
2689 pp = eptr;
2690
2691 #ifdef SUPPORT_UTF8
2692 /* UTF-8 mode */
2693 if (utf8)
2694 {
2695 for (i = min; i < max; i++)
2696 {
2697 int len = 1;
2698 if (eptr >= md->end_subject)
2699 {
2700 SCHECK_PARTIAL();
2701 break;
2702 }
2703 GETCHARLEN(c, eptr, len);
2704 if (c > 255)
2705 {
2706 if (op == OP_CLASS) break;
2707 }
2708 else
2709 {
2710 if ((data[c/8] & (1 << (c&7))) == 0) break;
2711 }
2712 eptr += len;
2713 }
2714 for (;;)
2715 {
2716 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2717 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2718 if (eptr-- == pp) break; /* Stop if tried at original pos */
2719 BACKCHAR(eptr);
2720 }
2721 }
2722 else
2723 #endif
2724 /* Not UTF-8 mode */
2725 {
2726 for (i = min; i < max; i++)
2727 {
2728 if (eptr >= md->end_subject)
2729 {
2730 SCHECK_PARTIAL();
2731 break;
2732 }
2733 c = *eptr;
2734 if ((data[c/8] & (1 << (c&7))) == 0) break;
2735 eptr++;
2736 }
2737 while (eptr >= pp)
2738 {
2739 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2741 eptr--;
2742 }
2743 }
2744
2745 MRRETURN(MATCH_NOMATCH);
2746 }
2747 }
2748 /* Control never gets here */
2749
2750
2751 /* Match an extended character class. This opcode is encountered only
2752 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2753 mode, because Unicode properties are supported in non-UTF-8 mode. */
2754
2755 #ifdef SUPPORT_UTF8
2756 case OP_XCLASS:
2757 {
2758 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2759 ecode += GET(ecode, 1); /* Advance past the item */
2760
2761 switch (*ecode)
2762 {
2763 case OP_CRSTAR:
2764 case OP_CRMINSTAR:
2765 case OP_CRPLUS:
2766 case OP_CRMINPLUS:
2767 case OP_CRQUERY:
2768 case OP_CRMINQUERY:
2769 c = *ecode++ - OP_CRSTAR;
2770 minimize = (c & 1) != 0;
2771 min = rep_min[c]; /* Pick up values from tables; */
2772 max = rep_max[c]; /* zero for max => infinity */
2773 if (max == 0) max = INT_MAX;
2774 break;
2775
2776 case OP_CRRANGE:
2777 case OP_CRMINRANGE:
2778 minimize = (*ecode == OP_CRMINRANGE);
2779 min = GET2(ecode, 1);
2780 max = GET2(ecode, 3);
2781 if (max == 0) max = INT_MAX;
2782 ecode += 5;
2783 break;
2784
2785 default: /* No repeat follows */
2786 min = max = 1;
2787 break;
2788 }
2789
2790 /* First, ensure the minimum number of matches are present. */
2791
2792 for (i = 1; i <= min; i++)
2793 {
2794 if (eptr >= md->end_subject)
2795 {
2796 SCHECK_PARTIAL();
2797 MRRETURN(MATCH_NOMATCH);
2798 }
2799 GETCHARINCTEST(c, eptr);
2800 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2801 }
2802
2803 /* If max == min we can continue with the main loop without the
2804 need to recurse. */
2805
2806 if (min == max) continue;
2807
2808 /* If minimizing, keep testing the rest of the expression and advancing
2809 the pointer while it matches the class. */
2810
2811 if (minimize)
2812 {
2813 for (fi = min;; fi++)
2814 {
2815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2817 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2818 if (eptr >= md->end_subject)
2819 {
2820 SCHECK_PARTIAL();
2821 MRRETURN(MATCH_NOMATCH);
2822 }
2823 GETCHARINCTEST(c, eptr);
2824 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2825 }
2826 /* Control never gets here */
2827 }
2828
2829 /* If maximizing, find the longest possible run, then work backwards. */
2830
2831 else
2832 {
2833 pp = eptr;
2834 for (i = min; i < max; i++)
2835 {
2836 int len = 1;
2837 if (eptr >= md->end_subject)
2838 {
2839 SCHECK_PARTIAL();
2840 break;
2841 }
2842 GETCHARLENTEST(c, eptr, len);
2843 if (!_pcre_xclass(c, data)) break;
2844 eptr += len;
2845 }
2846 for(;;)
2847 {
2848 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2850 if (eptr-- == pp) break; /* Stop if tried at original pos */
2851 if (utf8) BACKCHAR(eptr);
2852 }
2853 MRRETURN(MATCH_NOMATCH);
2854 }
2855
2856 /* Control never gets here */
2857 }
2858 #endif /* End of XCLASS */
2859
2860 /* Match a single character, casefully */
2861
2862 case OP_CHAR:
2863 #ifdef SUPPORT_UTF8
2864 if (utf8)
2865 {
2866 length = 1;
2867 ecode++;
2868 GETCHARLEN(fc, ecode, length);
2869 if (length > md->end_subject - eptr)
2870 {
2871 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2872 MRRETURN(MATCH_NOMATCH);
2873 }
2874 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2875 }
2876 else
2877 #endif
2878
2879 /* Non-UTF-8 mode */
2880 {
2881 if (md->end_subject - eptr < 1)
2882 {
2883 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2884 MRRETURN(MATCH_NOMATCH);
2885 }
2886 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2887 ecode += 2;
2888 }
2889 break;
2890
2891 /* Match a single character, caselessly */
2892
2893 case OP_CHARI:
2894 #ifdef SUPPORT_UTF8
2895 if (utf8)
2896 {
2897 length = 1;
2898 ecode++;
2899 GETCHARLEN(fc, ecode, length);
2900
2901 if (length > md->end_subject - eptr)
2902 {
2903 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2904 MRRETURN(MATCH_NOMATCH);
2905 }
2906
2907 /* If the pattern character's value is < 128, we have only one byte, and
2908 can use the fast lookup table. */
2909
2910 if (fc < 128)
2911 {
2912 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2913 }
2914
2915 /* Otherwise we must pick up the subject character */
2916
2917 else
2918 {
2919 unsigned int dc;
2920 GETCHARINC(dc, eptr);
2921 ecode += length;
2922
2923 /* If we have Unicode property support, we can use it to test the other
2924 case of the character, if there is one. */
2925
2926 if (fc != dc)
2927 {
2928 #ifdef SUPPORT_UCP
2929 if (dc != UCD_OTHERCASE(fc))
2930 #endif
2931 MRRETURN(MATCH_NOMATCH);
2932 }
2933 }
2934 }
2935 else
2936 #endif /* SUPPORT_UTF8 */
2937
2938 /* Non-UTF-8 mode */
2939 {
2940 if (md->end_subject - eptr < 1)
2941 {
2942 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2943 MRRETURN(MATCH_NOMATCH);
2944 }
2945 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2946 ecode += 2;
2947 }
2948 break;
2949
2950 /* Match a single character repeatedly. */
2951
2952 case OP_EXACT:
2953 case OP_EXACTI:
2954 min = max = GET2(ecode, 1);
2955 ecode += 3;
2956 goto REPEATCHAR;
2957
2958 case OP_POSUPTO:
2959 case OP_POSUPTOI:
2960 possessive = TRUE;
2961 /* Fall through */
2962
2963 case OP_UPTO:
2964 case OP_UPTOI:
2965 case OP_MINUPTO:
2966 case OP_MINUPTOI:
2967 min = 0;
2968 max = GET2(ecode, 1);
2969 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2970 ecode += 3;
2971 goto REPEATCHAR;
2972
2973 case OP_POSSTAR:
2974 case OP_POSSTARI:
2975 possessive = TRUE;
2976 min = 0;
2977 max = INT_MAX;
2978 ecode++;
2979 goto REPEATCHAR;
2980
2981 case OP_POSPLUS:
2982 case OP_POSPLUSI:
2983 possessive = TRUE;
2984 min = 1;
2985 max = INT_MAX;
2986 ecode++;
2987 goto REPEATCHAR;
2988
2989 case OP_POSQUERY:
2990 case OP_POSQUERYI:
2991 possessive = TRUE;
2992 min = 0;
2993 max = 1;
2994 ecode++;
2995 goto REPEATCHAR;
2996
2997 case OP_STAR:
2998 case OP_STARI:
2999 case OP_MINSTAR:
3000 case OP_MINSTARI:
3001 case OP_PLUS:
3002 case OP_PLUSI:
3003 case OP_MINPLUS:
3004 case OP_MINPLUSI:
3005 case OP_QUERY:
3006 case OP_QUERYI:
3007 case OP_MINQUERY:
3008 case OP_MINQUERYI:
3009 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3010 minimize = (c & 1) != 0;
3011 min = rep_min[c]; /* Pick up values from tables; */
3012 max = rep_max[c]; /* zero for max => infinity */
3013 if (max == 0) max = INT_MAX;
3014
3015 /* Common code for all repeated single-character matches. */
3016
3017 REPEATCHAR:
3018 #ifdef SUPPORT_UTF8
3019 if (utf8)
3020 {
3021 length = 1;
3022 charptr = ecode;
3023 GETCHARLEN(fc, ecode, length);
3024 ecode += length;
3025
3026 /* Handle multibyte character matching specially here. There is
3027 support for caseless matching if UCP support is present. */
3028
3029 if (length > 1)
3030 {
3031 #ifdef SUPPORT_UCP
3032 unsigned int othercase;
3033 if (op >= OP_STARI && /* Caseless */
3034 (othercase = UCD_OTHERCASE(fc)) != fc)
3035 oclength = _pcre_ord2utf8(othercase, occhars);
3036 else oclength = 0;
3037 #endif /* SUPPORT_UCP */
3038
3039 for (i = 1; i <= min; i++)
3040 {
3041 if (eptr <= md->end_subject - length &&
3042 memcmp(eptr, charptr, length) == 0) eptr += length;
3043 #ifdef SUPPORT_UCP
3044 else if (oclength > 0 &&
3045 eptr <= md->end_subject - oclength &&
3046 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3047 #endif /* SUPPORT_UCP */
3048 else
3049 {
3050 CHECK_PARTIAL();
3051 MRRETURN(MATCH_NOMATCH);
3052 }
3053 }
3054
3055 if (min == max) continue;
3056
3057 if (minimize)
3058 {
3059 for (fi = min;; fi++)
3060 {
3061 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3062 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3063 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3064 if (eptr <= md->end_subject - length &&
3065 memcmp(eptr, charptr, length) == 0) eptr += length;
3066 #ifdef SUPPORT_UCP
3067 else if (oclength > 0 &&
3068 eptr <= md->end_subject - oclength &&
3069 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3070 #endif /* SUPPORT_UCP */
3071 else
3072 {
3073 CHECK_PARTIAL();
3074 MRRETURN(MATCH_NOMATCH);
3075 }
3076 }
3077 /* Control never gets here */
3078 }
3079
3080 else /* Maximize */
3081 {
3082 pp = eptr;
3083 for (i = min; i < max; i++)
3084 {
3085 if (eptr <= md->end_subject - length &&
3086 memcmp(eptr, charptr, length) == 0) eptr += length;
3087 #ifdef SUPPORT_UCP
3088 else if (oclength > 0 &&
3089 eptr <= md->end_subject - oclength &&
3090 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3091 #endif /* SUPPORT_UCP */
3092 else
3093 {
3094 CHECK_PARTIAL();
3095 break;
3096 }
3097 }
3098
3099 if (possessive) continue;
3100
3101 for(;;)
3102 {
3103 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3104 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3105 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3106 #ifdef SUPPORT_UCP
3107 eptr--;
3108 BACKCHAR(eptr);
3109 #else /* without SUPPORT_UCP */
3110 eptr -= length;
3111 #endif /* SUPPORT_UCP */
3112 }
3113 }
3114 /* Control never gets here */
3115 }
3116
3117 /* If the length of a UTF-8 character is 1, we fall through here, and
3118 obey the code as for non-UTF-8 characters below, though in this case the
3119 value of fc will always be < 128. */
3120 }
3121 else
3122 #endif /* SUPPORT_UTF8 */
3123
3124 /* When not in UTF-8 mode, load a single-byte character. */
3125
3126 fc = *ecode++;
3127
3128 /* The value of fc at this point is always less than 256, though we may or
3129 may not be in UTF-8 mode. The code is duplicated for the caseless and
3130 caseful cases, for speed, since matching characters is likely to be quite
3131 common. First, ensure the minimum number of matches are present. If min =
3132 max, continue at the same level without recursing. Otherwise, if
3133 minimizing, keep trying the rest of the expression and advancing one
3134 matching character if failing, up to the maximum. Alternatively, if
3135 maximizing, find the maximum number of characters and work backwards. */
3136
3137 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3138 max, eptr));
3139
3140 if (op >= OP_STARI) /* Caseless */
3141 {
3142 fc = md->lcc[fc];
3143 for (i = 1; i <= min; i++)
3144 {
3145 if (eptr >= md->end_subject)
3146 {
3147 SCHECK_PARTIAL();
3148 MRRETURN(MATCH_NOMATCH);
3149 }
3150 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3151 }
3152 if (min == max) continue;
3153 if (minimize)
3154 {
3155 for (fi = min;; fi++)
3156 {
3157 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3158 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3159 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3160 if (eptr >= md->end_subject)
3161 {
3162 SCHECK_PARTIAL();
3163 MRRETURN(MATCH_NOMATCH);
3164 }
3165 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3166 }
3167 /* Control never gets here */
3168 }
3169 else /* Maximize */
3170 {
3171 pp = eptr;
3172 for (i = min; i < max; i++)
3173 {
3174 if (eptr >= md->end_subject)
3175 {
3176 SCHECK_PARTIAL();
3177 break;
3178 }
3179 if (fc != md->lcc[*eptr]) break;
3180 eptr++;
3181 }
3182
3183 if (possessive) continue;
3184
3185 while (eptr >= pp)
3186 {
3187 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3188 eptr--;
3189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3190 }
3191 MRRETURN(MATCH_NOMATCH);
3192 }
3193 /* Control never gets here */
3194 }
3195
3196 /* Caseful comparisons (includes all multi-byte characters) */
3197
3198 else
3199 {
3200 for (i = 1; i <= min; i++)
3201 {
3202 if (eptr >= md->end_subject)
3203 {
3204 SCHECK_PARTIAL();
3205 MRRETURN(MATCH_NOMATCH);
3206 }
3207 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3208 }
3209
3210 if (min == max) continue;
3211
3212 if (minimize)
3213 {
3214 for (fi = min;; fi++)
3215 {
3216 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3217 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3218 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3219 if (eptr >= md->end_subject)
3220 {
3221 SCHECK_PARTIAL();
3222 MRRETURN(MATCH_NOMATCH);
3223 }
3224 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3225 }
3226 /* Control never gets here */
3227 }
3228 else /* Maximize */
3229 {
3230 pp = eptr;
3231 for (i = min; i < max; i++)
3232 {
3233 if (eptr >= md->end_subject)
3234 {
3235 SCHECK_PARTIAL();
3236 break;
3237 }
3238 if (fc != *eptr) break;
3239 eptr++;
3240 }
3241 if (possessive) continue;
3242
3243 while (eptr >= pp)
3244 {
3245 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3246 eptr--;
3247 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3248 }
3249 MRRETURN(MATCH_NOMATCH);
3250 }
3251 }
3252 /* Control never gets here */
3253
3254 /* Match a negated single one-byte character. The character we are
3255 checking can be multibyte. */
3256
3257 case OP_NOT:
3258 case OP_NOTI:
3259 if (eptr >= md->end_subject)
3260 {
3261 SCHECK_PARTIAL();
3262 MRRETURN(MATCH_NOMATCH);
3263 }
3264 ecode++;
3265 GETCHARINCTEST(c, eptr);
3266 if (op == OP_NOTI) /* The caseless case */
3267 {
3268 #ifdef SUPPORT_UTF8
3269 if (c < 256)
3270 #endif
3271 c = md->lcc[c];
3272 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3273 }
3274 else /* Caseful */
3275 {
3276 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3277 }
3278 break;
3279
3280 /* Match a negated single one-byte character repeatedly. This is almost a
3281 repeat of the code for a repeated single character, but I haven't found a
3282 nice way of commoning these up that doesn't require a test of the
3283 positive/negative option for each character match. Maybe that wouldn't add
3284 very much to the time taken, but character matching *is* what this is all
3285 about... */
3286
3287 case OP_NOTEXACT:
3288 case OP_NOTEXACTI:
3289 min = max = GET2(ecode, 1);
3290 ecode += 3;
3291 goto REPEATNOTCHAR;
3292
3293 case OP_NOTUPTO:
3294 case OP_NOTUPTOI:
3295 case OP_NOTMINUPTO:
3296 case OP_NOTMINUPTOI:
3297 min = 0;
3298 max = GET2(ecode, 1);
3299 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3300 ecode += 3;
3301 goto REPEATNOTCHAR;
3302
3303 case OP_NOTPOSSTAR:
3304 case OP_NOTPOSSTARI:
3305 possessive = TRUE;
3306 min = 0;
3307 max = INT_MAX;
3308 ecode++;
3309 goto REPEATNOTCHAR;
3310
3311 case OP_NOTPOSPLUS:
3312 case OP_NOTPOSPLUSI:
3313 possessive = TRUE;
3314 min = 1;
3315 max = INT_MAX;
3316 ecode++;
3317 goto REPEATNOTCHAR;
3318
3319 case OP_NOTPOSQUERY:
3320 case OP_NOTPOSQUERYI:
3321 possessive = TRUE;
3322 min = 0;
3323 max = 1;
3324 ecode++;
3325 goto REPEATNOTCHAR;
3326
3327 case OP_NOTPOSUPTO:
3328 case OP_NOTPOSUPTOI:
3329 possessive = TRUE;
3330 min = 0;
3331 max = GET2(ecode, 1);
3332 ecode += 3;
3333 goto REPEATNOTCHAR;
3334
3335 case OP_NOTSTAR:
3336 case OP_NOTSTARI:
3337 case OP_NOTMINSTAR:
3338 case OP_NOTMINSTARI:
3339 case OP_NOTPLUS:
3340 case OP_NOTPLUSI:
3341 case OP_NOTMINPLUS:
3342 case OP_NOTMINPLUSI:
3343 case OP_NOTQUERY:
3344 case OP_NOTQUERYI:
3345 case OP_NOTMINQUERY:
3346 case OP_NOTMINQUERYI:
3347 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3348 minimize = (c & 1) != 0;
3349 min = rep_min[c]; /* Pick up values from tables; */
3350 max = rep_max[c]; /* zero for max => infinity */
3351 if (max == 0) max = INT_MAX;
3352
3353 /* Common code for all repeated single-byte matches. */
3354
3355 REPEATNOTCHAR:
3356 fc = *ecode++;
3357
3358 /* The code is duplicated for the caseless and caseful cases, for speed,
3359 since matching characters is likely to be quite common. First, ensure the
3360 minimum number of matches are present. If min = max, continue at the same
3361 level without recursing. Otherwise, if minimizing, keep trying the rest of
3362 the expression and advancing one matching character if failing, up to the
3363 maximum. Alternatively, if maximizing, find the maximum number of
3364 characters and work backwards. */
3365
3366 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3367 max, eptr));
3368
3369 if (op >= OP_NOTSTARI) /* Caseless */
3370 {
3371 fc = md->lcc[fc];
3372
3373 #ifdef SUPPORT_UTF8
3374 /* UTF-8 mode */
3375 if (utf8)
3376 {
3377 register unsigned int d;
3378 for (i = 1; i <= min; i++)
3379 {
3380 if (eptr >= md->end_subject)
3381 {
3382 SCHECK_PARTIAL();
3383 MRRETURN(MATCH_NOMATCH);
3384 }
3385 GETCHARINC(d, eptr);
3386 if (d < 256) d = md->lcc[d];
3387 if (fc == d) MRRETURN(MATCH_NOMATCH);
3388 }
3389 }
3390 else
3391 #endif
3392
3393 /* Not UTF-8 mode */
3394 {
3395 for (i = 1; i <= min; i++)
3396 {
3397 if (eptr >= md->end_subject)
3398 {
3399 SCHECK_PARTIAL();
3400 MRRETURN(MATCH_NOMATCH);
3401 }
3402 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3403 }
3404 }
3405
3406 if (min == max) continue;
3407
3408 if (minimize)
3409 {
3410 #ifdef SUPPORT_UTF8
3411 /* UTF-8 mode */
3412 if (utf8)
3413 {
3414 register unsigned int d;
3415 for (fi = min;; fi++)
3416 {
3417 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3418 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3420 if (eptr >= md->end_subject)
3421 {
3422 SCHECK_PARTIAL();
3423 MRRETURN(MATCH_NOMATCH);
3424 }
3425 GETCHARINC(d, eptr);
3426 if (d < 256) d = md->lcc[d];
3427 if (fc == d) MRRETURN(MATCH_NOMATCH);
3428 }
3429 }
3430 else
3431 #endif
3432 /* Not UTF-8 mode */
3433 {
3434 for (fi = min;; fi++)
3435 {
3436 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3437 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3439 if (eptr >= md->end_subject)
3440 {
3441 SCHECK_PARTIAL();
3442 MRRETURN(MATCH_NOMATCH);
3443 }
3444 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3445 }
3446 }
3447 /* Control never gets here */
3448 }
3449
3450 /* Maximize case */
3451
3452 else
3453 {
3454 pp = eptr;
3455
3456 #ifdef SUPPORT_UTF8
3457 /* UTF-8 mode */
3458 if (utf8)
3459 {
3460 register unsigned int d;
3461 for (i = min; i < max; i++)
3462 {
3463 int len = 1;
3464 if (eptr >= md->end_subject)
3465 {
3466 SCHECK_PARTIAL();
3467 break;
3468 }
3469 GETCHARLEN(d, eptr, len);
3470 if (d < 256) d = md->lcc[d];
3471 if (fc == d) break;
3472 eptr += len;
3473 }
3474 if (possessive) continue;
3475 for(;;)
3476 {
3477 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3478 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479 if (eptr-- == pp) break; /* Stop if tried at original pos */
3480 BACKCHAR(eptr);
3481 }
3482 }
3483 else
3484 #endif
3485 /* Not UTF-8 mode */
3486 {
3487 for (i = min; i < max; i++)
3488 {
3489 if (eptr >= md->end_subject)
3490 {
3491 SCHECK_PARTIAL();
3492 break;
3493 }
3494 if (fc == md->lcc[*eptr]) break;
3495 eptr++;
3496 }
3497 if (possessive) continue;
3498 while (eptr >= pp)
3499 {
3500 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3501 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3502 eptr--;
3503 }
3504 }
3505
3506 MRRETURN(MATCH_NOMATCH);
3507 }
3508 /* Control never gets here */
3509 }
3510
3511 /* Caseful comparisons */
3512
3513 else
3514 {
3515 #ifdef SUPPORT_UTF8
3516 /* UTF-8 mode */
3517 if (utf8)
3518 {
3519 register unsigned int d;
3520 for (i = 1; i <= min; i++)
3521 {
3522 if (eptr >= md->end_subject)
3523 {
3524 SCHECK_PARTIAL();
3525 MRRETURN(MATCH_NOMATCH);
3526 }
3527 GETCHARINC(d, eptr);
3528 if (fc == d) MRRETURN(MATCH_NOMATCH);
3529 }
3530 }
3531 else
3532 #endif
3533 /* Not UTF-8 mode */
3534 {
3535 for (i = 1; i <= min; i++)
3536 {
3537 if (eptr >= md->end_subject)
3538 {
3539 SCHECK_PARTIAL();
3540 MRRETURN(MATCH_NOMATCH);
3541 }
3542 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3543 }
3544 }
3545
3546 if (min == max) continue;
3547
3548 if (minimize)
3549 {
3550 #ifdef SUPPORT_UTF8
3551 /* UTF-8 mode */
3552 if (utf8)
3553 {
3554 register unsigned int d;
3555 for (fi = min;; fi++)
3556 {
3557 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3559 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3560 if (eptr >= md->end_subject)
3561 {
3562 SCHECK_PARTIAL();
3563 MRRETURN(MATCH_NOMATCH);
3564 }
3565 GETCHARINC(d, eptr);
3566 if (fc == d) MRRETURN(MATCH_NOMATCH);
3567 }
3568 }
3569 else
3570 #endif
3571 /* Not UTF-8 mode */
3572 {
3573 for (fi = min;; fi++)
3574 {
3575 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3577 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3578 if (eptr >= md->end_subject)
3579 {
3580 SCHECK_PARTIAL();
3581 MRRETURN(MATCH_NOMATCH);
3582 }
3583 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3584 }
3585 }
3586 /* Control never gets here */
3587 }
3588
3589 /* Maximize case */
3590
3591 else
3592 {
3593 pp = eptr;
3594
3595 #ifdef SUPPORT_UTF8
3596 /* UTF-8 mode */
3597 if (utf8)
3598 {
3599 register unsigned int d;
3600 for (i = min; i < max; i++)
3601 {
3602 int len = 1;
3603 if (eptr >= md->end_subject)
3604 {
3605 SCHECK_PARTIAL();
3606 break;
3607 }
3608 GETCHARLEN(d, eptr, len);
3609 if (fc == d) break;
3610 eptr += len;
3611 }
3612 if (possessive) continue;
3613 for(;;)
3614 {
3615 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3616 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3617 if (eptr-- == pp) break; /* Stop if tried at original pos */
3618 BACKCHAR(eptr);
3619 }
3620 }
3621 else
3622 #endif
3623 /* Not UTF-8 mode */
3624 {
3625 for (i = min; i < max; i++)
3626 {
3627 if (eptr >= md->end_subject)
3628 {
3629 SCHECK_PARTIAL();
3630 break;
3631 }
3632 if (fc == *eptr) break;
3633 eptr++;
3634 }
3635 if (possessive) continue;
3636 while (eptr >= pp)
3637 {
3638 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3640 eptr--;
3641 }
3642 }
3643
3644 MRRETURN(MATCH_NOMATCH);
3645 }
3646 }
3647 /* Control never gets here */
3648
3649 /* Match a single character type repeatedly; several different opcodes
3650 share code. This is very similar to the code for single characters, but we
3651 repeat it in the interests of efficiency. */
3652
3653 case OP_TYPEEXACT:
3654 min = max = GET2(ecode, 1);
3655 minimize = TRUE;
3656 ecode += 3;
3657 goto REPEATTYPE;
3658
3659 case OP_TYPEUPTO:
3660 case OP_TYPEMINUPTO:
3661 min = 0;
3662 max = GET2(ecode, 1);
3663 minimize = *ecode == OP_TYPEMINUPTO;
3664 ecode += 3;
3665 goto REPEATTYPE;
3666
3667 case OP_TYPEPOSSTAR:
3668 possessive = TRUE;
3669 min = 0;
3670 max = INT_MAX;
3671 ecode++;
3672 goto REPEATTYPE;
3673
3674 case OP_TYPEPOSPLUS:
3675 possessive = TRUE;
3676 min = 1;
3677 max = INT_MAX;
3678 ecode++;
3679 goto REPEATTYPE;
3680
3681 case OP_TYPEPOSQUERY:
3682 possessive = TRUE;
3683 min = 0;
3684 max = 1;
3685 ecode++;
3686 goto REPEATTYPE;
3687
3688 case OP_TYPEPOSUPTO:
3689 possessive = TRUE;
3690 min = 0;
3691 max = GET2(ecode, 1);
3692 ecode += 3;
3693 goto REPEATTYPE;
3694
3695 case OP_TYPESTAR:
3696 case OP_TYPEMINSTAR:
3697 case OP_TYPEPLUS:
3698 case OP_TYPEMINPLUS:
3699 case OP_TYPEQUERY:
3700 case OP_TYPEMINQUERY:
3701 c = *ecode++ - OP_TYPESTAR;
3702 minimize = (c & 1) != 0;
3703 min = rep_min[c]; /* Pick up values from tables; */
3704 max = rep_max[c]; /* zero for max => infinity */
3705 if (max == 0) max = INT_MAX;
3706
3707 /* Common code for all repeated single character type matches. Note that
3708 in UTF-8 mode, '.' matches a character of any length, but for the other
3709 character types, the valid characters are all one-byte long. */
3710
3711 REPEATTYPE:
3712 ctype = *ecode++; /* Code for the character type */
3713
3714 #ifdef SUPPORT_UCP
3715 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3716 {
3717 prop_fail_result = ctype == OP_NOTPROP;
3718 prop_type = *ecode++;
3719 prop_value = *ecode++;
3720 }
3721 else prop_type = -1;
3722 #endif
3723
3724 /* First, ensure the minimum number of matches are present. Use inline
3725 code for maximizing the speed, and do the type test once at the start
3726 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3727 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3728 and single-bytes. */
3729
3730 if (min > 0)
3731 {
3732 #ifdef SUPPORT_UCP
3733 if (prop_type >= 0)
3734 {
3735 switch(prop_type)
3736 {
3737 case PT_ANY:
3738 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3739 for (i = 1; i <= min; i++)
3740 {
3741 if (eptr >= md->end_subject)
3742 {
3743 SCHECK_PARTIAL();
3744 MRRETURN(MATCH_NOMATCH);
3745 }
3746 GETCHARINCTEST(c, eptr);
3747 }
3748 break;
3749
3750 case PT_LAMP:
3751 for (i = 1; i <= min; i++)
3752 {
3753 if (eptr >= md->end_subject)
3754 {
3755 SCHECK_PARTIAL();
3756 MRRETURN(MATCH_NOMATCH);
3757 }
3758 GETCHARINCTEST(c, eptr);
3759 prop_chartype = UCD_CHARTYPE(c);
3760 if ((prop_chartype == ucp_Lu ||
3761 prop_chartype == ucp_Ll ||
3762 prop_chartype == ucp_Lt) == prop_fail_result)
3763 MRRETURN(MATCH_NOMATCH);
3764 }
3765 break;
3766
3767 case PT_GC:
3768 for (i = 1; i <= min; i++)
3769 {
3770 if (eptr >= md->end_subject)
3771 {
3772 SCHECK_PARTIAL();
3773 MRRETURN(MATCH_NOMATCH);
3774 }
3775 GETCHARINCTEST(c, eptr);
3776 prop_category = UCD_CATEGORY(c);
3777 if ((prop_category == prop_value) == prop_fail_result)
3778 MRRETURN(MATCH_NOMATCH);
3779 }
3780 break;
3781
3782 case PT_PC:
3783 for (i = 1; i <= min; i++)
3784 {
3785 if (eptr >= md->end_subject)
3786 {
3787 SCHECK_PARTIAL();
3788 MRRETURN(MATCH_NOMATCH);
3789 }
3790 GETCHARINCTEST(c, eptr);
3791 prop_chartype = UCD_CHARTYPE(c);
3792 if ((prop_chartype == prop_value) == prop_fail_result)
3793 MRRETURN(MATCH_NOMATCH);
3794 }
3795 break;
3796
3797 case PT_SC:
3798 for (i = 1; i <= min; i++)
3799 {
3800 if (eptr >= md->end_subject)
3801 {
3802 SCHECK_PARTIAL();
3803 MRRETURN(MATCH_NOMATCH);
3804 }
3805 GETCHARINCTEST(c, eptr);
3806 prop_script = UCD_SCRIPT(c);
3807 if ((prop_script == prop_value) == prop_fail_result)
3808 MRRETURN(MATCH_NOMATCH);
3809 }
3810 break;
3811
3812 case PT_ALNUM:
3813 for (i = 1; i <= min; i++)
3814 {
3815 if (eptr >= md->end_subject)
3816 {
3817 SCHECK_PARTIAL();
3818 MRRETURN(MATCH_NOMATCH);
3819 }
3820 GETCHARINCTEST(c, eptr);
3821 prop_category = UCD_CATEGORY(c);
3822 if ((prop_category == ucp_L || prop_category == ucp_N)
3823 == prop_fail_result)
3824 MRRETURN(MATCH_NOMATCH);
3825 }
3826 break;
3827
3828 case PT_SPACE: /* Perl space */
3829 for (i = 1; i <= min; i++)
3830 {
3831 if (eptr >= md->end_subject)
3832 {
3833 SCHECK_PARTIAL();
3834 MRRETURN(MATCH_NOMATCH);
3835 }
3836 GETCHARINCTEST(c, eptr);
3837 prop_category = UCD_CATEGORY(c);
3838 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3839 c == CHAR_FF || c == CHAR_CR)
3840 == prop_fail_result)
3841 MRRETURN(MATCH_NOMATCH);
3842 }
3843 break;
3844
3845 case PT_PXSPACE: /* POSIX space */
3846 for (i = 1; i <= min; i++)
3847 {
3848 if (eptr >= md->end_subject)
3849 {
3850 SCHECK_PARTIAL();
3851 MRRETURN(MATCH_NOMATCH);
3852 }
3853 GETCHARINCTEST(c, eptr);
3854 prop_category = UCD_CATEGORY(c);
3855 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3856 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3857 == prop_fail_result)
3858 MRRETURN(MATCH_NOMATCH);
3859 }
3860 break;
3861
3862 case PT_WORD:
3863 for (i = 1; i <= min; i++)
3864 {
3865 if (eptr >= md->end_subject)
3866 {
3867 SCHECK_PARTIAL();
3868 MRRETURN(MATCH_NOMATCH);
3869 }
3870 GETCHARINCTEST(c, eptr);
3871 prop_category = UCD_CATEGORY(c);
3872 if ((prop_category == ucp_L || prop_category == ucp_N ||
3873 c == CHAR_UNDERSCORE)
3874 == prop_fail_result)
3875 MRRETURN(MATCH_NOMATCH);
3876 }
3877 break;
3878
3879 /* This should not occur */
3880
3881 default:
3882 RRETURN(PCRE_ERROR_INTERNAL);
3883 }
3884 }
3885
3886 /* Match extended Unicode sequences. We will get here only if the
3887 support is in the binary; otherwise a compile-time error occurs. */
3888
3889 else if (ctype == OP_EXTUNI)
3890 {
3891 for (i = 1; i <= min; i++)
3892 {
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 MRRETURN(MATCH_NOMATCH);
3897 }
3898 GETCHARINCTEST(c, eptr);
3899 prop_category = UCD_CATEGORY(c);
3900 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3901 while (eptr < md->end_subject)
3902 {
3903 int len = 1;
3904 if (!utf8) c = *eptr;
3905 else { GETCHARLEN(c, eptr, len); }
3906 prop_category = UCD_CATEGORY(c);
3907 if (prop_category != ucp_M) break;
3908 eptr += len;
3909 }
3910 }
3911 }
3912
3913 else
3914 #endif /* SUPPORT_UCP */
3915
3916 /* Handle all other cases when the coding is UTF-8 */
3917
3918 #ifdef SUPPORT_UTF8
3919 if (utf8) switch(ctype)
3920 {
3921 case OP_ANY:
3922 for (i = 1; i <= min; i++)
3923 {
3924 if (eptr >= md->end_subject)
3925 {
3926 SCHECK_PARTIAL();
3927 MRRETURN(MATCH_NOMATCH);
3928 }
3929 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3930 eptr++;
3931 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3932 }
3933 break;
3934
3935 case OP_ALLANY:
3936 for (i = 1; i <= min; i++)
3937 {
3938 if (eptr >= md->end_subject)
3939 {
3940 SCHECK_PARTIAL();
3941 MRRETURN(MATCH_NOMATCH);
3942 }
3943 eptr++;
3944 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3945 }
3946 break;
3947
3948 case OP_ANYBYTE:
3949 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3950 eptr += min;
3951 break;
3952
3953 case OP_ANYNL:
3954 for (i = 1; i <= min; i++)
3955 {
3956 if (eptr >= md->end_subject)
3957 {
3958 SCHECK_PARTIAL();
3959 MRRETURN(MATCH_NOMATCH);
3960 }
3961 GETCHARINC(c, eptr);
3962 switch(c)
3963 {
3964 default: MRRETURN(MATCH_NOMATCH);
3965
3966 case 0x000d:
3967 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3968 break;
3969
3970 case 0x000a:
3971 break;
3972
3973 case 0x000b:
3974 case 0x000c:
3975 case 0x0085:
3976 case 0x2028:
3977 case 0x2029:
3978 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3979 break;
3980 }
3981 }
3982 break;
3983
3984 case OP_NOT_HSPACE:
3985 for (i = 1; i <= min; i++)
3986 {
3987 if (eptr >= md->end_subject)
3988 {
3989 SCHECK_PARTIAL();
3990 MRRETURN(MATCH_NOMATCH);
3991 }
3992 GETCHARINC(c, eptr);
3993 switch(c)
3994 {
3995 default: break;
3996 case 0x09: /* HT */
3997 case 0x20: /* SPACE */
3998 case 0xa0: /* NBSP */
3999 case 0x1680: /* OGHAM SPACE MARK */
4000 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4001 case 0x2000: /* EN QUAD */
4002 case 0x2001: /* EM QUAD */
4003 case 0x2002: /* EN SPACE */
4004 case 0x2003: /* EM SPACE */
4005 case 0x2004: /* THREE-PER-EM SPACE */
4006 case 0x2005: /* FOUR-PER-EM SPACE */
4007 case 0x2006: /* SIX-PER-EM SPACE */
4008 case 0x2007: /* FIGURE SPACE */
4009 case 0x2008: /* PUNCTUATION SPACE */
4010 case 0x2009: /* THIN SPACE */
4011 case 0x200A: /* HAIR SPACE */
4012 case 0x202f: /* NARROW NO-BREAK SPACE */
4013 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4014 case 0x3000: /* IDEOGRAPHIC SPACE */
4015 MRRETURN(MATCH_NOMATCH);
4016 }
4017 }
4018 break;
4019
4020 case OP_HSPACE:
4021 for (i = 1; i <= min; i++)
4022 {
4023 if (eptr >= md->end_subject)
4024 {
4025 SCHECK_PARTIAL();
4026 MRRETURN(MATCH_NOMATCH);
4027 }
4028 GETCHARINC(c, eptr);
4029 switch(c)
4030 {
4031 default: MRRETURN(MATCH_NOMATCH);
4032 case 0x09: /* HT */
4033 case 0x20: /* SPACE */
4034 case 0xa0: /* NBSP */
4035 case 0x1680: /* OGHAM SPACE MARK */
4036 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4037 case 0x2000: /* EN QUAD */
4038 case 0x2001: /* EM QUAD */
4039 case 0x2002: /* EN SPACE */
4040 case 0x2003: /* EM SPACE */
4041 case 0x2004: /* THREE-PER-EM SPACE */
4042 case 0x2005: /* FOUR-PER-EM SPACE */
4043 case 0x2006: /* SIX-PER-EM SPACE */
4044 case 0x2007: /* FIGURE SPACE */
4045 case 0x2008: /* PUNCTUATION SPACE */
4046 case 0x2009: /* THIN SPACE */
4047 case 0x200A: /* HAIR SPACE */
4048 case 0x202f: /* NARROW NO-BREAK SPACE */
4049 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4050 case 0x3000: /* IDEOGRAPHIC SPACE */
4051 break;
4052 }
4053 }
4054 break;
4055
4056 case OP_NOT_VSPACE:
4057 for (i = 1; i <= min; i++)
4058 {
4059 if (eptr >= md->end_subject)
4060 {
4061 SCHECK_PARTIAL();
4062 MRRETURN(MATCH_NOMATCH);
4063 }
4064 GETCHARINC(c, eptr);
4065 switch(c)
4066 {
4067 default: break;
4068 case 0x0a: /* LF */
4069 case 0x0b: /* VT */
4070 case 0x0c: /* FF */
4071 case 0x0d: /* CR */
4072 case 0x85: /* NEL */
4073 case 0x2028: /* LINE SEPARATOR */
4074 case 0x2029: /* PARAGRAPH SEPARATOR */
4075 MRRETURN(MATCH_NOMATCH);
4076 }
4077 }
4078 break;
4079
4080 case OP_VSPACE:
4081 for (i = 1; i <= min; i++)
4082 {
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 MRRETURN(MATCH_NOMATCH);
4087 }
4088 GETCHARINC(c, eptr);
4089 switch(c)
4090 {
4091 default: MRRETURN(MATCH_NOMATCH);
4092 case 0x0a: /* LF */
4093 case 0x0b: /* VT */
4094 case 0x0c: /* FF */
4095 case 0x0d: /* CR */
4096 case 0x85: /* NEL */
4097 case 0x2028: /* LINE SEPARATOR */
4098 case 0x2029: /* PARAGRAPH SEPARATOR */
4099 break;
4100 }
4101 }
4102 break;
4103
4104 case OP_NOT_DIGIT:
4105 for (i = 1; i <= min; i++)
4106 {
4107 if (eptr >= md->end_subject)
4108 {
4109 SCHECK_PARTIAL();
4110 MRRETURN(MATCH_NOMATCH);
4111 }
4112 GETCHARINC(c, eptr);
4113 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4114 MRRETURN(MATCH_NOMATCH);
4115 }
4116 break;
4117
4118 case OP_DIGIT:
4119 for (i = 1; i <= min; i++)
4120 {
4121 if (eptr >= md->end_subject)
4122 {
4123 SCHECK_PARTIAL();
4124 MRRETURN(MATCH_NOMATCH);
4125 }
4126 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4127 MRRETURN(MATCH_NOMATCH);
4128 /* No need to skip more bytes - we know it's a 1-byte character */
4129 }
4130 break;
4131
4132 case OP_NOT_WHITESPACE:
4133 for (i = 1; i <= min; i++)
4134 {
4135 if (eptr >= md->end_subject)
4136 {
4137 SCHECK_PARTIAL();
4138 MRRETURN(MATCH_NOMATCH);
4139 }
4140 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4141 MRRETURN(MATCH_NOMATCH);
4142 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4143 }
4144 break;
4145
4146 case OP_WHITESPACE:
4147 for (i = 1; i <= min; i++)
4148 {
4149 if (eptr >= md->end_subject)
4150 {
4151 SCHECK_PARTIAL();
4152 MRRETURN(MATCH_NOMATCH);
4153 }
4154 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4155 MRRETURN(MATCH_NOMATCH);
4156 /* No need to skip more bytes - we know it's a 1-byte character */
4157 }
4158 break;
4159
4160 case OP_NOT_WORDCHAR:
4161 for (i = 1; i <= min; i++)
4162 {
4163 if (eptr >= md->end_subject)
4164 {
4165 SCHECK_PARTIAL();
4166 MRRETURN(MATCH_NOMATCH);
4167 }
4168 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4169 MRRETURN(MATCH_NOMATCH);
4170 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4171 }
4172 break;
4173
4174 case OP_WORDCHAR:
4175 for (i = 1; i <= min; i++)
4176 {
4177 if (eptr >= md->end_subject)
4178 {
4179 SCHECK_PARTIAL();
4180 MRRETURN(MATCH_NOMATCH);
4181 }
4182 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4183 MRRETURN(MATCH_NOMATCH);
4184 /* No need to skip more bytes - we know it's a 1-byte character */
4185 }
4186 break;
4187
4188 default:
4189 RRETURN(PCRE_ERROR_INTERNAL);
4190 } /* End switch(ctype) */
4191
4192 else
4193 #endif /* SUPPORT_UTF8 */
4194
4195 /* Code for the non-UTF-8 case for minimum matching of operators other
4196 than OP_PROP and OP_NOTPROP. */
4197
4198 switch(ctype)
4199 {
4200 case OP_ANY:
4201 for (i = 1; i <= min; i++)
4202 {
4203 if (eptr >= md->end_subject)
4204 {
4205 SCHECK_PARTIAL();
4206 MRRETURN(MATCH_NOMATCH);
4207 }
4208 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4209 eptr++;
4210 }
4211 break;
4212
4213 case OP_ALLANY:
4214 if (eptr > md->end_subject - min)
4215 {
4216 SCHECK_PARTIAL();
4217 MRRETURN(MATCH_NOMATCH);
4218 }
4219 eptr += min;
4220 break;
4221
4222 case OP_ANYBYTE:
4223 if (eptr > md->end_subject - min)
4224 {
4225 SCHECK_PARTIAL();
4226 MRRETURN(MATCH_NOMATCH);
4227 }
4228 eptr += min;
4229 break;
4230
4231 case OP_ANYNL:
4232 for (i = 1; i <= min; i++)
4233 {
4234 if (eptr >= md->end_subject)
4235 {
4236 SCHECK_PARTIAL();
4237 MRRETURN(MATCH_NOMATCH);
4238 }
4239 switch(*eptr++)
4240 {
4241 default: MRRETURN(MATCH_NOMATCH);
4242
4243 case 0x000d:
4244 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4245 break;
4246
4247 case 0x000a:
4248 break;
4249
4250 case 0x000b:
4251 case 0x000c:
4252 case 0x0085:
4253 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4254 break;
4255 }
4256 }
4257 break;
4258
4259 case OP_NOT_HSPACE:
4260 for (i = 1; i <= min; i++)
4261 {
4262 if (eptr >= md->end_subject)
4263 {
4264 SCHECK_PARTIAL();
4265 MRRETURN(MATCH_NOMATCH);
4266 }
4267 switch(*eptr++)
4268 {
4269 default: break;
4270 case 0x09: /* HT */
4271 case 0x20: /* SPACE */
4272 case 0xa0: /* NBSP */
4273 MRRETURN(MATCH_NOMATCH);
4274 }
4275 }
4276 break;
4277
4278 case OP_HSPACE:
4279 for (i = 1; i <= min; i++)
4280 {
4281 if (eptr >= md->end_subject)
4282 {
4283 SCHECK_PARTIAL();
4284 MRRETURN(MATCH_NOMATCH);
4285 }
4286 switch(*eptr++)
4287 {
4288 default: MRRETURN(MATCH_NOMATCH);
4289 case 0x09: /* HT */
4290 case 0x20: /* SPACE */
4291 case 0xa0: /* NBSP */
4292 break;
4293 }
4294 }
4295 break;
4296
4297 case OP_NOT_VSPACE:
4298 for (i = 1; i <= min; i++)
4299 {
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 MRRETURN(MATCH_NOMATCH);
4304 }
4305 switch(*eptr++)
4306 {
4307 default: break;
4308 case 0x0a: /* LF */
4309 case 0x0b: /* VT */
4310 case 0x0c: /* FF */
4311 case 0x0d: /* CR */
4312 case 0x85: /* NEL */
4313 MRRETURN(MATCH_NOMATCH);
4314 }
4315 }
4316 break;
4317
4318 case OP_VSPACE:
4319 for (i = 1; i <= min; i++)
4320 {
4321 if (eptr >= md->end_subject)
4322 {
4323 SCHECK_PARTIAL();
4324 MRRETURN(MATCH_NOMATCH);
4325 }
4326 switch(*eptr++)
4327 {
4328 default: MRRETURN(MATCH_NOMATCH);
4329 case 0x0a: /* LF */
4330 case 0x0b: /* VT */
4331 case 0x0c: /* FF */
4332 case 0x0d: /* CR */
4333 case 0x85: /* NEL */
4334 break;
4335 }
4336 }
4337 break;
4338
4339 case OP_NOT_DIGIT:
4340 for (i = 1; i <= min; i++)
4341 {
4342 if (eptr >= md->end_subject)
4343 {
4344 SCHECK_PARTIAL();
4345 MRRETURN(MATCH_NOMATCH);
4346 }
4347 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4348 }
4349 break;
4350
4351 case OP_DIGIT:
4352 for (i = 1; i <= min; i++)
4353 {
4354 if (eptr >= md->end_subject)
4355 {
4356 SCHECK_PARTIAL();
4357 MRRETURN(MATCH_NOMATCH);
4358 }
4359 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4360 }
4361 break;
4362
4363 case OP_NOT_WHITESPACE:
4364 for (i = 1; i <= min; i++)
4365 {
4366 if (eptr >= md->end_subject)
4367 {
4368 SCHECK_PARTIAL();
4369 MRRETURN(MATCH_NOMATCH);
4370 }
4371 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4372 }
4373 break;
4374
4375 case OP_WHITESPACE:
4376 for (i = 1; i <= min; i++)
4377 {
4378 if (eptr >= md->end_subject)
4379 {
4380 SCHECK_PARTIAL();
4381 MRRETURN(MATCH_NOMATCH);
4382 }
4383 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4384 }
4385 break;
4386
4387 case OP_NOT_WORDCHAR:
4388 for (i = 1; i <= min; i++)
4389 {
4390 if (eptr >= md->end_subject)
4391 {
4392 SCHECK_PARTIAL();
4393 MRRETURN(MATCH_NOMATCH);
4394 }
4395 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4396 MRRETURN(MATCH_NOMATCH);
4397 }
4398 break;
4399
4400 case OP_WORDCHAR:
4401 for (i = 1; i <= min; i++)
4402 {
4403 if (eptr >= md->end_subject)
4404 {
4405 SCHECK_PARTIAL();
4406 MRRETURN(MATCH_NOMATCH);
4407 }
4408 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4409 MRRETURN(MATCH_NOMATCH);
4410 }
4411 break;
4412
4413 default:
4414 RRETURN(PCRE_ERROR_INTERNAL);
4415 }
4416 }
4417
4418 /* If min = max, continue at the same level without recursing */
4419
4420 if (min == max) continue;
4421
4422 /* If minimizing, we have to test the rest of the pattern before each
4423 subsequent match. Again, separate the UTF-8 case for speed, and also
4424 separate the UCP cases. */
4425
4426 if (minimize)
4427 {
4428 #ifdef SUPPORT_UCP
4429 if (prop_type >= 0)
4430 {
4431 switch(prop_type)
4432 {
4433 case PT_ANY:
4434 for (fi = min;; fi++)
4435 {
4436 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4437 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4438 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4439 if (eptr >= md->end_subject)
4440 {
4441 SCHECK_PARTIAL();
4442 MRRETURN(MATCH_NOMATCH);
4443 }
4444 GETCHARINCTEST(c, eptr);
4445 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4446 }
4447 /* Control never gets here */
4448
4449 case PT_LAMP:
4450 for (fi = min;; fi++)
4451 {
4452 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4453 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4454 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4455 if (eptr >= md->end_subject)
4456 {
4457 SCHECK_PARTIAL();
4458 MRRETURN(MATCH_NOMATCH);
4459 }
4460 GETCHARINCTEST(c, eptr);
4461 prop_chartype = UCD_CHARTYPE(c);
4462 if ((prop_chartype == ucp_Lu ||
4463 prop_chartype == ucp_Ll ||
4464 prop_chartype == ucp_Lt) == prop_fail_result)
4465 MRRETURN(MATCH_NOMATCH);
4466 }
4467 /* Control never gets here */
4468
4469 case PT_GC:
4470 for (fi = min;; fi++)
4471 {
4472 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4473 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4474 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4475 if (eptr >= md->end_subject)
4476 {
4477 SCHECK_PARTIAL();
4478 MRRETURN(MATCH_NOMATCH);
4479 }
4480 GETCHARINCTEST(c, eptr);
4481 prop_category = UCD_CATEGORY(c);
4482 if ((prop_category == prop_value) == prop_fail_result)
4483 MRRETURN(MATCH_NOMATCH);
4484 }
4485 /* Control never gets here */
4486
4487 case PT_PC:
4488 for (fi = min;; fi++)
4489 {
4490 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4491 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4492 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4493 if (eptr >= md->end_subject)
4494 {
4495 SCHECK_PARTIAL();
4496 MRRETURN(MATCH_NOMATCH);
4497 }
4498 GETCHARINCTEST(c, eptr);
4499 prop_chartype = UCD_CHARTYPE(c);
4500 if ((prop_chartype == prop_value) == prop_fail_result)
4501 MRRETURN(MATCH_NOMATCH);
4502 }
4503 /* Control never gets here */
4504
4505 case PT_SC:
4506 for (fi = min;; fi++)
4507 {
4508 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4509 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4511 if (eptr >= md->end_subject)
4512 {
4513 SCHECK_PARTIAL();
4514 MRRETURN(MATCH_NOMATCH);
4515 }
4516 GETCHARINCTEST(c, eptr);
4517 prop_script = UCD_SCRIPT(c);
4518 if ((prop_script == prop_value) == prop_fail_result)
4519 MRRETURN(MATCH_NOMATCH);
4520 }
4521 /* Control never gets here */
4522
4523 case PT_ALNUM:
4524 for (fi = min;; fi++)
4525 {
4526 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4527 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4528 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4529 if (eptr >= md->end_subject)
4530 {
4531 SCHECK_PARTIAL();
4532 MRRETURN(MATCH_NOMATCH);
4533 }
4534 GETCHARINCTEST(c, eptr);
4535 prop_category = UCD_CATEGORY(c);
4536 if ((prop_category == ucp_L || prop_category == ucp_N)
4537 == prop_fail_result)
4538 MRRETURN(MATCH_NOMATCH);
4539 }
4540 /* Control never gets here */
4541
4542 case PT_SPACE: /* Perl space */
4543 for (fi = min;; fi++)
4544 {
4545 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4546 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4547 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4548 if (eptr >= md->end_subject)
4549 {
4550 SCHECK_PARTIAL();
4551 MRRETURN(MATCH_NOMATCH);
4552 }
4553 GETCHARINCTEST(c, eptr);
4554 prop_category = UCD_CATEGORY(c);
4555 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4556 c == CHAR_FF || c == CHAR_CR)
4557 == prop_fail_result)
4558 MRRETURN(MATCH_NOMATCH);
4559 }
4560 /* Control never gets here */
4561
4562 case PT_PXSPACE: /* POSIX space */
4563 for (fi = min;; fi++)
4564 {
4565 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4566 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4567 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4568 if (eptr >= md->end_subject)
4569 {
4570 SCHECK_PARTIAL();
4571 MRRETURN(MATCH_NOMATCH);
4572 }
4573 GETCHARINCTEST(c, eptr);
4574 prop_category = UCD_CATEGORY(c);
4575 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4576 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4577 == prop_fail_result)
4578 MRRETURN(MATCH_NOMATCH);
4579 }
4580 /* Control never gets here */
4581
4582 case PT_WORD:
4583 for (fi = min;; fi++)
4584 {
4585 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4586 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4587 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4588 if (eptr >= md->end_subject)
4589 {
4590 SCHECK_PARTIAL();
4591 MRRETURN(MATCH_NOMATCH);
4592 }
4593 GETCHARINCTEST(c, eptr);
4594 prop_category = UCD_CATEGORY(c);
4595 if ((prop_category == ucp_L ||
4596 prop_category == ucp_N ||
4597 c == CHAR_UNDERSCORE)
4598 == prop_fail_result)
4599 MRRETURN(MATCH_NOMATCH);
4600 }
4601 /* Control never gets here */
4602
4603 /* This should never occur */
4604
4605 default:
4606 RRETURN(PCRE_ERROR_INTERNAL);
4607 }
4608 }
4609
4610 /* Match extended Unicode sequences. We will get here only if the
4611 support is in the binary; otherwise a compile-time error occurs. */
4612
4613 else if (ctype == OP_EXTUNI)
4614 {
4615 for (fi = min;; fi++)
4616 {
4617 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4619 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4620 if (eptr >= md->end_subject)
4621 {
4622 SCHECK_PARTIAL();
4623 MRRETURN(MATCH_NOMATCH);
4624 }
4625 GETCHARINCTEST(c, eptr);
4626 prop_category = UCD_CATEGORY(c);
4627 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4628 while (eptr < md->end_subject)
4629 {
4630 int len = 1;
4631 if (!utf8) c = *eptr;
4632 else { GETCHARLEN(c, eptr, len); }
4633 prop_category = UCD_CATEGORY(c);
4634 if (prop_category != ucp_M) break;
4635 eptr += len;
4636 }
4637 }
4638 }
4639
4640 else
4641 #endif /* SUPPORT_UCP */
4642
4643 #ifdef SUPPORT_UTF8
4644 /* UTF-8 mode */
4645 if (utf8)
4646 {
4647 for (fi = min;; fi++)
4648 {
4649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4651 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4652 if (eptr >= md->end_subject)
4653 {
4654 SCHECK_PARTIAL();
4655 MRRETURN(MATCH_NOMATCH);
4656 }
4657 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4658 MRRETURN(MATCH_NOMATCH);
4659 GETCHARINC(c, eptr);
4660 switch(ctype)
4661 {
4662 case OP_ANY: /* This is the non-NL case */
4663 case OP_ALLANY:
4664 case OP_ANYBYTE:
4665 break;
4666
4667 case OP_ANYNL:
4668 switch(c)
4669 {
4670 default: MRRETURN(MATCH_NOMATCH);
4671 case 0x000d:
4672 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4673 break;
4674 case 0x000a:
4675 break;
4676
4677 case 0x000b:
4678 case 0x000c:
4679 case 0x0085:
4680 case 0x2028:
4681 case 0x2029:
4682 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4683 break;
4684 }
4685 break;
4686
4687 case OP_NOT_HSPACE:
4688 switch(c)
4689 {
4690 default: break;
4691 case 0x09: /* HT */
4692 case 0x20: /* SPACE */
4693 case 0xa0: /* NBSP */
4694 case 0x1680: /* OGHAM SPACE MARK */
4695 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4696 case 0x2000: /* EN QUAD */
4697 case 0x2001: /* EM QUAD */
4698 case 0x2002: /* EN SPACE */
4699 case 0x2003: /* EM SPACE */
4700 case 0x2004: /* THREE-PER-EM SPACE */
4701 case 0x2005: /* FOUR-PER-EM SPACE */
4702 case 0x2006: /* SIX-PER-EM SPACE */
4703 case 0x2007: /* FIGURE SPACE */
4704 case 0x2008: /* PUNCTUATION SPACE */
4705 case 0x2009: /* THIN SPACE */
4706 case 0x200A: /* HAIR SPACE */
4707 case 0x202f: /* NARROW NO-BREAK SPACE */
4708 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4709 case 0x3000: /* IDEOGRAPHIC SPACE */
4710 MRRETURN(MATCH_NOMATCH);
4711 }
4712 break;
4713
4714 case OP_HSPACE:
4715 switch(c)
4716 {
4717 default: MRRETURN(MATCH_NOMATCH);
4718 case 0x09: /* HT */
4719 case 0x20: /* SPACE */
4720 case 0xa0: /* NBSP */
4721 case 0x1680: /* OGHAM SPACE MARK */
4722 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4723 case 0x2000: /* EN QUAD */
4724 case 0x2001: /* EM QUAD */
4725 case 0x2002: /* EN SPACE */
4726 case 0x2003: /* EM SPACE */
4727 case 0x2004: /* THREE-PER-EM SPACE */
4728 case 0x2005: /* FOUR-PER-EM SPACE */
4729 case 0x2006: /* SIX-PER-EM SPACE */
4730 case 0x2007: /* FIGURE SPACE */
4731 case 0x2008: /* PUNCTUATION SPACE */
4732 case 0x2009: /* THIN SPACE */
4733 case 0x200A: /* HAIR SPACE */
4734 case 0x202f: /* NARROW NO-BREAK SPACE */
4735 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4736 case 0x3000: /* IDEOGRAPHIC SPACE */
4737 break;
4738 }
4739 break;
4740
4741 case OP_NOT_VSPACE:
4742 switch(c)
4743 {
4744 default: break;
4745 case 0x0a: /* LF */
4746 case 0x0b: /* VT */
4747 case 0x0c: /* FF */
4748 case 0x0d: /* CR */
4749 case 0x85: /* NEL */
4750 case 0x2028: /* LINE SEPARATOR */
4751 case 0x2029: /* PARAGRAPH SEPARATOR */
4752 MRRETURN(MATCH_NOMATCH);
4753 }
4754 break;
4755
4756 case OP_VSPACE:
4757 switch(c)
4758 {
4759 default: MRRETURN(MATCH_NOMATCH);
4760 case 0x0a: /* LF */
4761 case 0x0b: /* VT */
4762 case 0x0c: /* FF */
4763 case 0x0d: /* CR */
4764 case 0x85: /* NEL */
4765 case 0x2028: /* LINE SEPARATOR */
4766 case 0x2029: /* PARAGRAPH SEPARATOR */
4767 break;
4768 }
4769 break;
4770
4771 case OP_NOT_DIGIT:
4772 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4773 MRRETURN(MATCH_NOMATCH);
4774 break;
4775
4776 case OP_DIGIT:
4777 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4778 MRRETURN(MATCH_NOMATCH);
4779 break;
4780
4781 case OP_NOT_WHITESPACE:
4782 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4783 MRRETURN(MATCH_NOMATCH);
4784 break;
4785
4786 case OP_WHITESPACE:
4787 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4788 MRRETURN(MATCH_NOMATCH);
4789 break;
4790
4791 case OP_NOT_WORDCHAR:
4792 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4793 MRRETURN(MATCH_NOMATCH);
4794 break;
4795
4796 case OP_WORDCHAR:
4797 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4798 MRRETURN(MATCH_NOMATCH);
4799 break;
4800
4801 default:
4802 RRETURN(PCRE_ERROR_INTERNAL);
4803 }
4804 }
4805 }
4806 else
4807 #endif
4808 /* Not UTF-8 mode */
4809 {
4810 for (fi = min;; fi++)
4811 {
4812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4814 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4815 if (eptr >= md->end_subject)
4816 {
4817 SCHECK_PARTIAL();
4818 MRRETURN(MATCH_NOMATCH);
4819 }
4820 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4821 MRRETURN(MATCH_NOMATCH);
4822 c = *eptr++;
4823 switch(ctype)
4824 {
4825 case OP_ANY: /* This is the non-NL case */
4826 case OP_ALLANY:
4827 case OP_ANYBYTE:
4828 break;
4829
4830 case OP_ANYNL:
4831 switch(c)
4832 {
4833 default: MRRETURN(MATCH_NOMATCH);
4834 case 0x000d:
4835 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4836 break;
4837
4838 case 0x000a:
4839 break;
4840
4841 case 0x000b:
4842 case 0x000c:
4843 case 0x0085:
4844 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4845 break;
4846 }
4847 break;
4848
4849 case OP_NOT_HSPACE:
4850 switch(c)
4851 {
4852 default: break;
4853 case 0x09: /* HT */
4854 case 0x20: /* SPACE */
4855 case 0xa0: /* NBSP */
4856 MRRETURN(MATCH_NOMATCH);
4857 }
4858 break;
4859
4860 case OP_HSPACE:
4861 switch(c)
4862 {
4863 default: MRRETURN(MATCH_NOMATCH);
4864 case 0x09: /* HT */
4865 case 0x20: /* SPACE */
4866 case 0xa0: /* NBSP */
4867 break;
4868 }
4869 break;
4870
4871 case OP_NOT_VSPACE:
4872 switch(c)
4873 {
4874 default: break;
4875 case 0x0a: /* LF */
4876 case 0x0b: /* VT */
4877 case 0x0c: /* FF */
4878 case 0x0d: /* CR */
4879 case 0x85: /* NEL */
4880 MRRETURN(MATCH_NOMATCH);
4881 }
4882 break;
4883
4884 case OP_VSPACE:
4885 switch(c)
4886 {
4887 default: MRRETURN(MATCH_NOMATCH);
4888 case 0x0a: /* LF */
4889 case 0x0b: /* VT */
4890 case 0x0c: /* FF */
4891 case 0x0d: /* CR */
4892 case 0x85: /* NEL */
4893 break;
4894 }
4895 break;
4896
4897 case OP_NOT_DIGIT:
4898 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4899 break;
4900
4901 case OP_DIGIT:
4902 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4903 break;
4904
4905 case OP_NOT_WHITESPACE:
4906 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4907 break;
4908
4909 case OP_WHITESPACE:
4910 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4911 break;
4912
4913 case OP_NOT_WORDCHAR:
4914 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4915 break;
4916
4917 case OP_WORDCHAR:
4918 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4919 break;
4920
4921 default:
4922 RRETURN(PCRE_ERROR_INTERNAL);
4923 }
4924 }
4925 }
4926 /* Control never gets here */
4927 }
4928
4929 /* If maximizing, it is worth using inline code for speed, doing the type
4930 test once at the start (i.e. keep it out of the loop). Again, keep the
4931 UTF-8 and UCP stuff separate. */
4932
4933 else
4934 {
4935 pp = eptr; /* Remember where we started */
4936
4937 #ifdef SUPPORT_UCP
4938 if (prop_type >= 0)
4939 {
4940 switch(prop_type)
4941 {
4942 case PT_ANY:
4943 for (i = min; i < max; i++)
4944 {
4945 int len = 1;
4946 if (eptr >= md->end_subject)
4947 {
4948 SCHECK_PARTIAL();
4949 break;
4950 }
4951 GETCHARLENTEST(c, eptr, len);
4952 if (prop_fail_result) break;
4953 eptr+= len;
4954 }
4955 break;
4956
4957 case PT_LAMP:
4958 for (i = min; i < max; i++)
4959 {
4960 int len = 1;
4961 if (eptr >= md->end_subject)
4962 {
4963 SCHECK_PARTIAL();
4964 break;
4965 }
4966 GETCHARLENTEST(c, eptr, len);
4967 prop_chartype = UCD_CHARTYPE(c);
4968 if ((prop_chartype == ucp_Lu ||
4969 prop_chartype == ucp_Ll ||
4970 prop_chartype == ucp_Lt) == prop_fail_result)
4971 break;
4972 eptr+= len;
4973 }
4974 break;
4975
4976 case PT_GC:
4977 for (i = min; i < max; i++)
4978 {
4979 int len = 1;
4980 if (eptr >= md->end_subject)
4981 {
4982 SCHECK_PARTIAL();
4983 break;
4984 }
4985 GETCHARLENTEST(c, eptr, len);
4986 prop_category = UCD_CATEGORY(c);
4987 if ((prop_category == prop_value) == prop_fail_result)
4988 break;
4989 eptr+= len;
4990 }
4991 break;
4992
4993 case PT_PC:
4994 for (i = min; i < max; i++)
4995 {
4996 int len = 1;
4997 if (eptr >= md->end_subject)
4998 {
4999 SCHECK_PARTIAL();
5000 break;
5001 }
5002 GETCHARLENTEST(c, eptr, len);
5003 prop_chartype = UCD_CHARTYPE(c);
5004 if ((prop_chartype == prop_value) == prop_fail_result)
5005 break;
5006 eptr+= len;
5007 }
5008 break;
5009
5010 case PT_SC:
5011 for (i = min; i < max; i++)
5012 {
5013 int len = 1;
5014 if (eptr >= md->end_subject)
5015 {
5016 SCHECK_PARTIAL();
5017 break;
5018 }
5019 GETCHARLENTEST(c, eptr, len);
5020 prop_script = UCD_SCRIPT(c);
5021 if ((prop_script == prop_value) == prop_fail_result)
5022 break;
5023 eptr+= len;
5024 }
5025 break;
5026
5027 case PT_ALNUM:
5028 for (i = min; i < max; i++)
5029 {
5030 int len = 1;
5031 if (eptr >= md->end_subject)
5032 {
5033 SCHECK_PARTIAL();
5034 break;
5035 }
5036 GETCHARLENTEST(c, eptr, len);
5037 prop_category = UCD_CATEGORY(c);
5038 if ((prop_category == ucp_L || prop_category == ucp_N)
5039 == prop_fail_result)
5040 break;
5041 eptr+= len;
5042 }
5043 break;
5044
5045 case PT_SPACE: /* Perl space */
5046 for (i = min; i < max; i++)
5047 {
5048 int len = 1;
5049 if (eptr >= md->end_subject)
5050 {
5051 SCHECK_PARTIAL();
5052 break;
5053 }
5054 GETCHARLENTEST(c, eptr, len);
5055 prop_category = UCD_CATEGORY(c);
5056 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5057 c == CHAR_FF || c == CHAR_CR)
5058 == prop_fail_result)
5059 break;
5060 eptr+= len;
5061 }
5062 break;
5063
5064 case PT_PXSPACE: /* POSIX space */
5065 for (i = min; i < max; i++)
5066 {
5067 int len = 1;
5068 if (eptr >= md->end_subject)
5069 {
5070 SCHECK_PARTIAL();
5071 break;
5072 }
5073 GETCHARLENTEST(c, eptr, len);
5074 prop_category = UCD_CATEGORY(c);
5075 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5076 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5077 == prop_fail_result)
5078 break;
5079 eptr+= len;
5080 }
5081 break;
5082
5083 case PT_WORD:
5084 for (i = min; i < max; i++)
5085 {
5086 int len = 1;
5087 if (eptr >= md->end_subject)
5088 {
5089 SCHECK_PARTIAL();
5090 break;
5091 }
5092 GETCHARLENTEST(c, eptr, len);
5093 prop_category = UCD_CATEGORY(c);
5094 if ((prop_category == ucp_L || prop_category == ucp_N ||
5095 c == CHAR_UNDERSCORE) == prop_fail_result)
5096 break;
5097 eptr+= len;
5098 }
5099 break;
5100
5101 default:
5102 RRETURN(PCRE_ERROR_INTERNAL);
5103 }
5104
5105 /* eptr is now past the end of the maximum run */
5106
5107 if (possessive) continue;
5108 for(;;)
5109 {
5110 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5111 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5112 if (eptr-- == pp) break; /* Stop if tried at original pos */
5113 if (utf8) BACKCHAR(eptr);
5114 }
5115 }
5116
5117 /* Match extended Unicode sequences. We will get here only if the
5118 support is in the binary; otherwise a compile-time error occurs. */
5119
5120 else if (ctype == OP_EXTUNI)
5121 {
5122 for (i = min; i < max; i++)
5123 {
5124 if (eptr >= md->end_subject)
5125 {
5126 SCHECK_PARTIAL();
5127 break;
5128 }
5129 GETCHARINCTEST(c, eptr);
5130 prop_category = UCD_CATEGORY(c);
5131 if (prop_category == ucp_M) break;
5132 while (eptr < md->end_subject)
5133 {
5134 int len = 1;
5135 if (!utf8) c = *eptr; else
5136 {
5137 GETCHARLEN(c, eptr, len);
5138 }
5139 prop_category = UCD_CATEGORY(c);
5140 if (prop_category != ucp_M) break;
5141 eptr += len;
5142 }
5143 }
5144
5145 /* eptr is now past the end of the maximum run */
5146
5147 if (possessive) continue;
5148
5149 for(;;)
5150 {
5151 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5152 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5153 if (eptr-- == pp) break; /* Stop if tried at original pos */
5154 for (;;) /* Move back over one extended */
5155 {
5156 int len = 1;
5157 if (!utf8) c = *eptr; else
5158 {
5159 BACKCHAR(eptr);
5160 GETCHARLEN(c, eptr, len);
5161 }
5162 prop_category = UCD_CATEGORY(c);
5163 if (prop_category != ucp_M) break;
5164 eptr--;
5165 }
5166 }
5167 }
5168
5169 else
5170 #endif /* SUPPORT_UCP */
5171
5172 #ifdef SUPPORT_UTF8
5173 /* UTF-8 mode */
5174
5175 if (utf8)
5176 {
5177 switch(ctype)
5178 {
5179 case OP_ANY:
5180 if (max < INT_MAX)
5181 {
5182 for (i = min; i < max; i++)
5183 {
5184 if (eptr >= md->end_subject)
5185 {
5186 SCHECK_PARTIAL();
5187 break;
5188 }
5189 if (IS_NEWLINE(eptr)) break;
5190 eptr++;
5191 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5192 }
5193 }
5194
5195 /* Handle unlimited UTF-8 repeat */
5196
5197 else
5198 {
5199 for (i = min; i < max; i++)
5200 {
5201 if (eptr >= md->end_subject)
5202 {
5203 SCHECK_PARTIAL();
5204 break;
5205 }
5206 if (IS_NEWLINE(eptr)) break;
5207 eptr++;
5208 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5209 }
5210 }
5211 break;
5212
5213 case OP_ALLANY:
5214 if (max < INT_MAX)
5215 {
5216 for (i = min; i < max; i++)
5217 {
5218 if (eptr >= md->end_subject)
5219 {
5220 SCHECK_PARTIAL();
5221 break;
5222 }
5223 eptr++;
5224 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5225 }
5226 }
5227 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5228 break;
5229
5230 /* The byte case is the same as non-UTF8 */
5231
5232 case OP_ANYBYTE:
5233 c = max - min;
5234 if (c > (unsigned int)(md->end_subject - eptr))
5235 {
5236 eptr = md->end_subject;
5237 SCHECK_PARTIAL();
5238 }
5239 else eptr += c;
5240 break;
5241
5242 case OP_ANYNL:
5243 for (i = min; i < max; i++)
5244 {
5245 int len = 1;
5246 if (eptr >= md->end_subject)
5247 {
5248 SCHECK_PARTIAL();
5249 break;
5250 }
5251 GETCHARLEN(c, eptr, len);
5252 if (c == 0x000d)
5253 {
5254 if (++eptr >= md->end_subject) break;
5255 if (*eptr == 0x000a) eptr++;
5256 }
5257 else
5258 {
5259 if (c != 0x000a &&
5260 (md->bsr_anycrlf ||
5261 (c != 0x000b && c != 0x000c &&
5262 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5263 break;
5264 eptr += len;
5265 }
5266 }
5267 break;
5268
5269 case OP_NOT_HSPACE:
5270 case OP_HSPACE:
5271 for (i = min; i < max; i++)
5272 {
5273 BOOL gotspace;
5274 int len = 1;
5275 if (eptr >= md->end_subject)
5276 {
5277 SCHECK_PARTIAL();
5278 break;
5279 }
5280 GETCHARLEN(c, eptr, len);
5281 switch(c)
5282 {
5283 default: gotspace = FALSE; break;
5284 case 0x09: /* HT */
5285 case 0x20: /* SPACE */
5286 case 0xa0: /* NBSP */
5287 case 0x1680: /* OGHAM SPACE MARK */
5288 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5289 case 0x2000: /* EN QUAD */
5290 case 0x2001: /* EM QUAD */
5291 case 0x2002: /* EN SPACE */
5292 case 0x2003: /* EM SPACE */
5293 case 0x2004: /* THREE-PER-EM SPACE */
5294 case 0x2005: /* FOUR-PER-EM SPACE */
5295 case 0x2006: /* SIX-PER-EM SPACE */
5296 case 0x2007: /* FIGURE SPACE */
5297 case 0x2008: /* PUNCTUATION SPACE */
5298 case 0x2009: /* THIN SPACE */
5299 case 0x200A: /* HAIR SPACE */
5300 case 0x202f: /* NARROW NO-BREAK SPACE */
5301 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5302 case 0x3000: /* IDEOGRAPHIC SPACE */
5303 gotspace = TRUE;
5304 break;
5305 }
5306 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5307 eptr += len;
5308 }
5309 break;
5310
5311 case OP_NOT_VSPACE:
5312 case OP_VSPACE:
5313 for (i = min; i < max; i++)
5314 {
5315 BOOL gotspace;
5316 int len = 1;
5317 if (eptr >= md->end_subject)
5318 {
5319 SCHECK_PARTIAL();
5320 break;
5321 }
5322 GETCHARLEN(c, eptr, len);
5323 switch(c)
5324 {
5325 default: gotspace = FALSE; break;
5326 case 0x0a: /* LF */
5327 case 0x0b: /* VT */
5328 case 0x0c: /* FF */
5329 case 0x0d: /* CR */
5330 case 0x85: /* NEL */
5331 case 0x2028: /* LINE SEPARATOR */
5332 case 0x2029: /* PARAGRAPH SEPARATOR */
5333 gotspace = TRUE;
5334 break;
5335 }
5336 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5337 eptr += len;
5338 }
5339 break;
5340
5341 case OP_NOT_DIGIT:
5342 for (i = min; i < max; i++)
5343 {
5344 int len = 1;
5345 if (eptr >= md->end_subject)
5346 {
5347 SCHECK_PARTIAL();
5348 break;
5349 }
5350 GETCHARLEN(c, eptr, len);
5351 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5352 eptr+= len;
5353 }
5354 break;
5355
5356 case OP_DIGIT:
5357 for (i = min; i < max; i++)
5358 {
5359 int len = 1;
5360 if (eptr >= md->end_subject)
5361 {
5362 SCHECK_PARTIAL();
5363 break;
5364 }
5365 GETCHARLEN(c, eptr, len);
5366 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5367 eptr+= len;
5368 }
5369 break;
5370
5371 case OP_NOT_WHITESPACE:
5372 for (i = min; i < max; i++)
5373 {
5374 int len = 1;
5375 if (eptr >= md->end_subject)
5376 {
5377 SCHECK_PARTIAL();
5378 break;
5379 }
5380 GETCHARLEN(c, eptr, len);
5381 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5382 eptr+= len;
5383 }
5384 break;
5385
5386 case OP_WHITESPACE:
5387 for (i = min; i < max; i++)
5388 {
5389 int len = 1;
5390 if (eptr >= md->end_subject)
5391 {
5392 SCHECK_PARTIAL();
5393 break;
5394 }
5395 GETCHARLEN(c, eptr, len);
5396 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5397 eptr+= len;
5398 }
5399 break;
5400
5401 case OP_NOT_WORDCHAR:
5402 for (i = min; i < max; i++)
5403 {
5404 int len = 1;
5405 if (eptr >= md->end_subject)
5406 {
5407 SCHECK_PARTIAL();
5408 break;
5409 }
5410 GETCHARLEN(c, eptr, len);
5411 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5412 eptr+= len;
5413 }
5414 break;
5415
5416 case OP_WORDCHAR:
5417 for (i = min; i < max; i++)
5418 {
5419 int len = 1;
5420 if (eptr >= md->end_subject)
5421 {
5422 SCHECK_PARTIAL();
5423 break;
5424 }
5425 GETCHARLEN(c, eptr, len);
5426 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5427 eptr+= len;
5428 }
5429 break;
5430
5431 default:
5432 RRETURN(PCRE_ERROR_INTERNAL);
5433 }
5434
5435 /* eptr is now past the end of the maximum run. If possessive, we are
5436 done (no backing up). Otherwise, match at this position; anything other
5437 than no match is immediately returned. For nomatch, back up one
5438 character, unless we are matching \R and the last thing matched was
5439 \r\n, in which case, back up two bytes. */
5440
5441 if (possessive) continue;
5442 for(;;)
5443 {
5444 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5446 if (eptr-- == pp) break; /* Stop if tried at original pos */
5447 BACKCHAR(eptr);
5448 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5449 eptr[-1] == '\r') eptr--;
5450 }
5451 }
5452 else
5453 #endif /* SUPPORT_UTF8 */
5454
5455 /* Not UTF-8 mode */
5456 {
5457 switch(ctype)
5458 {
5459 case OP_ANY:
5460 for (i = min; i < max; i++)
5461 {
5462 if (eptr >= md->end_subject)
5463 {
5464 SCHECK_PARTIAL();
5465 break;
5466 }
5467 if (IS_NEWLINE(eptr)) break;
5468 eptr++;
5469 }
5470 break;
5471
5472 case OP_ALLANY:
5473 case OP_ANYBYTE:
5474 c = max - min;
5475 if (c > (unsigned int)(md->end_subject - eptr))
5476 {
5477 eptr = md->end_subject;
5478 SCHECK_PARTIAL();
5479 }
5480 else eptr += c;
5481 break;
5482
5483 case OP_ANYNL:
5484 for (i = min; i < max; i++)
5485 {
5486 if (eptr >= md->end_subject)
5487 {
5488 SCHECK_PARTIAL();
5489 break;
5490 }
5491 c = *eptr;
5492 if (c == 0x000d)
5493 {
5494 if (++eptr >= md->end_subject) break;
5495 if (*eptr == 0x000a) eptr++;
5496 }
5497 else
5498 {
5499 if (c != 0x000a &&
5500 (md->bsr_anycrlf ||
5501 (c != 0x000b && c != 0x000c && c != 0x0085)))
5502 break;
5503 eptr++;
5504 }
5505 }
5506 break;
5507
5508 case OP_NOT_HSPACE:
5509 for (i = min; i < max; i++)
5510 {
5511 if (eptr >= md->end_subject)
5512 {
5513 SCHECK_PARTIAL();
5514 break;
5515 }
5516 c = *eptr;
5517 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5518 eptr++;
5519 }
5520 break;
5521
5522 case OP_HSPACE:
5523 for (i = min; i < max; i++)
5524 {
5525 if (eptr >= md->end_subject)
5526 {
5527 SCHECK_PARTIAL();
5528 break;
5529 }
5530 c = *eptr;
5531 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5532 eptr++;
5533 }
5534 break;
5535
5536 case OP_NOT_VSPACE:
5537 for (i = min; i < max; i++)
5538 {
5539 if (eptr >= md->end_subject)
5540 {
5541 SCHECK_PARTIAL();
5542 break;
5543 }
5544 c = *eptr;
5545 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5546 break;
5547 eptr++;
5548 }
5549 break;
5550
5551 case OP_VSPACE:
5552 for (i = min; i < max; i++)
5553 {
5554 if (eptr >= md->end_subject)
5555 {
5556 SCHECK_PARTIAL();
5557 break;
5558 }
5559 c = *eptr;
5560 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5561 break;
5562 eptr++;
5563 }
5564 break;
5565
5566 case OP_NOT_DIGIT:
5567 for (i = min; i < max; i++)
5568 {
5569 if (eptr >= md->end_subject)
5570 {
5571 SCHECK_PARTIAL();
5572 break;
5573 }
5574 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5575 eptr++;
5576 }
5577 break;
5578
5579 case OP_DIGIT:
5580 for (i = min; i < max; i++)
5581 {
5582 if (eptr >= md->end_subject)
5583 {
5584 SCHECK_PARTIAL();
5585 break;
5586 }
5587 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5588 eptr++;
5589 }
5590 break;
5591
5592 case OP_NOT_WHITESPACE:
5593 for (i = min; i < max; i++)
5594 {
5595 if (eptr >= md->end_subject)
5596 {
5597 SCHECK_PARTIAL();
5598 break;
5599 }
5600 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5601 eptr++;
5602 }
5603 break;
5604
5605 case OP_WHITESPACE:
5606 for (i = min; i < max; i++)
5607 {
5608 if (eptr >= md->end_subject)
5609 {
5610 SCHECK_PARTIAL();
5611 break;
5612 }
5613 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5614 eptr++;
5615 }
5616 break;
5617
5618 case OP_NOT_WORDCHAR:
5619 for (i = min; i < max; i++)
5620 {
5621 if (eptr >= md->end_subject)
5622 {
5623 SCHECK_PARTIAL();
5624 break;
5625 }
5626 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5627 eptr++;
5628 }
5629 break;
5630
5631 case OP_WORDCHAR:
5632 for (i = min; i < max; i++)
5633 {
5634 if (eptr >= md->end_subject)
5635 {
5636 SCHECK_PARTIAL();
5637 break;
5638 }
5639 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5640 eptr++;
5641 }
5642 break;
5643
5644 default:
5645 RRETURN(PCRE_ERROR_INTERNAL);
5646 }
5647
5648 /* eptr is now past the end of the maximum run. If possessive, we are
5649 done (no backing up). Otherwise, match at this position; anything other
5650 than no match is immediately returned. For nomatch, back up one
5651 character (byte), unless we are matching \R and the last thing matched
5652 was \r\n, in which case, back up two bytes. */
5653
5654 if (possessive) continue;
5655 while (eptr >= pp)
5656 {
5657 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5659 eptr--;
5660 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5661 eptr[-1] == '\r') eptr--;
5662 }
5663 }
5664
5665 /* Get here if we can't make it match with any permitted repetitions */
5666
5667 MRRETURN(MATCH_NOMATCH);
5668 }
5669 /* Control never gets here */
5670
5671 /* There's been some horrible disaster. Arrival here can only mean there is
5672 something seriously wrong in the code above or the OP_xxx definitions. */
5673
5674 default:
5675 DPRINTF(("Unknown opcode %d\n", *ecode));
5676 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5677 }
5678
5679 /* Do not stick any code in here without much thought; it is assumed
5680 that "continue" in the code above comes out to here to repeat the main
5681 loop. */
5682
5683 } /* End of main loop */
5684 /* Control never reaches here */
5685
5686
5687 /* When compiling to use the heap rather than the stack for recursive calls to
5688 match(), the RRETURN() macro jumps here. The number that is saved in
5689 frame->Xwhere indicates which label we actually want to return to. */
5690
5691 #ifdef NO_RECURSE
5692 #define LBL(val) case val: goto L_RM##val;
5693 HEAP_RETURN:
5694 switch (frame->Xwhere)
5695 {
5696 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5697 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5698 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5699 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5700 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5701 LBL(65) LBL(66)
5702 #ifdef SUPPORT_UTF8
5703 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5704 LBL(32) LBL(34) LBL(42) LBL(46)
5705 #ifdef SUPPORT_UCP
5706 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5707 LBL(59) LBL(60) LBL(61) LBL(62)
5708 #endif /* SUPPORT_UCP */
5709 #endif /* SUPPORT_UTF8 */
5710 default:
5711 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5712 return PCRE_ERROR_INTERNAL;
5713 }
5714 #undef LBL
5715 #endif /* NO_RECURSE */
5716 }
5717
5718
5719 /***************************************************************************
5720 ****************************************************************************
5721 RECURSION IN THE match() FUNCTION
5722
5723 Undefine all the macros that were defined above to handle this. */
5724
5725 #ifdef NO_RECURSE
5726 #undef eptr
5727 #undef ecode
5728 #undef mstart
5729 #undef offset_top
5730 #undef eptrb
5731 #undef flags
5732
5733 #undef callpat
5734 #undef charptr
5735 #undef data
5736 #undef next
5737 #undef pp
5738 #undef prev
5739 #undef saved_eptr
5740
5741 #undef new_recursive
5742
5743 #undef cur_is_word
5744 #undef condition
5745 #undef prev_is_word
5746
5747 #undef ctype
5748 #undef length
5749 #undef max
5750 #undef min
5751 #undef number
5752 #undef offset
5753 #undef op
5754 #undef save_capture_last
5755 #undef save_offset1
5756 #undef save_offset2
5757 #undef save_offset3
5758 #undef stacksave
5759
5760 #undef newptrb
5761
5762 #endif
5763
5764 /* These two are defined as macros in both cases */
5765
5766 #undef fc
5767 #undef fi
5768
5769 /***************************************************************************
5770 ***************************************************************************/
5771
5772
5773
5774 /*************************************************
5775 * Execute a Regular Expression *
5776 *************************************************/
5777
5778 /* This function applies a compiled re to a subject string and picks out
5779 portions of the string if it matches. Two elements in the vector are set for
5780 each substring: the offsets to the start and end of the substring.
5781
5782 Arguments:
5783 argument_re points to the compiled expression
5784 extra_data points to extra data or is NULL
5785 subject points to the subject string
5786 length length of subject string (may contain binary zeros)
5787 start_offset where to start in the subject string
5788 options option bits
5789 offsets points to a vector of ints to be filled in with offsets
5790 offsetcount the number of elements in the vector
5791
5792 Returns: > 0 => success; value is the number of elements filled in
5793 = 0 => success, but offsets is not big enough
5794 -1 => failed to match
5795 < -1 => some kind of unexpected problem
5796 */
5797
5798 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5799 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5800 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5801 int offsetcount)
5802 {
5803 int rc, ocount;
5804 int first_byte = -1;
5805 int req_byte = -1;
5806 int req_byte2 = -1;
5807 int newline;
5808 BOOL using_temporary_offsets = FALSE;
5809 BOOL anchored;
5810 BOOL startline;
5811 BOOL firstline;
5812 BOOL first_byte_caseless = FALSE;
5813 BOOL req_byte_caseless = FALSE;
5814 BOOL utf8;
5815 match_data match_block;
5816 match_data *md = &match_block;
5817 const uschar *tables;
5818 const uschar *start_bits = NULL;
5819 USPTR start_match = (USPTR)subject + start_offset;
5820 USPTR end_subject;
5821 USPTR start_partial = NULL;
5822 USPTR req_byte_ptr = start_match - 1;
5823
5824 pcre_study_data internal_study;
5825 const pcre_study_data *study;
5826
5827 real_pcre internal_re;
5828 const real_pcre *external_re = (const real_pcre *)argument_re;
5829 const real_pcre *re = external_re;
5830
5831 /* Plausibility checks */
5832
5833 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5834 if (re == NULL || subject == NULL ||
5835 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5836 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5837 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5838
5839 /* This information is for finding all the numbers associated with a given
5840 name, for condition testing. */
5841
5842 md->name_table = (uschar *)re + re->name_table_offset;
5843 md->name_count = re->name_count;
5844 md->name_entry_size = re->name_entry_size;
5845
5846 /* Fish out the optional data from the extra_data structure, first setting
5847 the default values. */
5848
5849 study = NULL;
5850 md->match_limit = MATCH_LIMIT;
5851 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5852 md->callout_data = NULL;
5853
5854 /* The table pointer is always in native byte order. */
5855
5856 tables = external_re->tables;
5857
5858 if (extra_data != NULL)
5859 {
5860 register unsigned int flags = extra_data->flags;
5861 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5862 study = (const pcre_study_data *)extra_data->study_data;
5863 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5864 md->match_limit = extra_data->match_limit;
5865 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5866 md->match_limit_recursion = extra_data->match_limit_recursion;
5867 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5868 md->callout_data = extra_data->callout_data;
5869 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5870 }
5871
5872 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5873 is a feature that makes it possible to save compiled regex and re-use them
5874 in other programs later. */
5875
5876 if (tables == NULL) tables = _pcre_default_tables;
5877
5878 /* Check that the first field in the block is the magic number. If it is not,
5879 test for a regex that was compiled on a host of opposite endianness. If this is
5880 the case, flipped values are put in internal_re and internal_study if there was
5881 study data too. */
5882
5883 if (re->magic_number != MAGIC_NUMBER)
5884 {
5885 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5886 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5887 if (study != NULL) study = &internal_study;
5888 }
5889
5890 /* Set up other data */
5891
5892 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5893 startline = (re->flags & PCRE_STARTLINE) != 0;
5894 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5895
5896 /* The code starts after the real_pcre block and the capture name table. */
5897
5898 md->start_code = (const uschar *)external_re + re->name_table_offset +
5899 re->name_count * re->name_entry_size;
5900
5901 md->start_subject = (USPTR)subject;
5902 md->start_offset = start_offset;
5903 md->end_subject = md->start_subject + length;
5904 end_subject = md->end_subject;
5905
5906 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5907 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5908 md->use_ucp = (re->options & PCRE_UCP) != 0;
5909 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5910
5911 /* Some options are unpacked into BOOL variables in the hope that testing
5912 them will be faster than individual option bits. */
5913
5914 md->notbol = (options & PCRE_NOTBOL) != 0;
5915 md->noteol = (options & PCRE_NOTEOL) != 0;
5916 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5917 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5918 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5919 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5920
5921
5922 md->hitend = FALSE;
5923 md->mark = NULL; /* In case never set */
5924
5925 md->recursive = NULL; /* No recursion at top level */
5926
5927 md->lcc = tables + lcc_offset;
5928 md->ctypes = tables + ctypes_offset;
5929
5930 /* Handle different \R options. */
5931
5932 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5933 {
5934 case 0:
5935 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5936 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5937 else
5938 #ifdef BSR_ANYCRLF
5939 md->bsr_anycrlf = TRUE;
5940 #else
5941 md->bsr_anycrlf = FALSE;
5942 #endif
5943 break;
5944
5945 case PCRE_BSR_ANYCRLF:
5946 md->bsr_anycrlf = TRUE;
5947 break;
5948
5949 case PCRE_BSR_UNICODE:
5950 md->bsr_anycrlf = FALSE;
5951 break;
5952
5953 default: return PCRE_ERROR_BADNEWLINE;
5954 }
5955
5956 /* Handle different types of newline. The three bits give eight cases. If
5957 nothing is set at run time, whatever was used at compile time applies. */
5958
5959 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5960 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5961 {
5962 case 0: newline = NEWLINE; break; /* Compile-time default */
5963 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5964 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5965 case PCRE_NEWLINE_CR+
5966 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5967 case PCRE_NEWLINE_ANY: newline = -1; break;
5968 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5969 default: return PCRE_ERROR_BADNEWLINE;
5970 }
5971
5972 if (newline == -2)
5973 {
5974 md->nltype = NLTYPE_ANYCRLF;
5975 }
5976 else if (newline < 0)
5977 {
5978 md->nltype = NLTYPE_ANY;
5979 }
5980 else
5981 {
5982 md->nltype = NLTYPE_FIXED;
5983 if (newline > 255)
5984 {
5985 md->nllen = 2;
5986 md->nl[0] = (newline >> 8) & 255;
5987 md->nl[1] = newline & 255;
5988 }
5989 else
5990 {
5991 md->nllen = 1;
5992 md->nl[0] = newline;
5993 }
5994 }
5995
5996 /* Partial matching was originally supported only for a restricted set of
5997 regexes; from release 8.00 there are no restrictions, but the bits are still
5998 defined (though never set). So there's no harm in leaving this code. */
5999
6000 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6001 return PCRE_ERROR_BADPARTIAL;
6002
6003 /* Check a UTF-8 string if required. Pass back the character offset and error
6004 code for an invalid string if a results vector is available. */
6005
6006 #ifdef SUPPORT_UTF8
6007 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6008 {
6009 int erroroffset;
6010 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
6011 if (errorcode != 0)
6012 {
6013 if (offsetcount >= 2)
6014 {
6015 offsets[0] = erroroffset;
6016 offsets[1] = errorcode;
6017 }
6018 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6019 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6020 }
6021
6022 /* Check that a start_offset points to the start of a UTF-8 character. */
6023
6024 if (start_offset > 0 && start_offset < length &&
6025 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6026 return PCRE_ERROR_BADUTF8_OFFSET;
6027 }
6028 #endif
6029
6030 /* If the expression has got more back references than the offsets supplied can
6031 hold, we get a temporary chunk of working store to use during the matching.
6032 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6033 of 3. */
6034
6035 ocount = offsetcount - (offsetcount % 3);
6036
6037 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6038 {
6039 ocount = re->top_backref * 3 + 3;
6040 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6041 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6042 using_temporary_offsets = TRUE;
6043 DPRINTF(("Got memory to hold back references\n"));
6044 }
6045 else md->offset_vector = offsets;
6046
6047 md->offset_end = ocount;
6048 md->offset_max = (2*ocount)/3;
6049 md->offset_overflow = FALSE;
6050 md->capture_last = -1;
6051
6052 /* Reset the working variable associated with each extraction. These should
6053 never be used unless previously set, but they get saved and restored, and so we
6054 initialize them to avoid reading uninitialized locations. Also, unset the
6055 offsets for the matched string. This is really just for tidiness with callouts,
6056 in case they inspect these fields. */
6057
6058 if (md->offset_vector != NULL)
6059 {
6060 register int *iptr = md->offset_vector + ocount;
6061 register int *iend = iptr - re->top_bracket;
6062 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6063 while (--iptr >= iend) *iptr = -1;
6064 md->offset_vector[0] = md->offset_vector[1] = -1;
6065 }
6066
6067 /* Set up the first character to match, if available. The first_byte value is
6068 never set for an anchored regular expression, but the anchoring may be forced
6069 at run time, so we have to test for anchoring. The first char may be unset for
6070 an unanchored pattern, of course. If there's no first char and the pattern was
6071 studied, there may be a bitmap of possible first characters. */
6072
6073 if (!anchored)
6074 {
6075 if ((re->flags & PCRE_FIRSTSET) != 0)
6076 {
6077 first_byte = re->first_byte & 255;
6078 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6079 first_byte = md->lcc[first_byte];
6080 }
6081 else
6082 if (!startline && study != NULL &&
6083 (study->flags & PCRE_STUDY_MAPPED) != 0)
6084 start_bits = study->start_bits;
6085 }
6086
6087 /* For anchored or unanchored matches, there may be a "last known required
6088 character" set. */
6089
6090 if ((re->flags & PCRE_REQCHSET) != 0)
6091 {
6092 req_byte = re->req_byte & 255;
6093 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6094 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6095 }
6096
6097
6098
6099
6100 /* ==========================================================================*/
6101
6102 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6103 the loop runs just once. */
6104
6105 for(;;)
6106 {
6107 USPTR save_end_subject = end_subject;
6108 USPTR new_start_match;
6109
6110 /* If firstline is TRUE, the start of the match is constrained to the first
6111 line of a multiline string. That is, the match must be before or at the first
6112 newline. Implement this by temporarily adjusting end_subject so that we stop
6113 scanning at a newline. If the match fails at the newline, later code breaks
6114 this loop. */
6115
6116 if (firstline)
6117 {
6118 USPTR t = start_match;
6119 #ifdef SUPPORT_UTF8
6120 if (utf8)
6121 {
6122 while (t < md->end_subject && !IS_NEWLINE(t))
6123 {
6124 t++;
6125 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6126 }
6127 }
6128 else
6129 #endif
6130 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6131 end_subject = t;
6132 }
6133
6134 /* There are some optimizations that avoid running the match if a known
6135 starting point is not found, or if a known later character is not present.
6136 However, there is an option that disables these, for testing and for ensuring
6137 that all callouts do actually occur. The option can be set in the regex by
6138 (*NO_START_OPT) or passed in match-time options. */
6139
6140 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6141 {
6142 /* Advance to a unique first byte if there is one. */
6143
6144 if (first_byte >= 0)
6145 {
6146 if (first_byte_caseless)
6147 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6148 start_match++;
6149 else
6150 while (start_match < end_subject && *start_match != first_byte)
6151 start_match++;
6152 }
6153
6154 /* Or to just after a linebreak for a multiline match */
6155
6156 else if (startline)
6157 {
6158 if (start_match > md->start_subject + start_offset)
6159 {
6160 #ifdef SUPPORT_UTF8
6161 if (utf8)
6162 {
6163 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6164 {
6165 start_match++;
6166 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6167 start_match++;
6168 }
6169 }
6170 else
6171 #endif
6172 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6173 start_match++;
6174
6175 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6176 and we are now at a LF, advance the match position by one more character.
6177 */
6178
6179 if (start_match[-1] == CHAR_CR &&
6180 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6181 start_match < end_subject &&
6182 *start_match == CHAR_NL)
6183 start_match++;
6184 }
6185 }
6186
6187 /* Or to a non-unique first byte after study */
6188
6189 else if (start_bits != NULL)
6190 {
6191 while (start_match < end_subject)
6192 {
6193 register unsigned int c = *start_match;
6194 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6195 {
6196 start_match++;
6197 #ifdef SUPPORT_UTF8
6198 if (utf8)
6199 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6200 start_match++;
6201 #endif
6202 }
6203 else break;
6204 }
6205 }
6206 } /* Starting optimizations */
6207
6208 /* Restore fudged end_subject */
6209
6210 end_subject = save_end_subject;
6211
6212 /* The following two optimizations are disabled for partial matching or if
6213 disabling is explicitly requested. */
6214
6215 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6216 {
6217 /* If the pattern was studied, a minimum subject length may be set. This is
6218 a lower bound; no actual string of that length may actually match the
6219 pattern. Although the value is, strictly, in characters, we treat it as
6220 bytes to avoid spending too much time in this optimization. */
6221
6222 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6223 (pcre_uint32)(end_subject - start_match) < study->minlength)
6224 {
6225 rc = MATCH_NOMATCH;
6226 break;
6227 }
6228
6229 /* If req_byte is set, we know that that character must appear in the
6230 subject for the match to succeed. If the first character is set, req_byte
6231 must be later in the subject; otherwise the test starts at the match point.
6232 This optimization can save a huge amount of backtracking in patterns with
6233 nested unlimited repeats that aren't going to match. Writing separate code
6234 for cased/caseless versions makes it go faster, as does using an
6235 autoincrement and backing off on a match.
6236
6237 HOWEVER: when the subject string is very, very long, searching to its end
6238 can take a long time, and give bad performance on quite ordinary patterns.
6239 This showed up when somebody was matching something like /^\d+C/ on a
6240 32-megabyte string... so we don't do this when the string is sufficiently
6241 long. */
6242
6243 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6244 {
6245 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6246
6247 /* We don't need to repeat the search if we haven't yet reached the
6248 place we found it at last time. */
6249
6250 if (p > req_byte_ptr)
6251 {
6252 if (req_byte_caseless)
6253 {
6254 while (p < end_subject)
6255 {
6256 register int pp = *p++;
6257 if (pp == req_byte || pp == req_byte2) { p--; break; }
6258 }
6259 }
6260 else
6261 {
6262 while (p < end_subject)
6263 {
6264 if (*p++ == req_byte) { p--; break; }
6265 }
6266 }
6267
6268 /* If we can't find the required character, break the matching loop,
6269 forcing a match failure. */
6270
6271 if (p >= end_subject)
6272 {
6273 rc = MATCH_NOMATCH;
6274 break;
6275 }
6276
6277 /* If we have found the required character, save the point where we
6278 found it, so that we don't search again next time round the loop if
6279 the start hasn't passed this character yet. */
6280
6281 req_byte_ptr = p;
6282 }
6283 }
6284 }
6285
6286 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6287 printf(">>>> Match against: ");
6288 pchars(start_match, end_subject - start_match, TRUE, md);
6289 printf("\n");
6290 #endif
6291
6292 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6293 first starting point for which a partial match was found. */
6294
6295 md->start_match_ptr = start_match;
6296 md->start_used_ptr = start_match;
6297 md->match_call_count = 0;
6298 md->match_function_type = 0;
6299 md->end_offset_top = 0;
6300 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6301 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6302
6303 switch(rc)
6304 {
6305 /* SKIP passes back the next starting point explicitly, but if it is the
6306 same as the match we have just done, treat it as NOMATCH. */
6307
6308 case MATCH_SKIP:
6309 if (md->start_match_ptr != start_match)
6310 {
6311 new_start_match = md->start_match_ptr;
6312 break;
6313 }
6314 /* Fall through */
6315
6316 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6317 the SKIP's arg was not found. We also treat this as NOMATCH. */
6318
6319 case MATCH_SKIP_ARG:
6320 /* Fall through */
6321
6322 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6323 exactly like PRUNE. */
6324
6325 case MATCH_NOMATCH:
6326 case MATCH_PRUNE:
6327 case MATCH_THEN:
6328 new_start_match = start_match + 1;
6329 #ifdef SUPPORT_UTF8
6330 if (utf8)
6331 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6332 new_start_match++;
6333 #endif
6334 break;
6335
6336 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6337
6338 case MATCH_COMMIT:
6339 rc = MATCH_NOMATCH;
6340 goto ENDLOOP;
6341
6342 /* Any other return is either a match, or some kind of error. */
6343
6344 default:
6345 goto ENDLOOP;
6346 }
6347
6348 /* Control reaches here for the various types of "no match at this point"
6349 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6350
6351 rc = MATCH_NOMATCH;
6352
6353 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6354 newline in the subject (though it may continue over the newline). Therefore,
6355 if we have just failed to match, starting at a newline, do not continue. */
6356
6357 if (firstline && IS_NEWLINE(start_match)) break;
6358
6359 /* Advance to new matching position */
6360
6361 start_match = new_start_match;
6362
6363 /* Break the loop if the pattern is anchored or if we have passed the end of
6364 the subject. */
6365
6366 if (anchored || start_match > end_subject) break;
6367
6368 /* If we have just passed a CR and we are now at a LF, and the pattern does
6369 not contain any explicit matches for \r or \n, and the newline option is CRLF
6370 or ANY or ANYCRLF, advance the match position by one more character. */
6371
6372 if (start_match[-1] == CHAR_CR &&
6373 start_match < end_subject &&
6374 *start_match == CHAR_NL &&
6375 (re->flags & PCRE_HASCRORLF) == 0 &&
6376 (md->nltype == NLTYPE_ANY ||
6377 md->nltype == NLTYPE_ANYCRLF ||
6378 md->nllen == 2))
6379 start_match++;
6380
6381 md->mark = NULL; /* Reset for start of next match attempt */
6382 } /* End of for(;;) "bumpalong" loop */
6383
6384 /* ==========================================================================*/
6385
6386 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6387 conditions is true:
6388
6389 (1) The pattern is anchored or the match was failed by (*COMMIT);
6390
6391 (2) We are past the end of the subject;
6392
6393 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6394 this option requests that a match occur at or before the first newline in
6395 the subject.
6396
6397 When we have a match and the offset vector is big enough to deal with any
6398 backreferences, captured substring offsets will already be set up. In the case
6399 where we had to get some local store to hold offsets for backreference
6400 processing, copy those that we can. In this case there need not be overflow if
6401 certain parts of the pattern were not used, even though there are more
6402 capturing parentheses than vector slots. */
6403
6404 ENDLOOP:
6405
6406 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6407 {
6408 if (using_temporary_offsets)
6409 {
6410 if (offsetcount >= 4)
6411 {
6412 memcpy(offsets + 2, md->offset_vector + 2,
6413 (offsetcount - 2) * sizeof(int));
6414 DPRINTF(("Copied offsets from temporary memory\n"));
6415 }
6416 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6417 DPRINTF(("Freeing temporary memory\n"));
6418 (pcre_free)(md->offset_vector);
6419 }
6420
6421 /* Set the return code to the number of captured strings, or 0 if there are
6422 too many to fit into the vector. */
6423
6424 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6425
6426 /* If there is space, set up the whole thing as substring 0. The value of
6427 md->start_match_ptr might be modified if \K was encountered on the success
6428 matching path. */
6429
6430 if (offsetcount < 2) rc = 0; else
6431 {
6432 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6433 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6434 }
6435
6436 DPRINTF((">>>> returning %d\n", rc));
6437 goto RETURN_MARK;
6438 }
6439
6440 /* Control gets here if there has been an error, or if the overall match
6441 attempt has failed at all permitted starting positions. */
6442
6443 if (using_temporary_offsets)
6444 {
6445 DPRINTF(("Freeing temporary memory\n"));
6446 (pcre_free)(md->offset_vector);
6447 }
6448
6449 /* For anything other than nomatch or partial match, just return the code. */
6450
6451 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6452 {
6453 DPRINTF((">>>> error: returning %d\n", rc));
6454 return rc;
6455 }
6456
6457 /* Handle partial matches - disable any mark data */
6458
6459 if (start_partial != NULL)
6460 {
6461 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6462 md->mark = NULL;
6463 if (offsetcount > 1)
6464 {
6465 offsets[0] = (int)(start_partial - (USPTR)subject);
6466 offsets[1] = (int)(end_subject - (USPTR)subject);
6467 }
6468 rc = PCRE_ERROR_PARTIAL;
6469 }
6470
6471 /* This is the classic nomatch case */
6472
6473 else
6474 {
6475 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6476 rc = PCRE_ERROR_NOMATCH;
6477 }
6478
6479 /* Return the MARK data if it has been requested. */
6480
6481 RETURN_MARK:
6482
6483 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6484 *(extra_data->mark) = (unsigned char *)(md->mark);
6485 return rc;
6486 }
6487
6488 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5