/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 511 - (show annotations)
Mon Mar 29 09:25:38 2010 UTC (5 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 174742 byte(s)
Error occurred while calculating annotation data.
Make (*ACCEPT) work inside an atomic group.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54 };
259
260 /* These versions of the macros use the stack, as normal. There are debugging
261 versions and production versions. Note that the "rw" argument of RMATCH isn't
262 actually used in this definition. */
263
264 #ifndef NO_RECURSE
265 #define REGISTER register
266
267 #ifdef PCRE_DEBUG
268 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
269 { \
270 printf("match() called in line %d\n", __LINE__); \
271 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
272 printf("to line %d\n", __LINE__); \
273 }
274 #define RRETURN(ra) \
275 { \
276 printf("match() returned %d from line %d ", ra, __LINE__); \
277 return ra; \
278 }
279 #else
280 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
281 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
282 #define RRETURN(ra) return ra
283 #endif
284
285 #else
286
287
288 /* These versions of the macros manage a private stack on the heap. Note that
289 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
290 argument of match(), which never changes. */
291
292 #define REGISTER
293
294 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
295 {\
296 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
297 frame->Xwhere = rw; \
298 newframe->Xeptr = ra;\
299 newframe->Xecode = rb;\
300 newframe->Xmstart = mstart;\
301 newframe->Xmarkptr = markptr;\
302 newframe->Xoffset_top = rc;\
303 newframe->Xims = re;\
304 newframe->Xeptrb = rf;\
305 newframe->Xflags = rg;\
306 newframe->Xrdepth = frame->Xrdepth + 1;\
307 newframe->Xprevframe = frame;\
308 frame = newframe;\
309 DPRINTF(("restarting from line %d\n", __LINE__));\
310 goto HEAP_RECURSE;\
311 L_##rw:\
312 DPRINTF(("jumped back to line %d\n", __LINE__));\
313 }
314
315 #define RRETURN(ra)\
316 {\
317 heapframe *newframe = frame;\
318 frame = newframe->Xprevframe;\
319 (pcre_stack_free)(newframe);\
320 if (frame != NULL)\
321 {\
322 rrc = ra;\
323 goto HEAP_RETURN;\
324 }\
325 return ra;\
326 }
327
328
329 /* Structure for remembering the local variables in a private frame */
330
331 typedef struct heapframe {
332 struct heapframe *Xprevframe;
333
334 /* Function arguments that may change */
335
336 USPTR Xeptr;
337 const uschar *Xecode;
338 USPTR Xmstart;
339 USPTR Xmarkptr;
340 int Xoffset_top;
341 long int Xims;
342 eptrblock *Xeptrb;
343 int Xflags;
344 unsigned int Xrdepth;
345
346 /* Function local variables */
347
348 USPTR Xcallpat;
349 #ifdef SUPPORT_UTF8
350 USPTR Xcharptr;
351 #endif
352 USPTR Xdata;
353 USPTR Xnext;
354 USPTR Xpp;
355 USPTR Xprev;
356 USPTR Xsaved_eptr;
357
358 recursion_info Xnew_recursive;
359
360 BOOL Xcur_is_word;
361 BOOL Xcondition;
362 BOOL Xprev_is_word;
363
364 unsigned long int Xoriginal_ims;
365
366 #ifdef SUPPORT_UCP
367 int Xprop_type;
368 int Xprop_value;
369 int Xprop_fail_result;
370 int Xprop_category;
371 int Xprop_chartype;
372 int Xprop_script;
373 int Xoclength;
374 uschar Xocchars[8];
375 #endif
376
377 int Xcodelink;
378 int Xctype;
379 unsigned int Xfc;
380 int Xfi;
381 int Xlength;
382 int Xmax;
383 int Xmin;
384 int Xnumber;
385 int Xoffset;
386 int Xop;
387 int Xsave_capture_last;
388 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
389 int Xstacksave[REC_STACK_SAVE_MAX];
390
391 eptrblock Xnewptrb;
392
393 /* Where to jump back to */
394
395 int Xwhere;
396
397 } heapframe;
398
399 #endif
400
401
402 /***************************************************************************
403 ***************************************************************************/
404
405
406
407 /*************************************************
408 * Match from current position *
409 *************************************************/
410
411 /* This function is called recursively in many circumstances. Whenever it
412 returns a negative (error) response, the outer incarnation must also return the
413 same response. */
414
415 /* These macros pack up tests that are used for partial matching, and which
416 appears several times in the code. We set the "hit end" flag if the pointer is
417 at the end of the subject and also past the start of the subject (i.e.
418 something has been matched). For hard partial matching, we then return
419 immediately. The second one is used when we already know we are past the end of
420 the subject. */
421
422 #define CHECK_PARTIAL()\
423 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
424 {\
425 md->hitend = TRUE;\
426 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
427 }
428
429 #define SCHECK_PARTIAL()\
430 if (md->partial != 0 && eptr > mstart)\
431 {\
432 md->hitend = TRUE;\
433 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
434 }
435
436
437 /* Performance note: It might be tempting to extract commonly used fields from
438 the md structure (e.g. utf8, end_subject) into individual variables to improve
439 performance. Tests using gcc on a SPARC disproved this; in the first case, it
440 made performance worse.
441
442 Arguments:
443 eptr pointer to current character in subject
444 ecode pointer to current position in compiled code
445 mstart pointer to the current match start position (can be modified
446 by encountering \K)
447 markptr pointer to the most recent MARK name, or NULL
448 offset_top current top pointer
449 md pointer to "static" info for the match
450 ims current /i, /m, and /s options
451 eptrb pointer to chain of blocks containing eptr at start of
452 brackets - for testing for empty matches
453 flags can contain
454 match_condassert - this is an assertion condition
455 match_cbegroup - this is the start of an unlimited repeat
456 group that can match an empty string
457 rdepth the recursion depth
458
459 Returns: MATCH_MATCH if matched ) these values are >= 0
460 MATCH_NOMATCH if failed to match )
461 a negative MATCH_xxx value for PRUNE, SKIP, etc
462 a negative PCRE_ERROR_xxx value if aborted by an error condition
463 (e.g. stopped by repeated call or recursion limit)
464 */
465
466 static int
467 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
468 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
469 eptrblock *eptrb, int flags, unsigned int rdepth)
470 {
471 /* These variables do not need to be preserved over recursion in this function,
472 so they can be ordinary variables in all cases. Mark some of them with
473 "register" because they are used a lot in loops. */
474
475 register int rrc; /* Returns from recursive calls */
476 register int i; /* Used for loops not involving calls to RMATCH() */
477 register unsigned int c; /* Character values not kept over RMATCH() calls */
478 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
479
480 BOOL minimize, possessive; /* Quantifier options */
481 int condcode;
482
483 /* When recursion is not being used, all "local" variables that have to be
484 preserved over calls to RMATCH() are part of a "frame" which is obtained from
485 heap storage. Set up the top-level frame here; others are obtained from the
486 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
487
488 #ifdef NO_RECURSE
489 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
490 frame->Xprevframe = NULL; /* Marks the top level */
491
492 /* Copy in the original argument variables */
493
494 frame->Xeptr = eptr;
495 frame->Xecode = ecode;
496 frame->Xmstart = mstart;
497 frame->Xmarkptr = markptr;
498 frame->Xoffset_top = offset_top;
499 frame->Xims = ims;
500 frame->Xeptrb = eptrb;
501 frame->Xflags = flags;
502 frame->Xrdepth = rdepth;
503
504 /* This is where control jumps back to to effect "recursion" */
505
506 HEAP_RECURSE:
507
508 /* Macros make the argument variables come from the current frame */
509
510 #define eptr frame->Xeptr
511 #define ecode frame->Xecode
512 #define mstart frame->Xmstart
513 #define markptr frame->Xmarkptr
514 #define offset_top frame->Xoffset_top
515 #define ims frame->Xims
516 #define eptrb frame->Xeptrb
517 #define flags frame->Xflags
518 #define rdepth frame->Xrdepth
519
520 /* Ditto for the local variables */
521
522 #ifdef SUPPORT_UTF8
523 #define charptr frame->Xcharptr
524 #endif
525 #define callpat frame->Xcallpat
526 #define codelink frame->Xcodelink
527 #define data frame->Xdata
528 #define next frame->Xnext
529 #define pp frame->Xpp
530 #define prev frame->Xprev
531 #define saved_eptr frame->Xsaved_eptr
532
533 #define new_recursive frame->Xnew_recursive
534
535 #define cur_is_word frame->Xcur_is_word
536 #define condition frame->Xcondition
537 #define prev_is_word frame->Xprev_is_word
538
539 #define original_ims frame->Xoriginal_ims
540
541 #ifdef SUPPORT_UCP
542 #define prop_type frame->Xprop_type
543 #define prop_value frame->Xprop_value
544 #define prop_fail_result frame->Xprop_fail_result
545 #define prop_category frame->Xprop_category
546 #define prop_chartype frame->Xprop_chartype
547 #define prop_script frame->Xprop_script
548 #define oclength frame->Xoclength
549 #define occhars frame->Xocchars
550 #endif
551
552 #define ctype frame->Xctype
553 #define fc frame->Xfc
554 #define fi frame->Xfi
555 #define length frame->Xlength
556 #define max frame->Xmax
557 #define min frame->Xmin
558 #define number frame->Xnumber
559 #define offset frame->Xoffset
560 #define op frame->Xop
561 #define save_capture_last frame->Xsave_capture_last
562 #define save_offset1 frame->Xsave_offset1
563 #define save_offset2 frame->Xsave_offset2
564 #define save_offset3 frame->Xsave_offset3
565 #define stacksave frame->Xstacksave
566
567 #define newptrb frame->Xnewptrb
568
569 /* When recursion is being used, local variables are allocated on the stack and
570 get preserved during recursion in the normal way. In this environment, fi and
571 i, and fc and c, can be the same variables. */
572
573 #else /* NO_RECURSE not defined */
574 #define fi i
575 #define fc c
576
577
578 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
579 const uschar *charptr; /* in small blocks of the code. My normal */
580 #endif /* style of coding would have declared */
581 const uschar *callpat; /* them within each of those blocks. */
582 const uschar *data; /* However, in order to accommodate the */
583 const uschar *next; /* version of this code that uses an */
584 USPTR pp; /* external "stack" implemented on the */
585 const uschar *prev; /* heap, it is easier to declare them all */
586 USPTR saved_eptr; /* here, so the declarations can be cut */
587 /* out in a block. The only declarations */
588 recursion_info new_recursive; /* within blocks below are for variables */
589 /* that do not have to be preserved over */
590 BOOL cur_is_word; /* a recursive call to RMATCH(). */
591 BOOL condition;
592 BOOL prev_is_word;
593
594 unsigned long int original_ims;
595
596 #ifdef SUPPORT_UCP
597 int prop_type;
598 int prop_value;
599 int prop_fail_result;
600 int prop_category;
601 int prop_chartype;
602 int prop_script;
603 int oclength;
604 uschar occhars[8];
605 #endif
606
607 int codelink;
608 int ctype;
609 int length;
610 int max;
611 int min;
612 int number;
613 int offset;
614 int op;
615 int save_capture_last;
616 int save_offset1, save_offset2, save_offset3;
617 int stacksave[REC_STACK_SAVE_MAX];
618
619 eptrblock newptrb;
620 #endif /* NO_RECURSE */
621
622 /* These statements are here to stop the compiler complaining about unitialized
623 variables. */
624
625 #ifdef SUPPORT_UCP
626 prop_value = 0;
627 prop_fail_result = 0;
628 #endif
629
630
631 /* This label is used for tail recursion, which is used in a few cases even
632 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
633 used. Thanks to Ian Taylor for noticing this possibility and sending the
634 original patch. */
635
636 TAIL_RECURSE:
637
638 /* OK, now we can get on with the real code of the function. Recursive calls
639 are specified by the macro RMATCH and RRETURN is used to return. When
640 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
641 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
642 defined). However, RMATCH isn't like a function call because it's quite a
643 complicated macro. It has to be used in one particular way. This shouldn't,
644 however, impact performance when true recursion is being used. */
645
646 #ifdef SUPPORT_UTF8
647 utf8 = md->utf8; /* Local copy of the flag */
648 #else
649 utf8 = FALSE;
650 #endif
651
652 /* First check that we haven't called match() too many times, or that we
653 haven't exceeded the recursive call limit. */
654
655 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
656 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
657
658 original_ims = ims; /* Save for resetting on ')' */
659
660 /* At the start of a group with an unlimited repeat that may match an empty
661 string, the match_cbegroup flag is set. When this is the case, add the current
662 subject pointer to the chain of such remembered pointers, to be checked when we
663 hit the closing ket, in order to break infinite loops that match no characters.
664 When match() is called in other circumstances, don't add to the chain. The
665 match_cbegroup flag must NOT be used with tail recursion, because the memory
666 block that is used is on the stack, so a new one may be required for each
667 match(). */
668
669 if ((flags & match_cbegroup) != 0)
670 {
671 newptrb.epb_saved_eptr = eptr;
672 newptrb.epb_prev = eptrb;
673 eptrb = &newptrb;
674 }
675
676 /* Now start processing the opcodes. */
677
678 for (;;)
679 {
680 minimize = possessive = FALSE;
681 op = *ecode;
682
683 switch(op)
684 {
685 case OP_MARK:
686 markptr = ecode + 2;
687 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
688 ims, eptrb, flags, RM51);
689
690 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
691 argument, and we must check whether that argument matches this MARK's
692 argument. It is passed back in md->start_match_ptr (an overloading of that
693 variable). If it does match, we reset that variable to the current subject
694 position and return MATCH_SKIP. Otherwise, pass back the return code
695 unaltered. */
696
697 if (rrc == MATCH_SKIP_ARG &&
698 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
699 {
700 md->start_match_ptr = eptr;
701 RRETURN(MATCH_SKIP);
702 }
703
704 if (md->mark == NULL) md->mark = markptr;
705 RRETURN(rrc);
706
707 case OP_FAIL:
708 MRRETURN(MATCH_NOMATCH);
709
710 case OP_COMMIT:
711 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
712 ims, eptrb, flags, RM52);
713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
714 MRRETURN(MATCH_COMMIT);
715
716 case OP_PRUNE:
717 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
718 ims, eptrb, flags, RM51);
719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
720 MRRETURN(MATCH_PRUNE);
721
722 case OP_PRUNE_ARG:
723 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
724 ims, eptrb, flags, RM51);
725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
726 md->mark = ecode + 2;
727 RRETURN(MATCH_PRUNE);
728
729 case OP_SKIP:
730 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
731 ims, eptrb, flags, RM53);
732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
733 md->start_match_ptr = eptr; /* Pass back current position */
734 MRRETURN(MATCH_SKIP);
735
736 case OP_SKIP_ARG:
737 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
738 ims, eptrb, flags, RM53);
739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
740
741 /* Pass back the current skip name by overloading md->start_match_ptr and
742 returning the special MATCH_SKIP_ARG return code. This will either be
743 caught by a matching MARK, or get to the top, where it is treated the same
744 as PRUNE. */
745
746 md->start_match_ptr = ecode + 2;
747 RRETURN(MATCH_SKIP_ARG);
748
749 case OP_THEN:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ims, eptrb, flags, RM54);
752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
753 MRRETURN(MATCH_THEN);
754
755 case OP_THEN_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ims, eptrb, flags, RM54);
758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_THEN);
761
762 /* Handle a capturing bracket. If there is space in the offset vector, save
763 the current subject position in the working slot at the top of the vector.
764 We mustn't change the current values of the data slot, because they may be
765 set from a previous iteration of this group, and be referred to by a
766 reference inside the group.
767
768 If the bracket fails to match, we need to restore this value and also the
769 values of the final offsets, in case they were set by a previous iteration
770 of the same bracket.
771
772 If there isn't enough space in the offset vector, treat this as if it were
773 a non-capturing bracket. Don't worry about setting the flag for the error
774 case here; that is handled in the code for KET. */
775
776 case OP_CBRA:
777 case OP_SCBRA:
778 number = GET2(ecode, 1+LINK_SIZE);
779 offset = number << 1;
780
781 #ifdef PCRE_DEBUG
782 printf("start bracket %d\n", number);
783 printf("subject=");
784 pchars(eptr, 16, TRUE, md);
785 printf("\n");
786 #endif
787
788 if (offset < md->offset_max)
789 {
790 save_offset1 = md->offset_vector[offset];
791 save_offset2 = md->offset_vector[offset+1];
792 save_offset3 = md->offset_vector[md->offset_end - number];
793 save_capture_last = md->capture_last;
794
795 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
796 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
797
798 flags = (op == OP_SCBRA)? match_cbegroup : 0;
799 do
800 {
801 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
802 ims, eptrb, flags, RM1);
803 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
804 md->capture_last = save_capture_last;
805 ecode += GET(ecode, 1);
806 }
807 while (*ecode == OP_ALT);
808
809 DPRINTF(("bracket %d failed\n", number));
810
811 md->offset_vector[offset] = save_offset1;
812 md->offset_vector[offset+1] = save_offset2;
813 md->offset_vector[md->offset_end - number] = save_offset3;
814
815 if (rrc != MATCH_THEN) md->mark = markptr;
816 RRETURN(MATCH_NOMATCH);
817 }
818
819 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
820 as a non-capturing bracket. */
821
822 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
823 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
824
825 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
826
827 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
828 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
829
830 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
831 final alternative within the brackets, we would return the result of a
832 recursive call to match() whatever happened. We can reduce stack usage by
833 turning this into a tail recursion, except in the case when match_cbegroup
834 is set.*/
835
836 case OP_BRA:
837 case OP_SBRA:
838 DPRINTF(("start non-capturing bracket\n"));
839 flags = (op >= OP_SBRA)? match_cbegroup : 0;
840 for (;;)
841 {
842 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
843 {
844 if (flags == 0) /* Not a possibly empty group */
845 {
846 ecode += _pcre_OP_lengths[*ecode];
847 DPRINTF(("bracket 0 tail recursion\n"));
848 goto TAIL_RECURSE;
849 }
850
851 /* Possibly empty group; can't use tail recursion. */
852
853 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
854 eptrb, flags, RM48);
855 if (rrc == MATCH_NOMATCH) md->mark = markptr;
856 RRETURN(rrc);
857 }
858
859 /* For non-final alternatives, continue the loop for a NOMATCH result;
860 otherwise return. */
861
862 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
863 eptrb, flags, RM2);
864 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
865 ecode += GET(ecode, 1);
866 }
867 /* Control never reaches here. */
868
869 /* Conditional group: compilation checked that there are no more than
870 two branches. If the condition is false, skipping the first branch takes us
871 past the end if there is only one branch, but that's OK because that is
872 exactly what going to the ket would do. As there is only one branch to be
873 obeyed, we can use tail recursion to avoid using another stack frame. */
874
875 case OP_COND:
876 case OP_SCOND:
877 codelink= GET(ecode, 1);
878
879 /* Because of the way auto-callout works during compile, a callout item is
880 inserted between OP_COND and an assertion condition. */
881
882 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
883 {
884 if (pcre_callout != NULL)
885 {
886 pcre_callout_block cb;
887 cb.version = 1; /* Version 1 of the callout block */
888 cb.callout_number = ecode[LINK_SIZE+2];
889 cb.offset_vector = md->offset_vector;
890 cb.subject = (PCRE_SPTR)md->start_subject;
891 cb.subject_length = md->end_subject - md->start_subject;
892 cb.start_match = mstart - md->start_subject;
893 cb.current_position = eptr - md->start_subject;
894 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
895 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
896 cb.capture_top = offset_top/2;
897 cb.capture_last = md->capture_last;
898 cb.callout_data = md->callout_data;
899 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
900 if (rrc < 0) RRETURN(rrc);
901 }
902 ecode += _pcre_OP_lengths[OP_CALLOUT];
903 }
904
905 condcode = ecode[LINK_SIZE+1];
906
907 /* Now see what the actual condition is */
908
909 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
910 {
911 if (md->recursive == NULL) /* Not recursing => FALSE */
912 {
913 condition = FALSE;
914 ecode += GET(ecode, 1);
915 }
916 else
917 {
918 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
919 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
920
921 /* If the test is for recursion into a specific subpattern, and it is
922 false, but the test was set up by name, scan the table to see if the
923 name refers to any other numbers, and test them. The condition is true
924 if any one is set. */
925
926 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
927 {
928 uschar *slotA = md->name_table;
929 for (i = 0; i < md->name_count; i++)
930 {
931 if (GET2(slotA, 0) == recno) break;
932 slotA += md->name_entry_size;
933 }
934
935 /* Found a name for the number - there can be only one; duplicate
936 names for different numbers are allowed, but not vice versa. First
937 scan down for duplicates. */
938
939 if (i < md->name_count)
940 {
941 uschar *slotB = slotA;
942 while (slotB > md->name_table)
943 {
944 slotB -= md->name_entry_size;
945 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
946 {
947 condition = GET2(slotB, 0) == md->recursive->group_num;
948 if (condition) break;
949 }
950 else break;
951 }
952
953 /* Scan up for duplicates */
954
955 if (!condition)
956 {
957 slotB = slotA;
958 for (i++; i < md->name_count; i++)
959 {
960 slotB += md->name_entry_size;
961 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
962 {
963 condition = GET2(slotB, 0) == md->recursive->group_num;
964 if (condition) break;
965 }
966 else break;
967 }
968 }
969 }
970 }
971
972 /* Chose branch according to the condition */
973
974 ecode += condition? 3 : GET(ecode, 1);
975 }
976 }
977
978 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
979 {
980 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
981 condition = offset < offset_top && md->offset_vector[offset] >= 0;
982
983 /* If the numbered capture is unset, but the reference was by name,
984 scan the table to see if the name refers to any other numbers, and test
985 them. The condition is true if any one is set. This is tediously similar
986 to the code above, but not close enough to try to amalgamate. */
987
988 if (!condition && condcode == OP_NCREF)
989 {
990 int refno = offset >> 1;
991 uschar *slotA = md->name_table;
992
993 for (i = 0; i < md->name_count; i++)
994 {
995 if (GET2(slotA, 0) == refno) break;
996 slotA += md->name_entry_size;
997 }
998
999 /* Found a name for the number - there can be only one; duplicate names
1000 for different numbers are allowed, but not vice versa. First scan down
1001 for duplicates. */
1002
1003 if (i < md->name_count)
1004 {
1005 uschar *slotB = slotA;
1006 while (slotB > md->name_table)
1007 {
1008 slotB -= md->name_entry_size;
1009 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1010 {
1011 offset = GET2(slotB, 0) << 1;
1012 condition = offset < offset_top &&
1013 md->offset_vector[offset] >= 0;
1014 if (condition) break;
1015 }
1016 else break;
1017 }
1018
1019 /* Scan up for duplicates */
1020
1021 if (!condition)
1022 {
1023 slotB = slotA;
1024 for (i++; i < md->name_count; i++)
1025 {
1026 slotB += md->name_entry_size;
1027 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1028 {
1029 offset = GET2(slotB, 0) << 1;
1030 condition = offset < offset_top &&
1031 md->offset_vector[offset] >= 0;
1032 if (condition) break;
1033 }
1034 else break;
1035 }
1036 }
1037 }
1038 }
1039
1040 /* Chose branch according to the condition */
1041
1042 ecode += condition? 3 : GET(ecode, 1);
1043 }
1044
1045 else if (condcode == OP_DEF) /* DEFINE - always false */
1046 {
1047 condition = FALSE;
1048 ecode += GET(ecode, 1);
1049 }
1050
1051 /* The condition is an assertion. Call match() to evaluate it - setting
1052 the final argument match_condassert causes it to stop at the end of an
1053 assertion. */
1054
1055 else
1056 {
1057 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1058 match_condassert, RM3);
1059 if (rrc == MATCH_MATCH)
1060 {
1061 condition = TRUE;
1062 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1063 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1064 }
1065 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1066 {
1067 RRETURN(rrc); /* Need braces because of following else */
1068 }
1069 else
1070 {
1071 condition = FALSE;
1072 ecode += codelink;
1073 }
1074 }
1075
1076 /* We are now at the branch that is to be obeyed. As there is only one,
1077 we can use tail recursion to avoid using another stack frame, except when
1078 match_cbegroup is required for an unlimited repeat of a possibly empty
1079 group. If the second alternative doesn't exist, we can just plough on. */
1080
1081 if (condition || *ecode == OP_ALT)
1082 {
1083 ecode += 1 + LINK_SIZE;
1084 if (op == OP_SCOND) /* Possibly empty group */
1085 {
1086 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1087 RRETURN(rrc);
1088 }
1089 else /* Group must match something */
1090 {
1091 flags = 0;
1092 goto TAIL_RECURSE;
1093 }
1094 }
1095 else /* Condition false & no alternative */
1096 {
1097 ecode += 1 + LINK_SIZE;
1098 }
1099 break;
1100
1101
1102 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1103 to close any currently open capturing brackets. */
1104
1105 case OP_CLOSE:
1106 number = GET2(ecode, 1);
1107 offset = number << 1;
1108
1109 #ifdef PCRE_DEBUG
1110 printf("end bracket %d at *ACCEPT", number);
1111 printf("\n");
1112 #endif
1113
1114 md->capture_last = number;
1115 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1116 {
1117 md->offset_vector[offset] =
1118 md->offset_vector[md->offset_end - number];
1119 md->offset_vector[offset+1] = eptr - md->start_subject;
1120 if (offset_top <= offset) offset_top = offset + 2;
1121 }
1122 ecode += 3;
1123 break;
1124
1125
1126 /* End of the pattern, either real or forced. If we are in a top-level
1127 recursion, we should restore the offsets appropriately and continue from
1128 after the call. */
1129
1130 case OP_ACCEPT:
1131 case OP_END:
1132 if (md->recursive != NULL && md->recursive->group_num == 0)
1133 {
1134 recursion_info *rec = md->recursive;
1135 DPRINTF(("End of pattern in a (?0) recursion\n"));
1136 md->recursive = rec->prevrec;
1137 memmove(md->offset_vector, rec->offset_save,
1138 rec->saved_max * sizeof(int));
1139 offset_top = rec->save_offset_top;
1140 ims = original_ims;
1141 ecode = rec->after_call;
1142 break;
1143 }
1144
1145 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1146 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1147 the subject. In both cases, backtracking will then try other alternatives,
1148 if any. */
1149
1150 if (eptr == mstart &&
1151 (md->notempty ||
1152 (md->notempty_atstart &&
1153 mstart == md->start_subject + md->start_offset)))
1154 MRRETURN(MATCH_NOMATCH);
1155
1156 /* Otherwise, we have a match. */
1157
1158 md->end_match_ptr = eptr; /* Record where we ended */
1159 md->end_offset_top = offset_top; /* and how many extracts were taken */
1160 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1161 MRRETURN(((op == OP_END)? MATCH_MATCH : MATCH_ACCEPT));
1162
1163 /* Change option settings */
1164
1165 case OP_OPT:
1166 ims = ecode[1];
1167 ecode += 2;
1168 DPRINTF(("ims set to %02lx\n", ims));
1169 break;
1170
1171 /* Assertion brackets. Check the alternative branches in turn - the
1172 matching won't pass the KET for an assertion. If any one branch matches,
1173 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1174 start of each branch to move the current point backwards, so the code at
1175 this level is identical to the lookahead case. */
1176
1177 case OP_ASSERT:
1178 case OP_ASSERTBACK:
1179 do
1180 {
1181 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1182 RM4);
1183 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1184 {
1185 mstart = md->start_match_ptr; /* In case \K reset it */
1186 break;
1187 }
1188 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1189 ecode += GET(ecode, 1);
1190 }
1191 while (*ecode == OP_ALT);
1192 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1193
1194 /* If checking an assertion for a condition, return MATCH_MATCH. */
1195
1196 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1197
1198 /* Continue from after the assertion, updating the offsets high water
1199 mark, since extracts may have been taken during the assertion. */
1200
1201 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1202 ecode += 1 + LINK_SIZE;
1203 offset_top = md->end_offset_top;
1204 continue;
1205
1206 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1207 PRUNE, or COMMIT means we must assume failure without checking subsequent
1208 branches. */
1209
1210 case OP_ASSERT_NOT:
1211 case OP_ASSERTBACK_NOT:
1212 do
1213 {
1214 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1215 RM5);
1216 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1217 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1218 {
1219 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1220 break;
1221 }
1222 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1223 ecode += GET(ecode,1);
1224 }
1225 while (*ecode == OP_ALT);
1226
1227 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1228
1229 ecode += 1 + LINK_SIZE;
1230 continue;
1231
1232 /* Move the subject pointer back. This occurs only at the start of
1233 each branch of a lookbehind assertion. If we are too close to the start to
1234 move back, this match function fails. When working with UTF-8 we move
1235 back a number of characters, not bytes. */
1236
1237 case OP_REVERSE:
1238 #ifdef SUPPORT_UTF8
1239 if (utf8)
1240 {
1241 i = GET(ecode, 1);
1242 while (i-- > 0)
1243 {
1244 eptr--;
1245 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1246 BACKCHAR(eptr);
1247 }
1248 }
1249 else
1250 #endif
1251
1252 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1253
1254 {
1255 eptr -= GET(ecode, 1);
1256 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1257 }
1258
1259 /* Save the earliest consulted character, then skip to next op code */
1260
1261 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1262 ecode += 1 + LINK_SIZE;
1263 break;
1264
1265 /* The callout item calls an external function, if one is provided, passing
1266 details of the match so far. This is mainly for debugging, though the
1267 function is able to force a failure. */
1268
1269 case OP_CALLOUT:
1270 if (pcre_callout != NULL)
1271 {
1272 pcre_callout_block cb;
1273 cb.version = 1; /* Version 1 of the callout block */
1274 cb.callout_number = ecode[1];
1275 cb.offset_vector = md->offset_vector;
1276 cb.subject = (PCRE_SPTR)md->start_subject;
1277 cb.subject_length = md->end_subject - md->start_subject;
1278 cb.start_match = mstart - md->start_subject;
1279 cb.current_position = eptr - md->start_subject;
1280 cb.pattern_position = GET(ecode, 2);
1281 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1282 cb.capture_top = offset_top/2;
1283 cb.capture_last = md->capture_last;
1284 cb.callout_data = md->callout_data;
1285 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1286 if (rrc < 0) RRETURN(rrc);
1287 }
1288 ecode += 2 + 2*LINK_SIZE;
1289 break;
1290
1291 /* Recursion either matches the current regex, or some subexpression. The
1292 offset data is the offset to the starting bracket from the start of the
1293 whole pattern. (This is so that it works from duplicated subpatterns.)
1294
1295 If there are any capturing brackets started but not finished, we have to
1296 save their starting points and reinstate them after the recursion. However,
1297 we don't know how many such there are (offset_top records the completed
1298 total) so we just have to save all the potential data. There may be up to
1299 65535 such values, which is too large to put on the stack, but using malloc
1300 for small numbers seems expensive. As a compromise, the stack is used when
1301 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1302 is used. A problem is what to do if the malloc fails ... there is no way of
1303 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1304 values on the stack, and accept that the rest may be wrong.
1305
1306 There are also other values that have to be saved. We use a chained
1307 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1308 for the original version of this logic. */
1309
1310 case OP_RECURSE:
1311 {
1312 callpat = md->start_code + GET(ecode, 1);
1313 new_recursive.group_num = (callpat == md->start_code)? 0 :
1314 GET2(callpat, 1 + LINK_SIZE);
1315
1316 /* Add to "recursing stack" */
1317
1318 new_recursive.prevrec = md->recursive;
1319 md->recursive = &new_recursive;
1320
1321 /* Find where to continue from afterwards */
1322
1323 ecode += 1 + LINK_SIZE;
1324 new_recursive.after_call = ecode;
1325
1326 /* Now save the offset data. */
1327
1328 new_recursive.saved_max = md->offset_end;
1329 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1330 new_recursive.offset_save = stacksave;
1331 else
1332 {
1333 new_recursive.offset_save =
1334 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1335 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1336 }
1337
1338 memcpy(new_recursive.offset_save, md->offset_vector,
1339 new_recursive.saved_max * sizeof(int));
1340 new_recursive.save_offset_top = offset_top;
1341
1342 /* OK, now we can do the recursion. For each top-level alternative we
1343 restore the offset and recursion data. */
1344
1345 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1346 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1347 do
1348 {
1349 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1350 md, ims, eptrb, flags, RM6);
1351 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1352 {
1353 DPRINTF(("Recursion matched\n"));
1354 md->recursive = new_recursive.prevrec;
1355 if (new_recursive.offset_save != stacksave)
1356 (pcre_free)(new_recursive.offset_save);
1357 MRRETURN(MATCH_MATCH);
1358 }
1359 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1360 {
1361 DPRINTF(("Recursion gave error %d\n", rrc));
1362 if (new_recursive.offset_save != stacksave)
1363 (pcre_free)(new_recursive.offset_save);
1364 RRETURN(rrc);
1365 }
1366
1367 md->recursive = &new_recursive;
1368 memcpy(md->offset_vector, new_recursive.offset_save,
1369 new_recursive.saved_max * sizeof(int));
1370 callpat += GET(callpat, 1);
1371 }
1372 while (*callpat == OP_ALT);
1373
1374 DPRINTF(("Recursion didn't match\n"));
1375 md->recursive = new_recursive.prevrec;
1376 if (new_recursive.offset_save != stacksave)
1377 (pcre_free)(new_recursive.offset_save);
1378 MRRETURN(MATCH_NOMATCH);
1379 }
1380 /* Control never reaches here */
1381
1382 /* "Once" brackets are like assertion brackets except that after a match,
1383 the point in the subject string is not moved back. Thus there can never be
1384 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1385 Check the alternative branches in turn - the matching won't pass the KET
1386 for this kind of subpattern. If any one branch matches, we carry on as at
1387 the end of a normal bracket, leaving the subject pointer, but resetting
1388 the start-of-match value in case it was changed by \K. */
1389
1390 case OP_ONCE:
1391 prev = ecode;
1392 saved_eptr = eptr;
1393
1394 do
1395 {
1396 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1397 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1398 {
1399 mstart = md->start_match_ptr;
1400 break;
1401 }
1402 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1403 ecode += GET(ecode,1);
1404 }
1405 while (*ecode == OP_ALT);
1406
1407 /* If hit the end of the group (which could be repeated), fail */
1408
1409 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1410
1411 /* Continue as from after the assertion, updating the offsets high water
1412 mark, since extracts may have been taken. */
1413
1414 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1415
1416 offset_top = md->end_offset_top;
1417 eptr = md->end_match_ptr;
1418
1419 /* For a non-repeating ket, just continue at this level. This also
1420 happens for a repeating ket if no characters were matched in the group.
1421 This is the forcible breaking of infinite loops as implemented in Perl
1422 5.005. If there is an options reset, it will get obeyed in the normal
1423 course of events. */
1424
1425 if (*ecode == OP_KET || eptr == saved_eptr)
1426 {
1427 ecode += 1+LINK_SIZE;
1428 break;
1429 }
1430
1431 /* The repeating kets try the rest of the pattern or restart from the
1432 preceding bracket, in the appropriate order. The second "call" of match()
1433 uses tail recursion, to avoid using another stack frame. We need to reset
1434 any options that changed within the bracket before re-running it, so
1435 check the next opcode. */
1436
1437 if (ecode[1+LINK_SIZE] == OP_OPT)
1438 {
1439 ims = (ims & ~PCRE_IMS) | ecode[4];
1440 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1441 }
1442
1443 if (*ecode == OP_KETRMIN)
1444 {
1445 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1446 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1447 ecode = prev;
1448 flags = 0;
1449 goto TAIL_RECURSE;
1450 }
1451 else /* OP_KETRMAX */
1452 {
1453 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1454 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1455 ecode += 1 + LINK_SIZE;
1456 flags = 0;
1457 goto TAIL_RECURSE;
1458 }
1459 /* Control never gets here */
1460
1461 /* An alternation is the end of a branch; scan along to find the end of the
1462 bracketed group and go to there. */
1463
1464 case OP_ALT:
1465 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1466 break;
1467
1468 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1469 indicating that it may occur zero times. It may repeat infinitely, or not
1470 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1471 with fixed upper repeat limits are compiled as a number of copies, with the
1472 optional ones preceded by BRAZERO or BRAMINZERO. */
1473
1474 case OP_BRAZERO:
1475 {
1476 next = ecode+1;
1477 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1478 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1479 do next += GET(next,1); while (*next == OP_ALT);
1480 ecode = next + 1 + LINK_SIZE;
1481 }
1482 break;
1483
1484 case OP_BRAMINZERO:
1485 {
1486 next = ecode+1;
1487 do next += GET(next, 1); while (*next == OP_ALT);
1488 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1489 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1490 ecode++;
1491 }
1492 break;
1493
1494 case OP_SKIPZERO:
1495 {
1496 next = ecode+1;
1497 do next += GET(next,1); while (*next == OP_ALT);
1498 ecode = next + 1 + LINK_SIZE;
1499 }
1500 break;
1501
1502 /* End of a group, repeated or non-repeating. */
1503
1504 case OP_KET:
1505 case OP_KETRMIN:
1506 case OP_KETRMAX:
1507 prev = ecode - GET(ecode, 1);
1508
1509 /* If this was a group that remembered the subject start, in order to break
1510 infinite repeats of empty string matches, retrieve the subject start from
1511 the chain. Otherwise, set it NULL. */
1512
1513 if (*prev >= OP_SBRA)
1514 {
1515 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1516 eptrb = eptrb->epb_prev; /* Backup to previous group */
1517 }
1518 else saved_eptr = NULL;
1519
1520 /* If we are at the end of an assertion group or an atomic group, stop
1521 matching and return MATCH_MATCH, but record the current high water mark for
1522 use by positive assertions. We also need to record the match start in case
1523 it was changed by \K. */
1524
1525 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1526 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1527 *prev == OP_ONCE)
1528 {
1529 md->end_match_ptr = eptr; /* For ONCE */
1530 md->end_offset_top = offset_top;
1531 md->start_match_ptr = mstart;
1532 MRRETURN(MATCH_MATCH);
1533 }
1534
1535 /* For capturing groups we have to check the group number back at the start
1536 and if necessary complete handling an extraction by setting the offsets and
1537 bumping the high water mark. Note that whole-pattern recursion is coded as
1538 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1539 when the OP_END is reached. Other recursion is handled here. */
1540
1541 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1542 {
1543 number = GET2(prev, 1+LINK_SIZE);
1544 offset = number << 1;
1545
1546 #ifdef PCRE_DEBUG
1547 printf("end bracket %d", number);
1548 printf("\n");
1549 #endif
1550
1551 md->capture_last = number;
1552 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1553 {
1554 md->offset_vector[offset] =
1555 md->offset_vector[md->offset_end - number];
1556 md->offset_vector[offset+1] = eptr - md->start_subject;
1557 if (offset_top <= offset) offset_top = offset + 2;
1558 }
1559
1560 /* Handle a recursively called group. Restore the offsets
1561 appropriately and continue from after the call. */
1562
1563 if (md->recursive != NULL && md->recursive->group_num == number)
1564 {
1565 recursion_info *rec = md->recursive;
1566 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1567 md->recursive = rec->prevrec;
1568 memcpy(md->offset_vector, rec->offset_save,
1569 rec->saved_max * sizeof(int));
1570 offset_top = rec->save_offset_top;
1571 ecode = rec->after_call;
1572 ims = original_ims;
1573 break;
1574 }
1575 }
1576
1577 /* For both capturing and non-capturing groups, reset the value of the ims
1578 flags, in case they got changed during the group. */
1579
1580 ims = original_ims;
1581 DPRINTF(("ims reset to %02lx\n", ims));
1582
1583 /* For a non-repeating ket, just continue at this level. This also
1584 happens for a repeating ket if no characters were matched in the group.
1585 This is the forcible breaking of infinite loops as implemented in Perl
1586 5.005. If there is an options reset, it will get obeyed in the normal
1587 course of events. */
1588
1589 if (*ecode == OP_KET || eptr == saved_eptr)
1590 {
1591 ecode += 1 + LINK_SIZE;
1592 break;
1593 }
1594
1595 /* The repeating kets try the rest of the pattern or restart from the
1596 preceding bracket, in the appropriate order. In the second case, we can use
1597 tail recursion to avoid using another stack frame, unless we have an
1598 unlimited repeat of a group that can match an empty string. */
1599
1600 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1601
1602 if (*ecode == OP_KETRMIN)
1603 {
1604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1605 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1606 if (flags != 0) /* Could match an empty string */
1607 {
1608 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1609 RRETURN(rrc);
1610 }
1611 ecode = prev;
1612 goto TAIL_RECURSE;
1613 }
1614 else /* OP_KETRMAX */
1615 {
1616 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1617 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1618 ecode += 1 + LINK_SIZE;
1619 flags = 0;
1620 goto TAIL_RECURSE;
1621 }
1622 /* Control never gets here */
1623
1624 /* Start of subject unless notbol, or after internal newline if multiline */
1625
1626 case OP_CIRC:
1627 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1628 if ((ims & PCRE_MULTILINE) != 0)
1629 {
1630 if (eptr != md->start_subject &&
1631 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1632 MRRETURN(MATCH_NOMATCH);
1633 ecode++;
1634 break;
1635 }
1636 /* ... else fall through */
1637
1638 /* Start of subject assertion */
1639
1640 case OP_SOD:
1641 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1642 ecode++;
1643 break;
1644
1645 /* Start of match assertion */
1646
1647 case OP_SOM:
1648 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1649 ecode++;
1650 break;
1651
1652 /* Reset the start of match point */
1653
1654 case OP_SET_SOM:
1655 mstart = eptr;
1656 ecode++;
1657 break;
1658
1659 /* Assert before internal newline if multiline, or before a terminating
1660 newline unless endonly is set, else end of subject unless noteol is set. */
1661
1662 case OP_DOLL:
1663 if ((ims & PCRE_MULTILINE) != 0)
1664 {
1665 if (eptr < md->end_subject)
1666 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1667 else
1668 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1669 ecode++;
1670 break;
1671 }
1672 else
1673 {
1674 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1675 if (!md->endonly)
1676 {
1677 if (eptr != md->end_subject &&
1678 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1679 MRRETURN(MATCH_NOMATCH);
1680 ecode++;
1681 break;
1682 }
1683 }
1684 /* ... else fall through for endonly */
1685
1686 /* End of subject assertion (\z) */
1687
1688 case OP_EOD:
1689 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1690 ecode++;
1691 break;
1692
1693 /* End of subject or ending \n assertion (\Z) */
1694
1695 case OP_EODN:
1696 if (eptr != md->end_subject &&
1697 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1698 MRRETURN(MATCH_NOMATCH);
1699 ecode++;
1700 break;
1701
1702 /* Word boundary assertions */
1703
1704 case OP_NOT_WORD_BOUNDARY:
1705 case OP_WORD_BOUNDARY:
1706 {
1707
1708 /* Find out if the previous and current characters are "word" characters.
1709 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1710 be "non-word" characters. Remember the earliest consulted character for
1711 partial matching. */
1712
1713 #ifdef SUPPORT_UTF8
1714 if (utf8)
1715 {
1716 if (eptr == md->start_subject) prev_is_word = FALSE; else
1717 {
1718 USPTR lastptr = eptr - 1;
1719 while((*lastptr & 0xc0) == 0x80) lastptr--;
1720 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1721 GETCHAR(c, lastptr);
1722 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1723 }
1724 if (eptr >= md->end_subject)
1725 {
1726 SCHECK_PARTIAL();
1727 cur_is_word = FALSE;
1728 }
1729 else
1730 {
1731 GETCHAR(c, eptr);
1732 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1733 }
1734 }
1735 else
1736 #endif
1737
1738 /* Not in UTF-8 mode */
1739
1740 {
1741 if (eptr == md->start_subject) prev_is_word = FALSE; else
1742 {
1743 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1744 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1745 }
1746 if (eptr >= md->end_subject)
1747 {
1748 SCHECK_PARTIAL();
1749 cur_is_word = FALSE;
1750 }
1751 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1752 }
1753
1754 /* Now see if the situation is what we want */
1755
1756 if ((*ecode++ == OP_WORD_BOUNDARY)?
1757 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1758 MRRETURN(MATCH_NOMATCH);
1759 }
1760 break;
1761
1762 /* Match a single character type; inline for speed */
1763
1764 case OP_ANY:
1765 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1766 /* Fall through */
1767
1768 case OP_ALLANY:
1769 if (eptr++ >= md->end_subject)
1770 {
1771 SCHECK_PARTIAL();
1772 MRRETURN(MATCH_NOMATCH);
1773 }
1774 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1775 ecode++;
1776 break;
1777
1778 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1779 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1780
1781 case OP_ANYBYTE:
1782 if (eptr++ >= md->end_subject)
1783 {
1784 SCHECK_PARTIAL();
1785 MRRETURN(MATCH_NOMATCH);
1786 }
1787 ecode++;
1788 break;
1789
1790 case OP_NOT_DIGIT:
1791 if (eptr >= md->end_subject)
1792 {
1793 SCHECK_PARTIAL();
1794 MRRETURN(MATCH_NOMATCH);
1795 }
1796 GETCHARINCTEST(c, eptr);
1797 if (
1798 #ifdef SUPPORT_UTF8
1799 c < 256 &&
1800 #endif
1801 (md->ctypes[c] & ctype_digit) != 0
1802 )
1803 MRRETURN(MATCH_NOMATCH);
1804 ecode++;
1805 break;
1806
1807 case OP_DIGIT:
1808 if (eptr >= md->end_subject)
1809 {
1810 SCHECK_PARTIAL();
1811 MRRETURN(MATCH_NOMATCH);
1812 }
1813 GETCHARINCTEST(c, eptr);
1814 if (
1815 #ifdef SUPPORT_UTF8
1816 c >= 256 ||
1817 #endif
1818 (md->ctypes[c] & ctype_digit) == 0
1819 )
1820 MRRETURN(MATCH_NOMATCH);
1821 ecode++;
1822 break;
1823
1824 case OP_NOT_WHITESPACE:
1825 if (eptr >= md->end_subject)
1826 {
1827 SCHECK_PARTIAL();
1828 MRRETURN(MATCH_NOMATCH);
1829 }
1830 GETCHARINCTEST(c, eptr);
1831 if (
1832 #ifdef SUPPORT_UTF8
1833 c < 256 &&
1834 #endif
1835 (md->ctypes[c] & ctype_space) != 0
1836 )
1837 MRRETURN(MATCH_NOMATCH);
1838 ecode++;
1839 break;
1840
1841 case OP_WHITESPACE:
1842 if (eptr >= md->end_subject)
1843 {
1844 SCHECK_PARTIAL();
1845 MRRETURN(MATCH_NOMATCH);
1846 }
1847 GETCHARINCTEST(c, eptr);
1848 if (
1849 #ifdef SUPPORT_UTF8
1850 c >= 256 ||
1851 #endif
1852 (md->ctypes[c] & ctype_space) == 0
1853 )
1854 MRRETURN(MATCH_NOMATCH);
1855 ecode++;
1856 break;
1857
1858 case OP_NOT_WORDCHAR:
1859 if (eptr >= md->end_subject)
1860 {
1861 SCHECK_PARTIAL();
1862 MRRETURN(MATCH_NOMATCH);
1863 }
1864 GETCHARINCTEST(c, eptr);
1865 if (
1866 #ifdef SUPPORT_UTF8
1867 c < 256 &&
1868 #endif
1869 (md->ctypes[c] & ctype_word) != 0
1870 )
1871 MRRETURN(MATCH_NOMATCH);
1872 ecode++;
1873 break;
1874
1875 case OP_WORDCHAR:
1876 if (eptr >= md->end_subject)
1877 {
1878 SCHECK_PARTIAL();
1879 MRRETURN(MATCH_NOMATCH);
1880 }
1881 GETCHARINCTEST(c, eptr);
1882 if (
1883 #ifdef SUPPORT_UTF8
1884 c >= 256 ||
1885 #endif
1886 (md->ctypes[c] & ctype_word) == 0
1887 )
1888 MRRETURN(MATCH_NOMATCH);
1889 ecode++;
1890 break;
1891
1892 case OP_ANYNL:
1893 if (eptr >= md->end_subject)
1894 {
1895 SCHECK_PARTIAL();
1896 MRRETURN(MATCH_NOMATCH);
1897 }
1898 GETCHARINCTEST(c, eptr);
1899 switch(c)
1900 {
1901 default: MRRETURN(MATCH_NOMATCH);
1902 case 0x000d:
1903 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1904 break;
1905
1906 case 0x000a:
1907 break;
1908
1909 case 0x000b:
1910 case 0x000c:
1911 case 0x0085:
1912 case 0x2028:
1913 case 0x2029:
1914 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1915 break;
1916 }
1917 ecode++;
1918 break;
1919
1920 case OP_NOT_HSPACE:
1921 if (eptr >= md->end_subject)
1922 {
1923 SCHECK_PARTIAL();
1924 MRRETURN(MATCH_NOMATCH);
1925 }
1926 GETCHARINCTEST(c, eptr);
1927 switch(c)
1928 {
1929 default: break;
1930 case 0x09: /* HT */
1931 case 0x20: /* SPACE */
1932 case 0xa0: /* NBSP */
1933 case 0x1680: /* OGHAM SPACE MARK */
1934 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1935 case 0x2000: /* EN QUAD */
1936 case 0x2001: /* EM QUAD */
1937 case 0x2002: /* EN SPACE */
1938 case 0x2003: /* EM SPACE */
1939 case 0x2004: /* THREE-PER-EM SPACE */
1940 case 0x2005: /* FOUR-PER-EM SPACE */
1941 case 0x2006: /* SIX-PER-EM SPACE */
1942 case 0x2007: /* FIGURE SPACE */
1943 case 0x2008: /* PUNCTUATION SPACE */
1944 case 0x2009: /* THIN SPACE */
1945 case 0x200A: /* HAIR SPACE */
1946 case 0x202f: /* NARROW NO-BREAK SPACE */
1947 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1948 case 0x3000: /* IDEOGRAPHIC SPACE */
1949 MRRETURN(MATCH_NOMATCH);
1950 }
1951 ecode++;
1952 break;
1953
1954 case OP_HSPACE:
1955 if (eptr >= md->end_subject)
1956 {
1957 SCHECK_PARTIAL();
1958 MRRETURN(MATCH_NOMATCH);
1959 }
1960 GETCHARINCTEST(c, eptr);
1961 switch(c)
1962 {
1963 default: MRRETURN(MATCH_NOMATCH);
1964 case 0x09: /* HT */
1965 case 0x20: /* SPACE */
1966 case 0xa0: /* NBSP */
1967 case 0x1680: /* OGHAM SPACE MARK */
1968 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1969 case 0x2000: /* EN QUAD */
1970 case 0x2001: /* EM QUAD */
1971 case 0x2002: /* EN SPACE */
1972 case 0x2003: /* EM SPACE */
1973 case 0x2004: /* THREE-PER-EM SPACE */
1974 case 0x2005: /* FOUR-PER-EM SPACE */
1975 case 0x2006: /* SIX-PER-EM SPACE */
1976 case 0x2007: /* FIGURE SPACE */
1977 case 0x2008: /* PUNCTUATION SPACE */
1978 case 0x2009: /* THIN SPACE */
1979 case 0x200A: /* HAIR SPACE */
1980 case 0x202f: /* NARROW NO-BREAK SPACE */
1981 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1982 case 0x3000: /* IDEOGRAPHIC SPACE */
1983 break;
1984 }
1985 ecode++;
1986 break;
1987
1988 case OP_NOT_VSPACE:
1989 if (eptr >= md->end_subject)
1990 {
1991 SCHECK_PARTIAL();
1992 MRRETURN(MATCH_NOMATCH);
1993 }
1994 GETCHARINCTEST(c, eptr);
1995 switch(c)
1996 {
1997 default: break;
1998 case 0x0a: /* LF */
1999 case 0x0b: /* VT */
2000 case 0x0c: /* FF */
2001 case 0x0d: /* CR */
2002 case 0x85: /* NEL */
2003 case 0x2028: /* LINE SEPARATOR */
2004 case 0x2029: /* PARAGRAPH SEPARATOR */
2005 MRRETURN(MATCH_NOMATCH);
2006 }
2007 ecode++;
2008 break;
2009
2010 case OP_VSPACE:
2011 if (eptr >= md->end_subject)
2012 {
2013 SCHECK_PARTIAL();
2014 MRRETURN(MATCH_NOMATCH);
2015 }
2016 GETCHARINCTEST(c, eptr);
2017 switch(c)
2018 {
2019 default: MRRETURN(MATCH_NOMATCH);
2020 case 0x0a: /* LF */
2021 case 0x0b: /* VT */
2022 case 0x0c: /* FF */
2023 case 0x0d: /* CR */
2024 case 0x85: /* NEL */
2025 case 0x2028: /* LINE SEPARATOR */
2026 case 0x2029: /* PARAGRAPH SEPARATOR */
2027 break;
2028 }
2029 ecode++;
2030 break;
2031
2032 #ifdef SUPPORT_UCP
2033 /* Check the next character by Unicode property. We will get here only
2034 if the support is in the binary; otherwise a compile-time error occurs. */
2035
2036 case OP_PROP:
2037 case OP_NOTPROP:
2038 if (eptr >= md->end_subject)
2039 {
2040 SCHECK_PARTIAL();
2041 MRRETURN(MATCH_NOMATCH);
2042 }
2043 GETCHARINCTEST(c, eptr);
2044 {
2045 const ucd_record *prop = GET_UCD(c);
2046
2047 switch(ecode[1])
2048 {
2049 case PT_ANY:
2050 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2051 break;
2052
2053 case PT_LAMP:
2054 if ((prop->chartype == ucp_Lu ||
2055 prop->chartype == ucp_Ll ||
2056 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2057 MRRETURN(MATCH_NOMATCH);
2058 break;
2059
2060 case PT_GC:
2061 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2062 MRRETURN(MATCH_NOMATCH);
2063 break;
2064
2065 case PT_PC:
2066 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2067 MRRETURN(MATCH_NOMATCH);
2068 break;
2069
2070 case PT_SC:
2071 if ((ecode[2] != prop->script) == (op == OP_PROP))
2072 MRRETURN(MATCH_NOMATCH);
2073 break;
2074
2075 default:
2076 RRETURN(PCRE_ERROR_INTERNAL);
2077 }
2078
2079 ecode += 3;
2080 }
2081 break;
2082
2083 /* Match an extended Unicode sequence. We will get here only if the support
2084 is in the binary; otherwise a compile-time error occurs. */
2085
2086 case OP_EXTUNI:
2087 if (eptr >= md->end_subject)
2088 {
2089 SCHECK_PARTIAL();
2090 MRRETURN(MATCH_NOMATCH);
2091 }
2092 GETCHARINCTEST(c, eptr);
2093 {
2094 int category = UCD_CATEGORY(c);
2095 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2096 while (eptr < md->end_subject)
2097 {
2098 int len = 1;
2099 if (!utf8) c = *eptr; else
2100 {
2101 GETCHARLEN(c, eptr, len);
2102 }
2103 category = UCD_CATEGORY(c);
2104 if (category != ucp_M) break;
2105 eptr += len;
2106 }
2107 }
2108 ecode++;
2109 break;
2110 #endif
2111
2112
2113 /* Match a back reference, possibly repeatedly. Look past the end of the
2114 item to see if there is repeat information following. The code is similar
2115 to that for character classes, but repeated for efficiency. Then obey
2116 similar code to character type repeats - written out again for speed.
2117 However, if the referenced string is the empty string, always treat
2118 it as matched, any number of times (otherwise there could be infinite
2119 loops). */
2120
2121 case OP_REF:
2122 {
2123 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2124 ecode += 3;
2125
2126 /* If the reference is unset, there are two possibilities:
2127
2128 (a) In the default, Perl-compatible state, set the length to be longer
2129 than the amount of subject left; this ensures that every attempt at a
2130 match fails. We can't just fail here, because of the possibility of
2131 quantifiers with zero minima.
2132
2133 (b) If the JavaScript compatibility flag is set, set the length to zero
2134 so that the back reference matches an empty string.
2135
2136 Otherwise, set the length to the length of what was matched by the
2137 referenced subpattern. */
2138
2139 if (offset >= offset_top || md->offset_vector[offset] < 0)
2140 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2141 else
2142 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2143
2144 /* Set up for repetition, or handle the non-repeated case */
2145
2146 switch (*ecode)
2147 {
2148 case OP_CRSTAR:
2149 case OP_CRMINSTAR:
2150 case OP_CRPLUS:
2151 case OP_CRMINPLUS:
2152 case OP_CRQUERY:
2153 case OP_CRMINQUERY:
2154 c = *ecode++ - OP_CRSTAR;
2155 minimize = (c & 1) != 0;
2156 min = rep_min[c]; /* Pick up values from tables; */
2157 max = rep_max[c]; /* zero for max => infinity */
2158 if (max == 0) max = INT_MAX;
2159 break;
2160
2161 case OP_CRRANGE:
2162 case OP_CRMINRANGE:
2163 minimize = (*ecode == OP_CRMINRANGE);
2164 min = GET2(ecode, 1);
2165 max = GET2(ecode, 3);
2166 if (max == 0) max = INT_MAX;
2167 ecode += 5;
2168 break;
2169
2170 default: /* No repeat follows */
2171 if (!match_ref(offset, eptr, length, md, ims))
2172 {
2173 CHECK_PARTIAL();
2174 MRRETURN(MATCH_NOMATCH);
2175 }
2176 eptr += length;
2177 continue; /* With the main loop */
2178 }
2179
2180 /* If the length of the reference is zero, just continue with the
2181 main loop. */
2182
2183 if (length == 0) continue;
2184
2185 /* First, ensure the minimum number of matches are present. We get back
2186 the length of the reference string explicitly rather than passing the
2187 address of eptr, so that eptr can be a register variable. */
2188
2189 for (i = 1; i <= min; i++)
2190 {
2191 if (!match_ref(offset, eptr, length, md, ims))
2192 {
2193 CHECK_PARTIAL();
2194 MRRETURN(MATCH_NOMATCH);
2195 }
2196 eptr += length;
2197 }
2198
2199 /* If min = max, continue at the same level without recursion.
2200 They are not both allowed to be zero. */
2201
2202 if (min == max) continue;
2203
2204 /* If minimizing, keep trying and advancing the pointer */
2205
2206 if (minimize)
2207 {
2208 for (fi = min;; fi++)
2209 {
2210 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2211 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2212 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2213 if (!match_ref(offset, eptr, length, md, ims))
2214 {
2215 CHECK_PARTIAL();
2216 MRRETURN(MATCH_NOMATCH);
2217 }
2218 eptr += length;
2219 }
2220 /* Control never gets here */
2221 }
2222
2223 /* If maximizing, find the longest string and work backwards */
2224
2225 else
2226 {
2227 pp = eptr;
2228 for (i = min; i < max; i++)
2229 {
2230 if (!match_ref(offset, eptr, length, md, ims))
2231 {
2232 CHECK_PARTIAL();
2233 break;
2234 }
2235 eptr += length;
2236 }
2237 while (eptr >= pp)
2238 {
2239 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2241 eptr -= length;
2242 }
2243 MRRETURN(MATCH_NOMATCH);
2244 }
2245 }
2246 /* Control never gets here */
2247
2248 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2249 used when all the characters in the class have values in the range 0-255,
2250 and either the matching is caseful, or the characters are in the range
2251 0-127 when UTF-8 processing is enabled. The only difference between
2252 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2253 encountered.
2254
2255 First, look past the end of the item to see if there is repeat information
2256 following. Then obey similar code to character type repeats - written out
2257 again for speed. */
2258
2259 case OP_NCLASS:
2260 case OP_CLASS:
2261 {
2262 data = ecode + 1; /* Save for matching */
2263 ecode += 33; /* Advance past the item */
2264
2265 switch (*ecode)
2266 {
2267 case OP_CRSTAR:
2268 case OP_CRMINSTAR:
2269 case OP_CRPLUS:
2270 case OP_CRMINPLUS:
2271 case OP_CRQUERY:
2272 case OP_CRMINQUERY:
2273 c = *ecode++ - OP_CRSTAR;
2274 minimize = (c & 1) != 0;
2275 min = rep_min[c]; /* Pick up values from tables; */
2276 max = rep_max[c]; /* zero for max => infinity */
2277 if (max == 0) max = INT_MAX;
2278 break;
2279
2280 case OP_CRRANGE:
2281 case OP_CRMINRANGE:
2282 minimize = (*ecode == OP_CRMINRANGE);
2283 min = GET2(ecode, 1);
2284 max = GET2(ecode, 3);
2285 if (max == 0) max = INT_MAX;
2286 ecode += 5;
2287 break;
2288
2289 default: /* No repeat follows */
2290 min = max = 1;
2291 break;
2292 }
2293
2294 /* First, ensure the minimum number of matches are present. */
2295
2296 #ifdef SUPPORT_UTF8
2297 /* UTF-8 mode */
2298 if (utf8)
2299 {
2300 for (i = 1; i <= min; i++)
2301 {
2302 if (eptr >= md->end_subject)
2303 {
2304 SCHECK_PARTIAL();
2305 MRRETURN(MATCH_NOMATCH);
2306 }
2307 GETCHARINC(c, eptr);
2308 if (c > 255)
2309 {
2310 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2311 }
2312 else
2313 {
2314 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2315 }
2316 }
2317 }
2318 else
2319 #endif
2320 /* Not UTF-8 mode */
2321 {
2322 for (i = 1; i <= min; i++)
2323 {
2324 if (eptr >= md->end_subject)
2325 {
2326 SCHECK_PARTIAL();
2327 MRRETURN(MATCH_NOMATCH);
2328 }
2329 c = *eptr++;
2330 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2331 }
2332 }
2333
2334 /* If max == min we can continue with the main loop without the
2335 need to recurse. */
2336
2337 if (min == max) continue;
2338
2339 /* If minimizing, keep testing the rest of the expression and advancing
2340 the pointer while it matches the class. */
2341
2342 if (minimize)
2343 {
2344 #ifdef SUPPORT_UTF8
2345 /* UTF-8 mode */
2346 if (utf8)
2347 {
2348 for (fi = min;; fi++)
2349 {
2350 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2351 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2352 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2353 if (eptr >= md->end_subject)
2354 {
2355 SCHECK_PARTIAL();
2356 MRRETURN(MATCH_NOMATCH);
2357 }
2358 GETCHARINC(c, eptr);
2359 if (c > 255)
2360 {
2361 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2362 }
2363 else
2364 {
2365 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2366 }
2367 }
2368 }
2369 else
2370 #endif
2371 /* Not UTF-8 mode */
2372 {
2373 for (fi = min;; fi++)
2374 {
2375 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2376 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2377 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2378 if (eptr >= md->end_subject)
2379 {
2380 SCHECK_PARTIAL();
2381 MRRETURN(MATCH_NOMATCH);
2382 }
2383 c = *eptr++;
2384 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2385 }
2386 }
2387 /* Control never gets here */
2388 }
2389
2390 /* If maximizing, find the longest possible run, then work backwards. */
2391
2392 else
2393 {
2394 pp = eptr;
2395
2396 #ifdef SUPPORT_UTF8
2397 /* UTF-8 mode */
2398 if (utf8)
2399 {
2400 for (i = min; i < max; i++)
2401 {
2402 int len = 1;
2403 if (eptr >= md->end_subject)
2404 {
2405 SCHECK_PARTIAL();
2406 break;
2407 }
2408 GETCHARLEN(c, eptr, len);
2409 if (c > 255)
2410 {
2411 if (op == OP_CLASS) break;
2412 }
2413 else
2414 {
2415 if ((data[c/8] & (1 << (c&7))) == 0) break;
2416 }
2417 eptr += len;
2418 }
2419 for (;;)
2420 {
2421 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2423 if (eptr-- == pp) break; /* Stop if tried at original pos */
2424 BACKCHAR(eptr);
2425 }
2426 }
2427 else
2428 #endif
2429 /* Not UTF-8 mode */
2430 {
2431 for (i = min; i < max; i++)
2432 {
2433 if (eptr >= md->end_subject)
2434 {
2435 SCHECK_PARTIAL();
2436 break;
2437 }
2438 c = *eptr;
2439 if ((data[c/8] & (1 << (c&7))) == 0) break;
2440 eptr++;
2441 }
2442 while (eptr >= pp)
2443 {
2444 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2446 eptr--;
2447 }
2448 }
2449
2450 MRRETURN(MATCH_NOMATCH);
2451 }
2452 }
2453 /* Control never gets here */
2454
2455
2456 /* Match an extended character class. This opcode is encountered only
2457 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2458 mode, because Unicode properties are supported in non-UTF-8 mode. */
2459
2460 #ifdef SUPPORT_UTF8
2461 case OP_XCLASS:
2462 {
2463 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2464 ecode += GET(ecode, 1); /* Advance past the item */
2465
2466 switch (*ecode)
2467 {
2468 case OP_CRSTAR:
2469 case OP_CRMINSTAR:
2470 case OP_CRPLUS:
2471 case OP_CRMINPLUS:
2472 case OP_CRQUERY:
2473 case OP_CRMINQUERY:
2474 c = *ecode++ - OP_CRSTAR;
2475 minimize = (c & 1) != 0;
2476 min = rep_min[c]; /* Pick up values from tables; */
2477 max = rep_max[c]; /* zero for max => infinity */
2478 if (max == 0) max = INT_MAX;
2479 break;
2480
2481 case OP_CRRANGE:
2482 case OP_CRMINRANGE:
2483 minimize = (*ecode == OP_CRMINRANGE);
2484 min = GET2(ecode, 1);
2485 max = GET2(ecode, 3);
2486 if (max == 0) max = INT_MAX;
2487 ecode += 5;
2488 break;
2489
2490 default: /* No repeat follows */
2491 min = max = 1;
2492 break;
2493 }
2494
2495 /* First, ensure the minimum number of matches are present. */
2496
2497 for (i = 1; i <= min; i++)
2498 {
2499 if (eptr >= md->end_subject)
2500 {
2501 SCHECK_PARTIAL();
2502 MRRETURN(MATCH_NOMATCH);
2503 }
2504 GETCHARINCTEST(c, eptr);
2505 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2506 }
2507
2508 /* If max == min we can continue with the main loop without the
2509 need to recurse. */
2510
2511 if (min == max) continue;
2512
2513 /* If minimizing, keep testing the rest of the expression and advancing
2514 the pointer while it matches the class. */
2515
2516 if (minimize)
2517 {
2518 for (fi = min;; fi++)
2519 {
2520 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2521 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2522 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2523 if (eptr >= md->end_subject)
2524 {
2525 SCHECK_PARTIAL();
2526 MRRETURN(MATCH_NOMATCH);
2527 }
2528 GETCHARINCTEST(c, eptr);
2529 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2530 }
2531 /* Control never gets here */
2532 }
2533
2534 /* If maximizing, find the longest possible run, then work backwards. */
2535
2536 else
2537 {
2538 pp = eptr;
2539 for (i = min; i < max; i++)
2540 {
2541 int len = 1;
2542 if (eptr >= md->end_subject)
2543 {
2544 SCHECK_PARTIAL();
2545 break;
2546 }
2547 GETCHARLENTEST(c, eptr, len);
2548 if (!_pcre_xclass(c, data)) break;
2549 eptr += len;
2550 }
2551 for(;;)
2552 {
2553 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2554 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2555 if (eptr-- == pp) break; /* Stop if tried at original pos */
2556 if (utf8) BACKCHAR(eptr);
2557 }
2558 MRRETURN(MATCH_NOMATCH);
2559 }
2560
2561 /* Control never gets here */
2562 }
2563 #endif /* End of XCLASS */
2564
2565 /* Match a single character, casefully */
2566
2567 case OP_CHAR:
2568 #ifdef SUPPORT_UTF8
2569 if (utf8)
2570 {
2571 length = 1;
2572 ecode++;
2573 GETCHARLEN(fc, ecode, length);
2574 if (length > md->end_subject - eptr)
2575 {
2576 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2577 MRRETURN(MATCH_NOMATCH);
2578 }
2579 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2580 }
2581 else
2582 #endif
2583
2584 /* Non-UTF-8 mode */
2585 {
2586 if (md->end_subject - eptr < 1)
2587 {
2588 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2589 MRRETURN(MATCH_NOMATCH);
2590 }
2591 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2592 ecode += 2;
2593 }
2594 break;
2595
2596 /* Match a single character, caselessly */
2597
2598 case OP_CHARNC:
2599 #ifdef SUPPORT_UTF8
2600 if (utf8)
2601 {
2602 length = 1;
2603 ecode++;
2604 GETCHARLEN(fc, ecode, length);
2605
2606 if (length > md->end_subject - eptr)
2607 {
2608 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2609 MRRETURN(MATCH_NOMATCH);
2610 }
2611
2612 /* If the pattern character's value is < 128, we have only one byte, and
2613 can use the fast lookup table. */
2614
2615 if (fc < 128)
2616 {
2617 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2618 }
2619
2620 /* Otherwise we must pick up the subject character */
2621
2622 else
2623 {
2624 unsigned int dc;
2625 GETCHARINC(dc, eptr);
2626 ecode += length;
2627
2628 /* If we have Unicode property support, we can use it to test the other
2629 case of the character, if there is one. */
2630
2631 if (fc != dc)
2632 {
2633 #ifdef SUPPORT_UCP
2634 if (dc != UCD_OTHERCASE(fc))
2635 #endif
2636 MRRETURN(MATCH_NOMATCH);
2637 }
2638 }
2639 }
2640 else
2641 #endif /* SUPPORT_UTF8 */
2642
2643 /* Non-UTF-8 mode */
2644 {
2645 if (md->end_subject - eptr < 1)
2646 {
2647 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2648 MRRETURN(MATCH_NOMATCH);
2649 }
2650 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2651 ecode += 2;
2652 }
2653 break;
2654
2655 /* Match a single character repeatedly. */
2656
2657 case OP_EXACT:
2658 min = max = GET2(ecode, 1);
2659 ecode += 3;
2660 goto REPEATCHAR;
2661
2662 case OP_POSUPTO:
2663 possessive = TRUE;
2664 /* Fall through */
2665
2666 case OP_UPTO:
2667 case OP_MINUPTO:
2668 min = 0;
2669 max = GET2(ecode, 1);
2670 minimize = *ecode == OP_MINUPTO;
2671 ecode += 3;
2672 goto REPEATCHAR;
2673
2674 case OP_POSSTAR:
2675 possessive = TRUE;
2676 min = 0;
2677 max = INT_MAX;
2678 ecode++;
2679 goto REPEATCHAR;
2680
2681 case OP_POSPLUS:
2682 possessive = TRUE;
2683 min = 1;
2684 max = INT_MAX;
2685 ecode++;
2686 goto REPEATCHAR;
2687
2688 case OP_POSQUERY:
2689 possessive = TRUE;
2690 min = 0;
2691 max = 1;
2692 ecode++;
2693 goto REPEATCHAR;
2694
2695 case OP_STAR:
2696 case OP_MINSTAR:
2697 case OP_PLUS:
2698 case OP_MINPLUS:
2699 case OP_QUERY:
2700 case OP_MINQUERY:
2701 c = *ecode++ - OP_STAR;
2702 minimize = (c & 1) != 0;
2703
2704 min = rep_min[c]; /* Pick up values from tables; */
2705 max = rep_max[c]; /* zero for max => infinity */
2706 if (max == 0) max = INT_MAX;
2707
2708 /* Common code for all repeated single-character matches. */
2709
2710 REPEATCHAR:
2711 #ifdef SUPPORT_UTF8
2712 if (utf8)
2713 {
2714 length = 1;
2715 charptr = ecode;
2716 GETCHARLEN(fc, ecode, length);
2717 ecode += length;
2718
2719 /* Handle multibyte character matching specially here. There is
2720 support for caseless matching if UCP support is present. */
2721
2722 if (length > 1)
2723 {
2724 #ifdef SUPPORT_UCP
2725 unsigned int othercase;
2726 if ((ims & PCRE_CASELESS) != 0 &&
2727 (othercase = UCD_OTHERCASE(fc)) != fc)
2728 oclength = _pcre_ord2utf8(othercase, occhars);
2729 else oclength = 0;
2730 #endif /* SUPPORT_UCP */
2731
2732 for (i = 1; i <= min; i++)
2733 {
2734 if (eptr <= md->end_subject - length &&
2735 memcmp(eptr, charptr, length) == 0) eptr += length;
2736 #ifdef SUPPORT_UCP
2737 else if (oclength > 0 &&
2738 eptr <= md->end_subject - oclength &&
2739 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2740 #endif /* SUPPORT_UCP */
2741 else
2742 {
2743 CHECK_PARTIAL();
2744 MRRETURN(MATCH_NOMATCH);
2745 }
2746 }
2747
2748 if (min == max) continue;
2749
2750 if (minimize)
2751 {
2752 for (fi = min;; fi++)
2753 {
2754 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2756 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2757 if (eptr <= md->end_subject - length &&
2758 memcmp(eptr, charptr, length) == 0) eptr += length;
2759 #ifdef SUPPORT_UCP
2760 else if (oclength > 0 &&
2761 eptr <= md->end_subject - oclength &&
2762 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2763 #endif /* SUPPORT_UCP */
2764 else
2765 {
2766 CHECK_PARTIAL();
2767 MRRETURN(MATCH_NOMATCH);
2768 }
2769 }
2770 /* Control never gets here */
2771 }
2772
2773 else /* Maximize */
2774 {
2775 pp = eptr;
2776 for (i = min; i < max; i++)
2777 {
2778 if (eptr <= md->end_subject - length &&
2779 memcmp(eptr, charptr, length) == 0) eptr += length;
2780 #ifdef SUPPORT_UCP
2781 else if (oclength > 0 &&
2782 eptr <= md->end_subject - oclength &&
2783 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2784 #endif /* SUPPORT_UCP */
2785 else
2786 {
2787 CHECK_PARTIAL();
2788 break;
2789 }
2790 }
2791
2792 if (possessive) continue;
2793
2794 for(;;)
2795 {
2796 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2798 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2799 #ifdef SUPPORT_UCP
2800 eptr--;
2801 BACKCHAR(eptr);
2802 #else /* without SUPPORT_UCP */
2803 eptr -= length;
2804 #endif /* SUPPORT_UCP */
2805 }
2806 }
2807 /* Control never gets here */
2808 }
2809
2810 /* If the length of a UTF-8 character is 1, we fall through here, and
2811 obey the code as for non-UTF-8 characters below, though in this case the
2812 value of fc will always be < 128. */
2813 }
2814 else
2815 #endif /* SUPPORT_UTF8 */
2816
2817 /* When not in UTF-8 mode, load a single-byte character. */
2818
2819 fc = *ecode++;
2820
2821 /* The value of fc at this point is always less than 256, though we may or
2822 may not be in UTF-8 mode. The code is duplicated for the caseless and
2823 caseful cases, for speed, since matching characters is likely to be quite
2824 common. First, ensure the minimum number of matches are present. If min =
2825 max, continue at the same level without recursing. Otherwise, if
2826 minimizing, keep trying the rest of the expression and advancing one
2827 matching character if failing, up to the maximum. Alternatively, if
2828 maximizing, find the maximum number of characters and work backwards. */
2829
2830 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2831 max, eptr));
2832
2833 if ((ims & PCRE_CASELESS) != 0)
2834 {
2835 fc = md->lcc[fc];
2836 for (i = 1; i <= min; i++)
2837 {
2838 if (eptr >= md->end_subject)
2839 {
2840 SCHECK_PARTIAL();
2841 MRRETURN(MATCH_NOMATCH);
2842 }
2843 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2844 }
2845 if (min == max) continue;
2846 if (minimize)
2847 {
2848 for (fi = min;; fi++)
2849 {
2850 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2852 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2853 if (eptr >= md->end_subject)
2854 {
2855 SCHECK_PARTIAL();
2856 MRRETURN(MATCH_NOMATCH);
2857 }
2858 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2859 }
2860 /* Control never gets here */
2861 }
2862 else /* Maximize */
2863 {
2864 pp = eptr;
2865 for (i = min; i < max; i++)
2866 {
2867 if (eptr >= md->end_subject)
2868 {
2869 SCHECK_PARTIAL();
2870 break;
2871 }
2872 if (fc != md->lcc[*eptr]) break;
2873 eptr++;
2874 }
2875
2876 if (possessive) continue;
2877
2878 while (eptr >= pp)
2879 {
2880 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2881 eptr--;
2882 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2883 }
2884 MRRETURN(MATCH_NOMATCH);
2885 }
2886 /* Control never gets here */
2887 }
2888
2889 /* Caseful comparisons (includes all multi-byte characters) */
2890
2891 else
2892 {
2893 for (i = 1; i <= min; i++)
2894 {
2895 if (eptr >= md->end_subject)
2896 {
2897 SCHECK_PARTIAL();
2898 MRRETURN(MATCH_NOMATCH);
2899 }
2900 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2901 }
2902
2903 if (min == max) continue;
2904
2905 if (minimize)
2906 {
2907 for (fi = min;; fi++)
2908 {
2909 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2911 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2912 if (eptr >= md->end_subject)
2913 {
2914 SCHECK_PARTIAL();
2915 MRRETURN(MATCH_NOMATCH);
2916 }
2917 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2918 }
2919 /* Control never gets here */
2920 }
2921 else /* Maximize */
2922 {
2923 pp = eptr;
2924 for (i = min; i < max; i++)
2925 {
2926 if (eptr >= md->end_subject)
2927 {
2928 SCHECK_PARTIAL();
2929 break;
2930 }
2931 if (fc != *eptr) break;
2932 eptr++;
2933 }
2934 if (possessive) continue;
2935
2936 while (eptr >= pp)
2937 {
2938 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2939 eptr--;
2940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2941 }
2942 MRRETURN(MATCH_NOMATCH);
2943 }
2944 }
2945 /* Control never gets here */
2946
2947 /* Match a negated single one-byte character. The character we are
2948 checking can be multibyte. */
2949
2950 case OP_NOT:
2951 if (eptr >= md->end_subject)
2952 {
2953 SCHECK_PARTIAL();
2954 MRRETURN(MATCH_NOMATCH);
2955 }
2956 ecode++;
2957 GETCHARINCTEST(c, eptr);
2958 if ((ims & PCRE_CASELESS) != 0)
2959 {
2960 #ifdef SUPPORT_UTF8
2961 if (c < 256)
2962 #endif
2963 c = md->lcc[c];
2964 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
2965 }
2966 else
2967 {
2968 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
2969 }
2970 break;
2971
2972 /* Match a negated single one-byte character repeatedly. This is almost a
2973 repeat of the code for a repeated single character, but I haven't found a
2974 nice way of commoning these up that doesn't require a test of the
2975 positive/negative option for each character match. Maybe that wouldn't add
2976 very much to the time taken, but character matching *is* what this is all
2977 about... */
2978
2979 case OP_NOTEXACT:
2980 min = max = GET2(ecode, 1);
2981 ecode += 3;
2982 goto REPEATNOTCHAR;
2983
2984 case OP_NOTUPTO:
2985 case OP_NOTMINUPTO:
2986 min = 0;
2987 max = GET2(ecode, 1);
2988 minimize = *ecode == OP_NOTMINUPTO;
2989 ecode += 3;
2990 goto REPEATNOTCHAR;
2991
2992 case OP_NOTPOSSTAR:
2993 possessive = TRUE;
2994 min = 0;
2995 max = INT_MAX;
2996 ecode++;
2997 goto REPEATNOTCHAR;
2998
2999 case OP_NOTPOSPLUS:
3000 possessive = TRUE;
3001 min = 1;
3002 max = INT_MAX;
3003 ecode++;
3004 goto REPEATNOTCHAR;
3005
3006 case OP_NOTPOSQUERY:
3007 possessive = TRUE;
3008 min = 0;
3009 max = 1;
3010 ecode++;
3011 goto REPEATNOTCHAR;
3012
3013 case OP_NOTPOSUPTO:
3014 possessive = TRUE;
3015 min = 0;
3016 max = GET2(ecode, 1);
3017 ecode += 3;
3018 goto REPEATNOTCHAR;
3019
3020 case OP_NOTSTAR:
3021 case OP_NOTMINSTAR:
3022 case OP_NOTPLUS:
3023 case OP_NOTMINPLUS:
3024 case OP_NOTQUERY:
3025 case OP_NOTMINQUERY:
3026 c = *ecode++ - OP_NOTSTAR;
3027 minimize = (c & 1) != 0;
3028 min = rep_min[c]; /* Pick up values from tables; */
3029 max = rep_max[c]; /* zero for max => infinity */
3030 if (max == 0) max = INT_MAX;
3031
3032 /* Common code for all repeated single-byte matches. */
3033
3034 REPEATNOTCHAR:
3035 fc = *ecode++;
3036
3037 /* The code is duplicated for the caseless and caseful cases, for speed,
3038 since matching characters is likely to be quite common. First, ensure the
3039 minimum number of matches are present. If min = max, continue at the same
3040 level without recursing. Otherwise, if minimizing, keep trying the rest of
3041 the expression and advancing one matching character if failing, up to the
3042 maximum. Alternatively, if maximizing, find the maximum number of
3043 characters and work backwards. */
3044
3045 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3046 max, eptr));
3047
3048 if ((ims & PCRE_CASELESS) != 0)
3049 {
3050 fc = md->lcc[fc];
3051
3052 #ifdef SUPPORT_UTF8
3053 /* UTF-8 mode */
3054 if (utf8)
3055 {
3056 register unsigned int d;
3057 for (i = 1; i <= min; i++)
3058 {
3059 if (eptr >= md->end_subject)
3060 {
3061 SCHECK_PARTIAL();
3062 MRRETURN(MATCH_NOMATCH);
3063 }
3064 GETCHARINC(d, eptr);
3065 if (d < 256) d = md->lcc[d];
3066 if (fc == d) MRRETURN(MATCH_NOMATCH);
3067 }
3068 }
3069 else
3070 #endif
3071
3072 /* Not UTF-8 mode */
3073 {
3074 for (i = 1; i <= min; i++)
3075 {
3076 if (eptr >= md->end_subject)
3077 {
3078 SCHECK_PARTIAL();
3079 MRRETURN(MATCH_NOMATCH);
3080 }
3081 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3082 }
3083 }
3084
3085 if (min == max) continue;
3086
3087 if (minimize)
3088 {
3089 #ifdef SUPPORT_UTF8
3090 /* UTF-8 mode */
3091 if (utf8)
3092 {
3093 register unsigned int d;
3094 for (fi = min;; fi++)
3095 {
3096 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3097 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3098 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3099 if (eptr >= md->end_subject)
3100 {
3101 SCHECK_PARTIAL();
3102 MRRETURN(MATCH_NOMATCH);
3103 }
3104 GETCHARINC(d, eptr);
3105 if (d < 256) d = md->lcc[d];
3106 if (fc == d) MRRETURN(MATCH_NOMATCH);
3107 }
3108 }
3109 else
3110 #endif
3111 /* Not UTF-8 mode */
3112 {
3113 for (fi = min;; fi++)
3114 {
3115 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3116 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3117 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3118 if (eptr >= md->end_subject)
3119 {
3120 SCHECK_PARTIAL();
3121 MRRETURN(MATCH_NOMATCH);
3122 }
3123 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3124 }
3125 }
3126 /* Control never gets here */
3127 }
3128
3129 /* Maximize case */
3130
3131 else
3132 {
3133 pp = eptr;
3134
3135 #ifdef SUPPORT_UTF8
3136 /* UTF-8 mode */
3137 if (utf8)
3138 {
3139 register unsigned int d;
3140 for (i = min; i < max; i++)
3141 {
3142 int len = 1;
3143 if (eptr >= md->end_subject)
3144 {
3145 SCHECK_PARTIAL();
3146 break;
3147 }
3148 GETCHARLEN(d, eptr, len);
3149 if (d < 256) d = md->lcc[d];
3150 if (fc == d) break;
3151 eptr += len;
3152 }
3153 if (possessive) continue;
3154 for(;;)
3155 {
3156 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3157 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3158 if (eptr-- == pp) break; /* Stop if tried at original pos */
3159 BACKCHAR(eptr);
3160 }
3161 }
3162 else
3163 #endif
3164 /* Not UTF-8 mode */
3165 {
3166 for (i = min; i < max; i++)
3167 {
3168 if (eptr >= md->end_subject)
3169 {
3170 SCHECK_PARTIAL();
3171 break;
3172 }
3173 if (fc == md->lcc[*eptr]) break;
3174 eptr++;
3175 }
3176 if (possessive) continue;
3177 while (eptr >= pp)
3178 {
3179 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3180 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3181 eptr--;
3182 }
3183 }
3184
3185 MRRETURN(MATCH_NOMATCH);
3186 }
3187 /* Control never gets here */
3188 }
3189
3190 /* Caseful comparisons */
3191
3192 else
3193 {
3194 #ifdef SUPPORT_UTF8
3195 /* UTF-8 mode */
3196 if (utf8)
3197 {
3198 register unsigned int d;
3199 for (i = 1; i <= min; i++)
3200 {
3201 if (eptr >= md->end_subject)
3202 {
3203 SCHECK_PARTIAL();
3204 MRRETURN(MATCH_NOMATCH);
3205 }
3206 GETCHARINC(d, eptr);
3207 if (fc == d) MRRETURN(MATCH_NOMATCH);
3208 }
3209 }
3210 else
3211 #endif
3212 /* Not UTF-8 mode */
3213 {
3214 for (i = 1; i <= min; i++)
3215 {
3216 if (eptr >= md->end_subject)
3217 {
3218 SCHECK_PARTIAL();
3219 MRRETURN(MATCH_NOMATCH);
3220 }
3221 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3222 }
3223 }
3224
3225 if (min == max) continue;
3226
3227 if (minimize)
3228 {
3229 #ifdef SUPPORT_UTF8
3230 /* UTF-8 mode */
3231 if (utf8)
3232 {
3233 register unsigned int d;
3234 for (fi = min;; fi++)
3235 {
3236 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3237 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3238 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3239 if (eptr >= md->end_subject)
3240 {
3241 SCHECK_PARTIAL();
3242 MRRETURN(MATCH_NOMATCH);
3243 }
3244 GETCHARINC(d, eptr);
3245 if (fc == d) MRRETURN(MATCH_NOMATCH);
3246 }
3247 }
3248 else
3249 #endif
3250 /* Not UTF-8 mode */
3251 {
3252 for (fi = min;; fi++)
3253 {
3254 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3255 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3256 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3257 if (eptr >= md->end_subject)
3258 {
3259 SCHECK_PARTIAL();
3260 MRRETURN(MATCH_NOMATCH);
3261 }
3262 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3263 }
3264 }
3265 /* Control never gets here */
3266 }
3267
3268 /* Maximize case */
3269
3270 else
3271 {
3272 pp = eptr;
3273
3274 #ifdef SUPPORT_UTF8
3275 /* UTF-8 mode */
3276 if (utf8)
3277 {
3278 register unsigned int d;
3279 for (i = min; i < max; i++)
3280 {
3281 int len = 1;
3282 if (eptr >= md->end_subject)
3283 {
3284 SCHECK_PARTIAL();
3285 break;
3286 }
3287 GETCHARLEN(d, eptr, len);
3288 if (fc == d) break;
3289 eptr += len;
3290 }
3291 if (possessive) continue;
3292 for(;;)
3293 {
3294 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3295 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3296 if (eptr-- == pp) break; /* Stop if tried at original pos */
3297 BACKCHAR(eptr);
3298 }
3299 }
3300 else
3301 #endif
3302 /* Not UTF-8 mode */
3303 {
3304 for (i = min; i < max; i++)
3305 {
3306 if (eptr >= md->end_subject)
3307 {
3308 SCHECK_PARTIAL();
3309 break;
3310 }
3311 if (fc == *eptr) break;
3312 eptr++;
3313 }
3314 if (possessive) continue;
3315 while (eptr >= pp)
3316 {
3317 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3318 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3319 eptr--;
3320 }
3321 }
3322
3323 MRRETURN(MATCH_NOMATCH);
3324 }
3325 }
3326 /* Control never gets here */
3327
3328 /* Match a single character type repeatedly; several different opcodes
3329 share code. This is very similar to the code for single characters, but we
3330 repeat it in the interests of efficiency. */
3331
3332 case OP_TYPEEXACT:
3333 min = max = GET2(ecode, 1);
3334 minimize = TRUE;
3335 ecode += 3;
3336 goto REPEATTYPE;
3337
3338 case OP_TYPEUPTO:
3339 case OP_TYPEMINUPTO:
3340 min = 0;
3341 max = GET2(ecode, 1);
3342 minimize = *ecode == OP_TYPEMINUPTO;
3343 ecode += 3;
3344 goto REPEATTYPE;
3345
3346 case OP_TYPEPOSSTAR:
3347 possessive = TRUE;
3348 min = 0;
3349 max = INT_MAX;
3350 ecode++;
3351 goto REPEATTYPE;
3352
3353 case OP_TYPEPOSPLUS:
3354 possessive = TRUE;
3355 min = 1;
3356 max = INT_MAX;
3357 ecode++;
3358 goto REPEATTYPE;
3359
3360 case OP_TYPEPOSQUERY:
3361 possessive = TRUE;
3362 min = 0;
3363 max = 1;
3364 ecode++;
3365 goto REPEATTYPE;
3366
3367 case OP_TYPEPOSUPTO:
3368 possessive = TRUE;
3369 min = 0;
3370 max = GET2(ecode, 1);
3371 ecode += 3;
3372 goto REPEATTYPE;
3373
3374 case OP_TYPESTAR:
3375 case OP_TYPEMINSTAR:
3376 case OP_TYPEPLUS:
3377 case OP_TYPEMINPLUS:
3378 case OP_TYPEQUERY:
3379 case OP_TYPEMINQUERY:
3380 c = *ecode++ - OP_TYPESTAR;
3381 minimize = (c & 1) != 0;
3382 min = rep_min[c]; /* Pick up values from tables; */
3383 max = rep_max[c]; /* zero for max => infinity */
3384 if (max == 0) max = INT_MAX;
3385
3386 /* Common code for all repeated single character type matches. Note that
3387 in UTF-8 mode, '.' matches a character of any length, but for the other
3388 character types, the valid characters are all one-byte long. */
3389
3390 REPEATTYPE:
3391 ctype = *ecode++; /* Code for the character type */
3392
3393 #ifdef SUPPORT_UCP
3394 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3395 {
3396 prop_fail_result = ctype == OP_NOTPROP;
3397 prop_type = *ecode++;
3398 prop_value = *ecode++;
3399 }
3400 else prop_type = -1;
3401 #endif
3402
3403 /* First, ensure the minimum number of matches are present. Use inline
3404 code for maximizing the speed, and do the type test once at the start
3405 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3406 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3407 and single-bytes. */
3408
3409 if (min > 0)
3410 {
3411 #ifdef SUPPORT_UCP
3412 if (prop_type >= 0)
3413 {
3414 switch(prop_type)
3415 {
3416 case PT_ANY:
3417 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3418 for (i = 1; i <= min; i++)
3419 {
3420 if (eptr >= md->end_subject)
3421 {
3422 SCHECK_PARTIAL();
3423 MRRETURN(MATCH_NOMATCH);
3424 }
3425 GETCHARINCTEST(c, eptr);
3426 }
3427 break;
3428
3429 case PT_LAMP:
3430 for (i = 1; i <= min; i++)
3431 {
3432 if (eptr >= md->end_subject)
3433 {
3434 SCHECK_PARTIAL();
3435 MRRETURN(MATCH_NOMATCH);
3436 }
3437 GETCHARINCTEST(c, eptr);
3438 prop_chartype = UCD_CHARTYPE(c);
3439 if ((prop_chartype == ucp_Lu ||
3440 prop_chartype == ucp_Ll ||
3441 prop_chartype == ucp_Lt) == prop_fail_result)
3442 MRRETURN(MATCH_NOMATCH);
3443 }
3444 break;
3445
3446 case PT_GC:
3447 for (i = 1; i <= min; i++)
3448 {
3449 if (eptr >= md->end_subject)
3450 {
3451 SCHECK_PARTIAL();
3452 MRRETURN(MATCH_NOMATCH);
3453 }
3454 GETCHARINCTEST(c, eptr);
3455 prop_category = UCD_CATEGORY(c);
3456 if ((prop_category == prop_value) == prop_fail_result)
3457 MRRETURN(MATCH_NOMATCH);
3458 }
3459 break;
3460
3461 case PT_PC:
3462 for (i = 1; i <= min; i++)
3463 {
3464 if (eptr >= md->end_subject)
3465 {
3466 SCHECK_PARTIAL();
3467 MRRETURN(MATCH_NOMATCH);
3468 }
3469 GETCHARINCTEST(c, eptr);
3470 prop_chartype = UCD_CHARTYPE(c);
3471 if ((prop_chartype == prop_value) == prop_fail_result)
3472 MRRETURN(MATCH_NOMATCH);
3473 }
3474 break;
3475
3476 case PT_SC:
3477 for (i = 1; i <= min; i++)
3478 {
3479 if (eptr >= md->end_subject)
3480 {
3481 SCHECK_PARTIAL();
3482 MRRETURN(MATCH_NOMATCH);
3483 }
3484 GETCHARINCTEST(c, eptr);
3485 prop_script = UCD_SCRIPT(c);
3486 if ((prop_script == prop_value) == prop_fail_result)
3487 MRRETURN(MATCH_NOMATCH);
3488 }
3489 break;
3490
3491 default:
3492 RRETURN(PCRE_ERROR_INTERNAL);
3493 }
3494 }
3495
3496 /* Match extended Unicode sequences. We will get here only if the
3497 support is in the binary; otherwise a compile-time error occurs. */
3498
3499 else if (ctype == OP_EXTUNI)
3500 {
3501 for (i = 1; i <= min; i++)
3502 {
3503 if (eptr >= md->end_subject)
3504 {
3505 SCHECK_PARTIAL();
3506 MRRETURN(MATCH_NOMATCH);
3507 }
3508 GETCHARINCTEST(c, eptr);
3509 prop_category = UCD_CATEGORY(c);
3510 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3511 while (eptr < md->end_subject)
3512 {
3513 int len = 1;
3514 if (!utf8) c = *eptr;
3515 else { GETCHARLEN(c, eptr, len); }
3516 prop_category = UCD_CATEGORY(c);
3517 if (prop_category != ucp_M) break;
3518 eptr += len;
3519 }
3520 }
3521 }
3522
3523 else
3524 #endif /* SUPPORT_UCP */
3525
3526 /* Handle all other cases when the coding is UTF-8 */
3527
3528 #ifdef SUPPORT_UTF8
3529 if (utf8) switch(ctype)
3530 {
3531 case OP_ANY:
3532 for (i = 1; i <= min; i++)
3533 {
3534 if (eptr >= md->end_subject)
3535 {
3536 SCHECK_PARTIAL();
3537 MRRETURN(MATCH_NOMATCH);
3538 }
3539 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3540 eptr++;
3541 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3542 }
3543 break;
3544
3545 case OP_ALLANY:
3546 for (i = 1; i <= min; i++)
3547 {
3548 if (eptr >= md->end_subject)
3549 {
3550 SCHECK_PARTIAL();
3551 MRRETURN(MATCH_NOMATCH);
3552 }
3553 eptr++;
3554 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3555 }
3556 break;
3557
3558 case OP_ANYBYTE:
3559 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3560 eptr += min;
3561 break;
3562
3563 case OP_ANYNL:
3564 for (i = 1; i <= min; i++)
3565 {
3566 if (eptr >= md->end_subject)
3567 {
3568 SCHECK_PARTIAL();
3569 MRRETURN(MATCH_NOMATCH);
3570 }
3571 GETCHARINC(c, eptr);
3572 switch(c)
3573 {
3574 default: MRRETURN(MATCH_NOMATCH);
3575 case 0x000d:
3576 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3577 break;
3578
3579 case 0x000a:
3580 break;
3581
3582 case 0x000b:
3583 case 0x000c:
3584 case 0x0085:
3585 case 0x2028:
3586 case 0x2029:
3587 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3588 break;
3589 }
3590 }
3591 break;
3592
3593 case OP_NOT_HSPACE:
3594 for (i = 1; i <= min; i++)
3595 {
3596 if (eptr >= md->end_subject)
3597 {
3598 SCHECK_PARTIAL();
3599 MRRETURN(MATCH_NOMATCH);
3600 }
3601 GETCHARINC(c, eptr);
3602 switch(c)
3603 {
3604 default: break;
3605 case 0x09: /* HT */
3606 case 0x20: /* SPACE */
3607 case 0xa0: /* NBSP */
3608 case 0x1680: /* OGHAM SPACE MARK */
3609 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3610 case 0x2000: /* EN QUAD */
3611 case 0x2001: /* EM QUAD */
3612 case 0x2002: /* EN SPACE */
3613 case 0x2003: /* EM SPACE */
3614 case 0x2004: /* THREE-PER-EM SPACE */
3615 case 0x2005: /* FOUR-PER-EM SPACE */
3616 case 0x2006: /* SIX-PER-EM SPACE */
3617 case 0x2007: /* FIGURE SPACE */
3618 case 0x2008: /* PUNCTUATION SPACE */
3619 case 0x2009: /* THIN SPACE */
3620 case 0x200A: /* HAIR SPACE */
3621 case 0x202f: /* NARROW NO-BREAK SPACE */
3622 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3623 case 0x3000: /* IDEOGRAPHIC SPACE */
3624 MRRETURN(MATCH_NOMATCH);
3625 }
3626 }
3627 break;
3628
3629 case OP_HSPACE:
3630 for (i = 1; i <= min; i++)
3631 {
3632 if (eptr >= md->end_subject)
3633 {
3634 SCHECK_PARTIAL();
3635 MRRETURN(MATCH_NOMATCH);
3636 }
3637 GETCHARINC(c, eptr);
3638 switch(c)
3639 {
3640 default: MRRETURN(MATCH_NOMATCH);
3641 case 0x09: /* HT */
3642 case 0x20: /* SPACE */
3643 case 0xa0: /* NBSP */
3644 case 0x1680: /* OGHAM SPACE MARK */
3645 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3646 case 0x2000: /* EN QUAD */
3647 case 0x2001: /* EM QUAD */
3648 case 0x2002: /* EN SPACE */
3649 case 0x2003: /* EM SPACE */
3650 case 0x2004: /* THREE-PER-EM SPACE */
3651 case 0x2005: /* FOUR-PER-EM SPACE */
3652 case 0x2006: /* SIX-PER-EM SPACE */
3653 case 0x2007: /* FIGURE SPACE */
3654 case 0x2008: /* PUNCTUATION SPACE */
3655 case 0x2009: /* THIN SPACE */
3656 case 0x200A: /* HAIR SPACE */
3657 case 0x202f: /* NARROW NO-BREAK SPACE */
3658 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3659 case 0x3000: /* IDEOGRAPHIC SPACE */
3660 break;
3661 }
3662 }
3663 break;
3664
3665 case OP_NOT_VSPACE:
3666 for (i = 1; i <= min; i++)
3667 {
3668 if (eptr >= md->end_subject)
3669 {
3670 SCHECK_PARTIAL();
3671 MRRETURN(MATCH_NOMATCH);
3672 }
3673 GETCHARINC(c, eptr);
3674 switch(c)
3675 {
3676 default: break;
3677 case 0x0a: /* LF */
3678 case 0x0b: /* VT */
3679 case 0x0c: /* FF */
3680 case 0x0d: /* CR */
3681 case 0x85: /* NEL */
3682 case 0x2028: /* LINE SEPARATOR */
3683 case 0x2029: /* PARAGRAPH SEPARATOR */
3684 MRRETURN(MATCH_NOMATCH);
3685 }
3686 }
3687 break;
3688
3689 case OP_VSPACE:
3690 for (i = 1; i <= min; i++)
3691 {
3692 if (eptr >= md->end_subject)
3693 {
3694 SCHECK_PARTIAL();
3695 MRRETURN(MATCH_NOMATCH);
3696 }
3697 GETCHARINC(c, eptr);
3698 switch(c)
3699 {
3700 default: MRRETURN(MATCH_NOMATCH);
3701 case 0x0a: /* LF */
3702 case 0x0b: /* VT */
3703 case 0x0c: /* FF */
3704 case 0x0d: /* CR */
3705 case 0x85: /* NEL */
3706 case 0x2028: /* LINE SEPARATOR */
3707 case 0x2029: /* PARAGRAPH SEPARATOR */
3708 break;
3709 }
3710 }
3711 break;
3712
3713 case OP_NOT_DIGIT:
3714 for (i = 1; i <= min; i++)
3715 {
3716 if (eptr >= md->end_subject)
3717 {
3718 SCHECK_PARTIAL();
3719 MRRETURN(MATCH_NOMATCH);
3720 }
3721 GETCHARINC(c, eptr);
3722 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3723 MRRETURN(MATCH_NOMATCH);
3724 }
3725 break;
3726
3727 case OP_DIGIT:
3728 for (i = 1; i <= min; i++)
3729 {
3730 if (eptr >= md->end_subject)
3731 {
3732 SCHECK_PARTIAL();
3733 MRRETURN(MATCH_NOMATCH);
3734 }
3735 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3736 MRRETURN(MATCH_NOMATCH);
3737 /* No need to skip more bytes - we know it's a 1-byte character */
3738 }
3739 break;
3740
3741 case OP_NOT_WHITESPACE:
3742 for (i = 1; i <= min; i++)
3743 {
3744 if (eptr >= md->end_subject)
3745 {
3746 SCHECK_PARTIAL();
3747 MRRETURN(MATCH_NOMATCH);
3748 }
3749 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3750 MRRETURN(MATCH_NOMATCH);
3751 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3752 }
3753 break;
3754
3755 case OP_WHITESPACE:
3756 for (i = 1; i <= min; i++)
3757 {
3758 if (eptr >= md->end_subject)
3759 {
3760 SCHECK_PARTIAL();
3761 MRRETURN(MATCH_NOMATCH);
3762 }
3763 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3764 MRRETURN(MATCH_NOMATCH);
3765 /* No need to skip more bytes - we know it's a 1-byte character */
3766 }
3767 break;
3768
3769 case OP_NOT_WORDCHAR:
3770 for (i = 1; i <= min; i++)
3771 {
3772 if (eptr >= md->end_subject)
3773 {
3774 SCHECK_PARTIAL();
3775 MRRETURN(MATCH_NOMATCH);
3776 }
3777 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3778 MRRETURN(MATCH_NOMATCH);
3779 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3780 }
3781 break;
3782
3783 case OP_WORDCHAR:
3784 for (i = 1; i <= min; i++)
3785 {
3786 if (eptr >= md->end_subject)
3787 {
3788 SCHECK_PARTIAL();
3789 MRRETURN(MATCH_NOMATCH);
3790 }
3791 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3792 MRRETURN(MATCH_NOMATCH);
3793 /* No need to skip more bytes - we know it's a 1-byte character */
3794 }
3795 break;
3796
3797 default:
3798 RRETURN(PCRE_ERROR_INTERNAL);
3799 } /* End switch(ctype) */
3800
3801 else
3802 #endif /* SUPPORT_UTF8 */
3803
3804 /* Code for the non-UTF-8 case for minimum matching of operators other
3805 than OP_PROP and OP_NOTPROP. */
3806
3807 switch(ctype)
3808 {
3809 case OP_ANY:
3810 for (i = 1; i <= min; i++)
3811 {
3812 if (eptr >= md->end_subject)
3813 {
3814 SCHECK_PARTIAL();
3815 MRRETURN(MATCH_NOMATCH);
3816 }
3817 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3818 eptr++;
3819 }
3820 break;
3821
3822 case OP_ALLANY:
3823 if (eptr > md->end_subject - min)
3824 {
3825 SCHECK_PARTIAL();
3826 MRRETURN(MATCH_NOMATCH);
3827 }
3828 eptr += min;
3829 break;
3830
3831 case OP_ANYBYTE:
3832 if (eptr > md->end_subject - min)
3833 {
3834 SCHECK_PARTIAL();
3835 MRRETURN(MATCH_NOMATCH);
3836 }
3837 eptr += min;
3838 break;
3839
3840 case OP_ANYNL:
3841 for (i = 1; i <= min; i++)
3842 {
3843 if (eptr >= md->end_subject)
3844 {
3845 SCHECK_PARTIAL();
3846 MRRETURN(MATCH_NOMATCH);
3847 }
3848 switch(*eptr++)
3849 {
3850 default: MRRETURN(MATCH_NOMATCH);
3851 case 0x000d:
3852 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3853 break;
3854 case 0x000a:
3855 break;
3856
3857 case 0x000b:
3858 case 0x000c:
3859 case 0x0085:
3860 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3861 break;
3862 }
3863 }
3864 break;
3865
3866 case OP_NOT_HSPACE:
3867 for (i = 1; i <= min; i++)
3868 {
3869 if (eptr >= md->end_subject)
3870 {
3871 SCHECK_PARTIAL();
3872 MRRETURN(MATCH_NOMATCH);
3873 }
3874 switch(*eptr++)
3875 {
3876 default: break;
3877 case 0x09: /* HT */
3878 case 0x20: /* SPACE */
3879 case 0xa0: /* NBSP */
3880 MRRETURN(MATCH_NOMATCH);
3881 }
3882 }
3883 break;
3884
3885 case OP_HSPACE:
3886 for (i = 1; i <= min; i++)
3887 {
3888 if (eptr >= md->end_subject)
3889 {
3890 SCHECK_PARTIAL();
3891 MRRETURN(MATCH_NOMATCH);
3892 }
3893 switch(*eptr++)
3894 {
3895 default: MRRETURN(MATCH_NOMATCH);
3896 case 0x09: /* HT */
3897 case 0x20: /* SPACE */
3898 case 0xa0: /* NBSP */
3899 break;
3900 }
3901 }
3902 break;
3903
3904 case OP_NOT_VSPACE:
3905 for (i = 1; i <= min; i++)
3906 {
3907 if (eptr >= md->end_subject)
3908 {
3909 SCHECK_PARTIAL();
3910 MRRETURN(MATCH_NOMATCH);
3911 }
3912 switch(*eptr++)
3913 {
3914 default: break;
3915 case 0x0a: /* LF */
3916 case 0x0b: /* VT */
3917 case 0x0c: /* FF */
3918 case 0x0d: /* CR */
3919 case 0x85: /* NEL */
3920 MRRETURN(MATCH_NOMATCH);
3921 }
3922 }
3923 break;
3924
3925 case OP_VSPACE:
3926 for (i = 1; i <= min; i++)
3927 {
3928 if (eptr >= md->end_subject)
3929 {
3930 SCHECK_PARTIAL();
3931 MRRETURN(MATCH_NOMATCH);
3932 }
3933 switch(*eptr++)
3934 {
3935 default: MRRETURN(MATCH_NOMATCH);
3936 case 0x0a: /* LF */
3937 case 0x0b: /* VT */
3938 case 0x0c: /* FF */
3939 case 0x0d: /* CR */
3940 case 0x85: /* NEL */
3941 break;
3942 }
3943 }
3944 break;
3945
3946 case OP_NOT_DIGIT:
3947 for (i = 1; i <= min; i++)
3948 {
3949 if (eptr >= md->end_subject)
3950 {
3951 SCHECK_PARTIAL();
3952 MRRETURN(MATCH_NOMATCH);
3953 }
3954 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
3955 }
3956 break;
3957
3958 case OP_DIGIT:
3959 for (i = 1; i <= min; i++)
3960 {
3961 if (eptr >= md->end_subject)
3962 {
3963 SCHECK_PARTIAL();
3964 MRRETURN(MATCH_NOMATCH);
3965 }
3966 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
3967 }
3968 break;
3969
3970 case OP_NOT_WHITESPACE:
3971 for (i = 1; i <= min; i++)
3972 {
3973 if (eptr >= md->end_subject)
3974 {
3975 SCHECK_PARTIAL();
3976 MRRETURN(MATCH_NOMATCH);
3977 }
3978 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
3979 }
3980 break;
3981
3982 case OP_WHITESPACE:
3983 for (i = 1; i <= min; i++)
3984 {
3985 if (eptr >= md->end_subject)
3986 {
3987 SCHECK_PARTIAL();
3988 MRRETURN(MATCH_NOMATCH);
3989 }
3990 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
3991 }
3992 break;
3993
3994 case OP_NOT_WORDCHAR:
3995 for (i = 1; i <= min; i++)
3996 {
3997 if (eptr >= md->end_subject)
3998 {
3999 SCHECK_PARTIAL();
4000 MRRETURN(MATCH_NOMATCH);
4001 }
4002 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4003 MRRETURN(MATCH_NOMATCH);
4004 }
4005 break;
4006
4007 case OP_WORDCHAR:
4008 for (i = 1; i <= min; i++)
4009 {
4010 if (eptr >= md->end_subject)
4011 {
4012 SCHECK_PARTIAL();
4013 MRRETURN(MATCH_NOMATCH);
4014 }
4015 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4016 MRRETURN(MATCH_NOMATCH);
4017 }
4018 break;
4019
4020 default:
4021 RRETURN(PCRE_ERROR_INTERNAL);
4022 }
4023 }
4024
4025 /* If min = max, continue at the same level without recursing */
4026
4027 if (min == max) continue;
4028
4029 /* If minimizing, we have to test the rest of the pattern before each
4030 subsequent match. Again, separate the UTF-8 case for speed, and also
4031 separate the UCP cases. */
4032
4033 if (minimize)
4034 {
4035 #ifdef SUPPORT_UCP
4036 if (prop_type >= 0)
4037 {
4038 switch(prop_type)
4039 {
4040 case PT_ANY:
4041 for (fi = min;; fi++)
4042 {
4043 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4045 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4046 if (eptr >= md->end_subject)
4047 {
4048 SCHECK_PARTIAL();
4049 MRRETURN(MATCH_NOMATCH);
4050 }
4051 GETCHARINC(c, eptr);
4052 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4053 }
4054 /* Control never gets here */
4055
4056 case PT_LAMP:
4057 for (fi = min;; fi++)
4058 {
4059 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4060 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4061 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4062 if (eptr >= md->end_subject)
4063 {
4064 SCHECK_PARTIAL();
4065 MRRETURN(MATCH_NOMATCH);
4066 }
4067 GETCHARINC(c, eptr);
4068 prop_chartype = UCD_CHARTYPE(c);
4069 if ((prop_chartype == ucp_Lu ||
4070 prop_chartype == ucp_Ll ||
4071 prop_chartype == ucp_Lt) == prop_fail_result)
4072 MRRETURN(MATCH_NOMATCH);
4073 }
4074 /* Control never gets here */
4075
4076 case PT_GC:
4077 for (fi = min;; fi++)
4078 {
4079 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4080 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4081 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4082 if (eptr >= md->end_subject)
4083 {
4084 SCHECK_PARTIAL();
4085 MRRETURN(MATCH_NOMATCH);
4086 }
4087 GETCHARINC(c, eptr);
4088 prop_category = UCD_CATEGORY(c);
4089 if ((prop_category == prop_value) == prop_fail_result)
4090 MRRETURN(MATCH_NOMATCH);
4091 }
4092 /* Control never gets here */
4093
4094 case PT_PC:
4095 for (fi = min;; fi++)
4096 {
4097 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4099 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4100 if (eptr >= md->end_subject)
4101 {
4102 SCHECK_PARTIAL();
4103 MRRETURN(MATCH_NOMATCH);
4104 }
4105 GETCHARINC(c, eptr);
4106 prop_chartype = UCD_CHARTYPE(c);
4107 if ((prop_chartype == prop_value) == prop_fail_result)
4108 MRRETURN(MATCH_NOMATCH);
4109 }
4110 /* Control never gets here */
4111
4112 case PT_SC:
4113 for (fi = min;; fi++)
4114 {
4115 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4116 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4117 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4118 if (eptr >= md->end_subject)
4119 {
4120 SCHECK_PARTIAL();
4121 MRRETURN(MATCH_NOMATCH);
4122 }
4123 GETCHARINC(c, eptr);
4124 prop_script = UCD_SCRIPT(c);
4125 if ((prop_script == prop_value) == prop_fail_result)
4126 MRRETURN(MATCH_NOMATCH);
4127 }
4128 /* Control never gets here */
4129
4130 default:
4131 RRETURN(PCRE_ERROR_INTERNAL);
4132 }
4133 }
4134
4135 /* Match extended Unicode sequences. We will get here only if the
4136 support is in the binary; otherwise a compile-time error occurs. */
4137
4138 else if (ctype == OP_EXTUNI)
4139 {
4140 for (fi = min;; fi++)
4141 {
4142 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4143 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4144 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4145 if (eptr >= md->end_subject)
4146 {
4147 SCHECK_PARTIAL();
4148 MRRETURN(MATCH_NOMATCH);
4149 }
4150 GETCHARINCTEST(c, eptr);
4151 prop_category = UCD_CATEGORY(c);
4152 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4153 while (eptr < md->end_subject)
4154 {
4155 int len = 1;
4156 if (!utf8) c = *eptr;
4157 else { GETCHARLEN(c, eptr, len); }
4158 prop_category = UCD_CATEGORY(c);
4159 if (prop_category != ucp_M) break;
4160 eptr += len;
4161 }
4162 }
4163 }
4164
4165 else
4166 #endif /* SUPPORT_UCP */
4167
4168 #ifdef SUPPORT_UTF8
4169 /* UTF-8 mode */
4170 if (utf8)
4171 {
4172 for (fi = min;; fi++)
4173 {
4174 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4175 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4176 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4177 if (eptr >= md->end_subject)
4178 {
4179 SCHECK_PARTIAL();
4180 MRRETURN(MATCH_NOMATCH);
4181 }
4182 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4183 MRRETURN(MATCH_NOMATCH);
4184 GETCHARINC(c, eptr);
4185 switch(ctype)
4186 {
4187 case OP_ANY: /* This is the non-NL case */
4188 case OP_ALLANY:
4189 case OP_ANYBYTE:
4190 break;
4191
4192 case OP_ANYNL:
4193 switch(c)
4194 {
4195 default: MRRETURN(MATCH_NOMATCH);
4196 case 0x000d:
4197 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4198 break;
4199 case 0x000a:
4200 break;
4201
4202 case 0x000b:
4203 case 0x000c:
4204 case 0x0085:
4205 case 0x2028:
4206 case 0x2029:
4207 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4208 break;
4209 }
4210 break;
4211
4212 case OP_NOT_HSPACE:
4213 switch(c)
4214 {
4215 default: break;
4216 case 0x09: /* HT */
4217 case 0x20: /* SPACE */
4218 case 0xa0: /* NBSP */
4219 case 0x1680: /* OGHAM SPACE MARK */
4220 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4221 case 0x2000: /* EN QUAD */
4222 case 0x2001: /* EM QUAD */
4223 case 0x2002: /* EN SPACE */
4224 case 0x2003: /* EM SPACE */
4225 case 0x2004: /* THREE-PER-EM SPACE */
4226 case 0x2005: /* FOUR-PER-EM SPACE */
4227 case 0x2006: /* SIX-PER-EM SPACE */
4228 case 0x2007: /* FIGURE SPACE */
4229 case 0x2008: /* PUNCTUATION SPACE */
4230 case 0x2009: /* THIN SPACE */
4231 case 0x200A: /* HAIR SPACE */
4232 case 0x202f: /* NARROW NO-BREAK SPACE */
4233 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4234 case 0x3000: /* IDEOGRAPHIC SPACE */
4235 MRRETURN(MATCH_NOMATCH);
4236 }
4237 break;
4238
4239 case OP_HSPACE:
4240 switch(c)
4241 {
4242 default: MRRETURN(MATCH_NOMATCH);
4243 case 0x09: /* HT */
4244 case 0x20: /* SPACE */
4245 case 0xa0: /* NBSP */
4246 case 0x1680: /* OGHAM SPACE MARK */
4247 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4248 case 0x2000: /* EN QUAD */
4249 case 0x2001: /* EM QUAD */
4250 case 0x2002: /* EN SPACE */
4251 case 0x2003: /* EM SPACE */
4252 case 0x2004: /* THREE-PER-EM SPACE */
4253 case 0x2005: /* FOUR-PER-EM SPACE */
4254 case 0x2006: /* SIX-PER-EM SPACE */
4255 case 0x2007: /* FIGURE SPACE */
4256 case 0x2008: /* PUNCTUATION SPACE */
4257 case 0x2009: /* THIN SPACE */
4258 case 0x200A: /* HAIR SPACE */
4259 case 0x202f: /* NARROW NO-BREAK SPACE */
4260 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4261 case 0x3000: /* IDEOGRAPHIC SPACE */
4262 break;
4263 }
4264 break;
4265
4266 case OP_NOT_VSPACE:
4267 switch(c)
4268 {
4269 default: break;
4270 case 0x0a: /* LF */
4271 case 0x0b: /* VT */
4272 case 0x0c: /* FF */
4273 case 0x0d: /* CR */
4274 case 0x85: /* NEL */
4275 case 0x2028: /* LINE SEPARATOR */
4276 case 0x2029: /* PARAGRAPH SEPARATOR */
4277 MRRETURN(MATCH_NOMATCH);
4278 }
4279 break;
4280
4281 case OP_VSPACE:
4282 switch(c)
4283 {
4284 default: MRRETURN(MATCH_NOMATCH);
4285 case 0x0a: /* LF */
4286 case 0x0b: /* VT */
4287 case 0x0c: /* FF */
4288 case 0x0d: /* CR */
4289 case 0x85: /* NEL */
4290 case 0x2028: /* LINE SEPARATOR */
4291 case 0x2029: /* PARAGRAPH SEPARATOR */
4292 break;
4293 }
4294 break;
4295
4296 case OP_NOT_DIGIT:
4297 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4298 MRRETURN(MATCH_NOMATCH);
4299 break;
4300
4301 case OP_DIGIT:
4302 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4303 MRRETURN(MATCH_NOMATCH);
4304 break;
4305
4306 case OP_NOT_WHITESPACE:
4307 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4308 MRRETURN(MATCH_NOMATCH);
4309 break;
4310
4311 case OP_WHITESPACE:
4312 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4313 MRRETURN(MATCH_NOMATCH);
4314 break;
4315
4316 case OP_NOT_WORDCHAR:
4317 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4318 MRRETURN(MATCH_NOMATCH);
4319 break;
4320
4321 case OP_WORDCHAR:
4322 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4323 MRRETURN(MATCH_NOMATCH);
4324 break;
4325
4326 default:
4327 RRETURN(PCRE_ERROR_INTERNAL);
4328 }
4329 }
4330 }
4331 else
4332 #endif
4333 /* Not UTF-8 mode */
4334 {
4335 for (fi = min;; fi++)
4336 {
4337 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4339 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 MRRETURN(MATCH_NOMATCH);
4344 }
4345 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4346 MRRETURN(MATCH_NOMATCH);
4347 c = *eptr++;
4348 switch(ctype)
4349 {
4350 case OP_ANY: /* This is the non-NL case */
4351 case OP_ALLANY:
4352 case OP_ANYBYTE:
4353 break;
4354
4355 case OP_ANYNL:
4356 switch(c)
4357 {
4358 default: MRRETURN(MATCH_NOMATCH);
4359 case 0x000d:
4360 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4361 break;
4362
4363 case 0x000a:
4364 break;
4365
4366 case 0x000b:
4367 case 0x000c:
4368 case 0x0085:
4369 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4370 break;
4371 }
4372 break;
4373
4374 case OP_NOT_HSPACE:
4375 switch(c)
4376 {
4377 default: break;
4378 case 0x09: /* HT */
4379 case 0x20: /* SPACE */
4380 case 0xa0: /* NBSP */
4381 MRRETURN(MATCH_NOMATCH);
4382 }
4383 break;
4384
4385 case OP_HSPACE:
4386 switch(c)
4387 {
4388 default: MRRETURN(MATCH_NOMATCH);
4389 case 0x09: /* HT */
4390 case 0x20: /* SPACE */
4391 case 0xa0: /* NBSP */
4392 break;
4393 }
4394 break;
4395
4396 case OP_NOT_VSPACE:
4397 switch(c)
4398 {
4399 default: break;
4400 case 0x0a: /* LF */
4401 case 0x0b: /* VT */
4402 case 0x0c: /* FF */
4403 case 0x0d: /* CR */
4404 case 0x85: /* NEL */
4405 MRRETURN(MATCH_NOMATCH);
4406 }
4407 break;
4408
4409 case OP_VSPACE:
4410 switch(c)
4411 {
4412 default: MRRETURN(MATCH_NOMATCH);
4413 case 0x0a: /* LF */
4414 case 0x0b: /* VT */
4415 case 0x0c: /* FF */
4416 case 0x0d: /* CR */
4417 case 0x85: /* NEL */
4418 break;
4419 }
4420 break;
4421
4422 case OP_NOT_DIGIT:
4423 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4424 break;
4425
4426 case OP_DIGIT:
4427 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4428 break;
4429
4430 case OP_NOT_WHITESPACE:
4431 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4432 break;
4433
4434 case OP_WHITESPACE:
4435 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4436 break;
4437
4438 case OP_NOT_WORDCHAR:
4439 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4440 break;
4441
4442 case OP_WORDCHAR:
4443 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4444 break;
4445
4446 default:
4447 RRETURN(PCRE_ERROR_INTERNAL);
4448 }
4449 }
4450 }
4451 /* Control never gets here */
4452 }
4453
4454 /* If maximizing, it is worth using inline code for speed, doing the type
4455 test once at the start (i.e. keep it out of the loop). Again, keep the
4456 UTF-8 and UCP stuff separate. */
4457
4458 else
4459 {
4460 pp = eptr; /* Remember where we started */
4461
4462 #ifdef SUPPORT_UCP
4463 if (prop_type >= 0)
4464 {
4465 switch(prop_type)
4466 {
4467 case PT_ANY:
4468 for (i = min; i < max; i++)
4469 {
4470 int len = 1;
4471 if (eptr >= md->end_subject)
4472 {
4473 SCHECK_PARTIAL();
4474 break;
4475 }
4476 GETCHARLEN(c, eptr, len);
4477 if (prop_fail_result) break;
4478 eptr+= len;
4479 }
4480 break;
4481
4482 case PT_LAMP:
4483 for (i = min; i < max; i++)
4484 {
4485 int len = 1;
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 break;
4490 }
4491 GETCHARLEN(c, eptr, len);
4492 prop_chartype = UCD_CHARTYPE(c);
4493 if ((prop_chartype == ucp_Lu ||
4494 prop_chartype == ucp_Ll ||
4495 prop_chartype == ucp_Lt) == prop_fail_result)
4496 break;
4497 eptr+= len;
4498 }
4499 break;
4500
4501 case PT_GC:
4502 for (i = min; i < max; i++)
4503 {
4504 int len = 1;
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 break;
4509 }
4510 GETCHARLEN(c, eptr, len);
4511 prop_category = UCD_CATEGORY(c);
4512 if ((prop_category == prop_value) == prop_fail_result)
4513 break;
4514 eptr+= len;
4515 }
4516 break;
4517
4518 case PT_PC:
4519 for (i = min; i < max; i++)
4520 {
4521 int len = 1;
4522 if (eptr >= md->end_subject)
4523 {
4524 SCHECK_PARTIAL();
4525 break;
4526 }
4527 GETCHARLEN(c, eptr, len);
4528 prop_chartype = UCD_CHARTYPE(c);
4529 if ((prop_chartype == prop_value) == prop_fail_result)
4530 break;
4531 eptr+= len;
4532 }
4533 break;
4534
4535 case PT_SC:
4536 for (i = min; i < max; i++)
4537 {
4538 int len = 1;
4539 if (eptr >= md->end_subject)
4540 {
4541 SCHECK_PARTIAL();
4542 break;
4543 }
4544 GETCHARLEN(c, eptr, len);
4545 prop_script = UCD_SCRIPT(c);
4546 if ((prop_script == prop_value) == prop_fail_result)
4547 break;
4548 eptr+= len;
4549 }
4550 break;
4551 }
4552
4553 /* eptr is now past the end of the maximum run */
4554
4555 if (possessive) continue;
4556 for(;;)
4557 {
4558 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4560 if (eptr-- == pp) break; /* Stop if tried at original pos */
4561 if (utf8) BACKCHAR(eptr);
4562 }
4563 }
4564
4565 /* Match extended Unicode sequences. We will get here only if the
4566 support is in the binary; otherwise a compile-time error occurs. */
4567
4568 else if (ctype == OP_EXTUNI)
4569 {
4570 for (i = min; i < max; i++)
4571 {
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 break;
4576 }
4577 GETCHARINCTEST(c, eptr);
4578 prop_category = UCD_CATEGORY(c);
4579 if (prop_category == ucp_M) break;
4580 while (eptr < md->end_subject)
4581 {
4582 int len = 1;
4583 if (!utf8) c = *eptr; else
4584 {
4585 GETCHARLEN(c, eptr, len);
4586 }
4587 prop_category = UCD_CATEGORY(c);
4588 if (prop_category != ucp_M) break;
4589 eptr += len;
4590 }
4591 }
4592
4593 /* eptr is now past the end of the maximum run */
4594
4595 if (possessive) continue;
4596
4597 for(;;)
4598 {
4599 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4600 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4601 if (eptr-- == pp) break; /* Stop if tried at original pos */
4602 for (;;) /* Move back over one extended */
4603 {
4604 int len = 1;
4605 if (!utf8) c = *eptr; else
4606 {
4607 BACKCHAR(eptr);
4608 GETCHARLEN(c, eptr, len);
4609 }
4610 prop_category = UCD_CATEGORY(c);
4611 if (prop_category != ucp_M) break;
4612 eptr--;
4613 }
4614 }
4615 }
4616
4617 else
4618 #endif /* SUPPORT_UCP */
4619
4620 #ifdef SUPPORT_UTF8
4621 /* UTF-8 mode */
4622
4623 if (utf8)
4624 {
4625 switch(ctype)
4626 {
4627 case OP_ANY:
4628 if (max < INT_MAX)
4629 {
4630 for (i = min; i < max; i++)
4631 {
4632 if (eptr >= md->end_subject)
4633 {
4634 SCHECK_PARTIAL();
4635 break;
4636 }
4637 if (IS_NEWLINE(eptr)) break;
4638 eptr++;
4639 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4640 }
4641 }
4642
4643 /* Handle unlimited UTF-8 repeat */
4644
4645 else
4646 {
4647 for (i = min; i < max; i++)
4648 {
4649 if (eptr >= md->end_subject)
4650 {
4651 SCHECK_PARTIAL();
4652 break;
4653 }
4654 if (IS_NEWLINE(eptr)) break;
4655 eptr++;
4656 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4657 }
4658 }
4659 break;
4660
4661 case OP_ALLANY:
4662 if (max < INT_MAX)
4663 {
4664 for (i = min; i < max; i++)
4665 {
4666 if (eptr >= md->end_subject)
4667 {
4668 SCHECK_PARTIAL();
4669 break;
4670 }
4671 eptr++;
4672 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4673 }
4674 }
4675 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4676 break;
4677
4678 /* The byte case is the same as non-UTF8 */
4679
4680 case OP_ANYBYTE:
4681 c = max - min;
4682 if (c > (unsigned int)(md->end_subject - eptr))
4683 {
4684 eptr = md->end_subject;
4685 SCHECK_PARTIAL();
4686 }
4687 else eptr += c;
4688 break;
4689
4690 case OP_ANYNL:
4691 for (i = min; i < max; i++)
4692 {
4693 int len = 1;
4694 if (eptr >= md->end_subject)
4695 {
4696 SCHECK_PARTIAL();
4697 break;
4698 }
4699 GETCHARLEN(c, eptr, len);
4700 if (c == 0x000d)
4701 {
4702 if (++eptr >= md->end_subject) break;
4703 if (*eptr == 0x000a) eptr++;
4704 }
4705 else
4706 {
4707 if (c != 0x000a &&
4708 (md->bsr_anycrlf ||
4709 (c != 0x000b && c != 0x000c &&
4710 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4711 break;
4712 eptr += len;
4713 }
4714 }
4715 break;
4716
4717 case OP_NOT_HSPACE:
4718 case OP_HSPACE:
4719 for (i = min; i < max; i++)
4720 {
4721 BOOL gotspace;
4722 int len = 1;
4723 if (eptr >= md->end_subject)
4724 {
4725 SCHECK_PARTIAL();
4726 break;
4727 }
4728 GETCHARLEN(c, eptr, len);
4729 switch(c)
4730 {
4731 default: gotspace = FALSE; break;
4732 case 0x09: /* HT */
4733 case 0x20: /* SPACE */
4734 case 0xa0: /* NBSP */
4735 case 0x1680: /* OGHAM SPACE MARK */
4736 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4737 case 0x2000: /* EN QUAD */
4738 case 0x2001: /* EM QUAD */
4739 case 0x2002: /* EN SPACE */
4740 case 0x2003: /* EM SPACE */
4741 case 0x2004: /* THREE-PER-EM SPACE */
4742 case 0x2005: /* FOUR-PER-EM SPACE */
4743 case 0x2006: /* SIX-PER-EM SPACE */
4744 case 0x2007: /* FIGURE SPACE */
4745 case 0x2008: /* PUNCTUATION SPACE */
4746 case 0x2009: /* THIN SPACE */
4747 case 0x200A: /* HAIR SPACE */
4748 case 0x202f: /* NARROW NO-BREAK SPACE */
4749 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4750 case 0x3000: /* IDEOGRAPHIC SPACE */
4751 gotspace = TRUE;
4752 break;
4753 }
4754 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4755 eptr += len;
4756 }
4757 break;
4758
4759 case OP_NOT_VSPACE:
4760 case OP_VSPACE:
4761 for (i = min; i < max; i++)
4762 {
4763 BOOL gotspace;
4764 int len = 1;
4765 if (eptr >= md->end_subject)
4766 {
4767 SCHECK_PARTIAL();
4768 break;
4769 }
4770 GETCHARLEN(c, eptr, len);
4771 switch(c)
4772 {
4773 default: gotspace = FALSE; break;
4774 case 0x0a: /* LF */
4775 case 0x0b: /* VT */
4776 case 0x0c: /* FF */
4777 case 0x0d: /* CR */
4778 case 0x85: /* NEL */
4779 case 0x2028: /* LINE SEPARATOR */
4780 case 0x2029: /* PARAGRAPH SEPARATOR */
4781 gotspace = TRUE;
4782 break;
4783 }
4784 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4785 eptr += len;
4786 }
4787 break;
4788
4789 case OP_NOT_DIGIT:
4790 for (i = min; i < max; i++)
4791 {
4792 int len = 1;
4793 if (eptr >= md->end_subject)
4794 {
4795 SCHECK_PARTIAL();
4796 break;
4797 }
4798 GETCHARLEN(c, eptr, len);
4799 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4800 eptr+= len;
4801 }
4802 break;
4803
4804 case OP_DIGIT:
4805 for (i = min; i < max; i++)
4806 {
4807 int len = 1;
4808 if (eptr >= md->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 break;
4812 }
4813 GETCHARLEN(c, eptr, len);
4814 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4815 eptr+= len;
4816 }
4817 break;
4818
4819 case OP_NOT_WHITESPACE:
4820 for (i = min; i < max; i++)
4821 {
4822 int len = 1;
4823 if (eptr >= md->end_subject)
4824 {
4825 SCHECK_PARTIAL();
4826 break;
4827 }
4828 GETCHARLEN(c, eptr, len);
4829 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4830 eptr+= len;
4831 }
4832 break;
4833
4834 case OP_WHITESPACE:
4835 for (i = min; i < max; i++)
4836 {
4837 int len = 1;
4838 if (eptr >= md->end_subject)
4839 {
4840 SCHECK_PARTIAL();
4841 break;
4842 }
4843 GETCHARLEN(c, eptr, len);
4844 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4845 eptr+= len;
4846 }
4847 break;
4848
4849 case OP_NOT_WORDCHAR:
4850 for (i = min; i < max; i++)
4851 {
4852 int len = 1;
4853 if (eptr >= md->end_subject)
4854 {
4855 SCHECK_PARTIAL();
4856 break;
4857 }
4858 GETCHARLEN(c, eptr, len);
4859 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4860 eptr+= len;
4861 }
4862 break;
4863
4864 case OP_WORDCHAR:
4865 for (i = min; i < max; i++)
4866 {
4867 int len = 1;
4868 if (eptr >= md->end_subject)
4869 {
4870 SCHECK_PARTIAL();
4871 break;
4872 }
4873 GETCHARLEN(c, eptr, len);
4874 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4875 eptr+= len;
4876 }
4877 break;
4878
4879 default:
4880 RRETURN(PCRE_ERROR_INTERNAL);
4881 }
4882
4883 /* eptr is now past the end of the maximum run */
4884
4885 if (possessive) continue;
4886 for(;;)
4887 {
4888 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4890 if (eptr-- == pp) break; /* Stop if tried at original pos */
4891 BACKCHAR(eptr);
4892 }
4893 }
4894 else
4895 #endif /* SUPPORT_UTF8 */
4896
4897 /* Not UTF-8 mode */
4898 {
4899 switch(ctype)
4900 {
4901 case OP_ANY:
4902 for (i = min; i < max; i++)
4903 {
4904 if (eptr >= md->end_subject)
4905 {
4906 SCHECK_PARTIAL();
4907 break;
4908 }
4909 if (IS_NEWLINE(eptr)) break;
4910 eptr++;
4911 }
4912 break;
4913
4914 case OP_ALLANY:
4915 case OP_ANYBYTE:
4916 c = max - min;
4917 if (c > (unsigned int)(md->end_subject - eptr))
4918 {
4919 eptr = md->end_subject;
4920 SCHECK_PARTIAL();
4921 }
4922 else eptr += c;
4923 break;
4924
4925 case OP_ANYNL:
4926 for (i = min; i < max; i++)
4927 {
4928 if (eptr >= md->end_subject)
4929 {
4930 SCHECK_PARTIAL();
4931 break;
4932 }
4933 c = *eptr;
4934 if (c == 0x000d)
4935 {
4936 if (++eptr >= md->end_subject) break;
4937 if (*eptr == 0x000a) eptr++;
4938 }
4939 else
4940 {
4941 if (c != 0x000a &&
4942 (md->bsr_anycrlf ||
4943 (c != 0x000b && c != 0x000c && c != 0x0085)))
4944 break;
4945 eptr++;
4946 }
4947 }
4948 break;
4949
4950 case OP_NOT_HSPACE:
4951 for (i = min; i < max; i++)
4952 {
4953 if (eptr >= md->end_subject)
4954 {
4955 SCHECK_PARTIAL();
4956 break;
4957 }
4958 c = *eptr;
4959 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4960 eptr++;
4961 }
4962 break;
4963
4964 case OP_HSPACE:
4965 for (i = min; i < max; i++)
4966 {
4967 if (eptr >= md->end_subject)
4968 {
4969 SCHECK_PARTIAL();
4970 break;
4971 }
4972 c = *eptr;
4973 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4974 eptr++;
4975 }
4976 break;
4977
4978 case OP_NOT_VSPACE:
4979 for (i = min; i < max; i++)
4980 {
4981 if (eptr >= md->end_subject)
4982 {
4983 SCHECK_PARTIAL();
4984 break;
4985 }
4986 c = *eptr;
4987 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4988 break;
4989 eptr++;
4990 }
4991 break;
4992
4993 case OP_VSPACE:
4994 for (i = min; i < max; i++)
4995 {
4996 if (eptr >= md->end_subject)
4997 {
4998 SCHECK_PARTIAL();
4999 break;
5000 }
5001 c = *eptr;
5002 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5003 break;
5004 eptr++;
5005 }
5006 break;
5007
5008 case OP_NOT_DIGIT:
5009 for (i = min; i < max; i++)
5010 {
5011 if (eptr >= md->end_subject)
5012 {
5013 SCHECK_PARTIAL();
5014 break;
5015 }
5016 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5017 eptr++;
5018 }
5019 break;
5020
5021 case OP_DIGIT:
5022 for (i = min; i < max; i++)
5023 {
5024 if (eptr >= md->end_subject)
5025 {
5026 SCHECK_PARTIAL();
5027 break;
5028 }
5029 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5030 eptr++;
5031 }
5032 break;
5033
5034 case OP_NOT_WHITESPACE:
5035 for (i = min; i < max; i++)
5036 {
5037 if (eptr >= md->end_subject)
5038 {
5039 SCHECK_PARTIAL();
5040 break;
5041 }
5042 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5043 eptr++;
5044 }
5045 break;
5046
5047 case OP_WHITESPACE:
5048 for (i = min; i < max; i++)
5049 {
5050 if (eptr >= md->end_subject)
5051 {
5052 SCHECK_PARTIAL();
5053 break;
5054 }
5055 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5056 eptr++;
5057 }
5058 break;
5059
5060 case OP_NOT_WORDCHAR:
5061 for (i = min; i < max; i++)
5062 {
5063 if (eptr >= md->end_subject)
5064 {
5065 SCHECK_PARTIAL();
5066 break;
5067 }
5068 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5069 eptr++;
5070 }
5071 break;
5072
5073 case OP_WORDCHAR:
5074 for (i = min; i < max; i++)
5075 {
5076 if (eptr >= md->end_subject)
5077 {
5078 SCHECK_PARTIAL();
5079 break;
5080 }
5081 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5082 eptr++;
5083 }
5084 break;
5085
5086 default:
5087 RRETURN(PCRE_ERROR_INTERNAL);
5088 }
5089
5090 /* eptr is now past the end of the maximum run */
5091
5092 if (possessive) continue;
5093 while (eptr >= pp)
5094 {
5095 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5096 eptr--;
5097 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5098 }
5099 }
5100
5101 /* Get here if we can't make it match with any permitted repetitions */
5102
5103 MRRETURN(MATCH_NOMATCH);
5104 }
5105 /* Control never gets here */
5106
5107 /* There's been some horrible disaster. Arrival here can only mean there is
5108 something seriously wrong in the code above or the OP_xxx definitions. */
5109
5110 default:
5111 DPRINTF(("Unknown opcode %d\n", *ecode));
5112 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5113 }
5114
5115 /* Do not stick any code in here without much thought; it is assumed
5116 that "continue" in the code above comes out to here to repeat the main
5117 loop. */
5118
5119 } /* End of main loop */
5120 /* Control never reaches here */
5121
5122
5123 /* When compiling to use the heap rather than the stack for recursive calls to
5124 match(), the RRETURN() macro jumps here. The number that is saved in
5125 frame->Xwhere indicates which label we actually want to return to. */
5126
5127 #ifdef NO_RECURSE
5128 #define LBL(val) case val: goto L_RM##val;
5129 HEAP_RETURN:
5130 switch (frame->Xwhere)
5131 {
5132 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5133 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5134 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5135 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5136 LBL(53) LBL(54)
5137 #ifdef SUPPORT_UTF8
5138 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5139 LBL(32) LBL(34) LBL(42) LBL(46)
5140 #ifdef SUPPORT_UCP
5141 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5142 #endif /* SUPPORT_UCP */
5143 #endif /* SUPPORT_UTF8 */
5144 default:
5145 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5146 return PCRE_ERROR_INTERNAL;
5147 }
5148 #undef LBL
5149 #endif /* NO_RECURSE */
5150 }
5151
5152
5153 /***************************************************************************
5154 ****************************************************************************
5155 RECURSION IN THE match() FUNCTION
5156
5157 Undefine all the macros that were defined above to handle this. */
5158
5159 #ifdef NO_RECURSE
5160 #undef eptr
5161 #undef ecode
5162 #undef mstart
5163 #undef offset_top
5164 #undef ims
5165 #undef eptrb
5166 #undef flags
5167
5168 #undef callpat
5169 #undef charptr
5170 #undef data
5171 #undef next
5172 #undef pp
5173 #undef prev
5174 #undef saved_eptr
5175
5176 #undef new_recursive
5177
5178 #undef cur_is_word
5179 #undef condition
5180 #undef prev_is_word
5181
5182 #undef original_ims
5183
5184 #undef ctype
5185 #undef length
5186 #undef max
5187 #undef min
5188 #undef number
5189 #undef offset
5190 #undef op
5191 #undef save_capture_last
5192 #undef save_offset1
5193 #undef save_offset2
5194 #undef save_offset3
5195 #undef stacksave
5196
5197 #undef newptrb
5198
5199 #endif
5200
5201 /* These two are defined as macros in both cases */
5202
5203 #undef fc
5204 #undef fi
5205
5206 /***************************************************************************
5207 ***************************************************************************/
5208
5209
5210
5211 /*************************************************
5212 * Execute a Regular Expression *
5213 *************************************************/
5214
5215 /* This function applies a compiled re to a subject string and picks out
5216 portions of the string if it matches. Two elements in the vector are set for
5217 each substring: the offsets to the start and end of the substring.
5218
5219 Arguments:
5220 argument_re points to the compiled expression
5221 extra_data points to extra data or is NULL
5222 subject points to the subject string
5223 length length of subject string (may contain binary zeros)
5224 start_offset where to start in the subject string
5225 options option bits
5226 offsets points to a vector of ints to be filled in with offsets
5227 offsetcount the number of elements in the vector
5228
5229 Returns: > 0 => success; value is the number of elements filled in
5230 = 0 => success, but offsets is not big enough
5231 -1 => failed to match
5232 < -1 => some kind of unexpected problem
5233 */
5234
5235 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5236 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5237 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5238 int offsetcount)
5239 {
5240 int rc, resetcount, ocount;
5241 int first_byte = -1;
5242 int req_byte = -1;
5243 int req_byte2 = -1;
5244 int newline;
5245 unsigned long int ims;
5246 BOOL using_temporary_offsets = FALSE;
5247 BOOL anchored;
5248 BOOL startline;
5249 BOOL firstline;
5250 BOOL first_byte_caseless = FALSE;
5251 BOOL req_byte_caseless = FALSE;
5252 BOOL utf8;
5253 match_data match_block;
5254 match_data *md = &match_block;
5255 const uschar *tables;
5256 const uschar *start_bits = NULL;
5257 USPTR start_match = (USPTR)subject + start_offset;
5258 USPTR end_subject;
5259 USPTR start_partial = NULL;
5260 USPTR req_byte_ptr = start_match - 1;
5261
5262 pcre_study_data internal_study;
5263 const pcre_study_data *study;
5264
5265 real_pcre internal_re;
5266 const real_pcre *external_re = (const real_pcre *)argument_re;
5267 const real_pcre *re = external_re;
5268
5269 /* Plausibility checks */
5270
5271 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5272 if (re == NULL || subject == NULL ||
5273 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5274 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5275
5276 /* This information is for finding all the numbers associated with a given
5277 name, for condition testing. */
5278
5279 md->name_table = (uschar *)re + re->name_table_offset;
5280 md->name_count = re->name_count;
5281 md->name_entry_size = re->name_entry_size;
5282
5283 /* Fish out the optional data from the extra_data structure, first setting
5284 the default values. */
5285
5286 study = NULL;
5287 md->match_limit = MATCH_LIMIT;
5288 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5289 md->callout_data = NULL;
5290
5291 /* The table pointer is always in native byte order. */
5292
5293 tables = external_re->tables;
5294
5295 if (extra_data != NULL)
5296 {
5297 register unsigned int flags = extra_data->flags;
5298 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5299 study = (const pcre_study_data *)extra_data->study_data;
5300 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5301 md->match_limit = extra_data->match_limit;
5302 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5303 md->match_limit_recursion = extra_data->match_limit_recursion;
5304 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5305 md->callout_data = extra_data->callout_data;
5306 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5307 }
5308
5309 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5310 is a feature that makes it possible to save compiled regex and re-use them
5311 in other programs later. */
5312
5313 if (tables == NULL) tables = _pcre_default_tables;
5314
5315 /* Check that the first field in the block is the magic number. If it is not,
5316 test for a regex that was compiled on a host of opposite endianness. If this is
5317 the case, flipped values are put in internal_re and internal_study if there was
5318 study data too. */
5319
5320 if (re->magic_number != MAGIC_NUMBER)
5321 {
5322 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5323 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5324 if (study != NULL) study = &internal_study;
5325 }
5326
5327 /* Set up other data */
5328
5329 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5330 startline = (re->flags & PCRE_STARTLINE) != 0;
5331 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5332
5333 /* The code starts after the real_pcre block and the capture name table. */
5334
5335 md->start_code = (const uschar *)external_re + re->name_table_offset +
5336 re->name_count * re->name_entry_size;
5337
5338 md->start_subject = (USPTR)subject;
5339 md->start_offset = start_offset;
5340 md->end_subject = md->start_subject + length;
5341 end_subject = md->end_subject;
5342
5343 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5344 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5345 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5346
5347 md->notbol = (options & PCRE_NOTBOL) != 0;
5348 md->noteol = (options & PCRE_NOTEOL) != 0;
5349 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5350 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5351 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5352 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5353 md->hitend = FALSE;
5354 md->mark = NULL; /* In case never set */
5355
5356 md->recursive = NULL; /* No recursion at top level */
5357
5358 md->lcc = tables + lcc_offset;
5359 md->ctypes = tables + ctypes_offset;
5360
5361 /* Handle different \R options. */
5362
5363 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5364 {
5365 case 0:
5366 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5367 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5368 else
5369 #ifdef BSR_ANYCRLF
5370 md->bsr_anycrlf = TRUE;
5371 #else
5372 md->bsr_anycrlf = FALSE;
5373 #endif
5374 break;
5375
5376 case PCRE_BSR_ANYCRLF:
5377 md->bsr_anycrlf = TRUE;
5378 break;
5379
5380 case PCRE_BSR_UNICODE:
5381 md->bsr_anycrlf = FALSE;
5382 break;
5383
5384 default: return PCRE_ERROR_BADNEWLINE;
5385 }
5386
5387 /* Handle different types of newline. The three bits give eight cases. If
5388 nothing is set at run time, whatever was used at compile time applies. */
5389
5390 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5391 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5392 {
5393 case 0: newline = NEWLINE; break; /* Compile-time default */
5394 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5395 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5396 case PCRE_NEWLINE_CR+
5397 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5398 case PCRE_NEWLINE_ANY: newline = -1; break;
5399 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5400 default: return PCRE_ERROR_BADNEWLINE;
5401 }
5402
5403 if (newline == -2)
5404 {
5405 md->nltype = NLTYPE_ANYCRLF;
5406 }
5407 else if (newline < 0)
5408 {
5409 md->nltype = NLTYPE_ANY;
5410 }
5411 else
5412 {
5413 md->nltype = NLTYPE_FIXED;
5414 if (newline > 255)
5415 {
5416 md->nllen = 2;
5417 md->nl[0] = (newline >> 8) & 255;
5418 md->nl[1] = newline & 255;
5419 }
5420 else
5421 {
5422 md->nllen = 1;
5423 md->nl[0] = newline;
5424 }
5425 }
5426
5427 /* Partial matching was originally supported only for a restricted set of
5428 regexes; from release 8.00 there are no restrictions, but the bits are still
5429 defined (though never set). So there's no harm in leaving this code. */
5430
5431 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5432 return PCRE_ERROR_BADPARTIAL;
5433
5434 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5435 back the character offset. */
5436
5437 #ifdef SUPPORT_UTF8
5438 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5439 {
5440 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5441 return PCRE_ERROR_BADUTF8;
5442 if (start_offset > 0 && start_offset < length)
5443 {
5444 int tb = ((USPTR)subject)[start_offset];
5445 if (tb > 127)
5446 {
5447 tb &= 0xc0;
5448 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5449 }
5450 }
5451 }
5452 #endif
5453
5454 /* The ims options can vary during the matching as a result of the presence
5455 of (?ims) items in the pattern. They are kept in a local variable so that
5456 restoring at the exit of a group is easy. */
5457
5458 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5459
5460 /* If the expression has got more back references than the offsets supplied can
5461 hold, we get a temporary chunk of working store to use during the matching.
5462 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5463 of 3. */
5464
5465 ocount = offsetcount - (offsetcount % 3);
5466
5467 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5468 {
5469 ocount = re->top_backref * 3 + 3;
5470 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5471 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5472 using_temporary_offsets = TRUE;
5473 DPRINTF(("Got memory to hold back references\n"));
5474 }
5475 else md->offset_vector = offsets;
5476
5477 md->offset_end = ocount;
5478 md->offset_max = (2*ocount)/3;
5479 md->offset_overflow = FALSE;
5480 md->capture_last = -1;
5481
5482 /* Compute the minimum number of offsets that we need to reset each time. Doing
5483 this makes a huge difference to execution time when there aren't many brackets
5484 in the pattern. */
5485
5486 resetcount = 2 + re->top_bracket * 2;
5487 if (resetcount > offsetcount) resetcount = ocount;
5488
5489 /* Reset the working variable associated with each extraction. These should
5490 never be used unless previously set, but they get saved and restored, and so we
5491 initialize them to avoid reading uninitialized locations. */
5492
5493 if (md->offset_vector != NULL)
5494 {
5495 register int *iptr = md->offset_vector + ocount;
5496 register int *iend = iptr - resetcount/2 + 1;
5497 while (--iptr >= iend) *iptr = -1;
5498 }
5499
5500 /* Set up the first character to match, if available. The first_byte value is
5501 never set for an anchored regular expression, but the anchoring may be forced
5502 at run time, so we have to test for anchoring. The first char may be unset for
5503 an unanchored pattern, of course. If there's no first char and the pattern was
5504 studied, there may be a bitmap of possible first characters. */
5505
5506 if (!anchored)
5507 {
5508 if ((re->flags & PCRE_FIRSTSET) != 0)
5509 {
5510 first_byte = re->first_byte & 255;
5511 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5512 first_byte = md->lcc[first_byte];
5513 }
5514 else
5515 if (!startline && study != NULL &&
5516 (study->flags & PCRE_STUDY_MAPPED) != 0)
5517 start_bits = study->start_bits;
5518 }
5519
5520 /* For anchored or unanchored matches, there may be a "last known required
5521 character" set. */
5522
5523 if ((re->flags & PCRE_REQCHSET) != 0)
5524 {
5525 req_byte = re->req_byte & 255;
5526 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5527 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5528 }
5529
5530
5531 /* ==========================================================================*/
5532
5533 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5534 the loop runs just once. */
5535
5536 for(;;)
5537 {
5538 USPTR save_end_subject = end_subject;
5539 USPTR new_start_match;
5540
5541 /* Reset the maximum number of extractions we might see. */
5542
5543 if (md->offset_vector != NULL)
5544 {
5545 register int *iptr = md->offset_vector;
5546 register int *iend = iptr + resetcount;
5547 while (iptr < iend) *iptr++ = -1;
5548 }
5549
5550 /* If firstline is TRUE, the start of the match is constrained to the first
5551 line of a multiline string. That is, the match must be before or at the first
5552 newline. Implement this by temporarily adjusting end_subject so that we stop
5553 scanning at a newline. If the match fails at the newline, later code breaks
5554 this loop. */
5555
5556 if (firstline)
5557 {
5558 USPTR t = start_match;
5559 #ifdef SUPPORT_UTF8
5560 if (utf8)
5561 {
5562 while (t < md->end_subject && !IS_NEWLINE(t))
5563 {
5564 t++;
5565 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5566 }
5567 }
5568 else
5569 #endif
5570 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5571 end_subject = t;
5572 }
5573
5574 /* There are some optimizations that avoid running the match if a known
5575 starting point is not found, or if a known later character is not present.
5576 However, there is an option that disables these, for testing and for ensuring
5577 that all callouts do actually occur. */
5578
5579 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5580 {
5581 /* Advance to a unique first byte if there is one. */
5582
5583 if (first_byte >= 0)
5584 {
5585 if (first_byte_caseless)
5586 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5587 start_match++;
5588 else
5589 while (start_match < end_subject && *start_match != first_byte)
5590 start_match++;
5591 }
5592
5593 /* Or to just after a linebreak for a multiline match */
5594
5595 else if (startline)
5596 {
5597 if (start_match > md->start_subject + start_offset)
5598 {
5599 #ifdef SUPPORT_UTF8
5600 if (utf8)
5601 {
5602 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5603 {
5604 start_match++;
5605 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5606 start_match++;
5607 }
5608 }
5609 else
5610 #endif
5611 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5612 start_match++;
5613
5614 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5615 and we are now at a LF, advance the match position by one more character.
5616 */
5617
5618 if (start_match[-1] == CHAR_CR &&
5619 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5620 start_match < end_subject &&
5621 *start_match == CHAR_NL)
5622 start_match++;
5623 }
5624 }
5625
5626 /* Or to a non-unique first byte after study */
5627
5628 else if (start_bits != NULL)
5629 {
5630 while (start_match < end_subject)
5631 {
5632 register unsigned int c = *start_match;
5633 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5634 else break;
5635 }
5636 }
5637 } /* Starting optimizations */
5638
5639 /* Restore fudged end_subject */
5640
5641 end_subject = save_end_subject;
5642
5643 /* The following two optimizations are disabled for partial matching or if
5644 disabling is explicitly requested. */
5645
5646 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5647 {
5648 /* If the pattern was studied, a minimum subject length may be set. This is
5649 a lower bound; no actual string of that length may actually match the
5650 pattern. Although the value is, strictly, in characters, we treat it as
5651 bytes to avoid spending too much time in this optimization. */
5652
5653 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5654 (pcre_uint32)(end_subject - start_match) < study->minlength)
5655 {
5656 rc = MATCH_NOMATCH;
5657 break;
5658 }
5659
5660 /* If req_byte is set, we know that that character must appear in the
5661 subject for the match to succeed. If the first character is set, req_byte
5662 must be later in the subject; otherwise the test starts at the match point.
5663 This optimization can save a huge amount of backtracking in patterns with
5664 nested unlimited repeats that aren't going to match. Writing separate code
5665 for cased/caseless versions makes it go faster, as does using an
5666 autoincrement and backing off on a match.
5667
5668 HOWEVER: when the subject string is very, very long, searching to its end
5669 can take a long time, and give bad performance on quite ordinary patterns.
5670 This showed up when somebody was matching something like /^\d+C/ on a
5671 32-megabyte string... so we don't do this when the string is sufficiently
5672 long. */
5673
5674 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5675 {
5676 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5677
5678 /* We don't need to repeat the search if we haven't yet reached the
5679 place we found it at last time. */
5680
5681 if (p > req_byte_ptr)
5682 {
5683 if (req_byte_caseless)
5684 {
5685 while (p < end_subject)
5686 {
5687 register int pp = *p++;
5688 if (pp == req_byte || pp == req_byte2) { p--; break; }
5689 }
5690 }
5691 else
5692 {
5693 while (p < end_subject)
5694 {
5695 if (*p++ == req_byte) { p--; break; }
5696 }
5697 }
5698
5699 /* If we can't find the required character, break the matching loop,
5700 forcing a match failure. */
5701
5702 if (p >= end_subject)
5703 {
5704 rc = MATCH_NOMATCH;
5705 break;
5706 }
5707
5708 /* If we have found the required character, save the point where we
5709 found it, so that we don't search again next time round the loop if
5710 the start hasn't passed this character yet. */
5711
5712 req_byte_ptr = p;
5713 }
5714 }
5715 }
5716
5717 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5718 printf(">>>> Match against: ");
5719 pchars(start_match, end_subject - start_match, TRUE, md);
5720 printf("\n");
5721 #endif
5722
5723 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5724 first starting point for which a partial match was found. */
5725
5726 md->start_match_ptr = start_match;
5727 md->start_used_ptr = start_match;
5728 md->match_call_count = 0;
5729 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
5730 0, 0);
5731 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5732
5733 switch(rc)
5734 {
5735 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
5736 this level it means that a MARK that matched the SKIP's arg was not found.
5737 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
5738
5739 case MATCH_NOMATCH:
5740 case MATCH_PRUNE:
5741 case MATCH_SKIP_ARG:
5742 case MATCH_THEN:
5743 new_start_match = start_match + 1;
5744 #ifdef SUPPORT_UTF8
5745 if (utf8)
5746 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5747 new_start_match++;
5748 #endif
5749 break;
5750
5751 /* SKIP passes back the next starting point explicitly. */
5752
5753 case MATCH_SKIP:
5754 new_start_match = md->start_match_ptr;
5755 break;
5756
5757 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5758
5759 case MATCH_COMMIT:
5760 rc = MATCH_NOMATCH;
5761 goto ENDLOOP;
5762
5763 /* Any other return is either a match, or some kind of error. */
5764
5765 default:
5766 goto ENDLOOP;
5767 }
5768
5769 /* Control reaches here for the various types of "no match at this point"
5770 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5771
5772 rc = MATCH_NOMATCH;
5773
5774 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5775 newline in the subject (though it may continue over the newline). Therefore,
5776 if we have just failed to match, starting at a newline, do not continue. */
5777
5778 if (firstline && IS_NEWLINE(start_match)) break;
5779
5780 /* Advance to new matching position */
5781
5782 start_match = new_start_match;
5783
5784 /* Break the loop if the pattern is anchored or if we have passed the end of
5785 the subject. */
5786
5787 if (anchored || start_match > end_subject) break;
5788
5789 /* If we have just passed a CR and we are now at a LF, and the pattern does
5790 not contain any explicit matches for \r or \n, and the newline option is CRLF
5791 or ANY or ANYCRLF, advance the match position by one more character. */
5792
5793 if (start_match[-1] == CHAR_CR &&
5794 start_match < end_subject &&
5795 *start_match == CHAR_NL &&
5796 (re->flags & PCRE_HASCRORLF) == 0 &&
5797 (md->nltype == NLTYPE_ANY ||
5798 md->nltype == NLTYPE_ANYCRLF ||
5799 md->nllen == 2))
5800 start_match++;
5801
5802 md->mark = NULL; /* Reset for start of next match attempt */
5803 } /* End of for(;;) "bumpalong" loop */
5804
5805 /* ==========================================================================*/
5806
5807 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5808 conditions is true:
5809
5810 (1) The pattern is anchored or the match was failed by (*COMMIT);
5811
5812 (2) We are past the end of the subject;
5813
5814 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5815 this option requests that a match occur at or before the first newline in
5816 the subject.
5817
5818 When we have a match and the offset vector is big enough to deal with any
5819 backreferences, captured substring offsets will already be set up. In the case
5820 where we had to get some local store to hold offsets for backreference
5821 processing, copy those that we can. In this case there need not be overflow if
5822 certain parts of the pattern were not used, even though there are more
5823 capturing parentheses than vector slots. */
5824
5825 ENDLOOP:
5826
5827 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
5828 {
5829 if (using_temporary_offsets)
5830 {
5831 if (offsetcount >= 4)
5832 {
5833 memcpy(offsets + 2, md->offset_vector + 2,
5834 (offsetcount - 2) * sizeof(int));
5835 DPRINTF(("Copied offsets from temporary memory\n"));
5836 }
5837 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5838 DPRINTF(("Freeing temporary memory\n"));
5839 (pcre_free)(md->offset_vector);
5840 }
5841
5842 /* Set the return code to the number of captured strings, or 0 if there are
5843 too many to fit into the vector. */
5844
5845 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5846
5847 /* If there is space, set up the whole thing as substring 0. The value of
5848 md->start_match_ptr might be modified if \K was encountered on the success
5849 matching path. */
5850
5851 if (offsetcount < 2) rc = 0; else
5852 {
5853 offsets[0] = md->start_match_ptr - md->start_subject;
5854 offsets[1] = md->end_match_ptr - md->start_subject;
5855 }
5856
5857 DPRINTF((">>>> returning %d\n", rc));
5858 goto RETURN_MARK;
5859 }
5860
5861 /* Control gets here if there has been an error, or if the overall match
5862 attempt has failed at all permitted starting positions. */
5863
5864 if (using_temporary_offsets)
5865 {
5866 DPRINTF(("Freeing temporary memory\n"));
5867 (pcre_free)(md->offset_vector);
5868 }
5869
5870 /* For anything other than nomatch or partial match, just return the code. */
5871
5872 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5873 {
5874 DPRINTF((">>>> error: returning %d\n", rc));
5875 return rc;
5876 }
5877
5878 /* Handle partial matches - disable any mark data */
5879
5880 if (start_partial != NULL)
5881 {
5882 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5883 md->mark = NULL;
5884 if (offsetcount > 1)
5885 {
5886 offsets[0] = start_partial - (USPTR)subject;
5887 offsets[1] = end_subject - (USPTR)subject;
5888 }
5889 rc = PCRE_ERROR_PARTIAL;
5890 }
5891
5892 /* This is the classic nomatch case */
5893
5894 else
5895 {
5896 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5897 rc = PCRE_ERROR_NOMATCH;
5898 }
5899
5900 /* Return the MARK data if it has been requested. */
5901
5902 RETURN_MARK:
5903
5904 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
5905 *(extra_data->mark) = (unsigned char *)(md->mark);
5906 return rc;
5907 }
5908
5909 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5