/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 512 - (show annotations)
Tue Mar 30 11:11:52 2010 UTC (5 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 174898 byte(s)
Error occurred while calculating annotation data.
Fix compile problems when heap is in use
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58 };
259
260 /* These versions of the macros use the stack, as normal. There are debugging
261 versions and production versions. Note that the "rw" argument of RMATCH isn't
262 actually used in this definition. */
263
264 #ifndef NO_RECURSE
265 #define REGISTER register
266
267 #ifdef PCRE_DEBUG
268 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
269 { \
270 printf("match() called in line %d\n", __LINE__); \
271 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
272 printf("to line %d\n", __LINE__); \
273 }
274 #define RRETURN(ra) \
275 { \
276 printf("match() returned %d from line %d ", ra, __LINE__); \
277 return ra; \
278 }
279 #else
280 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
281 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
282 #define RRETURN(ra) return ra
283 #endif
284
285 #else
286
287
288 /* These versions of the macros manage a private stack on the heap. Note that
289 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
290 argument of match(), which never changes. */
291
292 #define REGISTER
293
294 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
295 {\
296 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
297 frame->Xwhere = rw; \
298 newframe->Xeptr = ra;\
299 newframe->Xecode = rb;\
300 newframe->Xmstart = mstart;\
301 newframe->Xmarkptr = markptr;\
302 newframe->Xoffset_top = rc;\
303 newframe->Xims = re;\
304 newframe->Xeptrb = rf;\
305 newframe->Xflags = rg;\
306 newframe->Xrdepth = frame->Xrdepth + 1;\
307 newframe->Xprevframe = frame;\
308 frame = newframe;\
309 DPRINTF(("restarting from line %d\n", __LINE__));\
310 goto HEAP_RECURSE;\
311 L_##rw:\
312 DPRINTF(("jumped back to line %d\n", __LINE__));\
313 }
314
315 #define RRETURN(ra)\
316 {\
317 heapframe *newframe = frame;\
318 frame = newframe->Xprevframe;\
319 (pcre_stack_free)(newframe);\
320 if (frame != NULL)\
321 {\
322 rrc = ra;\
323 goto HEAP_RETURN;\
324 }\
325 return ra;\
326 }
327
328
329 /* Structure for remembering the local variables in a private frame */
330
331 typedef struct heapframe {
332 struct heapframe *Xprevframe;
333
334 /* Function arguments that may change */
335
336 USPTR Xeptr;
337 const uschar *Xecode;
338 USPTR Xmstart;
339 USPTR Xmarkptr;
340 int Xoffset_top;
341 long int Xims;
342 eptrblock *Xeptrb;
343 int Xflags;
344 unsigned int Xrdepth;
345
346 /* Function local variables */
347
348 USPTR Xcallpat;
349 #ifdef SUPPORT_UTF8
350 USPTR Xcharptr;
351 #endif
352 USPTR Xdata;
353 USPTR Xnext;
354 USPTR Xpp;
355 USPTR Xprev;
356 USPTR Xsaved_eptr;
357
358 recursion_info Xnew_recursive;
359
360 BOOL Xcur_is_word;
361 BOOL Xcondition;
362 BOOL Xprev_is_word;
363
364 unsigned long int Xoriginal_ims;
365
366 #ifdef SUPPORT_UCP
367 int Xprop_type;
368 int Xprop_value;
369 int Xprop_fail_result;
370 int Xprop_category;
371 int Xprop_chartype;
372 int Xprop_script;
373 int Xoclength;
374 uschar Xocchars[8];
375 #endif
376
377 int Xcodelink;
378 int Xctype;
379 unsigned int Xfc;
380 int Xfi;
381 int Xlength;
382 int Xmax;
383 int Xmin;
384 int Xnumber;
385 int Xoffset;
386 int Xop;
387 int Xsave_capture_last;
388 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
389 int Xstacksave[REC_STACK_SAVE_MAX];
390
391 eptrblock Xnewptrb;
392
393 /* Where to jump back to */
394
395 int Xwhere;
396
397 } heapframe;
398
399 #endif
400
401
402 /***************************************************************************
403 ***************************************************************************/
404
405
406
407 /*************************************************
408 * Match from current position *
409 *************************************************/
410
411 /* This function is called recursively in many circumstances. Whenever it
412 returns a negative (error) response, the outer incarnation must also return the
413 same response. */
414
415 /* These macros pack up tests that are used for partial matching, and which
416 appears several times in the code. We set the "hit end" flag if the pointer is
417 at the end of the subject and also past the start of the subject (i.e.
418 something has been matched). For hard partial matching, we then return
419 immediately. The second one is used when we already know we are past the end of
420 the subject. */
421
422 #define CHECK_PARTIAL()\
423 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
424 {\
425 md->hitend = TRUE;\
426 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
427 }
428
429 #define SCHECK_PARTIAL()\
430 if (md->partial != 0 && eptr > mstart)\
431 {\
432 md->hitend = TRUE;\
433 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
434 }
435
436
437 /* Performance note: It might be tempting to extract commonly used fields from
438 the md structure (e.g. utf8, end_subject) into individual variables to improve
439 performance. Tests using gcc on a SPARC disproved this; in the first case, it
440 made performance worse.
441
442 Arguments:
443 eptr pointer to current character in subject
444 ecode pointer to current position in compiled code
445 mstart pointer to the current match start position (can be modified
446 by encountering \K)
447 markptr pointer to the most recent MARK name, or NULL
448 offset_top current top pointer
449 md pointer to "static" info for the match
450 ims current /i, /m, and /s options
451 eptrb pointer to chain of blocks containing eptr at start of
452 brackets - for testing for empty matches
453 flags can contain
454 match_condassert - this is an assertion condition
455 match_cbegroup - this is the start of an unlimited repeat
456 group that can match an empty string
457 rdepth the recursion depth
458
459 Returns: MATCH_MATCH if matched ) these values are >= 0
460 MATCH_NOMATCH if failed to match )
461 a negative MATCH_xxx value for PRUNE, SKIP, etc
462 a negative PCRE_ERROR_xxx value if aborted by an error condition
463 (e.g. stopped by repeated call or recursion limit)
464 */
465
466 static int
467 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
468 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
469 eptrblock *eptrb, int flags, unsigned int rdepth)
470 {
471 /* These variables do not need to be preserved over recursion in this function,
472 so they can be ordinary variables in all cases. Mark some of them with
473 "register" because they are used a lot in loops. */
474
475 register int rrc; /* Returns from recursive calls */
476 register int i; /* Used for loops not involving calls to RMATCH() */
477 register unsigned int c; /* Character values not kept over RMATCH() calls */
478 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
479
480 BOOL minimize, possessive; /* Quantifier options */
481 int condcode;
482
483 /* When recursion is not being used, all "local" variables that have to be
484 preserved over calls to RMATCH() are part of a "frame" which is obtained from
485 heap storage. Set up the top-level frame here; others are obtained from the
486 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
487
488 #ifdef NO_RECURSE
489 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
490 frame->Xprevframe = NULL; /* Marks the top level */
491
492 /* Copy in the original argument variables */
493
494 frame->Xeptr = eptr;
495 frame->Xecode = ecode;
496 frame->Xmstart = mstart;
497 frame->Xmarkptr = markptr;
498 frame->Xoffset_top = offset_top;
499 frame->Xims = ims;
500 frame->Xeptrb = eptrb;
501 frame->Xflags = flags;
502 frame->Xrdepth = rdepth;
503
504 /* This is where control jumps back to to effect "recursion" */
505
506 HEAP_RECURSE:
507
508 /* Macros make the argument variables come from the current frame */
509
510 #define eptr frame->Xeptr
511 #define ecode frame->Xecode
512 #define mstart frame->Xmstart
513 #define markptr frame->Xmarkptr
514 #define offset_top frame->Xoffset_top
515 #define ims frame->Xims
516 #define eptrb frame->Xeptrb
517 #define flags frame->Xflags
518 #define rdepth frame->Xrdepth
519
520 /* Ditto for the local variables */
521
522 #ifdef SUPPORT_UTF8
523 #define charptr frame->Xcharptr
524 #endif
525 #define callpat frame->Xcallpat
526 #define codelink frame->Xcodelink
527 #define data frame->Xdata
528 #define next frame->Xnext
529 #define pp frame->Xpp
530 #define prev frame->Xprev
531 #define saved_eptr frame->Xsaved_eptr
532
533 #define new_recursive frame->Xnew_recursive
534
535 #define cur_is_word frame->Xcur_is_word
536 #define condition frame->Xcondition
537 #define prev_is_word frame->Xprev_is_word
538
539 #define original_ims frame->Xoriginal_ims
540
541 #ifdef SUPPORT_UCP
542 #define prop_type frame->Xprop_type
543 #define prop_value frame->Xprop_value
544 #define prop_fail_result frame->Xprop_fail_result
545 #define prop_category frame->Xprop_category
546 #define prop_chartype frame->Xprop_chartype
547 #define prop_script frame->Xprop_script
548 #define oclength frame->Xoclength
549 #define occhars frame->Xocchars
550 #endif
551
552 #define ctype frame->Xctype
553 #define fc frame->Xfc
554 #define fi frame->Xfi
555 #define length frame->Xlength
556 #define max frame->Xmax
557 #define min frame->Xmin
558 #define number frame->Xnumber
559 #define offset frame->Xoffset
560 #define op frame->Xop
561 #define save_capture_last frame->Xsave_capture_last
562 #define save_offset1 frame->Xsave_offset1
563 #define save_offset2 frame->Xsave_offset2
564 #define save_offset3 frame->Xsave_offset3
565 #define stacksave frame->Xstacksave
566
567 #define newptrb frame->Xnewptrb
568
569 /* When recursion is being used, local variables are allocated on the stack and
570 get preserved during recursion in the normal way. In this environment, fi and
571 i, and fc and c, can be the same variables. */
572
573 #else /* NO_RECURSE not defined */
574 #define fi i
575 #define fc c
576
577
578 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
579 const uschar *charptr; /* in small blocks of the code. My normal */
580 #endif /* style of coding would have declared */
581 const uschar *callpat; /* them within each of those blocks. */
582 const uschar *data; /* However, in order to accommodate the */
583 const uschar *next; /* version of this code that uses an */
584 USPTR pp; /* external "stack" implemented on the */
585 const uschar *prev; /* heap, it is easier to declare them all */
586 USPTR saved_eptr; /* here, so the declarations can be cut */
587 /* out in a block. The only declarations */
588 recursion_info new_recursive; /* within blocks below are for variables */
589 /* that do not have to be preserved over */
590 BOOL cur_is_word; /* a recursive call to RMATCH(). */
591 BOOL condition;
592 BOOL prev_is_word;
593
594 unsigned long int original_ims;
595
596 #ifdef SUPPORT_UCP
597 int prop_type;
598 int prop_value;
599 int prop_fail_result;
600 int prop_category;
601 int prop_chartype;
602 int prop_script;
603 int oclength;
604 uschar occhars[8];
605 #endif
606
607 int codelink;
608 int ctype;
609 int length;
610 int max;
611 int min;
612 int number;
613 int offset;
614 int op;
615 int save_capture_last;
616 int save_offset1, save_offset2, save_offset3;
617 int stacksave[REC_STACK_SAVE_MAX];
618
619 eptrblock newptrb;
620 #endif /* NO_RECURSE */
621
622 /* These statements are here to stop the compiler complaining about unitialized
623 variables. */
624
625 #ifdef SUPPORT_UCP
626 prop_value = 0;
627 prop_fail_result = 0;
628 #endif
629
630
631 /* This label is used for tail recursion, which is used in a few cases even
632 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
633 used. Thanks to Ian Taylor for noticing this possibility and sending the
634 original patch. */
635
636 TAIL_RECURSE:
637
638 /* OK, now we can get on with the real code of the function. Recursive calls
639 are specified by the macro RMATCH and RRETURN is used to return. When
640 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
641 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
642 defined). However, RMATCH isn't like a function call because it's quite a
643 complicated macro. It has to be used in one particular way. This shouldn't,
644 however, impact performance when true recursion is being used. */
645
646 #ifdef SUPPORT_UTF8
647 utf8 = md->utf8; /* Local copy of the flag */
648 #else
649 utf8 = FALSE;
650 #endif
651
652 /* First check that we haven't called match() too many times, or that we
653 haven't exceeded the recursive call limit. */
654
655 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
656 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
657
658 original_ims = ims; /* Save for resetting on ')' */
659
660 /* At the start of a group with an unlimited repeat that may match an empty
661 string, the match_cbegroup flag is set. When this is the case, add the current
662 subject pointer to the chain of such remembered pointers, to be checked when we
663 hit the closing ket, in order to break infinite loops that match no characters.
664 When match() is called in other circumstances, don't add to the chain. The
665 match_cbegroup flag must NOT be used with tail recursion, because the memory
666 block that is used is on the stack, so a new one may be required for each
667 match(). */
668
669 if ((flags & match_cbegroup) != 0)
670 {
671 newptrb.epb_saved_eptr = eptr;
672 newptrb.epb_prev = eptrb;
673 eptrb = &newptrb;
674 }
675
676 /* Now start processing the opcodes. */
677
678 for (;;)
679 {
680 minimize = possessive = FALSE;
681 op = *ecode;
682
683 switch(op)
684 {
685 case OP_MARK:
686 markptr = ecode + 2;
687 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
688 ims, eptrb, flags, RM55);
689
690 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
691 argument, and we must check whether that argument matches this MARK's
692 argument. It is passed back in md->start_match_ptr (an overloading of that
693 variable). If it does match, we reset that variable to the current subject
694 position and return MATCH_SKIP. Otherwise, pass back the return code
695 unaltered. */
696
697 if (rrc == MATCH_SKIP_ARG &&
698 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
699 {
700 md->start_match_ptr = eptr;
701 RRETURN(MATCH_SKIP);
702 }
703
704 if (md->mark == NULL) md->mark = markptr;
705 RRETURN(rrc);
706
707 case OP_FAIL:
708 MRRETURN(MATCH_NOMATCH);
709
710 case OP_COMMIT:
711 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
712 ims, eptrb, flags, RM52);
713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
714 MRRETURN(MATCH_COMMIT);
715
716 case OP_PRUNE:
717 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
718 ims, eptrb, flags, RM51);
719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
720 MRRETURN(MATCH_PRUNE);
721
722 case OP_PRUNE_ARG:
723 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
724 ims, eptrb, flags, RM56);
725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
726 md->mark = ecode + 2;
727 RRETURN(MATCH_PRUNE);
728
729 case OP_SKIP:
730 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
731 ims, eptrb, flags, RM53);
732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
733 md->start_match_ptr = eptr; /* Pass back current position */
734 MRRETURN(MATCH_SKIP);
735
736 case OP_SKIP_ARG:
737 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
738 ims, eptrb, flags, RM57);
739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
740
741 /* Pass back the current skip name by overloading md->start_match_ptr and
742 returning the special MATCH_SKIP_ARG return code. This will either be
743 caught by a matching MARK, or get to the top, where it is treated the same
744 as PRUNE. */
745
746 md->start_match_ptr = ecode + 2;
747 RRETURN(MATCH_SKIP_ARG);
748
749 case OP_THEN:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ims, eptrb, flags, RM54);
752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
753 MRRETURN(MATCH_THEN);
754
755 case OP_THEN_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ims, eptrb, flags, RM58);
758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_THEN);
761
762 /* Handle a capturing bracket. If there is space in the offset vector, save
763 the current subject position in the working slot at the top of the vector.
764 We mustn't change the current values of the data slot, because they may be
765 set from a previous iteration of this group, and be referred to by a
766 reference inside the group.
767
768 If the bracket fails to match, we need to restore this value and also the
769 values of the final offsets, in case they were set by a previous iteration
770 of the same bracket.
771
772 If there isn't enough space in the offset vector, treat this as if it were
773 a non-capturing bracket. Don't worry about setting the flag for the error
774 case here; that is handled in the code for KET. */
775
776 case OP_CBRA:
777 case OP_SCBRA:
778 number = GET2(ecode, 1+LINK_SIZE);
779 offset = number << 1;
780
781 #ifdef PCRE_DEBUG
782 printf("start bracket %d\n", number);
783 printf("subject=");
784 pchars(eptr, 16, TRUE, md);
785 printf("\n");
786 #endif
787
788 if (offset < md->offset_max)
789 {
790 save_offset1 = md->offset_vector[offset];
791 save_offset2 = md->offset_vector[offset+1];
792 save_offset3 = md->offset_vector[md->offset_end - number];
793 save_capture_last = md->capture_last;
794
795 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
796 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
797
798 flags = (op == OP_SCBRA)? match_cbegroup : 0;
799 do
800 {
801 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
802 ims, eptrb, flags, RM1);
803 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
804 md->capture_last = save_capture_last;
805 ecode += GET(ecode, 1);
806 }
807 while (*ecode == OP_ALT);
808
809 DPRINTF(("bracket %d failed\n", number));
810
811 md->offset_vector[offset] = save_offset1;
812 md->offset_vector[offset+1] = save_offset2;
813 md->offset_vector[md->offset_end - number] = save_offset3;
814
815 if (rrc != MATCH_THEN) md->mark = markptr;
816 RRETURN(MATCH_NOMATCH);
817 }
818
819 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
820 as a non-capturing bracket. */
821
822 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
823 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
824
825 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
826
827 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
828 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
829
830 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
831 final alternative within the brackets, we would return the result of a
832 recursive call to match() whatever happened. We can reduce stack usage by
833 turning this into a tail recursion, except in the case when match_cbegroup
834 is set.*/
835
836 case OP_BRA:
837 case OP_SBRA:
838 DPRINTF(("start non-capturing bracket\n"));
839 flags = (op >= OP_SBRA)? match_cbegroup : 0;
840 for (;;)
841 {
842 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
843 {
844 if (flags == 0) /* Not a possibly empty group */
845 {
846 ecode += _pcre_OP_lengths[*ecode];
847 DPRINTF(("bracket 0 tail recursion\n"));
848 goto TAIL_RECURSE;
849 }
850
851 /* Possibly empty group; can't use tail recursion. */
852
853 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
854 eptrb, flags, RM48);
855 if (rrc == MATCH_NOMATCH) md->mark = markptr;
856 RRETURN(rrc);
857 }
858
859 /* For non-final alternatives, continue the loop for a NOMATCH result;
860 otherwise return. */
861
862 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
863 eptrb, flags, RM2);
864 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
865 ecode += GET(ecode, 1);
866 }
867 /* Control never reaches here. */
868
869 /* Conditional group: compilation checked that there are no more than
870 two branches. If the condition is false, skipping the first branch takes us
871 past the end if there is only one branch, but that's OK because that is
872 exactly what going to the ket would do. As there is only one branch to be
873 obeyed, we can use tail recursion to avoid using another stack frame. */
874
875 case OP_COND:
876 case OP_SCOND:
877 codelink= GET(ecode, 1);
878
879 /* Because of the way auto-callout works during compile, a callout item is
880 inserted between OP_COND and an assertion condition. */
881
882 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
883 {
884 if (pcre_callout != NULL)
885 {
886 pcre_callout_block cb;
887 cb.version = 1; /* Version 1 of the callout block */
888 cb.callout_number = ecode[LINK_SIZE+2];
889 cb.offset_vector = md->offset_vector;
890 cb.subject = (PCRE_SPTR)md->start_subject;
891 cb.subject_length = md->end_subject - md->start_subject;
892 cb.start_match = mstart - md->start_subject;
893 cb.current_position = eptr - md->start_subject;
894 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
895 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
896 cb.capture_top = offset_top/2;
897 cb.capture_last = md->capture_last;
898 cb.callout_data = md->callout_data;
899 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
900 if (rrc < 0) RRETURN(rrc);
901 }
902 ecode += _pcre_OP_lengths[OP_CALLOUT];
903 }
904
905 condcode = ecode[LINK_SIZE+1];
906
907 /* Now see what the actual condition is */
908
909 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
910 {
911 if (md->recursive == NULL) /* Not recursing => FALSE */
912 {
913 condition = FALSE;
914 ecode += GET(ecode, 1);
915 }
916 else
917 {
918 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
919 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
920
921 /* If the test is for recursion into a specific subpattern, and it is
922 false, but the test was set up by name, scan the table to see if the
923 name refers to any other numbers, and test them. The condition is true
924 if any one is set. */
925
926 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
927 {
928 uschar *slotA = md->name_table;
929 for (i = 0; i < md->name_count; i++)
930 {
931 if (GET2(slotA, 0) == recno) break;
932 slotA += md->name_entry_size;
933 }
934
935 /* Found a name for the number - there can be only one; duplicate
936 names for different numbers are allowed, but not vice versa. First
937 scan down for duplicates. */
938
939 if (i < md->name_count)
940 {
941 uschar *slotB = slotA;
942 while (slotB > md->name_table)
943 {
944 slotB -= md->name_entry_size;
945 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
946 {
947 condition = GET2(slotB, 0) == md->recursive->group_num;
948 if (condition) break;
949 }
950 else break;
951 }
952
953 /* Scan up for duplicates */
954
955 if (!condition)
956 {
957 slotB = slotA;
958 for (i++; i < md->name_count; i++)
959 {
960 slotB += md->name_entry_size;
961 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
962 {
963 condition = GET2(slotB, 0) == md->recursive->group_num;
964 if (condition) break;
965 }
966 else break;
967 }
968 }
969 }
970 }
971
972 /* Chose branch according to the condition */
973
974 ecode += condition? 3 : GET(ecode, 1);
975 }
976 }
977
978 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
979 {
980 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
981 condition = offset < offset_top && md->offset_vector[offset] >= 0;
982
983 /* If the numbered capture is unset, but the reference was by name,
984 scan the table to see if the name refers to any other numbers, and test
985 them. The condition is true if any one is set. This is tediously similar
986 to the code above, but not close enough to try to amalgamate. */
987
988 if (!condition && condcode == OP_NCREF)
989 {
990 int refno = offset >> 1;
991 uschar *slotA = md->name_table;
992
993 for (i = 0; i < md->name_count; i++)
994 {
995 if (GET2(slotA, 0) == refno) break;
996 slotA += md->name_entry_size;
997 }
998
999 /* Found a name for the number - there can be only one; duplicate names
1000 for different numbers are allowed, but not vice versa. First scan down
1001 for duplicates. */
1002
1003 if (i < md->name_count)
1004 {
1005 uschar *slotB = slotA;
1006 while (slotB > md->name_table)
1007 {
1008 slotB -= md->name_entry_size;
1009 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1010 {
1011 offset = GET2(slotB, 0) << 1;
1012 condition = offset < offset_top &&
1013 md->offset_vector[offset] >= 0;
1014 if (condition) break;
1015 }
1016 else break;
1017 }
1018
1019 /* Scan up for duplicates */
1020
1021 if (!condition)
1022 {
1023 slotB = slotA;
1024 for (i++; i < md->name_count; i++)
1025 {
1026 slotB += md->name_entry_size;
1027 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1028 {
1029 offset = GET2(slotB, 0) << 1;
1030 condition = offset < offset_top &&
1031 md->offset_vector[offset] >= 0;
1032 if (condition) break;
1033 }
1034 else break;
1035 }
1036 }
1037 }
1038 }
1039
1040 /* Chose branch according to the condition */
1041
1042 ecode += condition? 3 : GET(ecode, 1);
1043 }
1044
1045 else if (condcode == OP_DEF) /* DEFINE - always false */
1046 {
1047 condition = FALSE;
1048 ecode += GET(ecode, 1);
1049 }
1050
1051 /* The condition is an assertion. Call match() to evaluate it - setting
1052 the final argument match_condassert causes it to stop at the end of an
1053 assertion. */
1054
1055 else
1056 {
1057 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1058 match_condassert, RM3);
1059 if (rrc == MATCH_MATCH)
1060 {
1061 condition = TRUE;
1062 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1063 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1064 }
1065 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1066 {
1067 RRETURN(rrc); /* Need braces because of following else */
1068 }
1069 else
1070 {
1071 condition = FALSE;
1072 ecode += codelink;
1073 }
1074 }
1075
1076 /* We are now at the branch that is to be obeyed. As there is only one,
1077 we can use tail recursion to avoid using another stack frame, except when
1078 match_cbegroup is required for an unlimited repeat of a possibly empty
1079 group. If the second alternative doesn't exist, we can just plough on. */
1080
1081 if (condition || *ecode == OP_ALT)
1082 {
1083 ecode += 1 + LINK_SIZE;
1084 if (op == OP_SCOND) /* Possibly empty group */
1085 {
1086 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1087 RRETURN(rrc);
1088 }
1089 else /* Group must match something */
1090 {
1091 flags = 0;
1092 goto TAIL_RECURSE;
1093 }
1094 }
1095 else /* Condition false & no alternative */
1096 {
1097 ecode += 1 + LINK_SIZE;
1098 }
1099 break;
1100
1101
1102 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1103 to close any currently open capturing brackets. */
1104
1105 case OP_CLOSE:
1106 number = GET2(ecode, 1);
1107 offset = number << 1;
1108
1109 #ifdef PCRE_DEBUG
1110 printf("end bracket %d at *ACCEPT", number);
1111 printf("\n");
1112 #endif
1113
1114 md->capture_last = number;
1115 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1116 {
1117 md->offset_vector[offset] =
1118 md->offset_vector[md->offset_end - number];
1119 md->offset_vector[offset+1] = eptr - md->start_subject;
1120 if (offset_top <= offset) offset_top = offset + 2;
1121 }
1122 ecode += 3;
1123 break;
1124
1125
1126 /* End of the pattern, either real or forced. If we are in a top-level
1127 recursion, we should restore the offsets appropriately and continue from
1128 after the call. */
1129
1130 case OP_ACCEPT:
1131 case OP_END:
1132 if (md->recursive != NULL && md->recursive->group_num == 0)
1133 {
1134 recursion_info *rec = md->recursive;
1135 DPRINTF(("End of pattern in a (?0) recursion\n"));
1136 md->recursive = rec->prevrec;
1137 memmove(md->offset_vector, rec->offset_save,
1138 rec->saved_max * sizeof(int));
1139 offset_top = rec->save_offset_top;
1140 ims = original_ims;
1141 ecode = rec->after_call;
1142 break;
1143 }
1144
1145 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1146 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1147 the subject. In both cases, backtracking will then try other alternatives,
1148 if any. */
1149
1150 if (eptr == mstart &&
1151 (md->notempty ||
1152 (md->notempty_atstart &&
1153 mstart == md->start_subject + md->start_offset)))
1154 MRRETURN(MATCH_NOMATCH);
1155
1156 /* Otherwise, we have a match. */
1157
1158 md->end_match_ptr = eptr; /* Record where we ended */
1159 md->end_offset_top = offset_top; /* and how many extracts were taken */
1160 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1161
1162 /* For some reason, the macros don't work properly if an expression is
1163 given as the argument to MRRETURN when the heap is in use. */
1164
1165 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1166 MRRETURN(rrc);
1167
1168 /* Change option settings */
1169
1170 case OP_OPT:
1171 ims = ecode[1];
1172 ecode += 2;
1173 DPRINTF(("ims set to %02lx\n", ims));
1174 break;
1175
1176 /* Assertion brackets. Check the alternative branches in turn - the
1177 matching won't pass the KET for an assertion. If any one branch matches,
1178 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1179 start of each branch to move the current point backwards, so the code at
1180 this level is identical to the lookahead case. */
1181
1182 case OP_ASSERT:
1183 case OP_ASSERTBACK:
1184 do
1185 {
1186 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1187 RM4);
1188 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1189 {
1190 mstart = md->start_match_ptr; /* In case \K reset it */
1191 break;
1192 }
1193 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1194 ecode += GET(ecode, 1);
1195 }
1196 while (*ecode == OP_ALT);
1197 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1198
1199 /* If checking an assertion for a condition, return MATCH_MATCH. */
1200
1201 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1202
1203 /* Continue from after the assertion, updating the offsets high water
1204 mark, since extracts may have been taken during the assertion. */
1205
1206 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1207 ecode += 1 + LINK_SIZE;
1208 offset_top = md->end_offset_top;
1209 continue;
1210
1211 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1212 PRUNE, or COMMIT means we must assume failure without checking subsequent
1213 branches. */
1214
1215 case OP_ASSERT_NOT:
1216 case OP_ASSERTBACK_NOT:
1217 do
1218 {
1219 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1220 RM5);
1221 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1222 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1223 {
1224 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1225 break;
1226 }
1227 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1228 ecode += GET(ecode,1);
1229 }
1230 while (*ecode == OP_ALT);
1231
1232 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1233
1234 ecode += 1 + LINK_SIZE;
1235 continue;
1236
1237 /* Move the subject pointer back. This occurs only at the start of
1238 each branch of a lookbehind assertion. If we are too close to the start to
1239 move back, this match function fails. When working with UTF-8 we move
1240 back a number of characters, not bytes. */
1241
1242 case OP_REVERSE:
1243 #ifdef SUPPORT_UTF8
1244 if (utf8)
1245 {
1246 i = GET(ecode, 1);
1247 while (i-- > 0)
1248 {
1249 eptr--;
1250 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1251 BACKCHAR(eptr);
1252 }
1253 }
1254 else
1255 #endif
1256
1257 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1258
1259 {
1260 eptr -= GET(ecode, 1);
1261 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1262 }
1263
1264 /* Save the earliest consulted character, then skip to next op code */
1265
1266 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1267 ecode += 1 + LINK_SIZE;
1268 break;
1269
1270 /* The callout item calls an external function, if one is provided, passing
1271 details of the match so far. This is mainly for debugging, though the
1272 function is able to force a failure. */
1273
1274 case OP_CALLOUT:
1275 if (pcre_callout != NULL)
1276 {
1277 pcre_callout_block cb;
1278 cb.version = 1; /* Version 1 of the callout block */
1279 cb.callout_number = ecode[1];
1280 cb.offset_vector = md->offset_vector;
1281 cb.subject = (PCRE_SPTR)md->start_subject;
1282 cb.subject_length = md->end_subject - md->start_subject;
1283 cb.start_match = mstart - md->start_subject;
1284 cb.current_position = eptr - md->start_subject;
1285 cb.pattern_position = GET(ecode, 2);
1286 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1287 cb.capture_top = offset_top/2;
1288 cb.capture_last = md->capture_last;
1289 cb.callout_data = md->callout_data;
1290 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1291 if (rrc < 0) RRETURN(rrc);
1292 }
1293 ecode += 2 + 2*LINK_SIZE;
1294 break;
1295
1296 /* Recursion either matches the current regex, or some subexpression. The
1297 offset data is the offset to the starting bracket from the start of the
1298 whole pattern. (This is so that it works from duplicated subpatterns.)
1299
1300 If there are any capturing brackets started but not finished, we have to
1301 save their starting points and reinstate them after the recursion. However,
1302 we don't know how many such there are (offset_top records the completed
1303 total) so we just have to save all the potential data. There may be up to
1304 65535 such values, which is too large to put on the stack, but using malloc
1305 for small numbers seems expensive. As a compromise, the stack is used when
1306 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1307 is used. A problem is what to do if the malloc fails ... there is no way of
1308 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1309 values on the stack, and accept that the rest may be wrong.
1310
1311 There are also other values that have to be saved. We use a chained
1312 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1313 for the original version of this logic. */
1314
1315 case OP_RECURSE:
1316 {
1317 callpat = md->start_code + GET(ecode, 1);
1318 new_recursive.group_num = (callpat == md->start_code)? 0 :
1319 GET2(callpat, 1 + LINK_SIZE);
1320
1321 /* Add to "recursing stack" */
1322
1323 new_recursive.prevrec = md->recursive;
1324 md->recursive = &new_recursive;
1325
1326 /* Find where to continue from afterwards */
1327
1328 ecode += 1 + LINK_SIZE;
1329 new_recursive.after_call = ecode;
1330
1331 /* Now save the offset data. */
1332
1333 new_recursive.saved_max = md->offset_end;
1334 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1335 new_recursive.offset_save = stacksave;
1336 else
1337 {
1338 new_recursive.offset_save =
1339 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1340 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1341 }
1342
1343 memcpy(new_recursive.offset_save, md->offset_vector,
1344 new_recursive.saved_max * sizeof(int));
1345 new_recursive.save_offset_top = offset_top;
1346
1347 /* OK, now we can do the recursion. For each top-level alternative we
1348 restore the offset and recursion data. */
1349
1350 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1351 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1352 do
1353 {
1354 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1355 md, ims, eptrb, flags, RM6);
1356 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1357 {
1358 DPRINTF(("Recursion matched\n"));
1359 md->recursive = new_recursive.prevrec;
1360 if (new_recursive.offset_save != stacksave)
1361 (pcre_free)(new_recursive.offset_save);
1362 MRRETURN(MATCH_MATCH);
1363 }
1364 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1365 {
1366 DPRINTF(("Recursion gave error %d\n", rrc));
1367 if (new_recursive.offset_save != stacksave)
1368 (pcre_free)(new_recursive.offset_save);
1369 RRETURN(rrc);
1370 }
1371
1372 md->recursive = &new_recursive;
1373 memcpy(md->offset_vector, new_recursive.offset_save,
1374 new_recursive.saved_max * sizeof(int));
1375 callpat += GET(callpat, 1);
1376 }
1377 while (*callpat == OP_ALT);
1378
1379 DPRINTF(("Recursion didn't match\n"));
1380 md->recursive = new_recursive.prevrec;
1381 if (new_recursive.offset_save != stacksave)
1382 (pcre_free)(new_recursive.offset_save);
1383 MRRETURN(MATCH_NOMATCH);
1384 }
1385 /* Control never reaches here */
1386
1387 /* "Once" brackets are like assertion brackets except that after a match,
1388 the point in the subject string is not moved back. Thus there can never be
1389 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1390 Check the alternative branches in turn - the matching won't pass the KET
1391 for this kind of subpattern. If any one branch matches, we carry on as at
1392 the end of a normal bracket, leaving the subject pointer, but resetting
1393 the start-of-match value in case it was changed by \K. */
1394
1395 case OP_ONCE:
1396 prev = ecode;
1397 saved_eptr = eptr;
1398
1399 do
1400 {
1401 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1402 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1403 {
1404 mstart = md->start_match_ptr;
1405 break;
1406 }
1407 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1408 ecode += GET(ecode,1);
1409 }
1410 while (*ecode == OP_ALT);
1411
1412 /* If hit the end of the group (which could be repeated), fail */
1413
1414 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1415
1416 /* Continue as from after the assertion, updating the offsets high water
1417 mark, since extracts may have been taken. */
1418
1419 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1420
1421 offset_top = md->end_offset_top;
1422 eptr = md->end_match_ptr;
1423
1424 /* For a non-repeating ket, just continue at this level. This also
1425 happens for a repeating ket if no characters were matched in the group.
1426 This is the forcible breaking of infinite loops as implemented in Perl
1427 5.005. If there is an options reset, it will get obeyed in the normal
1428 course of events. */
1429
1430 if (*ecode == OP_KET || eptr == saved_eptr)
1431 {
1432 ecode += 1+LINK_SIZE;
1433 break;
1434 }
1435
1436 /* The repeating kets try the rest of the pattern or restart from the
1437 preceding bracket, in the appropriate order. The second "call" of match()
1438 uses tail recursion, to avoid using another stack frame. We need to reset
1439 any options that changed within the bracket before re-running it, so
1440 check the next opcode. */
1441
1442 if (ecode[1+LINK_SIZE] == OP_OPT)
1443 {
1444 ims = (ims & ~PCRE_IMS) | ecode[4];
1445 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1446 }
1447
1448 if (*ecode == OP_KETRMIN)
1449 {
1450 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1451 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1452 ecode = prev;
1453 flags = 0;
1454 goto TAIL_RECURSE;
1455 }
1456 else /* OP_KETRMAX */
1457 {
1458 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1460 ecode += 1 + LINK_SIZE;
1461 flags = 0;
1462 goto TAIL_RECURSE;
1463 }
1464 /* Control never gets here */
1465
1466 /* An alternation is the end of a branch; scan along to find the end of the
1467 bracketed group and go to there. */
1468
1469 case OP_ALT:
1470 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1471 break;
1472
1473 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1474 indicating that it may occur zero times. It may repeat infinitely, or not
1475 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1476 with fixed upper repeat limits are compiled as a number of copies, with the
1477 optional ones preceded by BRAZERO or BRAMINZERO. */
1478
1479 case OP_BRAZERO:
1480 {
1481 next = ecode+1;
1482 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1484 do next += GET(next,1); while (*next == OP_ALT);
1485 ecode = next + 1 + LINK_SIZE;
1486 }
1487 break;
1488
1489 case OP_BRAMINZERO:
1490 {
1491 next = ecode+1;
1492 do next += GET(next, 1); while (*next == OP_ALT);
1493 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1495 ecode++;
1496 }
1497 break;
1498
1499 case OP_SKIPZERO:
1500 {
1501 next = ecode+1;
1502 do next += GET(next,1); while (*next == OP_ALT);
1503 ecode = next + 1 + LINK_SIZE;
1504 }
1505 break;
1506
1507 /* End of a group, repeated or non-repeating. */
1508
1509 case OP_KET:
1510 case OP_KETRMIN:
1511 case OP_KETRMAX:
1512 prev = ecode - GET(ecode, 1);
1513
1514 /* If this was a group that remembered the subject start, in order to break
1515 infinite repeats of empty string matches, retrieve the subject start from
1516 the chain. Otherwise, set it NULL. */
1517
1518 if (*prev >= OP_SBRA)
1519 {
1520 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1521 eptrb = eptrb->epb_prev; /* Backup to previous group */
1522 }
1523 else saved_eptr = NULL;
1524
1525 /* If we are at the end of an assertion group or an atomic group, stop
1526 matching and return MATCH_MATCH, but record the current high water mark for
1527 use by positive assertions. We also need to record the match start in case
1528 it was changed by \K. */
1529
1530 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1531 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1532 *prev == OP_ONCE)
1533 {
1534 md->end_match_ptr = eptr; /* For ONCE */
1535 md->end_offset_top = offset_top;
1536 md->start_match_ptr = mstart;
1537 MRRETURN(MATCH_MATCH);
1538 }
1539
1540 /* For capturing groups we have to check the group number back at the start
1541 and if necessary complete handling an extraction by setting the offsets and
1542 bumping the high water mark. Note that whole-pattern recursion is coded as
1543 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1544 when the OP_END is reached. Other recursion is handled here. */
1545
1546 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1547 {
1548 number = GET2(prev, 1+LINK_SIZE);
1549 offset = number << 1;
1550
1551 #ifdef PCRE_DEBUG
1552 printf("end bracket %d", number);
1553 printf("\n");
1554 #endif
1555
1556 md->capture_last = number;
1557 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1558 {
1559 md->offset_vector[offset] =
1560 md->offset_vector[md->offset_end - number];
1561 md->offset_vector[offset+1] = eptr - md->start_subject;
1562 if (offset_top <= offset) offset_top = offset + 2;
1563 }
1564
1565 /* Handle a recursively called group. Restore the offsets
1566 appropriately and continue from after the call. */
1567
1568 if (md->recursive != NULL && md->recursive->group_num == number)
1569 {
1570 recursion_info *rec = md->recursive;
1571 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1572 md->recursive = rec->prevrec;
1573 memcpy(md->offset_vector, rec->offset_save,
1574 rec->saved_max * sizeof(int));
1575 offset_top = rec->save_offset_top;
1576 ecode = rec->after_call;
1577 ims = original_ims;
1578 break;
1579 }
1580 }
1581
1582 /* For both capturing and non-capturing groups, reset the value of the ims
1583 flags, in case they got changed during the group. */
1584
1585 ims = original_ims;
1586 DPRINTF(("ims reset to %02lx\n", ims));
1587
1588 /* For a non-repeating ket, just continue at this level. This also
1589 happens for a repeating ket if no characters were matched in the group.
1590 This is the forcible breaking of infinite loops as implemented in Perl
1591 5.005. If there is an options reset, it will get obeyed in the normal
1592 course of events. */
1593
1594 if (*ecode == OP_KET || eptr == saved_eptr)
1595 {
1596 ecode += 1 + LINK_SIZE;
1597 break;
1598 }
1599
1600 /* The repeating kets try the rest of the pattern or restart from the
1601 preceding bracket, in the appropriate order. In the second case, we can use
1602 tail recursion to avoid using another stack frame, unless we have an
1603 unlimited repeat of a group that can match an empty string. */
1604
1605 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1606
1607 if (*ecode == OP_KETRMIN)
1608 {
1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1610 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1611 if (flags != 0) /* Could match an empty string */
1612 {
1613 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1614 RRETURN(rrc);
1615 }
1616 ecode = prev;
1617 goto TAIL_RECURSE;
1618 }
1619 else /* OP_KETRMAX */
1620 {
1621 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1623 ecode += 1 + LINK_SIZE;
1624 flags = 0;
1625 goto TAIL_RECURSE;
1626 }
1627 /* Control never gets here */
1628
1629 /* Start of subject unless notbol, or after internal newline if multiline */
1630
1631 case OP_CIRC:
1632 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1633 if ((ims & PCRE_MULTILINE) != 0)
1634 {
1635 if (eptr != md->start_subject &&
1636 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1637 MRRETURN(MATCH_NOMATCH);
1638 ecode++;
1639 break;
1640 }
1641 /* ... else fall through */
1642
1643 /* Start of subject assertion */
1644
1645 case OP_SOD:
1646 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1647 ecode++;
1648 break;
1649
1650 /* Start of match assertion */
1651
1652 case OP_SOM:
1653 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1654 ecode++;
1655 break;
1656
1657 /* Reset the start of match point */
1658
1659 case OP_SET_SOM:
1660 mstart = eptr;
1661 ecode++;
1662 break;
1663
1664 /* Assert before internal newline if multiline, or before a terminating
1665 newline unless endonly is set, else end of subject unless noteol is set. */
1666
1667 case OP_DOLL:
1668 if ((ims & PCRE_MULTILINE) != 0)
1669 {
1670 if (eptr < md->end_subject)
1671 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1672 else
1673 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1674 ecode++;
1675 break;
1676 }
1677 else
1678 {
1679 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1680 if (!md->endonly)
1681 {
1682 if (eptr != md->end_subject &&
1683 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1684 MRRETURN(MATCH_NOMATCH);
1685 ecode++;
1686 break;
1687 }
1688 }
1689 /* ... else fall through for endonly */
1690
1691 /* End of subject assertion (\z) */
1692
1693 case OP_EOD:
1694 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1695 ecode++;
1696 break;
1697
1698 /* End of subject or ending \n assertion (\Z) */
1699
1700 case OP_EODN:
1701 if (eptr != md->end_subject &&
1702 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1703 MRRETURN(MATCH_NOMATCH);
1704 ecode++;
1705 break;
1706
1707 /* Word boundary assertions */
1708
1709 case OP_NOT_WORD_BOUNDARY:
1710 case OP_WORD_BOUNDARY:
1711 {
1712
1713 /* Find out if the previous and current characters are "word" characters.
1714 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1715 be "non-word" characters. Remember the earliest consulted character for
1716 partial matching. */
1717
1718 #ifdef SUPPORT_UTF8
1719 if (utf8)
1720 {
1721 if (eptr == md->start_subject) prev_is_word = FALSE; else
1722 {
1723 USPTR lastptr = eptr - 1;
1724 while((*lastptr & 0xc0) == 0x80) lastptr--;
1725 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1726 GETCHAR(c, lastptr);
1727 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1728 }
1729 if (eptr >= md->end_subject)
1730 {
1731 SCHECK_PARTIAL();
1732 cur_is_word = FALSE;
1733 }
1734 else
1735 {
1736 GETCHAR(c, eptr);
1737 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1738 }
1739 }
1740 else
1741 #endif
1742
1743 /* Not in UTF-8 mode */
1744
1745 {
1746 if (eptr == md->start_subject) prev_is_word = FALSE; else
1747 {
1748 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1749 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1750 }
1751 if (eptr >= md->end_subject)
1752 {
1753 SCHECK_PARTIAL();
1754 cur_is_word = FALSE;
1755 }
1756 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1757 }
1758
1759 /* Now see if the situation is what we want */
1760
1761 if ((*ecode++ == OP_WORD_BOUNDARY)?
1762 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1763 MRRETURN(MATCH_NOMATCH);
1764 }
1765 break;
1766
1767 /* Match a single character type; inline for speed */
1768
1769 case OP_ANY:
1770 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1771 /* Fall through */
1772
1773 case OP_ALLANY:
1774 if (eptr++ >= md->end_subject)
1775 {
1776 SCHECK_PARTIAL();
1777 MRRETURN(MATCH_NOMATCH);
1778 }
1779 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1780 ecode++;
1781 break;
1782
1783 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1784 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1785
1786 case OP_ANYBYTE:
1787 if (eptr++ >= md->end_subject)
1788 {
1789 SCHECK_PARTIAL();
1790 MRRETURN(MATCH_NOMATCH);
1791 }
1792 ecode++;
1793 break;
1794
1795 case OP_NOT_DIGIT:
1796 if (eptr >= md->end_subject)
1797 {
1798 SCHECK_PARTIAL();
1799 MRRETURN(MATCH_NOMATCH);
1800 }
1801 GETCHARINCTEST(c, eptr);
1802 if (
1803 #ifdef SUPPORT_UTF8
1804 c < 256 &&
1805 #endif
1806 (md->ctypes[c] & ctype_digit) != 0
1807 )
1808 MRRETURN(MATCH_NOMATCH);
1809 ecode++;
1810 break;
1811
1812 case OP_DIGIT:
1813 if (eptr >= md->end_subject)
1814 {
1815 SCHECK_PARTIAL();
1816 MRRETURN(MATCH_NOMATCH);
1817 }
1818 GETCHARINCTEST(c, eptr);
1819 if (
1820 #ifdef SUPPORT_UTF8
1821 c >= 256 ||
1822 #endif
1823 (md->ctypes[c] & ctype_digit) == 0
1824 )
1825 MRRETURN(MATCH_NOMATCH);
1826 ecode++;
1827 break;
1828
1829 case OP_NOT_WHITESPACE:
1830 if (eptr >= md->end_subject)
1831 {
1832 SCHECK_PARTIAL();
1833 MRRETURN(MATCH_NOMATCH);
1834 }
1835 GETCHARINCTEST(c, eptr);
1836 if (
1837 #ifdef SUPPORT_UTF8
1838 c < 256 &&
1839 #endif
1840 (md->ctypes[c] & ctype_space) != 0
1841 )
1842 MRRETURN(MATCH_NOMATCH);
1843 ecode++;
1844 break;
1845
1846 case OP_WHITESPACE:
1847 if (eptr >= md->end_subject)
1848 {
1849 SCHECK_PARTIAL();
1850 MRRETURN(MATCH_NOMATCH);
1851 }
1852 GETCHARINCTEST(c, eptr);
1853 if (
1854 #ifdef SUPPORT_UTF8
1855 c >= 256 ||
1856 #endif
1857 (md->ctypes[c] & ctype_space) == 0
1858 )
1859 MRRETURN(MATCH_NOMATCH);
1860 ecode++;
1861 break;
1862
1863 case OP_NOT_WORDCHAR:
1864 if (eptr >= md->end_subject)
1865 {
1866 SCHECK_PARTIAL();
1867 MRRETURN(MATCH_NOMATCH);
1868 }
1869 GETCHARINCTEST(c, eptr);
1870 if (
1871 #ifdef SUPPORT_UTF8
1872 c < 256 &&
1873 #endif
1874 (md->ctypes[c] & ctype_word) != 0
1875 )
1876 MRRETURN(MATCH_NOMATCH);
1877 ecode++;
1878 break;
1879
1880 case OP_WORDCHAR:
1881 if (eptr >= md->end_subject)
1882 {
1883 SCHECK_PARTIAL();
1884 MRRETURN(MATCH_NOMATCH);
1885 }
1886 GETCHARINCTEST(c, eptr);
1887 if (
1888 #ifdef SUPPORT_UTF8
1889 c >= 256 ||
1890 #endif
1891 (md->ctypes[c] & ctype_word) == 0
1892 )
1893 MRRETURN(MATCH_NOMATCH);
1894 ecode++;
1895 break;
1896
1897 case OP_ANYNL:
1898 if (eptr >= md->end_subject)
1899 {
1900 SCHECK_PARTIAL();
1901 MRRETURN(MATCH_NOMATCH);
1902 }
1903 GETCHARINCTEST(c, eptr);
1904 switch(c)
1905 {
1906 default: MRRETURN(MATCH_NOMATCH);
1907 case 0x000d:
1908 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1909 break;
1910
1911 case 0x000a:
1912 break;
1913
1914 case 0x000b:
1915 case 0x000c:
1916 case 0x0085:
1917 case 0x2028:
1918 case 0x2029:
1919 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1920 break;
1921 }
1922 ecode++;
1923 break;
1924
1925 case OP_NOT_HSPACE:
1926 if (eptr >= md->end_subject)
1927 {
1928 SCHECK_PARTIAL();
1929 MRRETURN(MATCH_NOMATCH);
1930 }
1931 GETCHARINCTEST(c, eptr);
1932 switch(c)
1933 {
1934 default: break;
1935 case 0x09: /* HT */
1936 case 0x20: /* SPACE */
1937 case 0xa0: /* NBSP */
1938 case 0x1680: /* OGHAM SPACE MARK */
1939 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1940 case 0x2000: /* EN QUAD */
1941 case 0x2001: /* EM QUAD */
1942 case 0x2002: /* EN SPACE */
1943 case 0x2003: /* EM SPACE */
1944 case 0x2004: /* THREE-PER-EM SPACE */
1945 case 0x2005: /* FOUR-PER-EM SPACE */
1946 case 0x2006: /* SIX-PER-EM SPACE */
1947 case 0x2007: /* FIGURE SPACE */
1948 case 0x2008: /* PUNCTUATION SPACE */
1949 case 0x2009: /* THIN SPACE */
1950 case 0x200A: /* HAIR SPACE */
1951 case 0x202f: /* NARROW NO-BREAK SPACE */
1952 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1953 case 0x3000: /* IDEOGRAPHIC SPACE */
1954 MRRETURN(MATCH_NOMATCH);
1955 }
1956 ecode++;
1957 break;
1958
1959 case OP_HSPACE:
1960 if (eptr >= md->end_subject)
1961 {
1962 SCHECK_PARTIAL();
1963 MRRETURN(MATCH_NOMATCH);
1964 }
1965 GETCHARINCTEST(c, eptr);
1966 switch(c)
1967 {
1968 default: MRRETURN(MATCH_NOMATCH);
1969 case 0x09: /* HT */
1970 case 0x20: /* SPACE */
1971 case 0xa0: /* NBSP */
1972 case 0x1680: /* OGHAM SPACE MARK */
1973 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1974 case 0x2000: /* EN QUAD */
1975 case 0x2001: /* EM QUAD */
1976 case 0x2002: /* EN SPACE */
1977 case 0x2003: /* EM SPACE */
1978 case 0x2004: /* THREE-PER-EM SPACE */
1979 case 0x2005: /* FOUR-PER-EM SPACE */
1980 case 0x2006: /* SIX-PER-EM SPACE */
1981 case 0x2007: /* FIGURE SPACE */
1982 case 0x2008: /* PUNCTUATION SPACE */
1983 case 0x2009: /* THIN SPACE */
1984 case 0x200A: /* HAIR SPACE */
1985 case 0x202f: /* NARROW NO-BREAK SPACE */
1986 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1987 case 0x3000: /* IDEOGRAPHIC SPACE */
1988 break;
1989 }
1990 ecode++;
1991 break;
1992
1993 case OP_NOT_VSPACE:
1994 if (eptr >= md->end_subject)
1995 {
1996 SCHECK_PARTIAL();
1997 MRRETURN(MATCH_NOMATCH);
1998 }
1999 GETCHARINCTEST(c, eptr);
2000 switch(c)
2001 {
2002 default: break;
2003 case 0x0a: /* LF */
2004 case 0x0b: /* VT */
2005 case 0x0c: /* FF */
2006 case 0x0d: /* CR */
2007 case 0x85: /* NEL */
2008 case 0x2028: /* LINE SEPARATOR */
2009 case 0x2029: /* PARAGRAPH SEPARATOR */
2010 MRRETURN(MATCH_NOMATCH);
2011 }
2012 ecode++;
2013 break;
2014
2015 case OP_VSPACE:
2016 if (eptr >= md->end_subject)
2017 {
2018 SCHECK_PARTIAL();
2019 MRRETURN(MATCH_NOMATCH);
2020 }
2021 GETCHARINCTEST(c, eptr);
2022 switch(c)
2023 {
2024 default: MRRETURN(MATCH_NOMATCH);
2025 case 0x0a: /* LF */
2026 case 0x0b: /* VT */
2027 case 0x0c: /* FF */
2028 case 0x0d: /* CR */
2029 case 0x85: /* NEL */
2030 case 0x2028: /* LINE SEPARATOR */
2031 case 0x2029: /* PARAGRAPH SEPARATOR */
2032 break;
2033 }
2034 ecode++;
2035 break;
2036
2037 #ifdef SUPPORT_UCP
2038 /* Check the next character by Unicode property. We will get here only
2039 if the support is in the binary; otherwise a compile-time error occurs. */
2040
2041 case OP_PROP:
2042 case OP_NOTPROP:
2043 if (eptr >= md->end_subject)
2044 {
2045 SCHECK_PARTIAL();
2046 MRRETURN(MATCH_NOMATCH);
2047 }
2048 GETCHARINCTEST(c, eptr);
2049 {
2050 const ucd_record *prop = GET_UCD(c);
2051
2052 switch(ecode[1])
2053 {
2054 case PT_ANY:
2055 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2056 break;
2057
2058 case PT_LAMP:
2059 if ((prop->chartype == ucp_Lu ||
2060 prop->chartype == ucp_Ll ||
2061 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2062 MRRETURN(MATCH_NOMATCH);
2063 break;
2064
2065 case PT_GC:
2066 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2067 MRRETURN(MATCH_NOMATCH);
2068 break;
2069
2070 case PT_PC:
2071 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2072 MRRETURN(MATCH_NOMATCH);
2073 break;
2074
2075 case PT_SC:
2076 if ((ecode[2] != prop->script) == (op == OP_PROP))
2077 MRRETURN(MATCH_NOMATCH);
2078 break;
2079
2080 default:
2081 RRETURN(PCRE_ERROR_INTERNAL);
2082 }
2083
2084 ecode += 3;
2085 }
2086 break;
2087
2088 /* Match an extended Unicode sequence. We will get here only if the support
2089 is in the binary; otherwise a compile-time error occurs. */
2090
2091 case OP_EXTUNI:
2092 if (eptr >= md->end_subject)
2093 {
2094 SCHECK_PARTIAL();
2095 MRRETURN(MATCH_NOMATCH);
2096 }
2097 GETCHARINCTEST(c, eptr);
2098 {
2099 int category = UCD_CATEGORY(c);
2100 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2101 while (eptr < md->end_subject)
2102 {
2103 int len = 1;
2104 if (!utf8) c = *eptr; else
2105 {
2106 GETCHARLEN(c, eptr, len);
2107 }
2108 category = UCD_CATEGORY(c);
2109 if (category != ucp_M) break;
2110 eptr += len;
2111 }
2112 }
2113 ecode++;
2114 break;
2115 #endif
2116
2117
2118 /* Match a back reference, possibly repeatedly. Look past the end of the
2119 item to see if there is repeat information following. The code is similar
2120 to that for character classes, but repeated for efficiency. Then obey
2121 similar code to character type repeats - written out again for speed.
2122 However, if the referenced string is the empty string, always treat
2123 it as matched, any number of times (otherwise there could be infinite
2124 loops). */
2125
2126 case OP_REF:
2127 {
2128 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2129 ecode += 3;
2130
2131 /* If the reference is unset, there are two possibilities:
2132
2133 (a) In the default, Perl-compatible state, set the length to be longer
2134 than the amount of subject left; this ensures that every attempt at a
2135 match fails. We can't just fail here, because of the possibility of
2136 quantifiers with zero minima.
2137
2138 (b) If the JavaScript compatibility flag is set, set the length to zero
2139 so that the back reference matches an empty string.
2140
2141 Otherwise, set the length to the length of what was matched by the
2142 referenced subpattern. */
2143
2144 if (offset >= offset_top || md->offset_vector[offset] < 0)
2145 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2146 else
2147 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2148
2149 /* Set up for repetition, or handle the non-repeated case */
2150
2151 switch (*ecode)
2152 {
2153 case OP_CRSTAR:
2154 case OP_CRMINSTAR:
2155 case OP_CRPLUS:
2156 case OP_CRMINPLUS:
2157 case OP_CRQUERY:
2158 case OP_CRMINQUERY:
2159 c = *ecode++ - OP_CRSTAR;
2160 minimize = (c & 1) != 0;
2161 min = rep_min[c]; /* Pick up values from tables; */
2162 max = rep_max[c]; /* zero for max => infinity */
2163 if (max == 0) max = INT_MAX;
2164 break;
2165
2166 case OP_CRRANGE:
2167 case OP_CRMINRANGE:
2168 minimize = (*ecode == OP_CRMINRANGE);
2169 min = GET2(ecode, 1);
2170 max = GET2(ecode, 3);
2171 if (max == 0) max = INT_MAX;
2172 ecode += 5;
2173 break;
2174
2175 default: /* No repeat follows */
2176 if (!match_ref(offset, eptr, length, md, ims))
2177 {
2178 CHECK_PARTIAL();
2179 MRRETURN(MATCH_NOMATCH);
2180 }
2181 eptr += length;
2182 continue; /* With the main loop */
2183 }
2184
2185 /* If the length of the reference is zero, just continue with the
2186 main loop. */
2187
2188 if (length == 0) continue;
2189
2190 /* First, ensure the minimum number of matches are present. We get back
2191 the length of the reference string explicitly rather than passing the
2192 address of eptr, so that eptr can be a register variable. */
2193
2194 for (i = 1; i <= min; i++)
2195 {
2196 if (!match_ref(offset, eptr, length, md, ims))
2197 {
2198 CHECK_PARTIAL();
2199 MRRETURN(MATCH_NOMATCH);
2200 }
2201 eptr += length;
2202 }
2203
2204 /* If min = max, continue at the same level without recursion.
2205 They are not both allowed to be zero. */
2206
2207 if (min == max) continue;
2208
2209 /* If minimizing, keep trying and advancing the pointer */
2210
2211 if (minimize)
2212 {
2213 for (fi = min;; fi++)
2214 {
2215 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2216 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2217 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2218 if (!match_ref(offset, eptr, length, md, ims))
2219 {
2220 CHECK_PARTIAL();
2221 MRRETURN(MATCH_NOMATCH);
2222 }
2223 eptr += length;
2224 }
2225 /* Control never gets here */
2226 }
2227
2228 /* If maximizing, find the longest string and work backwards */
2229
2230 else
2231 {
2232 pp = eptr;
2233 for (i = min; i < max; i++)
2234 {
2235 if (!match_ref(offset, eptr, length, md, ims))
2236 {
2237 CHECK_PARTIAL();
2238 break;
2239 }
2240 eptr += length;
2241 }
2242 while (eptr >= pp)
2243 {
2244 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2245 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2246 eptr -= length;
2247 }
2248 MRRETURN(MATCH_NOMATCH);
2249 }
2250 }
2251 /* Control never gets here */
2252
2253 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2254 used when all the characters in the class have values in the range 0-255,
2255 and either the matching is caseful, or the characters are in the range
2256 0-127 when UTF-8 processing is enabled. The only difference between
2257 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2258 encountered.
2259
2260 First, look past the end of the item to see if there is repeat information
2261 following. Then obey similar code to character type repeats - written out
2262 again for speed. */
2263
2264 case OP_NCLASS:
2265 case OP_CLASS:
2266 {
2267 data = ecode + 1; /* Save for matching */
2268 ecode += 33; /* Advance past the item */
2269
2270 switch (*ecode)
2271 {
2272 case OP_CRSTAR:
2273 case OP_CRMINSTAR:
2274 case OP_CRPLUS:
2275 case OP_CRMINPLUS:
2276 case OP_CRQUERY:
2277 case OP_CRMINQUERY:
2278 c = *ecode++ - OP_CRSTAR;
2279 minimize = (c & 1) != 0;
2280 min = rep_min[c]; /* Pick up values from tables; */
2281 max = rep_max[c]; /* zero for max => infinity */
2282 if (max == 0) max = INT_MAX;
2283 break;
2284
2285 case OP_CRRANGE:
2286 case OP_CRMINRANGE:
2287 minimize = (*ecode == OP_CRMINRANGE);
2288 min = GET2(ecode, 1);
2289 max = GET2(ecode, 3);
2290 if (max == 0) max = INT_MAX;
2291 ecode += 5;
2292 break;
2293
2294 default: /* No repeat follows */
2295 min = max = 1;
2296 break;
2297 }
2298
2299 /* First, ensure the minimum number of matches are present. */
2300
2301 #ifdef SUPPORT_UTF8
2302 /* UTF-8 mode */
2303 if (utf8)
2304 {
2305 for (i = 1; i <= min; i++)
2306 {
2307 if (eptr >= md->end_subject)
2308 {
2309 SCHECK_PARTIAL();
2310 MRRETURN(MATCH_NOMATCH);
2311 }
2312 GETCHARINC(c, eptr);
2313 if (c > 255)
2314 {
2315 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2316 }
2317 else
2318 {
2319 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2320 }
2321 }
2322 }
2323 else
2324 #endif
2325 /* Not UTF-8 mode */
2326 {
2327 for (i = 1; i <= min; i++)
2328 {
2329 if (eptr >= md->end_subject)
2330 {
2331 SCHECK_PARTIAL();
2332 MRRETURN(MATCH_NOMATCH);
2333 }
2334 c = *eptr++;
2335 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2336 }
2337 }
2338
2339 /* If max == min we can continue with the main loop without the
2340 need to recurse. */
2341
2342 if (min == max) continue;
2343
2344 /* If minimizing, keep testing the rest of the expression and advancing
2345 the pointer while it matches the class. */
2346
2347 if (minimize)
2348 {
2349 #ifdef SUPPORT_UTF8
2350 /* UTF-8 mode */
2351 if (utf8)
2352 {
2353 for (fi = min;; fi++)
2354 {
2355 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2356 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2357 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2358 if (eptr >= md->end_subject)
2359 {
2360 SCHECK_PARTIAL();
2361 MRRETURN(MATCH_NOMATCH);
2362 }
2363 GETCHARINC(c, eptr);
2364 if (c > 255)
2365 {
2366 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2367 }
2368 else
2369 {
2370 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2371 }
2372 }
2373 }
2374 else
2375 #endif
2376 /* Not UTF-8 mode */
2377 {
2378 for (fi = min;; fi++)
2379 {
2380 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2381 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2382 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2383 if (eptr >= md->end_subject)
2384 {
2385 SCHECK_PARTIAL();
2386 MRRETURN(MATCH_NOMATCH);
2387 }
2388 c = *eptr++;
2389 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2390 }
2391 }
2392 /* Control never gets here */
2393 }
2394
2395 /* If maximizing, find the longest possible run, then work backwards. */
2396
2397 else
2398 {
2399 pp = eptr;
2400
2401 #ifdef SUPPORT_UTF8
2402 /* UTF-8 mode */
2403 if (utf8)
2404 {
2405 for (i = min; i < max; i++)
2406 {
2407 int len = 1;
2408 if (eptr >= md->end_subject)
2409 {
2410 SCHECK_PARTIAL();
2411 break;
2412 }
2413 GETCHARLEN(c, eptr, len);
2414 if (c > 255)
2415 {
2416 if (op == OP_CLASS) break;
2417 }
2418 else
2419 {
2420 if ((data[c/8] & (1 << (c&7))) == 0) break;
2421 }
2422 eptr += len;
2423 }
2424 for (;;)
2425 {
2426 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2428 if (eptr-- == pp) break; /* Stop if tried at original pos */
2429 BACKCHAR(eptr);
2430 }
2431 }
2432 else
2433 #endif
2434 /* Not UTF-8 mode */
2435 {
2436 for (i = min; i < max; i++)
2437 {
2438 if (eptr >= md->end_subject)
2439 {
2440 SCHECK_PARTIAL();
2441 break;
2442 }
2443 c = *eptr;
2444 if ((data[c/8] & (1 << (c&7))) == 0) break;
2445 eptr++;
2446 }
2447 while (eptr >= pp)
2448 {
2449 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2451 eptr--;
2452 }
2453 }
2454
2455 MRRETURN(MATCH_NOMATCH);
2456 }
2457 }
2458 /* Control never gets here */
2459
2460
2461 /* Match an extended character class. This opcode is encountered only
2462 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2463 mode, because Unicode properties are supported in non-UTF-8 mode. */
2464
2465 #ifdef SUPPORT_UTF8
2466 case OP_XCLASS:
2467 {
2468 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2469 ecode += GET(ecode, 1); /* Advance past the item */
2470
2471 switch (*ecode)
2472 {
2473 case OP_CRSTAR:
2474 case OP_CRMINSTAR:
2475 case OP_CRPLUS:
2476 case OP_CRMINPLUS:
2477 case OP_CRQUERY:
2478 case OP_CRMINQUERY:
2479 c = *ecode++ - OP_CRSTAR;
2480 minimize = (c & 1) != 0;
2481 min = rep_min[c]; /* Pick up values from tables; */
2482 max = rep_max[c]; /* zero for max => infinity */
2483 if (max == 0) max = INT_MAX;
2484 break;
2485
2486 case OP_CRRANGE:
2487 case OP_CRMINRANGE:
2488 minimize = (*ecode == OP_CRMINRANGE);
2489 min = GET2(ecode, 1);
2490 max = GET2(ecode, 3);
2491 if (max == 0) max = INT_MAX;
2492 ecode += 5;
2493 break;
2494
2495 default: /* No repeat follows */
2496 min = max = 1;
2497 break;
2498 }
2499
2500 /* First, ensure the minimum number of matches are present. */
2501
2502 for (i = 1; i <= min; i++)
2503 {
2504 if (eptr >= md->end_subject)
2505 {
2506 SCHECK_PARTIAL();
2507 MRRETURN(MATCH_NOMATCH);
2508 }
2509 GETCHARINCTEST(c, eptr);
2510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2511 }
2512
2513 /* If max == min we can continue with the main loop without the
2514 need to recurse. */
2515
2516 if (min == max) continue;
2517
2518 /* If minimizing, keep testing the rest of the expression and advancing
2519 the pointer while it matches the class. */
2520
2521 if (minimize)
2522 {
2523 for (fi = min;; fi++)
2524 {
2525 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2527 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2528 if (eptr >= md->end_subject)
2529 {
2530 SCHECK_PARTIAL();
2531 MRRETURN(MATCH_NOMATCH);
2532 }
2533 GETCHARINCTEST(c, eptr);
2534 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2535 }
2536 /* Control never gets here */
2537 }
2538
2539 /* If maximizing, find the longest possible run, then work backwards. */
2540
2541 else
2542 {
2543 pp = eptr;
2544 for (i = min; i < max; i++)
2545 {
2546 int len = 1;
2547 if (eptr >= md->end_subject)
2548 {
2549 SCHECK_PARTIAL();
2550 break;
2551 }
2552 GETCHARLENTEST(c, eptr, len);
2553 if (!_pcre_xclass(c, data)) break;
2554 eptr += len;
2555 }
2556 for(;;)
2557 {
2558 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2560 if (eptr-- == pp) break; /* Stop if tried at original pos */
2561 if (utf8) BACKCHAR(eptr);
2562 }
2563 MRRETURN(MATCH_NOMATCH);
2564 }
2565
2566 /* Control never gets here */
2567 }
2568 #endif /* End of XCLASS */
2569
2570 /* Match a single character, casefully */
2571
2572 case OP_CHAR:
2573 #ifdef SUPPORT_UTF8
2574 if (utf8)
2575 {
2576 length = 1;
2577 ecode++;
2578 GETCHARLEN(fc, ecode, length);
2579 if (length > md->end_subject - eptr)
2580 {
2581 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2582 MRRETURN(MATCH_NOMATCH);
2583 }
2584 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2585 }
2586 else
2587 #endif
2588
2589 /* Non-UTF-8 mode */
2590 {
2591 if (md->end_subject - eptr < 1)
2592 {
2593 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2594 MRRETURN(MATCH_NOMATCH);
2595 }
2596 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2597 ecode += 2;
2598 }
2599 break;
2600
2601 /* Match a single character, caselessly */
2602
2603 case OP_CHARNC:
2604 #ifdef SUPPORT_UTF8
2605 if (utf8)
2606 {
2607 length = 1;
2608 ecode++;
2609 GETCHARLEN(fc, ecode, length);
2610
2611 if (length > md->end_subject - eptr)
2612 {
2613 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2614 MRRETURN(MATCH_NOMATCH);
2615 }
2616
2617 /* If the pattern character's value is < 128, we have only one byte, and
2618 can use the fast lookup table. */
2619
2620 if (fc < 128)
2621 {
2622 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2623 }
2624
2625 /* Otherwise we must pick up the subject character */
2626
2627 else
2628 {
2629 unsigned int dc;
2630 GETCHARINC(dc, eptr);
2631 ecode += length;
2632
2633 /* If we have Unicode property support, we can use it to test the other
2634 case of the character, if there is one. */
2635
2636 if (fc != dc)
2637 {
2638 #ifdef SUPPORT_UCP
2639 if (dc != UCD_OTHERCASE(fc))
2640 #endif
2641 MRRETURN(MATCH_NOMATCH);
2642 }
2643 }
2644 }
2645 else
2646 #endif /* SUPPORT_UTF8 */
2647
2648 /* Non-UTF-8 mode */
2649 {
2650 if (md->end_subject - eptr < 1)
2651 {
2652 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2653 MRRETURN(MATCH_NOMATCH);
2654 }
2655 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2656 ecode += 2;
2657 }
2658 break;
2659
2660 /* Match a single character repeatedly. */
2661
2662 case OP_EXACT:
2663 min = max = GET2(ecode, 1);
2664 ecode += 3;
2665 goto REPEATCHAR;
2666
2667 case OP_POSUPTO:
2668 possessive = TRUE;
2669 /* Fall through */
2670
2671 case OP_UPTO:
2672 case OP_MINUPTO:
2673 min = 0;
2674 max = GET2(ecode, 1);
2675 minimize = *ecode == OP_MINUPTO;
2676 ecode += 3;
2677 goto REPEATCHAR;
2678
2679 case OP_POSSTAR:
2680 possessive = TRUE;
2681 min = 0;
2682 max = INT_MAX;
2683 ecode++;
2684 goto REPEATCHAR;
2685
2686 case OP_POSPLUS:
2687 possessive = TRUE;
2688 min = 1;
2689 max = INT_MAX;
2690 ecode++;
2691 goto REPEATCHAR;
2692
2693 case OP_POSQUERY:
2694 possessive = TRUE;
2695 min = 0;
2696 max = 1;
2697 ecode++;
2698 goto REPEATCHAR;
2699
2700 case OP_STAR:
2701 case OP_MINSTAR:
2702 case OP_PLUS:
2703 case OP_MINPLUS:
2704 case OP_QUERY:
2705 case OP_MINQUERY:
2706 c = *ecode++ - OP_STAR;
2707 minimize = (c & 1) != 0;
2708
2709 min = rep_min[c]; /* Pick up values from tables; */
2710 max = rep_max[c]; /* zero for max => infinity */
2711 if (max == 0) max = INT_MAX;
2712
2713 /* Common code for all repeated single-character matches. */
2714
2715 REPEATCHAR:
2716 #ifdef SUPPORT_UTF8
2717 if (utf8)
2718 {
2719 length = 1;
2720 charptr = ecode;
2721 GETCHARLEN(fc, ecode, length);
2722 ecode += length;
2723
2724 /* Handle multibyte character matching specially here. There is
2725 support for caseless matching if UCP support is present. */
2726
2727 if (length > 1)
2728 {
2729 #ifdef SUPPORT_UCP
2730 unsigned int othercase;
2731 if ((ims & PCRE_CASELESS) != 0 &&
2732 (othercase = UCD_OTHERCASE(fc)) != fc)
2733 oclength = _pcre_ord2utf8(othercase, occhars);
2734 else oclength = 0;
2735 #endif /* SUPPORT_UCP */
2736
2737 for (i = 1; i <= min; i++)
2738 {
2739 if (eptr <= md->end_subject - length &&
2740 memcmp(eptr, charptr, length) == 0) eptr += length;
2741 #ifdef SUPPORT_UCP
2742 else if (oclength > 0 &&
2743 eptr <= md->end_subject - oclength &&
2744 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2745 #endif /* SUPPORT_UCP */
2746 else
2747 {
2748 CHECK_PARTIAL();
2749 MRRETURN(MATCH_NOMATCH);
2750 }
2751 }
2752
2753 if (min == max) continue;
2754
2755 if (minimize)
2756 {
2757 for (fi = min;; fi++)
2758 {
2759 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2761 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2762 if (eptr <= md->end_subject - length &&
2763 memcmp(eptr, charptr, length) == 0) eptr += length;
2764 #ifdef SUPPORT_UCP
2765 else if (oclength > 0 &&
2766 eptr <= md->end_subject - oclength &&
2767 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2768 #endif /* SUPPORT_UCP */
2769 else
2770 {
2771 CHECK_PARTIAL();
2772 MRRETURN(MATCH_NOMATCH);
2773 }
2774 }
2775 /* Control never gets here */
2776 }
2777
2778 else /* Maximize */
2779 {
2780 pp = eptr;
2781 for (i = min; i < max; i++)
2782 {
2783 if (eptr <= md->end_subject - length &&
2784 memcmp(eptr, charptr, length) == 0) eptr += length;
2785 #ifdef SUPPORT_UCP
2786 else if (oclength > 0 &&
2787 eptr <= md->end_subject - oclength &&
2788 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2789 #endif /* SUPPORT_UCP */
2790 else
2791 {
2792 CHECK_PARTIAL();
2793 break;
2794 }
2795 }
2796
2797 if (possessive) continue;
2798
2799 for(;;)
2800 {
2801 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2803 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2804 #ifdef SUPPORT_UCP
2805 eptr--;
2806 BACKCHAR(eptr);
2807 #else /* without SUPPORT_UCP */
2808 eptr -= length;
2809 #endif /* SUPPORT_UCP */
2810 }
2811 }
2812 /* Control never gets here */
2813 }
2814
2815 /* If the length of a UTF-8 character is 1, we fall through here, and
2816 obey the code as for non-UTF-8 characters below, though in this case the
2817 value of fc will always be < 128. */
2818 }
2819 else
2820 #endif /* SUPPORT_UTF8 */
2821
2822 /* When not in UTF-8 mode, load a single-byte character. */
2823
2824 fc = *ecode++;
2825
2826 /* The value of fc at this point is always less than 256, though we may or
2827 may not be in UTF-8 mode. The code is duplicated for the caseless and
2828 caseful cases, for speed, since matching characters is likely to be quite
2829 common. First, ensure the minimum number of matches are present. If min =
2830 max, continue at the same level without recursing. Otherwise, if
2831 minimizing, keep trying the rest of the expression and advancing one
2832 matching character if failing, up to the maximum. Alternatively, if
2833 maximizing, find the maximum number of characters and work backwards. */
2834
2835 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2836 max, eptr));
2837
2838 if ((ims & PCRE_CASELESS) != 0)
2839 {
2840 fc = md->lcc[fc];
2841 for (i = 1; i <= min; i++)
2842 {
2843 if (eptr >= md->end_subject)
2844 {
2845 SCHECK_PARTIAL();
2846 MRRETURN(MATCH_NOMATCH);
2847 }
2848 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2849 }
2850 if (min == max) continue;
2851 if (minimize)
2852 {
2853 for (fi = min;; fi++)
2854 {
2855 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2857 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2858 if (eptr >= md->end_subject)
2859 {
2860 SCHECK_PARTIAL();
2861 MRRETURN(MATCH_NOMATCH);
2862 }
2863 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2864 }
2865 /* Control never gets here */
2866 }
2867 else /* Maximize */
2868 {
2869 pp = eptr;
2870 for (i = min; i < max; i++)
2871 {
2872 if (eptr >= md->end_subject)
2873 {
2874 SCHECK_PARTIAL();
2875 break;
2876 }
2877 if (fc != md->lcc[*eptr]) break;
2878 eptr++;
2879 }
2880
2881 if (possessive) continue;
2882
2883 while (eptr >= pp)
2884 {
2885 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2886 eptr--;
2887 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2888 }
2889 MRRETURN(MATCH_NOMATCH);
2890 }
2891 /* Control never gets here */
2892 }
2893
2894 /* Caseful comparisons (includes all multi-byte characters) */
2895
2896 else
2897 {
2898 for (i = 1; i <= min; i++)
2899 {
2900 if (eptr >= md->end_subject)
2901 {
2902 SCHECK_PARTIAL();
2903 MRRETURN(MATCH_NOMATCH);
2904 }
2905 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2906 }
2907
2908 if (min == max) continue;
2909
2910 if (minimize)
2911 {
2912 for (fi = min;; fi++)
2913 {
2914 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2916 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2917 if (eptr >= md->end_subject)
2918 {
2919 SCHECK_PARTIAL();
2920 MRRETURN(MATCH_NOMATCH);
2921 }
2922 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2923 }
2924 /* Control never gets here */
2925 }
2926 else /* Maximize */
2927 {
2928 pp = eptr;
2929 for (i = min; i < max; i++)
2930 {
2931 if (eptr >= md->end_subject)
2932 {
2933 SCHECK_PARTIAL();
2934 break;
2935 }
2936 if (fc != *eptr) break;
2937 eptr++;
2938 }
2939 if (possessive) continue;
2940
2941 while (eptr >= pp)
2942 {
2943 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2944 eptr--;
2945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2946 }
2947 MRRETURN(MATCH_NOMATCH);
2948 }
2949 }
2950 /* Control never gets here */
2951
2952 /* Match a negated single one-byte character. The character we are
2953 checking can be multibyte. */
2954
2955 case OP_NOT:
2956 if (eptr >= md->end_subject)
2957 {
2958 SCHECK_PARTIAL();
2959 MRRETURN(MATCH_NOMATCH);
2960 }
2961 ecode++;
2962 GETCHARINCTEST(c, eptr);
2963 if ((ims & PCRE_CASELESS) != 0)
2964 {
2965 #ifdef SUPPORT_UTF8
2966 if (c < 256)
2967 #endif
2968 c = md->lcc[c];
2969 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
2970 }
2971 else
2972 {
2973 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
2974 }
2975 break;
2976
2977 /* Match a negated single one-byte character repeatedly. This is almost a
2978 repeat of the code for a repeated single character, but I haven't found a
2979 nice way of commoning these up that doesn't require a test of the
2980 positive/negative option for each character match. Maybe that wouldn't add
2981 very much to the time taken, but character matching *is* what this is all
2982 about... */
2983
2984 case OP_NOTEXACT:
2985 min = max = GET2(ecode, 1);
2986 ecode += 3;
2987 goto REPEATNOTCHAR;
2988
2989 case OP_NOTUPTO:
2990 case OP_NOTMINUPTO:
2991 min = 0;
2992 max = GET2(ecode, 1);
2993 minimize = *ecode == OP_NOTMINUPTO;
2994 ecode += 3;
2995 goto REPEATNOTCHAR;
2996
2997 case OP_NOTPOSSTAR:
2998 possessive = TRUE;
2999 min = 0;
3000 max = INT_MAX;
3001 ecode++;
3002 goto REPEATNOTCHAR;
3003
3004 case OP_NOTPOSPLUS:
3005 possessive = TRUE;
3006 min = 1;
3007 max = INT_MAX;
3008 ecode++;
3009 goto REPEATNOTCHAR;
3010
3011 case OP_NOTPOSQUERY:
3012 possessive = TRUE;
3013 min = 0;
3014 max = 1;
3015 ecode++;
3016 goto REPEATNOTCHAR;
3017
3018 case OP_NOTPOSUPTO:
3019 possessive = TRUE;
3020 min = 0;
3021 max = GET2(ecode, 1);
3022 ecode += 3;
3023 goto REPEATNOTCHAR;
3024
3025 case OP_NOTSTAR:
3026 case OP_NOTMINSTAR:
3027 case OP_NOTPLUS:
3028 case OP_NOTMINPLUS:
3029 case OP_NOTQUERY:
3030 case OP_NOTMINQUERY:
3031 c = *ecode++ - OP_NOTSTAR;
3032 minimize = (c & 1) != 0;
3033 min = rep_min[c]; /* Pick up values from tables; */
3034 max = rep_max[c]; /* zero for max => infinity */
3035 if (max == 0) max = INT_MAX;
3036
3037 /* Common code for all repeated single-byte matches. */
3038
3039 REPEATNOTCHAR:
3040 fc = *ecode++;
3041
3042 /* The code is duplicated for the caseless and caseful cases, for speed,
3043 since matching characters is likely to be quite common. First, ensure the
3044 minimum number of matches are present. If min = max, continue at the same
3045 level without recursing. Otherwise, if minimizing, keep trying the rest of
3046 the expression and advancing one matching character if failing, up to the
3047 maximum. Alternatively, if maximizing, find the maximum number of
3048 characters and work backwards. */
3049
3050 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3051 max, eptr));
3052
3053 if ((ims & PCRE_CASELESS) != 0)
3054 {
3055 fc = md->lcc[fc];
3056
3057 #ifdef SUPPORT_UTF8
3058 /* UTF-8 mode */
3059 if (utf8)
3060 {
3061 register unsigned int d;
3062 for (i = 1; i <= min; i++)
3063 {
3064 if (eptr >= md->end_subject)
3065 {
3066 SCHECK_PARTIAL();
3067 MRRETURN(MATCH_NOMATCH);
3068 }
3069 GETCHARINC(d, eptr);
3070 if (d < 256) d = md->lcc[d];
3071 if (fc == d) MRRETURN(MATCH_NOMATCH);
3072 }
3073 }
3074 else
3075 #endif
3076
3077 /* Not UTF-8 mode */
3078 {
3079 for (i = 1; i <= min; i++)
3080 {
3081 if (eptr >= md->end_subject)
3082 {
3083 SCHECK_PARTIAL();
3084 MRRETURN(MATCH_NOMATCH);
3085 }
3086 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3087 }
3088 }
3089
3090 if (min == max) continue;
3091
3092 if (minimize)
3093 {
3094 #ifdef SUPPORT_UTF8
3095 /* UTF-8 mode */
3096 if (utf8)
3097 {
3098 register unsigned int d;
3099 for (fi = min;; fi++)
3100 {
3101 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3102 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3103 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3104 if (eptr >= md->end_subject)
3105 {
3106 SCHECK_PARTIAL();
3107 MRRETURN(MATCH_NOMATCH);
3108 }
3109 GETCHARINC(d, eptr);
3110 if (d < 256) d = md->lcc[d];
3111 if (fc == d) MRRETURN(MATCH_NOMATCH);
3112 }
3113 }
3114 else
3115 #endif
3116 /* Not UTF-8 mode */
3117 {
3118 for (fi = min;; fi++)
3119 {
3120 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3122 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3123 if (eptr >= md->end_subject)
3124 {
3125 SCHECK_PARTIAL();
3126 MRRETURN(MATCH_NOMATCH);
3127 }
3128 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3129 }
3130 }
3131 /* Control never gets here */
3132 }
3133
3134 /* Maximize case */
3135
3136 else
3137 {
3138 pp = eptr;
3139
3140 #ifdef SUPPORT_UTF8
3141 /* UTF-8 mode */
3142 if (utf8)
3143 {
3144 register unsigned int d;
3145 for (i = min; i < max; i++)
3146 {
3147 int len = 1;
3148 if (eptr >= md->end_subject)
3149 {
3150 SCHECK_PARTIAL();
3151 break;
3152 }
3153 GETCHARLEN(d, eptr, len);
3154 if (d < 256) d = md->lcc[d];
3155 if (fc == d) break;
3156 eptr += len;
3157 }
3158 if (possessive) continue;
3159 for(;;)
3160 {
3161 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3163 if (eptr-- == pp) break; /* Stop if tried at original pos */
3164 BACKCHAR(eptr);
3165 }
3166 }
3167 else
3168 #endif
3169 /* Not UTF-8 mode */
3170 {
3171 for (i = min; i < max; i++)
3172 {
3173 if (eptr >= md->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 break;
3177 }
3178 if (fc == md->lcc[*eptr]) break;
3179 eptr++;
3180 }
3181 if (possessive) continue;
3182 while (eptr >= pp)
3183 {
3184 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3185 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3186 eptr--;
3187 }
3188 }
3189
3190 MRRETURN(MATCH_NOMATCH);
3191 }
3192 /* Control never gets here */
3193 }
3194
3195 /* Caseful comparisons */
3196
3197 else
3198 {
3199 #ifdef SUPPORT_UTF8
3200 /* UTF-8 mode */
3201 if (utf8)
3202 {
3203 register unsigned int d;
3204 for (i = 1; i <= min; i++)
3205 {
3206 if (eptr >= md->end_subject)
3207 {
3208 SCHECK_PARTIAL();
3209 MRRETURN(MATCH_NOMATCH);
3210 }
3211 GETCHARINC(d, eptr);
3212 if (fc == d) MRRETURN(MATCH_NOMATCH);
3213 }
3214 }
3215 else
3216 #endif
3217 /* Not UTF-8 mode */
3218 {
3219 for (i = 1; i <= min; i++)
3220 {
3221 if (eptr >= md->end_subject)
3222 {
3223 SCHECK_PARTIAL();
3224 MRRETURN(MATCH_NOMATCH);
3225 }
3226 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3227 }
3228 }
3229
3230 if (min == max) continue;
3231
3232 if (minimize)
3233 {
3234 #ifdef SUPPORT_UTF8
3235 /* UTF-8 mode */
3236 if (utf8)
3237 {
3238 register unsigned int d;
3239 for (fi = min;; fi++)
3240 {
3241 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3243 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3244 if (eptr >= md->end_subject)
3245 {
3246 SCHECK_PARTIAL();
3247 MRRETURN(MATCH_NOMATCH);
3248 }
3249 GETCHARINC(d, eptr);
3250 if (fc == d) MRRETURN(MATCH_NOMATCH);
3251 }
3252 }
3253 else
3254 #endif
3255 /* Not UTF-8 mode */
3256 {
3257 for (fi = min;; fi++)
3258 {
3259 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3260 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3261 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3262 if (eptr >= md->end_subject)
3263 {
3264 SCHECK_PARTIAL();
3265 MRRETURN(MATCH_NOMATCH);
3266 }
3267 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3268 }
3269 }
3270 /* Control never gets here */
3271 }
3272
3273 /* Maximize case */
3274
3275 else
3276 {
3277 pp = eptr;
3278
3279 #ifdef SUPPORT_UTF8
3280 /* UTF-8 mode */
3281 if (utf8)
3282 {
3283 register unsigned int d;
3284 for (i = min; i < max; i++)
3285 {
3286 int len = 1;
3287 if (eptr >= md->end_subject)
3288 {
3289 SCHECK_PARTIAL();
3290 break;
3291 }
3292 GETCHARLEN(d, eptr, len);
3293 if (fc == d) break;
3294 eptr += len;
3295 }
3296 if (possessive) continue;
3297 for(;;)
3298 {
3299 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3300 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3301 if (eptr-- == pp) break; /* Stop if tried at original pos */
3302 BACKCHAR(eptr);
3303 }
3304 }
3305 else
3306 #endif
3307 /* Not UTF-8 mode */
3308 {
3309 for (i = min; i < max; i++)
3310 {
3311 if (eptr >= md->end_subject)
3312 {
3313 SCHECK_PARTIAL();
3314 break;
3315 }
3316 if (fc == *eptr) break;
3317 eptr++;
3318 }
3319 if (possessive) continue;
3320 while (eptr >= pp)
3321 {
3322 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3323 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3324 eptr--;
3325 }
3326 }
3327
3328 MRRETURN(MATCH_NOMATCH);
3329 }
3330 }
3331 /* Control never gets here */
3332
3333 /* Match a single character type repeatedly; several different opcodes
3334 share code. This is very similar to the code for single characters, but we
3335 repeat it in the interests of efficiency. */
3336
3337 case OP_TYPEEXACT:
3338 min = max = GET2(ecode, 1);
3339 minimize = TRUE;
3340 ecode += 3;
3341 goto REPEATTYPE;
3342
3343 case OP_TYPEUPTO:
3344 case OP_TYPEMINUPTO:
3345 min = 0;
3346 max = GET2(ecode, 1);
3347 minimize = *ecode == OP_TYPEMINUPTO;
3348 ecode += 3;
3349 goto REPEATTYPE;
3350
3351 case OP_TYPEPOSSTAR:
3352 possessive = TRUE;
3353 min = 0;
3354 max = INT_MAX;
3355 ecode++;
3356 goto REPEATTYPE;
3357
3358 case OP_TYPEPOSPLUS:
3359 possessive = TRUE;
3360 min = 1;
3361 max = INT_MAX;
3362 ecode++;
3363 goto REPEATTYPE;
3364
3365 case OP_TYPEPOSQUERY:
3366 possessive = TRUE;
3367 min = 0;
3368 max = 1;
3369 ecode++;
3370 goto REPEATTYPE;
3371
3372 case OP_TYPEPOSUPTO:
3373 possessive = TRUE;
3374 min = 0;
3375 max = GET2(ecode, 1);
3376 ecode += 3;
3377 goto REPEATTYPE;
3378
3379 case OP_TYPESTAR:
3380 case OP_TYPEMINSTAR:
3381 case OP_TYPEPLUS:
3382 case OP_TYPEMINPLUS:
3383 case OP_TYPEQUERY:
3384 case OP_TYPEMINQUERY:
3385 c = *ecode++ - OP_TYPESTAR;
3386 minimize = (c & 1) != 0;
3387 min = rep_min[c]; /* Pick up values from tables; */
3388 max = rep_max[c]; /* zero for max => infinity */
3389 if (max == 0) max = INT_MAX;
3390
3391 /* Common code for all repeated single character type matches. Note that
3392 in UTF-8 mode, '.' matches a character of any length, but for the other
3393 character types, the valid characters are all one-byte long. */
3394
3395 REPEATTYPE:
3396 ctype = *ecode++; /* Code for the character type */
3397
3398 #ifdef SUPPORT_UCP
3399 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3400 {
3401 prop_fail_result = ctype == OP_NOTPROP;
3402 prop_type = *ecode++;
3403 prop_value = *ecode++;
3404 }
3405 else prop_type = -1;
3406 #endif
3407
3408 /* First, ensure the minimum number of matches are present. Use inline
3409 code for maximizing the speed, and do the type test once at the start
3410 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3411 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3412 and single-bytes. */
3413
3414 if (min > 0)
3415 {
3416 #ifdef SUPPORT_UCP
3417 if (prop_type >= 0)
3418 {
3419 switch(prop_type)
3420 {
3421 case PT_ANY:
3422 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3423 for (i = 1; i <= min; i++)
3424 {
3425 if (eptr >= md->end_subject)
3426 {
3427 SCHECK_PARTIAL();
3428 MRRETURN(MATCH_NOMATCH);
3429 }
3430 GETCHARINCTEST(c, eptr);
3431 }
3432 break;
3433
3434 case PT_LAMP:
3435 for (i = 1; i <= min; i++)
3436 {
3437 if (eptr >= md->end_subject)
3438 {
3439 SCHECK_PARTIAL();
3440 MRRETURN(MATCH_NOMATCH);
3441 }
3442 GETCHARINCTEST(c, eptr);
3443 prop_chartype = UCD_CHARTYPE(c);
3444 if ((prop_chartype == ucp_Lu ||
3445 prop_chartype == ucp_Ll ||
3446 prop_chartype == ucp_Lt) == prop_fail_result)
3447 MRRETURN(MATCH_NOMATCH);
3448 }
3449 break;
3450
3451 case PT_GC:
3452 for (i = 1; i <= min; i++)
3453 {
3454 if (eptr >= md->end_subject)
3455 {
3456 SCHECK_PARTIAL();
3457 MRRETURN(MATCH_NOMATCH);
3458 }
3459 GETCHARINCTEST(c, eptr);
3460 prop_category = UCD_CATEGORY(c);
3461 if ((prop_category == prop_value) == prop_fail_result)
3462 MRRETURN(MATCH_NOMATCH);
3463 }
3464 break;
3465
3466 case PT_PC:
3467 for (i = 1; i <= min; i++)
3468 {
3469 if (eptr >= md->end_subject)
3470 {
3471 SCHECK_PARTIAL();
3472 MRRETURN(MATCH_NOMATCH);
3473 }
3474 GETCHARINCTEST(c, eptr);
3475 prop_chartype = UCD_CHARTYPE(c);
3476 if ((prop_chartype == prop_value) == prop_fail_result)
3477 MRRETURN(MATCH_NOMATCH);
3478 }
3479 break;
3480
3481 case PT_SC:
3482 for (i = 1; i <= min; i++)
3483 {
3484 if (eptr >= md->end_subject)
3485 {
3486 SCHECK_PARTIAL();
3487 MRRETURN(MATCH_NOMATCH);
3488 }
3489 GETCHARINCTEST(c, eptr);
3490 prop_script = UCD_SCRIPT(c);
3491 if ((prop_script == prop_value) == prop_fail_result)
3492 MRRETURN(MATCH_NOMATCH);
3493 }
3494 break;
3495
3496 default:
3497 RRETURN(PCRE_ERROR_INTERNAL);
3498 }
3499 }
3500
3501 /* Match extended Unicode sequences. We will get here only if the
3502 support is in the binary; otherwise a compile-time error occurs. */
3503
3504 else if (ctype == OP_EXTUNI)
3505 {
3506 for (i = 1; i <= min; i++)
3507 {
3508 if (eptr >= md->end_subject)
3509 {
3510 SCHECK_PARTIAL();
3511 MRRETURN(MATCH_NOMATCH);
3512 }
3513 GETCHARINCTEST(c, eptr);
3514 prop_category = UCD_CATEGORY(c);
3515 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3516 while (eptr < md->end_subject)
3517 {
3518 int len = 1;
3519 if (!utf8) c = *eptr;
3520 else { GETCHARLEN(c, eptr, len); }
3521 prop_category = UCD_CATEGORY(c);
3522 if (prop_category != ucp_M) break;
3523 eptr += len;
3524 }
3525 }
3526 }
3527
3528 else
3529 #endif /* SUPPORT_UCP */
3530
3531 /* Handle all other cases when the coding is UTF-8 */
3532
3533 #ifdef SUPPORT_UTF8
3534 if (utf8) switch(ctype)
3535 {
3536 case OP_ANY:
3537 for (i = 1; i <= min; i++)
3538 {
3539 if (eptr >= md->end_subject)
3540 {
3541 SCHECK_PARTIAL();
3542 MRRETURN(MATCH_NOMATCH);
3543 }
3544 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3545 eptr++;
3546 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3547 }
3548 break;
3549
3550 case OP_ALLANY:
3551 for (i = 1; i <= min; i++)
3552 {
3553 if (eptr >= md->end_subject)
3554 {
3555 SCHECK_PARTIAL();
3556 MRRETURN(MATCH_NOMATCH);
3557 }
3558 eptr++;
3559 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3560 }
3561 break;
3562
3563 case OP_ANYBYTE:
3564 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3565 eptr += min;
3566 break;
3567
3568 case OP_ANYNL:
3569 for (i = 1; i <= min; i++)
3570 {
3571 if (eptr >= md->end_subject)
3572 {
3573 SCHECK_PARTIAL();
3574 MRRETURN(MATCH_NOMATCH);
3575 }
3576 GETCHARINC(c, eptr);
3577 switch(c)
3578 {
3579 default: MRRETURN(MATCH_NOMATCH);
3580 case 0x000d:
3581 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3582 break;
3583
3584 case 0x000a:
3585 break;
3586
3587 case 0x000b:
3588 case 0x000c:
3589 case 0x0085:
3590 case 0x2028:
3591 case 0x2029:
3592 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3593 break;
3594 }
3595 }
3596 break;
3597
3598 case OP_NOT_HSPACE:
3599 for (i = 1; i <= min; i++)
3600 {
3601 if (eptr >= md->end_subject)
3602 {
3603 SCHECK_PARTIAL();
3604 MRRETURN(MATCH_NOMATCH);
3605 }
3606 GETCHARINC(c, eptr);
3607 switch(c)
3608 {
3609 default: break;
3610 case 0x09: /* HT */
3611 case 0x20: /* SPACE */
3612 case 0xa0: /* NBSP */
3613 case 0x1680: /* OGHAM SPACE MARK */
3614 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3615 case 0x2000: /* EN QUAD */
3616 case 0x2001: /* EM QUAD */
3617 case 0x2002: /* EN SPACE */
3618 case 0x2003: /* EM SPACE */
3619 case 0x2004: /* THREE-PER-EM SPACE */
3620 case 0x2005: /* FOUR-PER-EM SPACE */
3621 case 0x2006: /* SIX-PER-EM SPACE */
3622 case 0x2007: /* FIGURE SPACE */
3623 case 0x2008: /* PUNCTUATION SPACE */
3624 case 0x2009: /* THIN SPACE */
3625 case 0x200A: /* HAIR SPACE */
3626 case 0x202f: /* NARROW NO-BREAK SPACE */
3627 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3628 case 0x3000: /* IDEOGRAPHIC SPACE */
3629 MRRETURN(MATCH_NOMATCH);
3630 }
3631 }
3632 break;
3633
3634 case OP_HSPACE:
3635 for (i = 1; i <= min; i++)
3636 {
3637 if (eptr >= md->end_subject)
3638 {
3639 SCHECK_PARTIAL();
3640 MRRETURN(MATCH_NOMATCH);
3641 }
3642 GETCHARINC(c, eptr);
3643 switch(c)
3644 {
3645 default: MRRETURN(MATCH_NOMATCH);
3646 case 0x09: /* HT */
3647 case 0x20: /* SPACE */
3648 case 0xa0: /* NBSP */
3649 case 0x1680: /* OGHAM SPACE MARK */
3650 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3651 case 0x2000: /* EN QUAD */
3652 case 0x2001: /* EM QUAD */
3653 case 0x2002: /* EN SPACE */
3654 case 0x2003: /* EM SPACE */
3655 case 0x2004: /* THREE-PER-EM SPACE */
3656 case 0x2005: /* FOUR-PER-EM SPACE */
3657 case 0x2006: /* SIX-PER-EM SPACE */
3658 case 0x2007: /* FIGURE SPACE */
3659 case 0x2008: /* PUNCTUATION SPACE */
3660 case 0x2009: /* THIN SPACE */
3661 case 0x200A: /* HAIR SPACE */
3662 case 0x202f: /* NARROW NO-BREAK SPACE */
3663 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3664 case 0x3000: /* IDEOGRAPHIC SPACE */
3665 break;
3666 }
3667 }
3668 break;
3669
3670 case OP_NOT_VSPACE:
3671 for (i = 1; i <= min; i++)
3672 {
3673 if (eptr >= md->end_subject)
3674 {
3675 SCHECK_PARTIAL();
3676 MRRETURN(MATCH_NOMATCH);
3677 }
3678 GETCHARINC(c, eptr);
3679 switch(c)
3680 {
3681 default: break;
3682 case 0x0a: /* LF */
3683 case 0x0b: /* VT */
3684 case 0x0c: /* FF */
3685 case 0x0d: /* CR */
3686 case 0x85: /* NEL */
3687 case 0x2028: /* LINE SEPARATOR */
3688 case 0x2029: /* PARAGRAPH SEPARATOR */
3689 MRRETURN(MATCH_NOMATCH);
3690 }
3691 }
3692 break;
3693
3694 case OP_VSPACE:
3695 for (i = 1; i <= min; i++)
3696 {
3697 if (eptr >= md->end_subject)
3698 {
3699 SCHECK_PARTIAL();
3700 MRRETURN(MATCH_NOMATCH);
3701 }
3702 GETCHARINC(c, eptr);
3703 switch(c)
3704 {
3705 default: MRRETURN(MATCH_NOMATCH);
3706 case 0x0a: /* LF */
3707 case 0x0b: /* VT */
3708 case 0x0c: /* FF */
3709 case 0x0d: /* CR */
3710 case 0x85: /* NEL */
3711 case 0x2028: /* LINE SEPARATOR */
3712 case 0x2029: /* PARAGRAPH SEPARATOR */
3713 break;
3714 }
3715 }
3716 break;
3717
3718 case OP_NOT_DIGIT:
3719 for (i = 1; i <= min; i++)
3720 {
3721 if (eptr >= md->end_subject)
3722 {
3723 SCHECK_PARTIAL();
3724 MRRETURN(MATCH_NOMATCH);
3725 }
3726 GETCHARINC(c, eptr);
3727 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3728 MRRETURN(MATCH_NOMATCH);
3729 }
3730 break;
3731
3732 case OP_DIGIT:
3733 for (i = 1; i <= min; i++)
3734 {
3735 if (eptr >= md->end_subject)
3736 {
3737 SCHECK_PARTIAL();
3738 MRRETURN(MATCH_NOMATCH);
3739 }
3740 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3741 MRRETURN(MATCH_NOMATCH);
3742 /* No need to skip more bytes - we know it's a 1-byte character */
3743 }
3744 break;
3745
3746 case OP_NOT_WHITESPACE:
3747 for (i = 1; i <= min; i++)
3748 {
3749 if (eptr >= md->end_subject)
3750 {
3751 SCHECK_PARTIAL();
3752 MRRETURN(MATCH_NOMATCH);
3753 }
3754 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3755 MRRETURN(MATCH_NOMATCH);
3756 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3757 }
3758 break;
3759
3760 case OP_WHITESPACE:
3761 for (i = 1; i <= min; i++)
3762 {
3763 if (eptr >= md->end_subject)
3764 {
3765 SCHECK_PARTIAL();
3766 MRRETURN(MATCH_NOMATCH);
3767 }
3768 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3769 MRRETURN(MATCH_NOMATCH);
3770 /* No need to skip more bytes - we know it's a 1-byte character */
3771 }
3772 break;
3773
3774 case OP_NOT_WORDCHAR:
3775 for (i = 1; i <= min; i++)
3776 {
3777 if (eptr >= md->end_subject)
3778 {
3779 SCHECK_PARTIAL();
3780 MRRETURN(MATCH_NOMATCH);
3781 }
3782 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3783 MRRETURN(MATCH_NOMATCH);
3784 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3785 }
3786 break;
3787
3788 case OP_WORDCHAR:
3789 for (i = 1; i <= min; i++)
3790 {
3791 if (eptr >= md->end_subject)
3792 {
3793 SCHECK_PARTIAL();
3794 MRRETURN(MATCH_NOMATCH);
3795 }
3796 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3797 MRRETURN(MATCH_NOMATCH);
3798 /* No need to skip more bytes - we know it's a 1-byte character */
3799 }
3800 break;
3801
3802 default:
3803 RRETURN(PCRE_ERROR_INTERNAL);
3804 } /* End switch(ctype) */
3805
3806 else
3807 #endif /* SUPPORT_UTF8 */
3808
3809 /* Code for the non-UTF-8 case for minimum matching of operators other
3810 than OP_PROP and OP_NOTPROP. */
3811
3812 switch(ctype)
3813 {
3814 case OP_ANY:
3815 for (i = 1; i <= min; i++)
3816 {
3817 if (eptr >= md->end_subject)
3818 {
3819 SCHECK_PARTIAL();
3820 MRRETURN(MATCH_NOMATCH);
3821 }
3822 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3823 eptr++;
3824 }
3825 break;
3826
3827 case OP_ALLANY:
3828 if (eptr > md->end_subject - min)
3829 {
3830 SCHECK_PARTIAL();
3831 MRRETURN(MATCH_NOMATCH);
3832 }
3833 eptr += min;
3834 break;
3835
3836 case OP_ANYBYTE:
3837 if (eptr > md->end_subject - min)
3838 {
3839 SCHECK_PARTIAL();
3840 MRRETURN(MATCH_NOMATCH);
3841 }
3842 eptr += min;
3843 break;
3844
3845 case OP_ANYNL:
3846 for (i = 1; i <= min; i++)
3847 {
3848 if (eptr >= md->end_subject)
3849 {
3850 SCHECK_PARTIAL();
3851 MRRETURN(MATCH_NOMATCH);
3852 }
3853 switch(*eptr++)
3854 {
3855 default: MRRETURN(MATCH_NOMATCH);
3856 case 0x000d:
3857 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3858 break;
3859 case 0x000a:
3860 break;
3861
3862 case 0x000b:
3863 case 0x000c:
3864 case 0x0085:
3865 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3866 break;
3867 }
3868 }
3869 break;
3870
3871 case OP_NOT_HSPACE:
3872 for (i = 1; i <= min; i++)
3873 {
3874 if (eptr >= md->end_subject)
3875 {
3876 SCHECK_PARTIAL();
3877 MRRETURN(MATCH_NOMATCH);
3878 }
3879 switch(*eptr++)
3880 {
3881 default: break;
3882 case 0x09: /* HT */
3883 case 0x20: /* SPACE */
3884 case 0xa0: /* NBSP */
3885 MRRETURN(MATCH_NOMATCH);
3886 }
3887 }
3888 break;
3889
3890 case OP_HSPACE:
3891 for (i = 1; i <= min; i++)
3892 {
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 MRRETURN(MATCH_NOMATCH);
3897 }
3898 switch(*eptr++)
3899 {
3900 default: MRRETURN(MATCH_NOMATCH);
3901 case 0x09: /* HT */
3902 case 0x20: /* SPACE */
3903 case 0xa0: /* NBSP */
3904 break;
3905 }
3906 }
3907 break;
3908
3909 case OP_NOT_VSPACE:
3910 for (i = 1; i <= min; i++)
3911 {
3912 if (eptr >= md->end_subject)
3913 {
3914 SCHECK_PARTIAL();
3915 MRRETURN(MATCH_NOMATCH);
3916 }
3917 switch(*eptr++)
3918 {
3919 default: break;
3920 case 0x0a: /* LF */
3921 case 0x0b: /* VT */
3922 case 0x0c: /* FF */
3923 case 0x0d: /* CR */
3924 case 0x85: /* NEL */
3925 MRRETURN(MATCH_NOMATCH);
3926 }
3927 }
3928 break;
3929
3930 case OP_VSPACE:
3931 for (i = 1; i <= min; i++)
3932 {
3933 if (eptr >= md->end_subject)
3934 {
3935 SCHECK_PARTIAL();
3936 MRRETURN(MATCH_NOMATCH);
3937 }
3938 switch(*eptr++)
3939 {
3940 default: MRRETURN(MATCH_NOMATCH);
3941 case 0x0a: /* LF */
3942 case 0x0b: /* VT */
3943 case 0x0c: /* FF */
3944 case 0x0d: /* CR */
3945 case 0x85: /* NEL */
3946 break;
3947 }
3948 }
3949 break;
3950
3951 case OP_NOT_DIGIT:
3952 for (i = 1; i <= min; i++)
3953 {
3954 if (eptr >= md->end_subject)
3955 {
3956 SCHECK_PARTIAL();
3957 MRRETURN(MATCH_NOMATCH);
3958 }
3959 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
3960 }
3961 break;
3962
3963 case OP_DIGIT:
3964 for (i = 1; i <= min; i++)
3965 {
3966 if (eptr >= md->end_subject)
3967 {
3968 SCHECK_PARTIAL();
3969 MRRETURN(MATCH_NOMATCH);
3970 }
3971 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
3972 }
3973 break;
3974
3975 case OP_NOT_WHITESPACE:
3976 for (i = 1; i <= min; i++)
3977 {
3978 if (eptr >= md->end_subject)
3979 {
3980 SCHECK_PARTIAL();
3981 MRRETURN(MATCH_NOMATCH);
3982 }
3983 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
3984 }
3985 break;
3986
3987 case OP_WHITESPACE:
3988 for (i = 1; i <= min; i++)
3989 {
3990 if (eptr >= md->end_subject)
3991 {
3992 SCHECK_PARTIAL();
3993 MRRETURN(MATCH_NOMATCH);
3994 }
3995 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
3996 }
3997 break;
3998
3999 case OP_NOT_WORDCHAR:
4000 for (i = 1; i <= min; i++)
4001 {
4002 if (eptr >= md->end_subject)
4003 {
4004 SCHECK_PARTIAL();
4005 MRRETURN(MATCH_NOMATCH);
4006 }
4007 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4008 MRRETURN(MATCH_NOMATCH);
4009 }
4010 break;
4011
4012 case OP_WORDCHAR:
4013 for (i = 1; i <= min; i++)
4014 {
4015 if (eptr >= md->end_subject)
4016 {
4017 SCHECK_PARTIAL();
4018 MRRETURN(MATCH_NOMATCH);
4019 }
4020 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4021 MRRETURN(MATCH_NOMATCH);
4022 }
4023 break;
4024
4025 default:
4026 RRETURN(PCRE_ERROR_INTERNAL);
4027 }
4028 }
4029
4030 /* If min = max, continue at the same level without recursing */
4031
4032 if (min == max) continue;
4033
4034 /* If minimizing, we have to test the rest of the pattern before each
4035 subsequent match. Again, separate the UTF-8 case for speed, and also
4036 separate the UCP cases. */
4037
4038 if (minimize)
4039 {
4040 #ifdef SUPPORT_UCP
4041 if (prop_type >= 0)
4042 {
4043 switch(prop_type)
4044 {
4045 case PT_ANY:
4046 for (fi = min;; fi++)
4047 {
4048 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4050 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4051 if (eptr >= md->end_subject)
4052 {
4053 SCHECK_PARTIAL();
4054 MRRETURN(MATCH_NOMATCH);
4055 }
4056 GETCHARINC(c, eptr);
4057 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4058 }
4059 /* Control never gets here */
4060
4061 case PT_LAMP:
4062 for (fi = min;; fi++)
4063 {
4064 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4065 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4066 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4067 if (eptr >= md->end_subject)
4068 {
4069 SCHECK_PARTIAL();
4070 MRRETURN(MATCH_NOMATCH);
4071 }
4072 GETCHARINC(c, eptr);
4073 prop_chartype = UCD_CHARTYPE(c);
4074 if ((prop_chartype == ucp_Lu ||
4075 prop_chartype == ucp_Ll ||
4076 prop_chartype == ucp_Lt) == prop_fail_result)
4077 MRRETURN(MATCH_NOMATCH);
4078 }
4079 /* Control never gets here */
4080
4081 case PT_GC:
4082 for (fi = min;; fi++)
4083 {
4084 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4086 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4087 if (eptr >= md->end_subject)
4088 {
4089 SCHECK_PARTIAL();
4090 MRRETURN(MATCH_NOMATCH);
4091 }
4092 GETCHARINC(c, eptr);
4093 prop_category = UCD_CATEGORY(c);
4094 if ((prop_category == prop_value) == prop_fail_result)
4095 MRRETURN(MATCH_NOMATCH);
4096 }
4097 /* Control never gets here */
4098
4099 case PT_PC:
4100 for (fi = min;; fi++)
4101 {
4102 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4103 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4104 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4105 if (eptr >= md->end_subject)
4106 {
4107 SCHECK_PARTIAL();
4108 MRRETURN(MATCH_NOMATCH);
4109 }
4110 GETCHARINC(c, eptr);
4111 prop_chartype = UCD_CHARTYPE(c);
4112 if ((prop_chartype == prop_value) == prop_fail_result)
4113 MRRETURN(MATCH_NOMATCH);
4114 }
4115 /* Control never gets here */
4116
4117 case PT_SC:
4118 for (fi = min;; fi++)
4119 {
4120 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4122 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4123 if (eptr >= md->end_subject)
4124 {
4125 SCHECK_PARTIAL();
4126 MRRETURN(MATCH_NOMATCH);
4127 }
4128 GETCHARINC(c, eptr);
4129 prop_script = UCD_SCRIPT(c);
4130 if ((prop_script == prop_value) == prop_fail_result)
4131 MRRETURN(MATCH_NOMATCH);
4132 }
4133 /* Control never gets here */
4134
4135 default:
4136 RRETURN(PCRE_ERROR_INTERNAL);
4137 }
4138 }
4139
4140 /* Match extended Unicode sequences. We will get here only if the
4141 support is in the binary; otherwise a compile-time error occurs. */
4142
4143 else if (ctype == OP_EXTUNI)
4144 {
4145 for (fi = min;; fi++)
4146 {
4147 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4148 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4149 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4150 if (eptr >= md->end_subject)
4151 {
4152 SCHECK_PARTIAL();
4153 MRRETURN(MATCH_NOMATCH);
4154 }
4155 GETCHARINCTEST(c, eptr);
4156 prop_category = UCD_CATEGORY(c);
4157 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4158 while (eptr < md->end_subject)
4159 {
4160 int len = 1;
4161 if (!utf8) c = *eptr;
4162 else { GETCHARLEN(c, eptr, len); }
4163 prop_category = UCD_CATEGORY(c);
4164 if (prop_category != ucp_M) break;
4165 eptr += len;
4166 }
4167 }
4168 }
4169
4170 else
4171 #endif /* SUPPORT_UCP */
4172
4173 #ifdef SUPPORT_UTF8
4174 /* UTF-8 mode */
4175 if (utf8)
4176 {
4177 for (fi = min;; fi++)
4178 {
4179 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4180 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4181 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4182 if (eptr >= md->end_subject)
4183 {
4184 SCHECK_PARTIAL();
4185 MRRETURN(MATCH_NOMATCH);
4186 }
4187 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4188 MRRETURN(MATCH_NOMATCH);
4189 GETCHARINC(c, eptr);
4190 switch(ctype)
4191 {
4192 case OP_ANY: /* This is the non-NL case */
4193 case OP_ALLANY:
4194 case OP_ANYBYTE:
4195 break;
4196
4197 case OP_ANYNL:
4198 switch(c)
4199 {
4200 default: MRRETURN(MATCH_NOMATCH);
4201 case 0x000d:
4202 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4203 break;
4204 case 0x000a:
4205 break;
4206
4207 case 0x000b:
4208 case 0x000c:
4209 case 0x0085:
4210 case 0x2028:
4211 case 0x2029:
4212 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4213 break;
4214 }
4215 break;
4216
4217 case OP_NOT_HSPACE:
4218 switch(c)
4219 {
4220 default: break;
4221 case 0x09: /* HT */
4222 case 0x20: /* SPACE */
4223 case 0xa0: /* NBSP */
4224 case 0x1680: /* OGHAM SPACE MARK */
4225 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4226 case 0x2000: /* EN QUAD */
4227 case 0x2001: /* EM QUAD */
4228 case 0x2002: /* EN SPACE */
4229 case 0x2003: /* EM SPACE */
4230 case 0x2004: /* THREE-PER-EM SPACE */
4231 case 0x2005: /* FOUR-PER-EM SPACE */
4232 case 0x2006: /* SIX-PER-EM SPACE */
4233 case 0x2007: /* FIGURE SPACE */
4234 case 0x2008: /* PUNCTUATION SPACE */
4235 case 0x2009: /* THIN SPACE */
4236 case 0x200A: /* HAIR SPACE */
4237 case 0x202f: /* NARROW NO-BREAK SPACE */
4238 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4239 case 0x3000: /* IDEOGRAPHIC SPACE */
4240 MRRETURN(MATCH_NOMATCH);
4241 }
4242 break;
4243
4244 case OP_HSPACE:
4245 switch(c)
4246 {
4247 default: MRRETURN(MATCH_NOMATCH);
4248 case 0x09: /* HT */
4249 case 0x20: /* SPACE */
4250 case 0xa0: /* NBSP */
4251 case 0x1680: /* OGHAM SPACE MARK */
4252 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4253 case 0x2000: /* EN QUAD */
4254 case 0x2001: /* EM QUAD */
4255 case 0x2002: /* EN SPACE */
4256 case 0x2003: /* EM SPACE */
4257 case 0x2004: /* THREE-PER-EM SPACE */
4258 case 0x2005: /* FOUR-PER-EM SPACE */
4259 case 0x2006: /* SIX-PER-EM SPACE */
4260 case 0x2007: /* FIGURE SPACE */
4261 case 0x2008: /* PUNCTUATION SPACE */
4262 case 0x2009: /* THIN SPACE */
4263 case 0x200A: /* HAIR SPACE */
4264 case 0x202f: /* NARROW NO-BREAK SPACE */
4265 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4266 case 0x3000: /* IDEOGRAPHIC SPACE */
4267 break;
4268 }
4269 break;
4270
4271 case OP_NOT_VSPACE:
4272 switch(c)
4273 {
4274 default: break;
4275 case 0x0a: /* LF */
4276 case 0x0b: /* VT */
4277 case 0x0c: /* FF */
4278 case 0x0d: /* CR */
4279 case 0x85: /* NEL */
4280 case 0x2028: /* LINE SEPARATOR */
4281 case 0x2029: /* PARAGRAPH SEPARATOR */
4282 MRRETURN(MATCH_NOMATCH);
4283 }
4284 break;
4285
4286 case OP_VSPACE:
4287 switch(c)
4288 {
4289 default: MRRETURN(MATCH_NOMATCH);
4290 case 0x0a: /* LF */
4291 case 0x0b: /* VT */
4292 case 0x0c: /* FF */
4293 case 0x0d: /* CR */
4294 case 0x85: /* NEL */
4295 case 0x2028: /* LINE SEPARATOR */
4296 case 0x2029: /* PARAGRAPH SEPARATOR */
4297 break;
4298 }
4299 break;
4300
4301 case OP_NOT_DIGIT:
4302 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4303 MRRETURN(MATCH_NOMATCH);
4304 break;
4305
4306 case OP_DIGIT:
4307 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4308 MRRETURN(MATCH_NOMATCH);
4309 break;
4310
4311 case OP_NOT_WHITESPACE:
4312 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4313 MRRETURN(MATCH_NOMATCH);
4314 break;
4315
4316 case OP_WHITESPACE:
4317 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4318 MRRETURN(MATCH_NOMATCH);
4319 break;
4320
4321 case OP_NOT_WORDCHAR:
4322 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4323 MRRETURN(MATCH_NOMATCH);
4324 break;
4325
4326 case OP_WORDCHAR:
4327 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4328 MRRETURN(MATCH_NOMATCH);
4329 break;
4330
4331 default:
4332 RRETURN(PCRE_ERROR_INTERNAL);
4333 }
4334 }
4335 }
4336 else
4337 #endif
4338 /* Not UTF-8 mode */
4339 {
4340 for (fi = min;; fi++)
4341 {
4342 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4343 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4344 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4345 if (eptr >= md->end_subject)
4346 {
4347 SCHECK_PARTIAL();
4348 MRRETURN(MATCH_NOMATCH);
4349 }
4350 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4351 MRRETURN(MATCH_NOMATCH);
4352 c = *eptr++;
4353 switch(ctype)
4354 {
4355 case OP_ANY: /* This is the non-NL case */
4356 case OP_ALLANY:
4357 case OP_ANYBYTE:
4358 break;
4359
4360 case OP_ANYNL:
4361 switch(c)
4362 {
4363 default: MRRETURN(MATCH_NOMATCH);
4364 case 0x000d:
4365 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4366 break;
4367
4368 case 0x000a:
4369 break;
4370
4371 case 0x000b:
4372 case 0x000c:
4373 case 0x0085:
4374 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4375 break;
4376 }
4377 break;
4378
4379 case OP_NOT_HSPACE:
4380 switch(c)
4381 {
4382 default: break;
4383 case 0x09: /* HT */
4384 case 0x20: /* SPACE */
4385 case 0xa0: /* NBSP */
4386 MRRETURN(MATCH_NOMATCH);
4387 }
4388 break;
4389
4390 case OP_HSPACE:
4391 switch(c)
4392 {
4393 default: MRRETURN(MATCH_NOMATCH);
4394 case 0x09: /* HT */
4395 case 0x20: /* SPACE */
4396 case 0xa0: /* NBSP */
4397 break;
4398 }
4399 break;
4400
4401 case OP_NOT_VSPACE:
4402 switch(c)
4403 {
4404 default: break;
4405 case 0x0a: /* LF */
4406 case 0x0b: /* VT */
4407 case 0x0c: /* FF */
4408 case 0x0d: /* CR */
4409 case 0x85: /* NEL */
4410 MRRETURN(MATCH_NOMATCH);
4411 }
4412 break;
4413
4414 case OP_VSPACE:
4415 switch(c)
4416 {
4417 default: MRRETURN(MATCH_NOMATCH);
4418 case 0x0a: /* LF */
4419 case 0x0b: /* VT */
4420 case 0x0c: /* FF */
4421 case 0x0d: /* CR */
4422 case 0x85: /* NEL */
4423 break;
4424 }
4425 break;
4426
4427 case OP_NOT_DIGIT:
4428 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4429 break;
4430
4431 case OP_DIGIT:
4432 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4433 break;
4434
4435 case OP_NOT_WHITESPACE:
4436 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4437 break;
4438
4439 case OP_WHITESPACE:
4440 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4441 break;
4442
4443 case OP_NOT_WORDCHAR:
4444 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4445 break;
4446
4447 case OP_WORDCHAR:
4448 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4449 break;
4450
4451 default:
4452 RRETURN(PCRE_ERROR_INTERNAL);
4453 }
4454 }
4455 }
4456 /* Control never gets here */
4457 }
4458
4459 /* If maximizing, it is worth using inline code for speed, doing the type
4460 test once at the start (i.e. keep it out of the loop). Again, keep the
4461 UTF-8 and UCP stuff separate. */
4462
4463 else
4464 {
4465 pp = eptr; /* Remember where we started */
4466
4467 #ifdef SUPPORT_UCP
4468 if (prop_type >= 0)
4469 {
4470 switch(prop_type)
4471 {
4472 case PT_ANY:
4473 for (i = min; i < max; i++)
4474 {
4475 int len = 1;
4476 if (eptr >= md->end_subject)
4477 {
4478 SCHECK_PARTIAL();
4479 break;
4480 }
4481 GETCHARLEN(c, eptr, len);
4482 if (prop_fail_result) break;
4483 eptr+= len;
4484 }
4485 break;
4486
4487 case PT_LAMP:
4488 for (i = min; i < max; i++)
4489 {
4490 int len = 1;
4491 if (eptr >= md->end_subject)
4492 {
4493 SCHECK_PARTIAL();
4494 break;
4495 }
4496 GETCHARLEN(c, eptr, len);
4497 prop_chartype = UCD_CHARTYPE(c);
4498 if ((prop_chartype == ucp_Lu ||
4499 prop_chartype == ucp_Ll ||
4500 prop_chartype == ucp_Lt) == prop_fail_result)
4501 break;
4502 eptr+= len;
4503 }
4504 break;
4505
4506 case PT_GC:
4507 for (i = min; i < max; i++)
4508 {
4509 int len = 1;
4510 if (eptr >= md->end_subject)
4511 {
4512 SCHECK_PARTIAL();
4513 break;
4514 }
4515 GETCHARLEN(c, eptr, len);
4516 prop_category = UCD_CATEGORY(c);
4517 if ((prop_category == prop_value) == prop_fail_result)
4518 break;
4519 eptr+= len;
4520 }
4521 break;
4522
4523 case PT_PC:
4524 for (i = min; i < max; i++)
4525 {
4526 int len = 1;
4527 if (eptr >= md->end_subject)
4528 {
4529 SCHECK_PARTIAL();
4530 break;
4531 }
4532 GETCHARLEN(c, eptr, len);
4533 prop_chartype = UCD_CHARTYPE(c);
4534 if ((prop_chartype == prop_value) == prop_fail_result)
4535 break;
4536 eptr+= len;
4537 }
4538 break;
4539
4540 case PT_SC:
4541 for (i = min; i < max; i++)
4542 {
4543 int len = 1;
4544 if (eptr >= md->end_subject)
4545 {
4546 SCHECK_PARTIAL();
4547 break;
4548 }
4549 GETCHARLEN(c, eptr, len);
4550 prop_script = UCD_SCRIPT(c);
4551 if ((prop_script == prop_value) == prop_fail_result)
4552 break;
4553 eptr+= len;
4554 }
4555 break;
4556 }
4557
4558 /* eptr is now past the end of the maximum run */
4559
4560 if (possessive) continue;
4561 for(;;)
4562 {
4563 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4564 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4565 if (eptr-- == pp) break; /* Stop if tried at original pos */
4566 if (utf8) BACKCHAR(eptr);
4567 }
4568 }
4569
4570 /* Match extended Unicode sequences. We will get here only if the
4571 support is in the binary; otherwise a compile-time error occurs. */
4572
4573 else if (ctype == OP_EXTUNI)
4574 {
4575 for (i = min; i < max; i++)
4576 {
4577 if (eptr >= md->end_subject)
4578 {
4579 SCHECK_PARTIAL();
4580 break;
4581 }
4582 GETCHARINCTEST(c, eptr);
4583 prop_category = UCD_CATEGORY(c);
4584 if (prop_category == ucp_M) break;
4585 while (eptr < md->end_subject)
4586 {
4587 int len = 1;
4588 if (!utf8) c = *eptr; else
4589 {
4590 GETCHARLEN(c, eptr, len);
4591 }
4592 prop_category = UCD_CATEGORY(c);
4593 if (prop_category != ucp_M) break;
4594 eptr += len;
4595 }
4596 }
4597
4598 /* eptr is now past the end of the maximum run */
4599
4600 if (possessive) continue;
4601
4602 for(;;)
4603 {
4604 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4605 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4606 if (eptr-- == pp) break; /* Stop if tried at original pos */
4607 for (;;) /* Move back over one extended */
4608 {
4609 int len = 1;
4610 if (!utf8) c = *eptr; else
4611 {
4612 BACKCHAR(eptr);
4613 GETCHARLEN(c, eptr, len);
4614 }
4615 prop_category = UCD_CATEGORY(c);
4616 if (prop_category != ucp_M) break;
4617 eptr--;
4618 }
4619 }
4620 }
4621
4622 else
4623 #endif /* SUPPORT_UCP */
4624
4625 #ifdef SUPPORT_UTF8
4626 /* UTF-8 mode */
4627
4628 if (utf8)
4629 {
4630 switch(ctype)
4631 {
4632 case OP_ANY:
4633 if (max < INT_MAX)
4634 {
4635 for (i = min; i < max; i++)
4636 {
4637 if (eptr >= md->end_subject)
4638 {
4639 SCHECK_PARTIAL();
4640 break;
4641 }
4642 if (IS_NEWLINE(eptr)) break;
4643 eptr++;
4644 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4645 }
4646 }
4647
4648 /* Handle unlimited UTF-8 repeat */
4649
4650 else
4651 {
4652 for (i = min; i < max; i++)
4653 {
4654 if (eptr >= md->end_subject)
4655 {
4656 SCHECK_PARTIAL();
4657 break;
4658 }
4659 if (IS_NEWLINE(eptr)) break;
4660 eptr++;
4661 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4662 }
4663 }
4664 break;
4665
4666 case OP_ALLANY:
4667 if (max < INT_MAX)
4668 {
4669 for (i = min; i < max; i++)
4670 {
4671 if (eptr >= md->end_subject)
4672 {
4673 SCHECK_PARTIAL();
4674 break;
4675 }
4676 eptr++;
4677 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4678 }
4679 }
4680 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4681 break;
4682
4683 /* The byte case is the same as non-UTF8 */
4684
4685 case OP_ANYBYTE:
4686 c = max - min;
4687 if (c > (unsigned int)(md->end_subject - eptr))
4688 {
4689 eptr = md->end_subject;
4690 SCHECK_PARTIAL();
4691 }
4692 else eptr += c;
4693 break;
4694
4695 case OP_ANYNL:
4696 for (i = min; i < max; i++)
4697 {
4698 int len = 1;
4699 if (eptr >= md->end_subject)
4700 {
4701 SCHECK_PARTIAL();
4702 break;
4703 }
4704 GETCHARLEN(c, eptr, len);
4705 if (c == 0x000d)
4706 {
4707 if (++eptr >= md->end_subject) break;
4708 if (*eptr == 0x000a) eptr++;
4709 }
4710 else
4711 {
4712 if (c != 0x000a &&
4713 (md->bsr_anycrlf ||
4714 (c != 0x000b && c != 0x000c &&
4715 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4716 break;
4717 eptr += len;
4718 }
4719 }
4720 break;
4721
4722 case OP_NOT_HSPACE:
4723 case OP_HSPACE:
4724 for (i = min; i < max; i++)
4725 {
4726 BOOL gotspace;
4727 int len = 1;
4728 if (eptr >= md->end_subject)
4729 {
4730 SCHECK_PARTIAL();
4731 break;
4732 }
4733 GETCHARLEN(c, eptr, len);
4734 switch(c)
4735 {
4736 default: gotspace = FALSE; break;
4737 case 0x09: /* HT */
4738 case 0x20: /* SPACE */
4739 case 0xa0: /* NBSP */
4740 case 0x1680: /* OGHAM SPACE MARK */
4741 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4742 case 0x2000: /* EN QUAD */
4743 case 0x2001: /* EM QUAD */
4744 case 0x2002: /* EN SPACE */
4745 case 0x2003: /* EM SPACE */
4746 case 0x2004: /* THREE-PER-EM SPACE */
4747 case 0x2005: /* FOUR-PER-EM SPACE */
4748 case 0x2006: /* SIX-PER-EM SPACE */
4749 case 0x2007: /* FIGURE SPACE */
4750 case 0x2008: /* PUNCTUATION SPACE */
4751 case 0x2009: /* THIN SPACE */
4752 case 0x200A: /* HAIR SPACE */
4753 case 0x202f: /* NARROW NO-BREAK SPACE */
4754 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4755 case 0x3000: /* IDEOGRAPHIC SPACE */
4756 gotspace = TRUE;
4757 break;
4758 }
4759 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4760 eptr += len;
4761 }
4762 break;
4763
4764 case OP_NOT_VSPACE:
4765 case OP_VSPACE:
4766 for (i = min; i < max; i++)
4767 {
4768 BOOL gotspace;
4769 int len = 1;
4770 if (eptr >= md->end_subject)
4771 {
4772 SCHECK_PARTIAL();
4773 break;
4774 }
4775 GETCHARLEN(c, eptr, len);
4776 switch(c)
4777 {
4778 default: gotspace = FALSE; break;
4779 case 0x0a: /* LF */
4780 case 0x0b: /* VT */
4781 case 0x0c: /* FF */
4782 case 0x0d: /* CR */
4783 case 0x85: /* NEL */
4784 case 0x2028: /* LINE SEPARATOR */
4785 case 0x2029: /* PARAGRAPH SEPARATOR */
4786 gotspace = TRUE;
4787 break;
4788 }
4789 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4790 eptr += len;
4791 }
4792 break;
4793
4794 case OP_NOT_DIGIT:
4795 for (i = min; i < max; i++)
4796 {
4797 int len = 1;
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 break;
4802 }
4803 GETCHARLEN(c, eptr, len);
4804 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4805 eptr+= len;
4806 }
4807 break;
4808
4809 case OP_DIGIT:
4810 for (i = min; i < max; i++)
4811 {
4812 int len = 1;
4813 if (eptr >= md->end_subject)
4814 {
4815 SCHECK_PARTIAL();
4816 break;
4817 }
4818 GETCHARLEN(c, eptr, len);
4819 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4820 eptr+= len;
4821 }
4822 break;
4823
4824 case OP_NOT_WHITESPACE:
4825 for (i = min; i < max; i++)
4826 {
4827 int len = 1;
4828 if (eptr >= md->end_subject)
4829 {
4830 SCHECK_PARTIAL();
4831 break;
4832 }
4833 GETCHARLEN(c, eptr, len);
4834 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4835 eptr+= len;
4836 }
4837 break;
4838
4839 case OP_WHITESPACE:
4840 for (i = min; i < max; i++)
4841 {
4842 int len = 1;
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 break;
4847 }
4848 GETCHARLEN(c, eptr, len);
4849 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4850 eptr+= len;
4851 }
4852 break;
4853
4854 case OP_NOT_WORDCHAR:
4855 for (i = min; i < max; i++)
4856 {
4857 int len = 1;
4858 if (eptr >= md->end_subject)
4859 {
4860 SCHECK_PARTIAL();
4861 break;
4862 }
4863 GETCHARLEN(c, eptr, len);
4864 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4865 eptr+= len;
4866 }
4867 break;
4868
4869 case OP_WORDCHAR:
4870 for (i = min; i < max; i++)
4871 {
4872 int len = 1;
4873 if (eptr >= md->end_subject)
4874 {
4875 SCHECK_PARTIAL();
4876 break;
4877 }
4878 GETCHARLEN(c, eptr, len);
4879 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4880 eptr+= len;
4881 }
4882 break;
4883
4884 default:
4885 RRETURN(PCRE_ERROR_INTERNAL);
4886 }
4887
4888 /* eptr is now past the end of the maximum run */
4889
4890 if (possessive) continue;
4891 for(;;)
4892 {
4893 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4895 if (eptr-- == pp) break; /* Stop if tried at original pos */
4896 BACKCHAR(eptr);
4897 }
4898 }
4899 else
4900 #endif /* SUPPORT_UTF8 */
4901
4902 /* Not UTF-8 mode */
4903 {
4904 switch(ctype)
4905 {
4906 case OP_ANY:
4907 for (i = min; i < max; i++)
4908 {
4909 if (eptr >= md->end_subject)
4910 {
4911 SCHECK_PARTIAL();
4912 break;
4913 }
4914 if (IS_NEWLINE(eptr)) break;
4915 eptr++;
4916 }
4917 break;
4918
4919 case OP_ALLANY:
4920 case OP_ANYBYTE:
4921 c = max - min;
4922 if (c > (unsigned int)(md->end_subject - eptr))
4923 {
4924 eptr = md->end_subject;
4925 SCHECK_PARTIAL();
4926 }
4927 else eptr += c;
4928 break;
4929
4930 case OP_ANYNL:
4931 for (i = min; i < max; i++)
4932 {
4933 if (eptr >= md->end_subject)
4934 {
4935 SCHECK_PARTIAL();
4936 break;
4937 }
4938 c = *eptr;
4939 if (c == 0x000d)
4940 {
4941 if (++eptr >= md->end_subject) break;
4942 if (*eptr == 0x000a) eptr++;
4943 }
4944 else
4945 {
4946 if (c != 0x000a &&
4947 (md->bsr_anycrlf ||
4948 (c != 0x000b && c != 0x000c && c != 0x0085)))
4949 break;
4950 eptr++;
4951 }
4952 }
4953 break;
4954
4955 case OP_NOT_HSPACE:
4956 for (i = min; i < max; i++)
4957 {
4958 if (eptr >= md->end_subject)
4959 {
4960 SCHECK_PARTIAL();
4961 break;
4962 }
4963 c = *eptr;
4964 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4965 eptr++;
4966 }
4967 break;
4968
4969 case OP_HSPACE:
4970 for (i = min; i < max; i++)
4971 {
4972 if (eptr >= md->end_subject)
4973 {
4974 SCHECK_PARTIAL();
4975 break;
4976 }
4977 c = *eptr;
4978 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4979 eptr++;
4980 }
4981 break;
4982
4983 case OP_NOT_VSPACE:
4984 for (i = min; i < max; i++)
4985 {
4986 if (eptr >= md->end_subject)
4987 {
4988 SCHECK_PARTIAL();
4989 break;
4990 }
4991 c = *eptr;
4992 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4993 break;
4994 eptr++;
4995 }
4996 break;
4997
4998 case OP_VSPACE:
4999 for (i = min; i < max; i++)
5000 {
5001 if (eptr >= md->end_subject)
5002 {
5003 SCHECK_PARTIAL();
5004 break;
5005 }
5006 c = *eptr;
5007 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5008 break;
5009 eptr++;
5010 }
5011 break;
5012
5013 case OP_NOT_DIGIT:
5014 for (i = min; i < max; i++)
5015 {
5016 if (eptr >= md->end_subject)
5017 {
5018 SCHECK_PARTIAL();
5019 break;
5020 }
5021 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5022 eptr++;
5023 }
5024 break;
5025
5026 case OP_DIGIT:
5027 for (i = min; i < max; i++)
5028 {
5029 if (eptr >= md->end_subject)
5030 {
5031 SCHECK_PARTIAL();
5032 break;
5033 }
5034 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5035 eptr++;
5036 }
5037 break;
5038
5039 case OP_NOT_WHITESPACE:
5040 for (i = min; i < max; i++)
5041 {
5042 if (eptr >= md->end_subject)
5043 {
5044 SCHECK_PARTIAL();
5045 break;
5046 }
5047 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5048 eptr++;
5049 }
5050 break;
5051
5052 case OP_WHITESPACE:
5053 for (i = min; i < max; i++)
5054 {
5055 if (eptr >= md->end_subject)
5056 {
5057 SCHECK_PARTIAL();
5058 break;
5059 }
5060 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5061 eptr++;
5062 }
5063 break;
5064
5065 case OP_NOT_WORDCHAR:
5066 for (i = min; i < max; i++)
5067 {
5068 if (eptr >= md->end_subject)
5069 {
5070 SCHECK_PARTIAL();
5071 break;
5072 }
5073 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5074 eptr++;
5075 }
5076 break;
5077
5078 case OP_WORDCHAR:
5079 for (i = min; i < max; i++)
5080 {
5081 if (eptr >= md->end_subject)
5082 {
5083 SCHECK_PARTIAL();
5084 break;
5085 }
5086 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5087 eptr++;
5088 }
5089 break;
5090
5091 default:
5092 RRETURN(PCRE_ERROR_INTERNAL);
5093 }
5094
5095 /* eptr is now past the end of the maximum run */
5096
5097 if (possessive) continue;
5098 while (eptr >= pp)
5099 {
5100 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5101 eptr--;
5102 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5103 }
5104 }
5105
5106 /* Get here if we can't make it match with any permitted repetitions */
5107
5108 MRRETURN(MATCH_NOMATCH);
5109 }
5110 /* Control never gets here */
5111
5112 /* There's been some horrible disaster. Arrival here can only mean there is
5113 something seriously wrong in the code above or the OP_xxx definitions. */
5114
5115 default:
5116 DPRINTF(("Unknown opcode %d\n", *ecode));
5117 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5118 }
5119
5120 /* Do not stick any code in here without much thought; it is assumed
5121 that "continue" in the code above comes out to here to repeat the main
5122 loop. */
5123
5124 } /* End of main loop */
5125 /* Control never reaches here */
5126
5127
5128 /* When compiling to use the heap rather than the stack for recursive calls to
5129 match(), the RRETURN() macro jumps here. The number that is saved in
5130 frame->Xwhere indicates which label we actually want to return to. */
5131
5132 #ifdef NO_RECURSE
5133 #define LBL(val) case val: goto L_RM##val;
5134 HEAP_RETURN:
5135 switch (frame->Xwhere)
5136 {
5137 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5138 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5139 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5140 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5141 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5142 #ifdef SUPPORT_UTF8
5143 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5144 LBL(32) LBL(34) LBL(42) LBL(46)
5145 #ifdef SUPPORT_UCP
5146 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5147 #endif /* SUPPORT_UCP */
5148 #endif /* SUPPORT_UTF8 */
5149 default:
5150 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5151 return PCRE_ERROR_INTERNAL;
5152 }
5153 #undef LBL
5154 #endif /* NO_RECURSE */
5155 }
5156
5157
5158 /***************************************************************************
5159 ****************************************************************************
5160 RECURSION IN THE match() FUNCTION
5161
5162 Undefine all the macros that were defined above to handle this. */
5163
5164 #ifdef NO_RECURSE
5165 #undef eptr
5166 #undef ecode
5167 #undef mstart
5168 #undef offset_top
5169 #undef ims
5170 #undef eptrb
5171 #undef flags
5172
5173 #undef callpat
5174 #undef charptr
5175 #undef data
5176 #undef next
5177 #undef pp
5178 #undef prev
5179 #undef saved_eptr
5180
5181 #undef new_recursive
5182
5183 #undef cur_is_word
5184 #undef condition
5185 #undef prev_is_word
5186
5187 #undef original_ims
5188
5189 #undef ctype
5190 #undef length
5191 #undef max
5192 #undef min
5193 #undef number
5194 #undef offset
5195 #undef op
5196 #undef save_capture_last
5197 #undef save_offset1
5198 #undef save_offset2
5199 #undef save_offset3
5200 #undef stacksave
5201
5202 #undef newptrb
5203
5204 #endif
5205
5206 /* These two are defined as macros in both cases */
5207
5208 #undef fc
5209 #undef fi
5210
5211 /***************************************************************************
5212 ***************************************************************************/
5213
5214
5215
5216 /*************************************************
5217 * Execute a Regular Expression *
5218 *************************************************/
5219
5220 /* This function applies a compiled re to a subject string and picks out
5221 portions of the string if it matches. Two elements in the vector are set for
5222 each substring: the offsets to the start and end of the substring.
5223
5224 Arguments:
5225 argument_re points to the compiled expression
5226 extra_data points to extra data or is NULL
5227 subject points to the subject string
5228 length length of subject string (may contain binary zeros)
5229 start_offset where to start in the subject string
5230 options option bits
5231 offsets points to a vector of ints to be filled in with offsets
5232 offsetcount the number of elements in the vector
5233
5234 Returns: > 0 => success; value is the number of elements filled in
5235 = 0 => success, but offsets is not big enough
5236 -1 => failed to match
5237 < -1 => some kind of unexpected problem
5238 */
5239
5240 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5241 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5242 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5243 int offsetcount)
5244 {
5245 int rc, resetcount, ocount;
5246 int first_byte = -1;
5247 int req_byte = -1;
5248 int req_byte2 = -1;
5249 int newline;
5250 unsigned long int ims;
5251 BOOL using_temporary_offsets = FALSE;
5252 BOOL anchored;
5253 BOOL startline;
5254 BOOL firstline;
5255 BOOL first_byte_caseless = FALSE;
5256 BOOL req_byte_caseless = FALSE;
5257 BOOL utf8;
5258 match_data match_block;
5259 match_data *md = &match_block;
5260 const uschar *tables;
5261 const uschar *start_bits = NULL;
5262 USPTR start_match = (USPTR)subject + start_offset;
5263 USPTR end_subject;
5264 USPTR start_partial = NULL;
5265 USPTR req_byte_ptr = start_match - 1;
5266
5267 pcre_study_data internal_study;
5268 const pcre_study_data *study;
5269
5270 real_pcre internal_re;
5271 const real_pcre *external_re = (const real_pcre *)argument_re;
5272 const real_pcre *re = external_re;
5273
5274 /* Plausibility checks */
5275
5276 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5277 if (re == NULL || subject == NULL ||
5278 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5279 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5280
5281 /* This information is for finding all the numbers associated with a given
5282 name, for condition testing. */
5283
5284 md->name_table = (uschar *)re + re->name_table_offset;
5285 md->name_count = re->name_count;
5286 md->name_entry_size = re->name_entry_size;
5287
5288 /* Fish out the optional data from the extra_data structure, first setting
5289 the default values. */
5290
5291 study = NULL;
5292 md->match_limit = MATCH_LIMIT;
5293 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5294 md->callout_data = NULL;
5295
5296 /* The table pointer is always in native byte order. */
5297
5298 tables = external_re->tables;
5299
5300 if (extra_data != NULL)
5301 {
5302 register unsigned int flags = extra_data->flags;
5303 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5304 study = (const pcre_study_data *)extra_data->study_data;
5305 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5306 md->match_limit = extra_data->match_limit;
5307 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5308 md->match_limit_recursion = extra_data->match_limit_recursion;
5309 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5310 md->callout_data = extra_data->callout_data;
5311 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5312 }
5313
5314 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5315 is a feature that makes it possible to save compiled regex and re-use them
5316 in other programs later. */
5317
5318 if (tables == NULL) tables = _pcre_default_tables;
5319
5320 /* Check that the first field in the block is the magic number. If it is not,
5321 test for a regex that was compiled on a host of opposite endianness. If this is
5322 the case, flipped values are put in internal_re and internal_study if there was
5323 study data too. */
5324
5325 if (re->magic_number != MAGIC_NUMBER)
5326 {
5327 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5328 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5329 if (study != NULL) study = &internal_study;
5330 }
5331
5332 /* Set up other data */
5333
5334 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5335 startline = (re->flags & PCRE_STARTLINE) != 0;
5336 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5337
5338 /* The code starts after the real_pcre block and the capture name table. */
5339
5340 md->start_code = (const uschar *)external_re + re->name_table_offset +
5341 re->name_count * re->name_entry_size;
5342
5343 md->start_subject = (USPTR)subject;
5344 md->start_offset = start_offset;
5345 md->end_subject = md->start_subject + length;
5346 end_subject = md->end_subject;
5347
5348 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5349 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5350 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5351
5352 md->notbol = (options & PCRE_NOTBOL) != 0;
5353 md->noteol = (options & PCRE_NOTEOL) != 0;
5354 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5355 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5356 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5357 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5358 md->hitend = FALSE;
5359 md->mark = NULL; /* In case never set */
5360
5361 md->recursive = NULL; /* No recursion at top level */
5362
5363 md->lcc = tables + lcc_offset;
5364 md->ctypes = tables + ctypes_offset;
5365
5366 /* Handle different \R options. */
5367
5368 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5369 {
5370 case 0:
5371 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5372 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5373 else
5374 #ifdef BSR_ANYCRLF
5375 md->bsr_anycrlf = TRUE;
5376 #else
5377 md->bsr_anycrlf = FALSE;
5378 #endif
5379 break;
5380
5381 case PCRE_BSR_ANYCRLF:
5382 md->bsr_anycrlf = TRUE;
5383 break;
5384
5385 case PCRE_BSR_UNICODE:
5386 md->bsr_anycrlf = FALSE;
5387 break;
5388
5389 default: return PCRE_ERROR_BADNEWLINE;
5390 }
5391
5392 /* Handle different types of newline. The three bits give eight cases. If
5393 nothing is set at run time, whatever was used at compile time applies. */
5394
5395 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5396 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5397 {
5398 case 0: newline = NEWLINE; break; /* Compile-time default */
5399 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5400 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5401 case PCRE_NEWLINE_CR+
5402 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5403 case PCRE_NEWLINE_ANY: newline = -1; break;
5404 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5405 default: return PCRE_ERROR_BADNEWLINE;
5406 }
5407
5408 if (newline == -2)
5409 {
5410 md->nltype = NLTYPE_ANYCRLF;
5411 }
5412 else if (newline < 0)
5413 {
5414 md->nltype = NLTYPE_ANY;
5415 }
5416 else
5417 {
5418 md->nltype = NLTYPE_FIXED;
5419 if (newline > 255)
5420 {
5421 md->nllen = 2;
5422 md->nl[0] = (newline >> 8) & 255;
5423 md->nl[1] = newline & 255;
5424 }
5425 else
5426 {
5427 md->nllen = 1;
5428 md->nl[0] = newline;
5429 }
5430 }
5431
5432 /* Partial matching was originally supported only for a restricted set of
5433 regexes; from release 8.00 there are no restrictions, but the bits are still
5434 defined (though never set). So there's no harm in leaving this code. */
5435
5436 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5437 return PCRE_ERROR_BADPARTIAL;
5438
5439 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5440 back the character offset. */
5441
5442 #ifdef SUPPORT_UTF8
5443 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5444 {
5445 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5446 return PCRE_ERROR_BADUTF8;
5447 if (start_offset > 0 && start_offset < length)
5448 {
5449 int tb = ((USPTR)subject)[start_offset];
5450 if (tb > 127)
5451 {
5452 tb &= 0xc0;
5453 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5454 }
5455 }
5456 }
5457 #endif
5458
5459 /* The ims options can vary during the matching as a result of the presence
5460 of (?ims) items in the pattern. They are kept in a local variable so that
5461 restoring at the exit of a group is easy. */
5462
5463 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5464
5465 /* If the expression has got more back references than the offsets supplied can
5466 hold, we get a temporary chunk of working store to use during the matching.
5467 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5468 of 3. */
5469
5470 ocount = offsetcount - (offsetcount % 3);
5471
5472 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5473 {
5474 ocount = re->top_backref * 3 + 3;
5475 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5476 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5477 using_temporary_offsets = TRUE;
5478 DPRINTF(("Got memory to hold back references\n"));
5479 }
5480 else md->offset_vector = offsets;
5481
5482 md->offset_end = ocount;
5483 md->offset_max = (2*ocount)/3;
5484 md->offset_overflow = FALSE;
5485 md->capture_last = -1;
5486
5487 /* Compute the minimum number of offsets that we need to reset each time. Doing
5488 this makes a huge difference to execution time when there aren't many brackets
5489 in the pattern. */
5490
5491 resetcount = 2 + re->top_bracket * 2;
5492 if (resetcount > offsetcount) resetcount = ocount;
5493
5494 /* Reset the working variable associated with each extraction. These should
5495 never be used unless previously set, but they get saved and restored, and so we
5496 initialize them to avoid reading uninitialized locations. */
5497
5498 if (md->offset_vector != NULL)
5499 {
5500 register int *iptr = md->offset_vector + ocount;
5501 register int *iend = iptr - resetcount/2 + 1;
5502 while (--iptr >= iend) *iptr = -1;
5503 }
5504
5505 /* Set up the first character to match, if available. The first_byte value is
5506 never set for an anchored regular expression, but the anchoring may be forced
5507 at run time, so we have to test for anchoring. The first char may be unset for
5508 an unanchored pattern, of course. If there's no first char and the pattern was
5509 studied, there may be a bitmap of possible first characters. */
5510
5511 if (!anchored)
5512 {
5513 if ((re->flags & PCRE_FIRSTSET) != 0)
5514 {
5515 first_byte = re->first_byte & 255;
5516 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5517 first_byte = md->lcc[first_byte];
5518 }
5519 else
5520 if (!startline && study != NULL &&
5521 (study->flags & PCRE_STUDY_MAPPED) != 0)
5522 start_bits = study->start_bits;
5523 }
5524
5525 /* For anchored or unanchored matches, there may be a "last known required
5526 character" set. */
5527
5528 if ((re->flags & PCRE_REQCHSET) != 0)
5529 {
5530 req_byte = re->req_byte & 255;
5531 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5532 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5533 }
5534
5535
5536 /* ==========================================================================*/
5537
5538 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5539 the loop runs just once. */
5540
5541 for(;;)
5542 {
5543 USPTR save_end_subject = end_subject;
5544 USPTR new_start_match;
5545
5546 /* Reset the maximum number of extractions we might see. */
5547
5548 if (md->offset_vector != NULL)
5549 {
5550 register int *iptr = md->offset_vector;
5551 register int *iend = iptr + resetcount;
5552 while (iptr < iend) *iptr++ = -1;
5553 }
5554
5555 /* If firstline is TRUE, the start of the match is constrained to the first
5556 line of a multiline string. That is, the match must be before or at the first
5557 newline. Implement this by temporarily adjusting end_subject so that we stop
5558 scanning at a newline. If the match fails at the newline, later code breaks
5559 this loop. */
5560
5561 if (firstline)
5562 {
5563 USPTR t = start_match;
5564 #ifdef SUPPORT_UTF8
5565 if (utf8)
5566 {
5567 while (t < md->end_subject && !IS_NEWLINE(t))
5568 {
5569 t++;
5570 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5571 }
5572 }
5573 else
5574 #endif
5575 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5576 end_subject = t;
5577 }
5578
5579 /* There are some optimizations that avoid running the match if a known
5580 starting point is not found, or if a known later character is not present.
5581 However, there is an option that disables these, for testing and for ensuring
5582 that all callouts do actually occur. */
5583
5584 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5585 {
5586 /* Advance to a unique first byte if there is one. */
5587
5588 if (first_byte >= 0)
5589 {
5590 if (first_byte_caseless)
5591 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5592 start_match++;
5593 else
5594 while (start_match < end_subject && *start_match != first_byte)
5595 start_match++;
5596 }
5597
5598 /* Or to just after a linebreak for a multiline match */
5599
5600 else if (startline)
5601 {
5602 if (start_match > md->start_subject + start_offset)
5603 {
5604 #ifdef SUPPORT_UTF8
5605 if (utf8)
5606 {
5607 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5608 {
5609 start_match++;
5610 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5611 start_match++;
5612 }
5613 }
5614 else
5615 #endif
5616 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5617 start_match++;
5618
5619 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5620 and we are now at a LF, advance the match position by one more character.
5621 */
5622
5623 if (start_match[-1] == CHAR_CR &&
5624 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5625 start_match < end_subject &&
5626 *start_match == CHAR_NL)
5627 start_match++;
5628 }
5629 }
5630
5631 /* Or to a non-unique first byte after study */
5632
5633 else if (start_bits != NULL)
5634 {
5635 while (start_match < end_subject)
5636 {
5637 register unsigned int c = *start_match;
5638 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5639 else break;
5640 }
5641 }
5642 } /* Starting optimizations */
5643
5644 /* Restore fudged end_subject */
5645
5646 end_subject = save_end_subject;
5647
5648 /* The following two optimizations are disabled for partial matching or if
5649 disabling is explicitly requested. */
5650
5651 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5652 {
5653 /* If the pattern was studied, a minimum subject length may be set. This is
5654 a lower bound; no actual string of that length may actually match the
5655 pattern. Although the value is, strictly, in characters, we treat it as
5656 bytes to avoid spending too much time in this optimization. */
5657
5658 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5659 (pcre_uint32)(end_subject - start_match) < study->minlength)
5660 {
5661 rc = MATCH_NOMATCH;
5662 break;
5663 }
5664
5665 /* If req_byte is set, we know that that character must appear in the
5666 subject for the match to succeed. If the first character is set, req_byte
5667 must be later in the subject; otherwise the test starts at the match point.
5668 This optimization can save a huge amount of backtracking in patterns with
5669 nested unlimited repeats that aren't going to match. Writing separate code
5670 for cased/caseless versions makes it go faster, as does using an
5671 autoincrement and backing off on a match.
5672
5673 HOWEVER: when the subject string is very, very long, searching to its end
5674 can take a long time, and give bad performance on quite ordinary patterns.
5675 This showed up when somebody was matching something like /^\d+C/ on a
5676 32-megabyte string... so we don't do this when the string is sufficiently
5677 long. */
5678
5679 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5680 {
5681 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5682
5683 /* We don't need to repeat the search if we haven't yet reached the
5684 place we found it at last time. */
5685
5686 if (p > req_byte_ptr)
5687 {
5688 if (req_byte_caseless)
5689 {
5690 while (p < end_subject)
5691 {
5692 register int pp = *p++;
5693 if (pp == req_byte || pp == req_byte2) { p--; break; }
5694 }
5695 }
5696 else
5697 {
5698 while (p < end_subject)
5699 {
5700 if (*p++ == req_byte) { p--; break; }
5701 }
5702 }
5703
5704 /* If we can't find the required character, break the matching loop,
5705 forcing a match failure. */
5706
5707 if (p >= end_subject)
5708 {
5709 rc = MATCH_NOMATCH;
5710 break;
5711 }
5712
5713 /* If we have found the required character, save the point where we
5714 found it, so that we don't search again next time round the loop if
5715 the start hasn't passed this character yet. */
5716
5717 req_byte_ptr = p;
5718 }
5719 }
5720 }
5721
5722 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5723 printf(">>>> Match against: ");
5724 pchars(start_match, end_subject - start_match, TRUE, md);
5725 printf("\n");
5726 #endif
5727
5728 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5729 first starting point for which a partial match was found. */
5730
5731 md->start_match_ptr = start_match;
5732 md->start_used_ptr = start_match;
5733 md->match_call_count = 0;
5734 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
5735 0, 0);
5736 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5737
5738 switch(rc)
5739 {
5740 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
5741 this level it means that a MARK that matched the SKIP's arg was not found.
5742 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
5743
5744 case MATCH_NOMATCH:
5745 case MATCH_PRUNE:
5746 case MATCH_SKIP_ARG:
5747 case MATCH_THEN:
5748 new_start_match = start_match + 1;
5749 #ifdef SUPPORT_UTF8
5750 if (utf8)
5751 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5752 new_start_match++;
5753 #endif
5754 break;
5755
5756 /* SKIP passes back the next starting point explicitly. */
5757
5758 case MATCH_SKIP:
5759 new_start_match = md->start_match_ptr;
5760 break;
5761
5762 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5763
5764 case MATCH_COMMIT:
5765 rc = MATCH_NOMATCH;
5766 goto ENDLOOP;
5767
5768 /* Any other return is either a match, or some kind of error. */
5769
5770 default:
5771 goto ENDLOOP;
5772 }
5773
5774 /* Control reaches here for the various types of "no match at this point"
5775 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5776
5777 rc = MATCH_NOMATCH;
5778
5779 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5780 newline in the subject (though it may continue over the newline). Therefore,
5781 if we have just failed to match, starting at a newline, do not continue. */
5782
5783 if (firstline && IS_NEWLINE(start_match)) break;
5784
5785 /* Advance to new matching position */
5786
5787 start_match = new_start_match;
5788
5789 /* Break the loop if the pattern is anchored or if we have passed the end of
5790 the subject. */
5791
5792 if (anchored || start_match > end_subject) break;
5793
5794 /* If we have just passed a CR and we are now at a LF, and the pattern does
5795 not contain any explicit matches for \r or \n, and the newline option is CRLF
5796 or ANY or ANYCRLF, advance the match position by one more character. */
5797
5798 if (start_match[-1] == CHAR_CR &&
5799 start_match < end_subject &&
5800 *start_match == CHAR_NL &&
5801 (re->flags & PCRE_HASCRORLF) == 0 &&
5802 (md->nltype == NLTYPE_ANY ||
5803 md->nltype == NLTYPE_ANYCRLF ||
5804 md->nllen == 2))
5805 start_match++;
5806
5807 md->mark = NULL; /* Reset for start of next match attempt */
5808 } /* End of for(;;) "bumpalong" loop */
5809
5810 /* ==========================================================================*/
5811
5812 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5813 conditions is true:
5814
5815 (1) The pattern is anchored or the match was failed by (*COMMIT);
5816
5817 (2) We are past the end of the subject;
5818
5819 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5820 this option requests that a match occur at or before the first newline in
5821 the subject.
5822
5823 When we have a match and the offset vector is big enough to deal with any
5824 backreferences, captured substring offsets will already be set up. In the case
5825 where we had to get some local store to hold offsets for backreference
5826 processing, copy those that we can. In this case there need not be overflow if
5827 certain parts of the pattern were not used, even though there are more
5828 capturing parentheses than vector slots. */
5829
5830 ENDLOOP:
5831
5832 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
5833 {
5834 if (using_temporary_offsets)
5835 {
5836 if (offsetcount >= 4)
5837 {
5838 memcpy(offsets + 2, md->offset_vector + 2,
5839 (offsetcount - 2) * sizeof(int));
5840 DPRINTF(("Copied offsets from temporary memory\n"));
5841 }
5842 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5843 DPRINTF(("Freeing temporary memory\n"));
5844 (pcre_free)(md->offset_vector);
5845 }
5846
5847 /* Set the return code to the number of captured strings, or 0 if there are
5848 too many to fit into the vector. */
5849
5850 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5851
5852 /* If there is space, set up the whole thing as substring 0. The value of
5853 md->start_match_ptr might be modified if \K was encountered on the success
5854 matching path. */
5855
5856 if (offsetcount < 2) rc = 0; else
5857 {
5858 offsets[0] = md->start_match_ptr - md->start_subject;
5859 offsets[1] = md->end_match_ptr - md->start_subject;
5860 }
5861
5862 DPRINTF((">>>> returning %d\n", rc));
5863 goto RETURN_MARK;
5864 }
5865
5866 /* Control gets here if there has been an error, or if the overall match
5867 attempt has failed at all permitted starting positions. */
5868
5869 if (using_temporary_offsets)
5870 {
5871 DPRINTF(("Freeing temporary memory\n"));
5872 (pcre_free)(md->offset_vector);
5873 }
5874
5875 /* For anything other than nomatch or partial match, just return the code. */
5876
5877 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5878 {
5879 DPRINTF((">>>> error: returning %d\n", rc));
5880 return rc;
5881 }
5882
5883 /* Handle partial matches - disable any mark data */
5884
5885 if (start_partial != NULL)
5886 {
5887 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5888 md->mark = NULL;
5889 if (offsetcount > 1)
5890 {
5891 offsets[0] = start_partial - (USPTR)subject;
5892 offsets[1] = end_subject - (USPTR)subject;
5893 }
5894 rc = PCRE_ERROR_PARTIAL;
5895 }
5896
5897 /* This is the classic nomatch case */
5898
5899 else
5900 {
5901 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5902 rc = PCRE_ERROR_NOMATCH;
5903 }
5904
5905 /* Return the MARK data if it has been requested. */
5906
5907 RETURN_MARK:
5908
5909 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
5910 *(extra_data->mark) = (unsigned char *)(md->mark);
5911 return rc;
5912 }
5913
5914 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5