/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 381 - (show annotations)
Tue Mar 3 16:08:23 2009 UTC (10 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 152246 byte(s)
Fix bug with (?(?=.*b)b|^) thinking it must match at start of line; also fix 
bug causing a crash when auto-callout is used with a conditional assertion.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 const uschar *Xeptr;
326 const uschar *Xecode;
327 const uschar *Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 const uschar *Xcallpat;
337 const uschar *Xcharptr;
338 const uschar *Xdata;
339 const uschar *Xnext;
340 const uschar *Xpp;
341 const uschar *Xprev;
342 const uschar *Xsaved_eptr;
343
344 recursion_info Xnew_recursive;
345
346 BOOL Xcur_is_word;
347 BOOL Xcondition;
348 BOOL Xprev_is_word;
349
350 unsigned long int Xoriginal_ims;
351
352 #ifdef SUPPORT_UCP
353 int Xprop_type;
354 int Xprop_value;
355 int Xprop_fail_result;
356 int Xprop_category;
357 int Xprop_chartype;
358 int Xprop_script;
359 int Xoclength;
360 uschar Xocchars[8];
361 #endif
362
363 int Xctype;
364 unsigned int Xfc;
365 int Xfi;
366 int Xlength;
367 int Xmax;
368 int Xmin;
369 int Xnumber;
370 int Xoffset;
371 int Xop;
372 int Xsave_capture_last;
373 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374 int Xstacksave[REC_STACK_SAVE_MAX];
375
376 eptrblock Xnewptrb;
377
378 /* Where to jump back to */
379
380 int Xwhere;
381
382 } heapframe;
383
384 #endif
385
386
387 /***************************************************************************
388 ***************************************************************************/
389
390
391
392 /*************************************************
393 * Match from current position *
394 *************************************************/
395
396 /* This function is called recursively in many circumstances. Whenever it
397 returns a negative (error) response, the outer incarnation must also return the
398 same response.
399
400 Performance note: It might be tempting to extract commonly used fields from the
401 md structure (e.g. utf8, end_subject) into individual variables to improve
402 performance. Tests using gcc on a SPARC disproved this; in the first case, it
403 made performance worse.
404
405 Arguments:
406 eptr pointer to current character in subject
407 ecode pointer to current position in compiled code
408 mstart pointer to the current match start position (can be modified
409 by encountering \K)
410 offset_top current top pointer
411 md pointer to "static" info for the match
412 ims current /i, /m, and /s options
413 eptrb pointer to chain of blocks containing eptr at start of
414 brackets - for testing for empty matches
415 flags can contain
416 match_condassert - this is an assertion condition
417 match_cbegroup - this is the start of an unlimited repeat
418 group that can match an empty string
419 rdepth the recursion depth
420
421 Returns: MATCH_MATCH if matched ) these values are >= 0
422 MATCH_NOMATCH if failed to match )
423 a negative PCRE_ERROR_xxx value if aborted by an error condition
424 (e.g. stopped by repeated call or recursion limit)
425 */
426
427 static int
428 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 int flags, unsigned int rdepth)
431 {
432 /* These variables do not need to be preserved over recursion in this function,
433 so they can be ordinary variables in all cases. Mark some of them with
434 "register" because they are used a lot in loops. */
435
436 register int rrc; /* Returns from recursive calls */
437 register int i; /* Used for loops not involving calls to RMATCH() */
438 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440
441 BOOL minimize, possessive; /* Quantifier options */
442
443 /* When recursion is not being used, all "local" variables that have to be
444 preserved over calls to RMATCH() are part of a "frame" which is obtained from
445 heap storage. Set up the top-level frame here; others are obtained from the
446 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447
448 #ifdef NO_RECURSE
449 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450 frame->Xprevframe = NULL; /* Marks the top level */
451
452 /* Copy in the original argument variables */
453
454 frame->Xeptr = eptr;
455 frame->Xecode = ecode;
456 frame->Xmstart = mstart;
457 frame->Xoffset_top = offset_top;
458 frame->Xims = ims;
459 frame->Xeptrb = eptrb;
460 frame->Xflags = flags;
461 frame->Xrdepth = rdepth;
462
463 /* This is where control jumps back to to effect "recursion" */
464
465 HEAP_RECURSE:
466
467 /* Macros make the argument variables come from the current frame */
468
469 #define eptr frame->Xeptr
470 #define ecode frame->Xecode
471 #define mstart frame->Xmstart
472 #define offset_top frame->Xoffset_top
473 #define ims frame->Xims
474 #define eptrb frame->Xeptrb
475 #define flags frame->Xflags
476 #define rdepth frame->Xrdepth
477
478 /* Ditto for the local variables */
479
480 #ifdef SUPPORT_UTF8
481 #define charptr frame->Xcharptr
482 #endif
483 #define callpat frame->Xcallpat
484 #define data frame->Xdata
485 #define next frame->Xnext
486 #define pp frame->Xpp
487 #define prev frame->Xprev
488 #define saved_eptr frame->Xsaved_eptr
489
490 #define new_recursive frame->Xnew_recursive
491
492 #define cur_is_word frame->Xcur_is_word
493 #define condition frame->Xcondition
494 #define prev_is_word frame->Xprev_is_word
495
496 #define original_ims frame->Xoriginal_ims
497
498 #ifdef SUPPORT_UCP
499 #define prop_type frame->Xprop_type
500 #define prop_value frame->Xprop_value
501 #define prop_fail_result frame->Xprop_fail_result
502 #define prop_category frame->Xprop_category
503 #define prop_chartype frame->Xprop_chartype
504 #define prop_script frame->Xprop_script
505 #define oclength frame->Xoclength
506 #define occhars frame->Xocchars
507 #endif
508
509 #define ctype frame->Xctype
510 #define fc frame->Xfc
511 #define fi frame->Xfi
512 #define length frame->Xlength
513 #define max frame->Xmax
514 #define min frame->Xmin
515 #define number frame->Xnumber
516 #define offset frame->Xoffset
517 #define op frame->Xop
518 #define save_capture_last frame->Xsave_capture_last
519 #define save_offset1 frame->Xsave_offset1
520 #define save_offset2 frame->Xsave_offset2
521 #define save_offset3 frame->Xsave_offset3
522 #define stacksave frame->Xstacksave
523
524 #define newptrb frame->Xnewptrb
525
526 /* When recursion is being used, local variables are allocated on the stack and
527 get preserved during recursion in the normal way. In this environment, fi and
528 i, and fc and c, can be the same variables. */
529
530 #else /* NO_RECURSE not defined */
531 #define fi i
532 #define fc c
533
534
535 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536 const uschar *charptr; /* in small blocks of the code. My normal */
537 #endif /* style of coding would have declared */
538 const uschar *callpat; /* them within each of those blocks. */
539 const uschar *data; /* However, in order to accommodate the */
540 const uschar *next; /* version of this code that uses an */
541 USPTR pp; /* external "stack" implemented on the */
542 const uschar *prev; /* heap, it is easier to declare them all */
543 USPTR saved_eptr; /* here, so the declarations can be cut */
544 /* out in a block. The only declarations */
545 recursion_info new_recursive; /* within blocks below are for variables */
546 /* that do not have to be preserved over */
547 BOOL cur_is_word; /* a recursive call to RMATCH(). */
548 BOOL condition;
549 BOOL prev_is_word;
550
551 unsigned long int original_ims;
552
553 #ifdef SUPPORT_UCP
554 int prop_type;
555 int prop_value;
556 int prop_fail_result;
557 int prop_category;
558 int prop_chartype;
559 int prop_script;
560 int oclength;
561 uschar occhars[8];
562 #endif
563
564 int ctype;
565 int length;
566 int max;
567 int min;
568 int number;
569 int offset;
570 int op;
571 int save_capture_last;
572 int save_offset1, save_offset2, save_offset3;
573 int stacksave[REC_STACK_SAVE_MAX];
574
575 eptrblock newptrb;
576 #endif /* NO_RECURSE */
577
578 /* These statements are here to stop the compiler complaining about unitialized
579 variables. */
580
581 #ifdef SUPPORT_UCP
582 prop_value = 0;
583 prop_fail_result = 0;
584 #endif
585
586
587 /* This label is used for tail recursion, which is used in a few cases even
588 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589 used. Thanks to Ian Taylor for noticing this possibility and sending the
590 original patch. */
591
592 TAIL_RECURSE:
593
594 /* OK, now we can get on with the real code of the function. Recursive calls
595 are specified by the macro RMATCH and RRETURN is used to return. When
596 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597 and a "return", respectively (possibly with some debugging if DEBUG is
598 defined). However, RMATCH isn't like a function call because it's quite a
599 complicated macro. It has to be used in one particular way. This shouldn't,
600 however, impact performance when true recursion is being used. */
601
602 #ifdef SUPPORT_UTF8
603 utf8 = md->utf8; /* Local copy of the flag */
604 #else
605 utf8 = FALSE;
606 #endif
607
608 /* First check that we haven't called match() too many times, or that we
609 haven't exceeded the recursive call limit. */
610
611 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613
614 original_ims = ims; /* Save for resetting on ')' */
615
616 /* At the start of a group with an unlimited repeat that may match an empty
617 string, the match_cbegroup flag is set. When this is the case, add the current
618 subject pointer to the chain of such remembered pointers, to be checked when we
619 hit the closing ket, in order to break infinite loops that match no characters.
620 When match() is called in other circumstances, don't add to the chain. The
621 match_cbegroup flag must NOT be used with tail recursion, because the memory
622 block that is used is on the stack, so a new one may be required for each
623 match(). */
624
625 if ((flags & match_cbegroup) != 0)
626 {
627 newptrb.epb_saved_eptr = eptr;
628 newptrb.epb_prev = eptrb;
629 eptrb = &newptrb;
630 }
631
632 /* Now start processing the opcodes. */
633
634 for (;;)
635 {
636 minimize = possessive = FALSE;
637 op = *ecode;
638
639 /* For partial matching, remember if we ever hit the end of the subject after
640 matching at least one subject character. */
641
642 if (md->partial &&
643 eptr >= md->end_subject &&
644 eptr > mstart)
645 md->hitend = TRUE;
646
647 switch(op)
648 {
649 case OP_FAIL:
650 RRETURN(MATCH_NOMATCH);
651
652 case OP_PRUNE:
653 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654 ims, eptrb, flags, RM51);
655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656 RRETURN(MATCH_PRUNE);
657
658 case OP_COMMIT:
659 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660 ims, eptrb, flags, RM52);
661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 RRETURN(MATCH_COMMIT);
663
664 case OP_SKIP:
665 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666 ims, eptrb, flags, RM53);
667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 md->start_match_ptr = eptr; /* Pass back current position */
669 RRETURN(MATCH_SKIP);
670
671 case OP_THEN:
672 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673 ims, eptrb, flags, RM54);
674 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675 RRETURN(MATCH_THEN);
676
677 /* Handle a capturing bracket. If there is space in the offset vector, save
678 the current subject position in the working slot at the top of the vector.
679 We mustn't change the current values of the data slot, because they may be
680 set from a previous iteration of this group, and be referred to by a
681 reference inside the group.
682
683 If the bracket fails to match, we need to restore this value and also the
684 values of the final offsets, in case they were set by a previous iteration
685 of the same bracket.
686
687 If there isn't enough space in the offset vector, treat this as if it were
688 a non-capturing bracket. Don't worry about setting the flag for the error
689 case here; that is handled in the code for KET. */
690
691 case OP_CBRA:
692 case OP_SCBRA:
693 number = GET2(ecode, 1+LINK_SIZE);
694 offset = number << 1;
695
696 #ifdef DEBUG
697 printf("start bracket %d\n", number);
698 printf("subject=");
699 pchars(eptr, 16, TRUE, md);
700 printf("\n");
701 #endif
702
703 if (offset < md->offset_max)
704 {
705 save_offset1 = md->offset_vector[offset];
706 save_offset2 = md->offset_vector[offset+1];
707 save_offset3 = md->offset_vector[md->offset_end - number];
708 save_capture_last = md->capture_last;
709
710 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712
713 flags = (op == OP_SCBRA)? match_cbegroup : 0;
714 do
715 {
716 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717 ims, eptrb, flags, RM1);
718 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719 md->capture_last = save_capture_last;
720 ecode += GET(ecode, 1);
721 }
722 while (*ecode == OP_ALT);
723
724 DPRINTF(("bracket %d failed\n", number));
725
726 md->offset_vector[offset] = save_offset1;
727 md->offset_vector[offset+1] = save_offset2;
728 md->offset_vector[md->offset_end - number] = save_offset3;
729
730 RRETURN(MATCH_NOMATCH);
731 }
732
733 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734 as a non-capturing bracket. */
735
736 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738
739 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740
741 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743
744 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745 final alternative within the brackets, we would return the result of a
746 recursive call to match() whatever happened. We can reduce stack usage by
747 turning this into a tail recursion, except in the case when match_cbegroup
748 is set.*/
749
750 case OP_BRA:
751 case OP_SBRA:
752 DPRINTF(("start non-capturing bracket\n"));
753 flags = (op >= OP_SBRA)? match_cbegroup : 0;
754 for (;;)
755 {
756 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757 {
758 if (flags == 0) /* Not a possibly empty group */
759 {
760 ecode += _pcre_OP_lengths[*ecode];
761 DPRINTF(("bracket 0 tail recursion\n"));
762 goto TAIL_RECURSE;
763 }
764
765 /* Possibly empty group; can't use tail recursion. */
766
767 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768 eptrb, flags, RM48);
769 RRETURN(rrc);
770 }
771
772 /* For non-final alternatives, continue the loop for a NOMATCH result;
773 otherwise return. */
774
775 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776 eptrb, flags, RM2);
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 ecode += GET(ecode, 1);
779 }
780 /* Control never reaches here. */
781
782 /* Conditional group: compilation checked that there are no more than
783 two branches. If the condition is false, skipping the first branch takes us
784 past the end if there is only one branch, but that's OK because that is
785 exactly what going to the ket would do. As there is only one branch to be
786 obeyed, we can use tail recursion to avoid using another stack frame. */
787
788 case OP_COND:
789 case OP_SCOND:
790 /* Because of the way auto-callout works during compile, a callout item is
791 inserted between OP_COND and an assertion condition. */
792
793 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
794 {
795 if (pcre_callout != NULL)
796 {
797 pcre_callout_block cb;
798 cb.version = 1; /* Version 1 of the callout block */
799 cb.callout_number = ecode[LINK_SIZE+2];
800 cb.offset_vector = md->offset_vector;
801 cb.subject = (PCRE_SPTR)md->start_subject;
802 cb.subject_length = md->end_subject - md->start_subject;
803 cb.start_match = mstart - md->start_subject;
804 cb.current_position = eptr - md->start_subject;
805 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
806 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
807 cb.capture_top = offset_top/2;
808 cb.capture_last = md->capture_last;
809 cb.callout_data = md->callout_data;
810 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
811 if (rrc < 0) RRETURN(rrc);
812 }
813 ecode += _pcre_OP_lengths[OP_CALLOUT];
814 }
815
816 /* Now see what the actual condition is */
817
818 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
819 {
820 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
821 condition = md->recursive != NULL &&
822 (offset == RREF_ANY || offset == md->recursive->group_num);
823 ecode += condition? 3 : GET(ecode, 1);
824 }
825
826 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
827 {
828 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
829 condition = offset < offset_top && md->offset_vector[offset] >= 0;
830 ecode += condition? 3 : GET(ecode, 1);
831 }
832
833 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
834 {
835 condition = FALSE;
836 ecode += GET(ecode, 1);
837 }
838
839 /* The condition is an assertion. Call match() to evaluate it - setting
840 the final argument match_condassert causes it to stop at the end of an
841 assertion. */
842
843 else
844 {
845 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
846 match_condassert, RM3);
847 if (rrc == MATCH_MATCH)
848 {
849 condition = TRUE;
850 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
851 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
852 }
853 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
854 {
855 RRETURN(rrc); /* Need braces because of following else */
856 }
857 else
858 {
859 condition = FALSE;
860 ecode += GET(ecode, 1);
861 }
862 }
863
864 /* We are now at the branch that is to be obeyed. As there is only one,
865 we can use tail recursion to avoid using another stack frame, except when
866 match_cbegroup is required for an unlimited repeat of a possibly empty
867 group. If the second alternative doesn't exist, we can just plough on. */
868
869 if (condition || *ecode == OP_ALT)
870 {
871 ecode += 1 + LINK_SIZE;
872 if (op == OP_SCOND) /* Possibly empty group */
873 {
874 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
875 RRETURN(rrc);
876 }
877 else /* Group must match something */
878 {
879 flags = 0;
880 goto TAIL_RECURSE;
881 }
882 }
883 else /* Condition false & no 2nd alternative */
884 {
885 ecode += 1 + LINK_SIZE;
886 }
887 break;
888
889
890 /* End of the pattern, either real or forced. If we are in a top-level
891 recursion, we should restore the offsets appropriately and continue from
892 after the call. */
893
894 case OP_ACCEPT:
895 case OP_END:
896 if (md->recursive != NULL && md->recursive->group_num == 0)
897 {
898 recursion_info *rec = md->recursive;
899 DPRINTF(("End of pattern in a (?0) recursion\n"));
900 md->recursive = rec->prevrec;
901 memmove(md->offset_vector, rec->offset_save,
902 rec->saved_max * sizeof(int));
903 mstart = rec->save_start;
904 ims = original_ims;
905 ecode = rec->after_call;
906 break;
907 }
908
909 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
910 string - backtracking will then try other alternatives, if any. */
911
912 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
913 md->end_match_ptr = eptr; /* Record where we ended */
914 md->end_offset_top = offset_top; /* and how many extracts were taken */
915 md->start_match_ptr = mstart; /* and the start (\K can modify) */
916 RRETURN(MATCH_MATCH);
917
918 /* Change option settings */
919
920 case OP_OPT:
921 ims = ecode[1];
922 ecode += 2;
923 DPRINTF(("ims set to %02lx\n", ims));
924 break;
925
926 /* Assertion brackets. Check the alternative branches in turn - the
927 matching won't pass the KET for an assertion. If any one branch matches,
928 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
929 start of each branch to move the current point backwards, so the code at
930 this level is identical to the lookahead case. */
931
932 case OP_ASSERT:
933 case OP_ASSERTBACK:
934 do
935 {
936 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
937 RM4);
938 if (rrc == MATCH_MATCH) break;
939 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
940 ecode += GET(ecode, 1);
941 }
942 while (*ecode == OP_ALT);
943 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
944
945 /* If checking an assertion for a condition, return MATCH_MATCH. */
946
947 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
948
949 /* Continue from after the assertion, updating the offsets high water
950 mark, since extracts may have been taken during the assertion. */
951
952 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
953 ecode += 1 + LINK_SIZE;
954 offset_top = md->end_offset_top;
955 continue;
956
957 /* Negative assertion: all branches must fail to match */
958
959 case OP_ASSERT_NOT:
960 case OP_ASSERTBACK_NOT:
961 do
962 {
963 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
964 RM5);
965 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
966 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
967 ecode += GET(ecode,1);
968 }
969 while (*ecode == OP_ALT);
970
971 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972
973 ecode += 1 + LINK_SIZE;
974 continue;
975
976 /* Move the subject pointer back. This occurs only at the start of
977 each branch of a lookbehind assertion. If we are too close to the start to
978 move back, this match function fails. When working with UTF-8 we move
979 back a number of characters, not bytes. */
980
981 case OP_REVERSE:
982 #ifdef SUPPORT_UTF8
983 if (utf8)
984 {
985 i = GET(ecode, 1);
986 while (i-- > 0)
987 {
988 eptr--;
989 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
990 BACKCHAR(eptr);
991 }
992 }
993 else
994 #endif
995
996 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
997
998 {
999 eptr -= GET(ecode, 1);
1000 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* Skip to next op code */
1004
1005 ecode += 1 + LINK_SIZE;
1006 break;
1007
1008 /* The callout item calls an external function, if one is provided, passing
1009 details of the match so far. This is mainly for debugging, though the
1010 function is able to force a failure. */
1011
1012 case OP_CALLOUT:
1013 if (pcre_callout != NULL)
1014 {
1015 pcre_callout_block cb;
1016 cb.version = 1; /* Version 1 of the callout block */
1017 cb.callout_number = ecode[1];
1018 cb.offset_vector = md->offset_vector;
1019 cb.subject = (PCRE_SPTR)md->start_subject;
1020 cb.subject_length = md->end_subject - md->start_subject;
1021 cb.start_match = mstart - md->start_subject;
1022 cb.current_position = eptr - md->start_subject;
1023 cb.pattern_position = GET(ecode, 2);
1024 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1025 cb.capture_top = offset_top/2;
1026 cb.capture_last = md->capture_last;
1027 cb.callout_data = md->callout_data;
1028 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1029 if (rrc < 0) RRETURN(rrc);
1030 }
1031 ecode += 2 + 2*LINK_SIZE;
1032 break;
1033
1034 /* Recursion either matches the current regex, or some subexpression. The
1035 offset data is the offset to the starting bracket from the start of the
1036 whole pattern. (This is so that it works from duplicated subpatterns.)
1037
1038 If there are any capturing brackets started but not finished, we have to
1039 save their starting points and reinstate them after the recursion. However,
1040 we don't know how many such there are (offset_top records the completed
1041 total) so we just have to save all the potential data. There may be up to
1042 65535 such values, which is too large to put on the stack, but using malloc
1043 for small numbers seems expensive. As a compromise, the stack is used when
1044 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1045 is used. A problem is what to do if the malloc fails ... there is no way of
1046 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1047 values on the stack, and accept that the rest may be wrong.
1048
1049 There are also other values that have to be saved. We use a chained
1050 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1051 for the original version of this logic. */
1052
1053 case OP_RECURSE:
1054 {
1055 callpat = md->start_code + GET(ecode, 1);
1056 new_recursive.group_num = (callpat == md->start_code)? 0 :
1057 GET2(callpat, 1 + LINK_SIZE);
1058
1059 /* Add to "recursing stack" */
1060
1061 new_recursive.prevrec = md->recursive;
1062 md->recursive = &new_recursive;
1063
1064 /* Find where to continue from afterwards */
1065
1066 ecode += 1 + LINK_SIZE;
1067 new_recursive.after_call = ecode;
1068
1069 /* Now save the offset data. */
1070
1071 new_recursive.saved_max = md->offset_end;
1072 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1073 new_recursive.offset_save = stacksave;
1074 else
1075 {
1076 new_recursive.offset_save =
1077 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1078 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1079 }
1080
1081 memcpy(new_recursive.offset_save, md->offset_vector,
1082 new_recursive.saved_max * sizeof(int));
1083 new_recursive.save_start = mstart;
1084 mstart = eptr;
1085
1086 /* OK, now we can do the recursion. For each top-level alternative we
1087 restore the offset and recursion data. */
1088
1089 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1090 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1091 do
1092 {
1093 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1094 md, ims, eptrb, flags, RM6);
1095 if (rrc == MATCH_MATCH)
1096 {
1097 DPRINTF(("Recursion matched\n"));
1098 md->recursive = new_recursive.prevrec;
1099 if (new_recursive.offset_save != stacksave)
1100 (pcre_free)(new_recursive.offset_save);
1101 RRETURN(MATCH_MATCH);
1102 }
1103 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1104 {
1105 DPRINTF(("Recursion gave error %d\n", rrc));
1106 RRETURN(rrc);
1107 }
1108
1109 md->recursive = &new_recursive;
1110 memcpy(md->offset_vector, new_recursive.offset_save,
1111 new_recursive.saved_max * sizeof(int));
1112 callpat += GET(callpat, 1);
1113 }
1114 while (*callpat == OP_ALT);
1115
1116 DPRINTF(("Recursion didn't match\n"));
1117 md->recursive = new_recursive.prevrec;
1118 if (new_recursive.offset_save != stacksave)
1119 (pcre_free)(new_recursive.offset_save);
1120 RRETURN(MATCH_NOMATCH);
1121 }
1122 /* Control never reaches here */
1123
1124 /* "Once" brackets are like assertion brackets except that after a match,
1125 the point in the subject string is not moved back. Thus there can never be
1126 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1127 Check the alternative branches in turn - the matching won't pass the KET
1128 for this kind of subpattern. If any one branch matches, we carry on as at
1129 the end of a normal bracket, leaving the subject pointer. */
1130
1131 case OP_ONCE:
1132 prev = ecode;
1133 saved_eptr = eptr;
1134
1135 do
1136 {
1137 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1138 if (rrc == MATCH_MATCH) break;
1139 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1140 ecode += GET(ecode,1);
1141 }
1142 while (*ecode == OP_ALT);
1143
1144 /* If hit the end of the group (which could be repeated), fail */
1145
1146 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1147
1148 /* Continue as from after the assertion, updating the offsets high water
1149 mark, since extracts may have been taken. */
1150
1151 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1152
1153 offset_top = md->end_offset_top;
1154 eptr = md->end_match_ptr;
1155
1156 /* For a non-repeating ket, just continue at this level. This also
1157 happens for a repeating ket if no characters were matched in the group.
1158 This is the forcible breaking of infinite loops as implemented in Perl
1159 5.005. If there is an options reset, it will get obeyed in the normal
1160 course of events. */
1161
1162 if (*ecode == OP_KET || eptr == saved_eptr)
1163 {
1164 ecode += 1+LINK_SIZE;
1165 break;
1166 }
1167
1168 /* The repeating kets try the rest of the pattern or restart from the
1169 preceding bracket, in the appropriate order. The second "call" of match()
1170 uses tail recursion, to avoid using another stack frame. We need to reset
1171 any options that changed within the bracket before re-running it, so
1172 check the next opcode. */
1173
1174 if (ecode[1+LINK_SIZE] == OP_OPT)
1175 {
1176 ims = (ims & ~PCRE_IMS) | ecode[4];
1177 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1178 }
1179
1180 if (*ecode == OP_KETRMIN)
1181 {
1182 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1183 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1184 ecode = prev;
1185 flags = 0;
1186 goto TAIL_RECURSE;
1187 }
1188 else /* OP_KETRMAX */
1189 {
1190 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1191 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 ecode += 1 + LINK_SIZE;
1193 flags = 0;
1194 goto TAIL_RECURSE;
1195 }
1196 /* Control never gets here */
1197
1198 /* An alternation is the end of a branch; scan along to find the end of the
1199 bracketed group and go to there. */
1200
1201 case OP_ALT:
1202 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1203 break;
1204
1205 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1206 indicating that it may occur zero times. It may repeat infinitely, or not
1207 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1208 with fixed upper repeat limits are compiled as a number of copies, with the
1209 optional ones preceded by BRAZERO or BRAMINZERO. */
1210
1211 case OP_BRAZERO:
1212 {
1213 next = ecode+1;
1214 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1216 do next += GET(next,1); while (*next == OP_ALT);
1217 ecode = next + 1 + LINK_SIZE;
1218 }
1219 break;
1220
1221 case OP_BRAMINZERO:
1222 {
1223 next = ecode+1;
1224 do next += GET(next, 1); while (*next == OP_ALT);
1225 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1226 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1227 ecode++;
1228 }
1229 break;
1230
1231 case OP_SKIPZERO:
1232 {
1233 next = ecode+1;
1234 do next += GET(next,1); while (*next == OP_ALT);
1235 ecode = next + 1 + LINK_SIZE;
1236 }
1237 break;
1238
1239 /* End of a group, repeated or non-repeating. */
1240
1241 case OP_KET:
1242 case OP_KETRMIN:
1243 case OP_KETRMAX:
1244 prev = ecode - GET(ecode, 1);
1245
1246 /* If this was a group that remembered the subject start, in order to break
1247 infinite repeats of empty string matches, retrieve the subject start from
1248 the chain. Otherwise, set it NULL. */
1249
1250 if (*prev >= OP_SBRA)
1251 {
1252 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1253 eptrb = eptrb->epb_prev; /* Backup to previous group */
1254 }
1255 else saved_eptr = NULL;
1256
1257 /* If we are at the end of an assertion group, stop matching and return
1258 MATCH_MATCH, but record the current high water mark for use by positive
1259 assertions. Do this also for the "once" (atomic) groups. */
1260
1261 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1262 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1263 *prev == OP_ONCE)
1264 {
1265 md->end_match_ptr = eptr; /* For ONCE */
1266 md->end_offset_top = offset_top;
1267 RRETURN(MATCH_MATCH);
1268 }
1269
1270 /* For capturing groups we have to check the group number back at the start
1271 and if necessary complete handling an extraction by setting the offsets and
1272 bumping the high water mark. Note that whole-pattern recursion is coded as
1273 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1274 when the OP_END is reached. Other recursion is handled here. */
1275
1276 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1277 {
1278 number = GET2(prev, 1+LINK_SIZE);
1279 offset = number << 1;
1280
1281 #ifdef DEBUG
1282 printf("end bracket %d", number);
1283 printf("\n");
1284 #endif
1285
1286 md->capture_last = number;
1287 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1288 {
1289 md->offset_vector[offset] =
1290 md->offset_vector[md->offset_end - number];
1291 md->offset_vector[offset+1] = eptr - md->start_subject;
1292 if (offset_top <= offset) offset_top = offset + 2;
1293 }
1294
1295 /* Handle a recursively called group. Restore the offsets
1296 appropriately and continue from after the call. */
1297
1298 if (md->recursive != NULL && md->recursive->group_num == number)
1299 {
1300 recursion_info *rec = md->recursive;
1301 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1302 md->recursive = rec->prevrec;
1303 mstart = rec->save_start;
1304 memcpy(md->offset_vector, rec->offset_save,
1305 rec->saved_max * sizeof(int));
1306 ecode = rec->after_call;
1307 ims = original_ims;
1308 break;
1309 }
1310 }
1311
1312 /* For both capturing and non-capturing groups, reset the value of the ims
1313 flags, in case they got changed during the group. */
1314
1315 ims = original_ims;
1316 DPRINTF(("ims reset to %02lx\n", ims));
1317
1318 /* For a non-repeating ket, just continue at this level. This also
1319 happens for a repeating ket if no characters were matched in the group.
1320 This is the forcible breaking of infinite loops as implemented in Perl
1321 5.005. If there is an options reset, it will get obeyed in the normal
1322 course of events. */
1323
1324 if (*ecode == OP_KET || eptr == saved_eptr)
1325 {
1326 ecode += 1 + LINK_SIZE;
1327 break;
1328 }
1329
1330 /* The repeating kets try the rest of the pattern or restart from the
1331 preceding bracket, in the appropriate order. In the second case, we can use
1332 tail recursion to avoid using another stack frame, unless we have an
1333 unlimited repeat of a group that can match an empty string. */
1334
1335 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1336
1337 if (*ecode == OP_KETRMIN)
1338 {
1339 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1340 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1341 if (flags != 0) /* Could match an empty string */
1342 {
1343 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1344 RRETURN(rrc);
1345 }
1346 ecode = prev;
1347 goto TAIL_RECURSE;
1348 }
1349 else /* OP_KETRMAX */
1350 {
1351 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1352 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353 ecode += 1 + LINK_SIZE;
1354 flags = 0;
1355 goto TAIL_RECURSE;
1356 }
1357 /* Control never gets here */
1358
1359 /* Start of subject unless notbol, or after internal newline if multiline */
1360
1361 case OP_CIRC:
1362 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1363 if ((ims & PCRE_MULTILINE) != 0)
1364 {
1365 if (eptr != md->start_subject &&
1366 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1367 RRETURN(MATCH_NOMATCH);
1368 ecode++;
1369 break;
1370 }
1371 /* ... else fall through */
1372
1373 /* Start of subject assertion */
1374
1375 case OP_SOD:
1376 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1377 ecode++;
1378 break;
1379
1380 /* Start of match assertion */
1381
1382 case OP_SOM:
1383 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1384 ecode++;
1385 break;
1386
1387 /* Reset the start of match point */
1388
1389 case OP_SET_SOM:
1390 mstart = eptr;
1391 ecode++;
1392 break;
1393
1394 /* Assert before internal newline if multiline, or before a terminating
1395 newline unless endonly is set, else end of subject unless noteol is set. */
1396
1397 case OP_DOLL:
1398 if ((ims & PCRE_MULTILINE) != 0)
1399 {
1400 if (eptr < md->end_subject)
1401 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1402 else
1403 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1404 ecode++;
1405 break;
1406 }
1407 else
1408 {
1409 if (md->noteol) RRETURN(MATCH_NOMATCH);
1410 if (!md->endonly)
1411 {
1412 if (eptr != md->end_subject &&
1413 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1414 RRETURN(MATCH_NOMATCH);
1415 ecode++;
1416 break;
1417 }
1418 }
1419 /* ... else fall through for endonly */
1420
1421 /* End of subject assertion (\z) */
1422
1423 case OP_EOD:
1424 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1425 ecode++;
1426 break;
1427
1428 /* End of subject or ending \n assertion (\Z) */
1429
1430 case OP_EODN:
1431 if (eptr != md->end_subject &&
1432 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1433 RRETURN(MATCH_NOMATCH);
1434 ecode++;
1435 break;
1436
1437 /* Word boundary assertions */
1438
1439 case OP_NOT_WORD_BOUNDARY:
1440 case OP_WORD_BOUNDARY:
1441 {
1442
1443 /* Find out if the previous and current characters are "word" characters.
1444 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1445 be "non-word" characters. */
1446
1447 #ifdef SUPPORT_UTF8
1448 if (utf8)
1449 {
1450 if (eptr == md->start_subject) prev_is_word = FALSE; else
1451 {
1452 const uschar *lastptr = eptr - 1;
1453 while((*lastptr & 0xc0) == 0x80) lastptr--;
1454 GETCHAR(c, lastptr);
1455 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1456 }
1457 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1458 {
1459 GETCHAR(c, eptr);
1460 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1461 }
1462 }
1463 else
1464 #endif
1465
1466 /* More streamlined when not in UTF-8 mode */
1467
1468 {
1469 prev_is_word = (eptr != md->start_subject) &&
1470 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1471 cur_is_word = (eptr < md->end_subject) &&
1472 ((md->ctypes[*eptr] & ctype_word) != 0);
1473 }
1474
1475 /* Now see if the situation is what we want */
1476
1477 if ((*ecode++ == OP_WORD_BOUNDARY)?
1478 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1479 RRETURN(MATCH_NOMATCH);
1480 }
1481 break;
1482
1483 /* Match a single character type; inline for speed */
1484
1485 case OP_ANY:
1486 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1487 /* Fall through */
1488
1489 case OP_ALLANY:
1490 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1492 ecode++;
1493 break;
1494
1495 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1496 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1497
1498 case OP_ANYBYTE:
1499 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1500 ecode++;
1501 break;
1502
1503 case OP_NOT_DIGIT:
1504 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1505 GETCHARINCTEST(c, eptr);
1506 if (
1507 #ifdef SUPPORT_UTF8
1508 c < 256 &&
1509 #endif
1510 (md->ctypes[c] & ctype_digit) != 0
1511 )
1512 RRETURN(MATCH_NOMATCH);
1513 ecode++;
1514 break;
1515
1516 case OP_DIGIT:
1517 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1518 GETCHARINCTEST(c, eptr);
1519 if (
1520 #ifdef SUPPORT_UTF8
1521 c >= 256 ||
1522 #endif
1523 (md->ctypes[c] & ctype_digit) == 0
1524 )
1525 RRETURN(MATCH_NOMATCH);
1526 ecode++;
1527 break;
1528
1529 case OP_NOT_WHITESPACE:
1530 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1531 GETCHARINCTEST(c, eptr);
1532 if (
1533 #ifdef SUPPORT_UTF8
1534 c < 256 &&
1535 #endif
1536 (md->ctypes[c] & ctype_space) != 0
1537 )
1538 RRETURN(MATCH_NOMATCH);
1539 ecode++;
1540 break;
1541
1542 case OP_WHITESPACE:
1543 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1544 GETCHARINCTEST(c, eptr);
1545 if (
1546 #ifdef SUPPORT_UTF8
1547 c >= 256 ||
1548 #endif
1549 (md->ctypes[c] & ctype_space) == 0
1550 )
1551 RRETURN(MATCH_NOMATCH);
1552 ecode++;
1553 break;
1554
1555 case OP_NOT_WORDCHAR:
1556 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1557 GETCHARINCTEST(c, eptr);
1558 if (
1559 #ifdef SUPPORT_UTF8
1560 c < 256 &&
1561 #endif
1562 (md->ctypes[c] & ctype_word) != 0
1563 )
1564 RRETURN(MATCH_NOMATCH);
1565 ecode++;
1566 break;
1567
1568 case OP_WORDCHAR:
1569 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1570 GETCHARINCTEST(c, eptr);
1571 if (
1572 #ifdef SUPPORT_UTF8
1573 c >= 256 ||
1574 #endif
1575 (md->ctypes[c] & ctype_word) == 0
1576 )
1577 RRETURN(MATCH_NOMATCH);
1578 ecode++;
1579 break;
1580
1581 case OP_ANYNL:
1582 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1583 GETCHARINCTEST(c, eptr);
1584 switch(c)
1585 {
1586 default: RRETURN(MATCH_NOMATCH);
1587 case 0x000d:
1588 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1589 break;
1590
1591 case 0x000a:
1592 break;
1593
1594 case 0x000b:
1595 case 0x000c:
1596 case 0x0085:
1597 case 0x2028:
1598 case 0x2029:
1599 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1600 break;
1601 }
1602 ecode++;
1603 break;
1604
1605 case OP_NOT_HSPACE:
1606 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1607 GETCHARINCTEST(c, eptr);
1608 switch(c)
1609 {
1610 default: break;
1611 case 0x09: /* HT */
1612 case 0x20: /* SPACE */
1613 case 0xa0: /* NBSP */
1614 case 0x1680: /* OGHAM SPACE MARK */
1615 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1616 case 0x2000: /* EN QUAD */
1617 case 0x2001: /* EM QUAD */
1618 case 0x2002: /* EN SPACE */
1619 case 0x2003: /* EM SPACE */
1620 case 0x2004: /* THREE-PER-EM SPACE */
1621 case 0x2005: /* FOUR-PER-EM SPACE */
1622 case 0x2006: /* SIX-PER-EM SPACE */
1623 case 0x2007: /* FIGURE SPACE */
1624 case 0x2008: /* PUNCTUATION SPACE */
1625 case 0x2009: /* THIN SPACE */
1626 case 0x200A: /* HAIR SPACE */
1627 case 0x202f: /* NARROW NO-BREAK SPACE */
1628 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1629 case 0x3000: /* IDEOGRAPHIC SPACE */
1630 RRETURN(MATCH_NOMATCH);
1631 }
1632 ecode++;
1633 break;
1634
1635 case OP_HSPACE:
1636 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1637 GETCHARINCTEST(c, eptr);
1638 switch(c)
1639 {
1640 default: RRETURN(MATCH_NOMATCH);
1641 case 0x09: /* HT */
1642 case 0x20: /* SPACE */
1643 case 0xa0: /* NBSP */
1644 case 0x1680: /* OGHAM SPACE MARK */
1645 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1646 case 0x2000: /* EN QUAD */
1647 case 0x2001: /* EM QUAD */
1648 case 0x2002: /* EN SPACE */
1649 case 0x2003: /* EM SPACE */
1650 case 0x2004: /* THREE-PER-EM SPACE */
1651 case 0x2005: /* FOUR-PER-EM SPACE */
1652 case 0x2006: /* SIX-PER-EM SPACE */
1653 case 0x2007: /* FIGURE SPACE */
1654 case 0x2008: /* PUNCTUATION SPACE */
1655 case 0x2009: /* THIN SPACE */
1656 case 0x200A: /* HAIR SPACE */
1657 case 0x202f: /* NARROW NO-BREAK SPACE */
1658 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1659 case 0x3000: /* IDEOGRAPHIC SPACE */
1660 break;
1661 }
1662 ecode++;
1663 break;
1664
1665 case OP_NOT_VSPACE:
1666 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1667 GETCHARINCTEST(c, eptr);
1668 switch(c)
1669 {
1670 default: break;
1671 case 0x0a: /* LF */
1672 case 0x0b: /* VT */
1673 case 0x0c: /* FF */
1674 case 0x0d: /* CR */
1675 case 0x85: /* NEL */
1676 case 0x2028: /* LINE SEPARATOR */
1677 case 0x2029: /* PARAGRAPH SEPARATOR */
1678 RRETURN(MATCH_NOMATCH);
1679 }
1680 ecode++;
1681 break;
1682
1683 case OP_VSPACE:
1684 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1685 GETCHARINCTEST(c, eptr);
1686 switch(c)
1687 {
1688 default: RRETURN(MATCH_NOMATCH);
1689 case 0x0a: /* LF */
1690 case 0x0b: /* VT */
1691 case 0x0c: /* FF */
1692 case 0x0d: /* CR */
1693 case 0x85: /* NEL */
1694 case 0x2028: /* LINE SEPARATOR */
1695 case 0x2029: /* PARAGRAPH SEPARATOR */
1696 break;
1697 }
1698 ecode++;
1699 break;
1700
1701 #ifdef SUPPORT_UCP
1702 /* Check the next character by Unicode property. We will get here only
1703 if the support is in the binary; otherwise a compile-time error occurs. */
1704
1705 case OP_PROP:
1706 case OP_NOTPROP:
1707 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1708 GETCHARINCTEST(c, eptr);
1709 {
1710 const ucd_record * prop = GET_UCD(c);
1711
1712 switch(ecode[1])
1713 {
1714 case PT_ANY:
1715 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1716 break;
1717
1718 case PT_LAMP:
1719 if ((prop->chartype == ucp_Lu ||
1720 prop->chartype == ucp_Ll ||
1721 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1722 RRETURN(MATCH_NOMATCH);
1723 break;
1724
1725 case PT_GC:
1726 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1727 RRETURN(MATCH_NOMATCH);
1728 break;
1729
1730 case PT_PC:
1731 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1732 RRETURN(MATCH_NOMATCH);
1733 break;
1734
1735 case PT_SC:
1736 if ((ecode[2] != prop->script) == (op == OP_PROP))
1737 RRETURN(MATCH_NOMATCH);
1738 break;
1739
1740 default:
1741 RRETURN(PCRE_ERROR_INTERNAL);
1742 }
1743
1744 ecode += 3;
1745 }
1746 break;
1747
1748 /* Match an extended Unicode sequence. We will get here only if the support
1749 is in the binary; otherwise a compile-time error occurs. */
1750
1751 case OP_EXTUNI:
1752 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1753 GETCHARINCTEST(c, eptr);
1754 {
1755 int category = UCD_CATEGORY(c);
1756 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1757 while (eptr < md->end_subject)
1758 {
1759 int len = 1;
1760 if (!utf8) c = *eptr; else
1761 {
1762 GETCHARLEN(c, eptr, len);
1763 }
1764 category = UCD_CATEGORY(c);
1765 if (category != ucp_M) break;
1766 eptr += len;
1767 }
1768 }
1769 ecode++;
1770 break;
1771 #endif
1772
1773
1774 /* Match a back reference, possibly repeatedly. Look past the end of the
1775 item to see if there is repeat information following. The code is similar
1776 to that for character classes, but repeated for efficiency. Then obey
1777 similar code to character type repeats - written out again for speed.
1778 However, if the referenced string is the empty string, always treat
1779 it as matched, any number of times (otherwise there could be infinite
1780 loops). */
1781
1782 case OP_REF:
1783 {
1784 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1785 ecode += 3;
1786
1787 /* If the reference is unset, there are two possibilities:
1788
1789 (a) In the default, Perl-compatible state, set the length to be longer
1790 than the amount of subject left; this ensures that every attempt at a
1791 match fails. We can't just fail here, because of the possibility of
1792 quantifiers with zero minima.
1793
1794 (b) If the JavaScript compatibility flag is set, set the length to zero
1795 so that the back reference matches an empty string.
1796
1797 Otherwise, set the length to the length of what was matched by the
1798 referenced subpattern. */
1799
1800 if (offset >= offset_top || md->offset_vector[offset] < 0)
1801 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1802 else
1803 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1804
1805 /* Set up for repetition, or handle the non-repeated case */
1806
1807 switch (*ecode)
1808 {
1809 case OP_CRSTAR:
1810 case OP_CRMINSTAR:
1811 case OP_CRPLUS:
1812 case OP_CRMINPLUS:
1813 case OP_CRQUERY:
1814 case OP_CRMINQUERY:
1815 c = *ecode++ - OP_CRSTAR;
1816 minimize = (c & 1) != 0;
1817 min = rep_min[c]; /* Pick up values from tables; */
1818 max = rep_max[c]; /* zero for max => infinity */
1819 if (max == 0) max = INT_MAX;
1820 break;
1821
1822 case OP_CRRANGE:
1823 case OP_CRMINRANGE:
1824 minimize = (*ecode == OP_CRMINRANGE);
1825 min = GET2(ecode, 1);
1826 max = GET2(ecode, 3);
1827 if (max == 0) max = INT_MAX;
1828 ecode += 5;
1829 break;
1830
1831 default: /* No repeat follows */
1832 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1833 eptr += length;
1834 continue; /* With the main loop */
1835 }
1836
1837 /* If the length of the reference is zero, just continue with the
1838 main loop. */
1839
1840 if (length == 0) continue;
1841
1842 /* First, ensure the minimum number of matches are present. We get back
1843 the length of the reference string explicitly rather than passing the
1844 address of eptr, so that eptr can be a register variable. */
1845
1846 for (i = 1; i <= min; i++)
1847 {
1848 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1849 eptr += length;
1850 }
1851
1852 /* If min = max, continue at the same level without recursion.
1853 They are not both allowed to be zero. */
1854
1855 if (min == max) continue;
1856
1857 /* If minimizing, keep trying and advancing the pointer */
1858
1859 if (minimize)
1860 {
1861 for (fi = min;; fi++)
1862 {
1863 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1865 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1866 RRETURN(MATCH_NOMATCH);
1867 eptr += length;
1868 }
1869 /* Control never gets here */
1870 }
1871
1872 /* If maximizing, find the longest string and work backwards */
1873
1874 else
1875 {
1876 pp = eptr;
1877 for (i = min; i < max; i++)
1878 {
1879 if (!match_ref(offset, eptr, length, md, ims)) break;
1880 eptr += length;
1881 }
1882 while (eptr >= pp)
1883 {
1884 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886 eptr -= length;
1887 }
1888 RRETURN(MATCH_NOMATCH);
1889 }
1890 }
1891 /* Control never gets here */
1892
1893
1894
1895 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1896 used when all the characters in the class have values in the range 0-255,
1897 and either the matching is caseful, or the characters are in the range
1898 0-127 when UTF-8 processing is enabled. The only difference between
1899 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1900 encountered.
1901
1902 First, look past the end of the item to see if there is repeat information
1903 following. Then obey similar code to character type repeats - written out
1904 again for speed. */
1905
1906 case OP_NCLASS:
1907 case OP_CLASS:
1908 {
1909 data = ecode + 1; /* Save for matching */
1910 ecode += 33; /* Advance past the item */
1911
1912 switch (*ecode)
1913 {
1914 case OP_CRSTAR:
1915 case OP_CRMINSTAR:
1916 case OP_CRPLUS:
1917 case OP_CRMINPLUS:
1918 case OP_CRQUERY:
1919 case OP_CRMINQUERY:
1920 c = *ecode++ - OP_CRSTAR;
1921 minimize = (c & 1) != 0;
1922 min = rep_min[c]; /* Pick up values from tables; */
1923 max = rep_max[c]; /* zero for max => infinity */
1924 if (max == 0) max = INT_MAX;
1925 break;
1926
1927 case OP_CRRANGE:
1928 case OP_CRMINRANGE:
1929 minimize = (*ecode == OP_CRMINRANGE);
1930 min = GET2(ecode, 1);
1931 max = GET2(ecode, 3);
1932 if (max == 0) max = INT_MAX;
1933 ecode += 5;
1934 break;
1935
1936 default: /* No repeat follows */
1937 min = max = 1;
1938 break;
1939 }
1940
1941 /* First, ensure the minimum number of matches are present. */
1942
1943 #ifdef SUPPORT_UTF8
1944 /* UTF-8 mode */
1945 if (utf8)
1946 {
1947 for (i = 1; i <= min; i++)
1948 {
1949 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1950 GETCHARINC(c, eptr);
1951 if (c > 255)
1952 {
1953 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1954 }
1955 else
1956 {
1957 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1958 }
1959 }
1960 }
1961 else
1962 #endif
1963 /* Not UTF-8 mode */
1964 {
1965 for (i = 1; i <= min; i++)
1966 {
1967 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1968 c = *eptr++;
1969 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970 }
1971 }
1972
1973 /* If max == min we can continue with the main loop without the
1974 need to recurse. */
1975
1976 if (min == max) continue;
1977
1978 /* If minimizing, keep testing the rest of the expression and advancing
1979 the pointer while it matches the class. */
1980
1981 if (minimize)
1982 {
1983 #ifdef SUPPORT_UTF8
1984 /* UTF-8 mode */
1985 if (utf8)
1986 {
1987 for (fi = min;; fi++)
1988 {
1989 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1991 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1992 GETCHARINC(c, eptr);
1993 if (c > 255)
1994 {
1995 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1996 }
1997 else
1998 {
1999 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2000 }
2001 }
2002 }
2003 else
2004 #endif
2005 /* Not UTF-8 mode */
2006 {
2007 for (fi = min;; fi++)
2008 {
2009 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2012 c = *eptr++;
2013 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2014 }
2015 }
2016 /* Control never gets here */
2017 }
2018
2019 /* If maximizing, find the longest possible run, then work backwards. */
2020
2021 else
2022 {
2023 pp = eptr;
2024
2025 #ifdef SUPPORT_UTF8
2026 /* UTF-8 mode */
2027 if (utf8)
2028 {
2029 for (i = min; i < max; i++)
2030 {
2031 int len = 1;
2032 if (eptr >= md->end_subject) break;
2033 GETCHARLEN(c, eptr, len);
2034 if (c > 255)
2035 {
2036 if (op == OP_CLASS) break;
2037 }
2038 else
2039 {
2040 if ((data[c/8] & (1 << (c&7))) == 0) break;
2041 }
2042 eptr += len;
2043 }
2044 for (;;)
2045 {
2046 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 if (eptr-- == pp) break; /* Stop if tried at original pos */
2049 BACKCHAR(eptr);
2050 }
2051 }
2052 else
2053 #endif
2054 /* Not UTF-8 mode */
2055 {
2056 for (i = min; i < max; i++)
2057 {
2058 if (eptr >= md->end_subject) break;
2059 c = *eptr;
2060 if ((data[c/8] & (1 << (c&7))) == 0) break;
2061 eptr++;
2062 }
2063 while (eptr >= pp)
2064 {
2065 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 eptr--;
2068 }
2069 }
2070
2071 RRETURN(MATCH_NOMATCH);
2072 }
2073 }
2074 /* Control never gets here */
2075
2076
2077 /* Match an extended character class. This opcode is encountered only
2078 in UTF-8 mode, because that's the only time it is compiled. */
2079
2080 #ifdef SUPPORT_UTF8
2081 case OP_XCLASS:
2082 {
2083 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2084 ecode += GET(ecode, 1); /* Advance past the item */
2085
2086 switch (*ecode)
2087 {
2088 case OP_CRSTAR:
2089 case OP_CRMINSTAR:
2090 case OP_CRPLUS:
2091 case OP_CRMINPLUS:
2092 case OP_CRQUERY:
2093 case OP_CRMINQUERY:
2094 c = *ecode++ - OP_CRSTAR;
2095 minimize = (c & 1) != 0;
2096 min = rep_min[c]; /* Pick up values from tables; */
2097 max = rep_max[c]; /* zero for max => infinity */
2098 if (max == 0) max = INT_MAX;
2099 break;
2100
2101 case OP_CRRANGE:
2102 case OP_CRMINRANGE:
2103 minimize = (*ecode == OP_CRMINRANGE);
2104 min = GET2(ecode, 1);
2105 max = GET2(ecode, 3);
2106 if (max == 0) max = INT_MAX;
2107 ecode += 5;
2108 break;
2109
2110 default: /* No repeat follows */
2111 min = max = 1;
2112 break;
2113 }
2114
2115 /* First, ensure the minimum number of matches are present. */
2116
2117 for (i = 1; i <= min; i++)
2118 {
2119 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2120 GETCHARINC(c, eptr);
2121 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2122 }
2123
2124 /* If max == min we can continue with the main loop without the
2125 need to recurse. */
2126
2127 if (min == max) continue;
2128
2129 /* If minimizing, keep testing the rest of the expression and advancing
2130 the pointer while it matches the class. */
2131
2132 if (minimize)
2133 {
2134 for (fi = min;; fi++)
2135 {
2136 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2138 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2139 GETCHARINC(c, eptr);
2140 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2141 }
2142 /* Control never gets here */
2143 }
2144
2145 /* If maximizing, find the longest possible run, then work backwards. */
2146
2147 else
2148 {
2149 pp = eptr;
2150 for (i = min; i < max; i++)
2151 {
2152 int len = 1;
2153 if (eptr >= md->end_subject) break;
2154 GETCHARLEN(c, eptr, len);
2155 if (!_pcre_xclass(c, data)) break;
2156 eptr += len;
2157 }
2158 for(;;)
2159 {
2160 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2162 if (eptr-- == pp) break; /* Stop if tried at original pos */
2163 if (utf8) BACKCHAR(eptr);
2164 }
2165 RRETURN(MATCH_NOMATCH);
2166 }
2167
2168 /* Control never gets here */
2169 }
2170 #endif /* End of XCLASS */
2171
2172 /* Match a single character, casefully */
2173
2174 case OP_CHAR:
2175 #ifdef SUPPORT_UTF8
2176 if (utf8)
2177 {
2178 length = 1;
2179 ecode++;
2180 GETCHARLEN(fc, ecode, length);
2181 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2182 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2183 }
2184 else
2185 #endif
2186
2187 /* Non-UTF-8 mode */
2188 {
2189 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2190 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2191 ecode += 2;
2192 }
2193 break;
2194
2195 /* Match a single character, caselessly */
2196
2197 case OP_CHARNC:
2198 #ifdef SUPPORT_UTF8
2199 if (utf8)
2200 {
2201 length = 1;
2202 ecode++;
2203 GETCHARLEN(fc, ecode, length);
2204
2205 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2206
2207 /* If the pattern character's value is < 128, we have only one byte, and
2208 can use the fast lookup table. */
2209
2210 if (fc < 128)
2211 {
2212 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2213 }
2214
2215 /* Otherwise we must pick up the subject character */
2216
2217 else
2218 {
2219 unsigned int dc;
2220 GETCHARINC(dc, eptr);
2221 ecode += length;
2222
2223 /* If we have Unicode property support, we can use it to test the other
2224 case of the character, if there is one. */
2225
2226 if (fc != dc)
2227 {
2228 #ifdef SUPPORT_UCP
2229 if (dc != UCD_OTHERCASE(fc))
2230 #endif
2231 RRETURN(MATCH_NOMATCH);
2232 }
2233 }
2234 }
2235 else
2236 #endif /* SUPPORT_UTF8 */
2237
2238 /* Non-UTF-8 mode */
2239 {
2240 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2241 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2242 ecode += 2;
2243 }
2244 break;
2245
2246 /* Match a single character repeatedly. */
2247
2248 case OP_EXACT:
2249 min = max = GET2(ecode, 1);
2250 ecode += 3;
2251 goto REPEATCHAR;
2252
2253 case OP_POSUPTO:
2254 possessive = TRUE;
2255 /* Fall through */
2256
2257 case OP_UPTO:
2258 case OP_MINUPTO:
2259 min = 0;
2260 max = GET2(ecode, 1);
2261 minimize = *ecode == OP_MINUPTO;
2262 ecode += 3;
2263 goto REPEATCHAR;
2264
2265 case OP_POSSTAR:
2266 possessive = TRUE;
2267 min = 0;
2268 max = INT_MAX;
2269 ecode++;
2270 goto REPEATCHAR;
2271
2272 case OP_POSPLUS:
2273 possessive = TRUE;
2274 min = 1;
2275 max = INT_MAX;
2276 ecode++;
2277 goto REPEATCHAR;
2278
2279 case OP_POSQUERY:
2280 possessive = TRUE;
2281 min = 0;
2282 max = 1;
2283 ecode++;
2284 goto REPEATCHAR;
2285
2286 case OP_STAR:
2287 case OP_MINSTAR:
2288 case OP_PLUS:
2289 case OP_MINPLUS:
2290 case OP_QUERY:
2291 case OP_MINQUERY:
2292 c = *ecode++ - OP_STAR;
2293 minimize = (c & 1) != 0;
2294 min = rep_min[c]; /* Pick up values from tables; */
2295 max = rep_max[c]; /* zero for max => infinity */
2296 if (max == 0) max = INT_MAX;
2297
2298 /* Common code for all repeated single-character matches. We can give
2299 up quickly if there are fewer than the minimum number of characters left in
2300 the subject. */
2301
2302 REPEATCHAR:
2303 #ifdef SUPPORT_UTF8
2304 if (utf8)
2305 {
2306 length = 1;
2307 charptr = ecode;
2308 GETCHARLEN(fc, ecode, length);
2309 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2310 ecode += length;
2311
2312 /* Handle multibyte character matching specially here. There is
2313 support for caseless matching if UCP support is present. */
2314
2315 if (length > 1)
2316 {
2317 #ifdef SUPPORT_UCP
2318 unsigned int othercase;
2319 if ((ims & PCRE_CASELESS) != 0 &&
2320 (othercase = UCD_OTHERCASE(fc)) != fc)
2321 oclength = _pcre_ord2utf8(othercase, occhars);
2322 else oclength = 0;
2323 #endif /* SUPPORT_UCP */
2324
2325 for (i = 1; i <= min; i++)
2326 {
2327 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2328 #ifdef SUPPORT_UCP
2329 /* Need braces because of following else */
2330 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2331 else
2332 {
2333 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2334 eptr += oclength;
2335 }
2336 #else /* without SUPPORT_UCP */
2337 else { RRETURN(MATCH_NOMATCH); }
2338 #endif /* SUPPORT_UCP */
2339 }
2340
2341 if (min == max) continue;
2342
2343 if (minimize)
2344 {
2345 for (fi = min;; fi++)
2346 {
2347 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2349 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2350 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2351 #ifdef SUPPORT_UCP
2352 /* Need braces because of following else */
2353 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2354 else
2355 {
2356 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2357 eptr += oclength;
2358 }
2359 #else /* without SUPPORT_UCP */
2360 else { RRETURN (MATCH_NOMATCH); }
2361 #endif /* SUPPORT_UCP */
2362 }
2363 /* Control never gets here */
2364 }
2365
2366 else /* Maximize */
2367 {
2368 pp = eptr;
2369 for (i = min; i < max; i++)
2370 {
2371 if (eptr > md->end_subject - length) break;
2372 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2373 #ifdef SUPPORT_UCP
2374 else if (oclength == 0) break;
2375 else
2376 {
2377 if (memcmp(eptr, occhars, oclength) != 0) break;
2378 eptr += oclength;
2379 }
2380 #else /* without SUPPORT_UCP */
2381 else break;
2382 #endif /* SUPPORT_UCP */
2383 }
2384
2385 if (possessive) continue;
2386 for(;;)
2387 {
2388 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2389 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2390 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2391 #ifdef SUPPORT_UCP
2392 eptr--;
2393 BACKCHAR(eptr);
2394 #else /* without SUPPORT_UCP */
2395 eptr -= length;
2396 #endif /* SUPPORT_UCP */
2397 }
2398 }
2399 /* Control never gets here */
2400 }
2401
2402 /* If the length of a UTF-8 character is 1, we fall through here, and
2403 obey the code as for non-UTF-8 characters below, though in this case the
2404 value of fc will always be < 128. */
2405 }
2406 else
2407 #endif /* SUPPORT_UTF8 */
2408
2409 /* When not in UTF-8 mode, load a single-byte character. */
2410 {
2411 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2412 fc = *ecode++;
2413 }
2414
2415 /* The value of fc at this point is always less than 256, though we may or
2416 may not be in UTF-8 mode. The code is duplicated for the caseless and
2417 caseful cases, for speed, since matching characters is likely to be quite
2418 common. First, ensure the minimum number of matches are present. If min =
2419 max, continue at the same level without recursing. Otherwise, if
2420 minimizing, keep trying the rest of the expression and advancing one
2421 matching character if failing, up to the maximum. Alternatively, if
2422 maximizing, find the maximum number of characters and work backwards. */
2423
2424 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2425 max, eptr));
2426
2427 if ((ims & PCRE_CASELESS) != 0)
2428 {
2429 fc = md->lcc[fc];
2430 for (i = 1; i <= min; i++)
2431 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2432 if (min == max) continue;
2433 if (minimize)
2434 {
2435 for (fi = min;; fi++)
2436 {
2437 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2438 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2439 if (fi >= max || eptr >= md->end_subject ||
2440 fc != md->lcc[*eptr++])
2441 RRETURN(MATCH_NOMATCH);
2442 }
2443 /* Control never gets here */
2444 }
2445 else /* Maximize */
2446 {
2447 pp = eptr;
2448 for (i = min; i < max; i++)
2449 {
2450 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2451 eptr++;
2452 }
2453 if (possessive) continue;
2454 while (eptr >= pp)
2455 {
2456 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2457 eptr--;
2458 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2459 }
2460 RRETURN(MATCH_NOMATCH);
2461 }
2462 /* Control never gets here */
2463 }
2464
2465 /* Caseful comparisons (includes all multi-byte characters) */
2466
2467 else
2468 {
2469 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2470 if (min == max) continue;
2471 if (minimize)
2472 {
2473 for (fi = min;; fi++)
2474 {
2475 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2476 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2477 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2478 RRETURN(MATCH_NOMATCH);
2479 }
2480 /* Control never gets here */
2481 }
2482 else /* Maximize */
2483 {
2484 pp = eptr;
2485 for (i = min; i < max; i++)
2486 {
2487 if (eptr >= md->end_subject || fc != *eptr) break;
2488 eptr++;
2489 }
2490 if (possessive) continue;
2491 while (eptr >= pp)
2492 {
2493 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2494 eptr--;
2495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2496 }
2497 RRETURN(MATCH_NOMATCH);
2498 }
2499 }
2500 /* Control never gets here */
2501
2502 /* Match a negated single one-byte character. The character we are
2503 checking can be multibyte. */
2504
2505 case OP_NOT:
2506 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2507 ecode++;
2508 GETCHARINCTEST(c, eptr);
2509 if ((ims & PCRE_CASELESS) != 0)
2510 {
2511 #ifdef SUPPORT_UTF8
2512 if (c < 256)
2513 #endif
2514 c = md->lcc[c];
2515 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2516 }
2517 else
2518 {
2519 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2520 }
2521 break;
2522
2523 /* Match a negated single one-byte character repeatedly. This is almost a
2524 repeat of the code for a repeated single character, but I haven't found a
2525 nice way of commoning these up that doesn't require a test of the
2526 positive/negative option for each character match. Maybe that wouldn't add
2527 very much to the time taken, but character matching *is* what this is all
2528 about... */
2529
2530 case OP_NOTEXACT:
2531 min = max = GET2(ecode, 1);
2532 ecode += 3;
2533 goto REPEATNOTCHAR;
2534
2535 case OP_NOTUPTO:
2536 case OP_NOTMINUPTO:
2537 min = 0;
2538 max = GET2(ecode, 1);
2539 minimize = *ecode == OP_NOTMINUPTO;
2540 ecode += 3;
2541 goto REPEATNOTCHAR;
2542
2543 case OP_NOTPOSSTAR:
2544 possessive = TRUE;
2545 min = 0;
2546 max = INT_MAX;
2547 ecode++;
2548 goto REPEATNOTCHAR;
2549
2550 case OP_NOTPOSPLUS:
2551 possessive = TRUE;
2552 min = 1;
2553 max = INT_MAX;
2554 ecode++;
2555 goto REPEATNOTCHAR;
2556
2557 case OP_NOTPOSQUERY:
2558 possessive = TRUE;
2559 min = 0;
2560 max = 1;
2561 ecode++;
2562 goto REPEATNOTCHAR;
2563
2564 case OP_NOTPOSUPTO:
2565 possessive = TRUE;
2566 min = 0;
2567 max = GET2(ecode, 1);
2568 ecode += 3;
2569 goto REPEATNOTCHAR;
2570
2571 case OP_NOTSTAR:
2572 case OP_NOTMINSTAR:
2573 case OP_NOTPLUS:
2574 case OP_NOTMINPLUS:
2575 case OP_NOTQUERY:
2576 case OP_NOTMINQUERY:
2577 c = *ecode++ - OP_NOTSTAR;
2578 minimize = (c & 1) != 0;
2579 min = rep_min[c]; /* Pick up values from tables; */
2580 max = rep_max[c]; /* zero for max => infinity */
2581 if (max == 0) max = INT_MAX;
2582
2583 /* Common code for all repeated single-byte matches. We can give up quickly
2584 if there are fewer than the minimum number of bytes left in the
2585 subject. */
2586
2587 REPEATNOTCHAR:
2588 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2589 fc = *ecode++;
2590
2591 /* The code is duplicated for the caseless and caseful cases, for speed,
2592 since matching characters is likely to be quite common. First, ensure the
2593 minimum number of matches are present. If min = max, continue at the same
2594 level without recursing. Otherwise, if minimizing, keep trying the rest of
2595 the expression and advancing one matching character if failing, up to the
2596 maximum. Alternatively, if maximizing, find the maximum number of
2597 characters and work backwards. */
2598
2599 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2600 max, eptr));
2601
2602 if ((ims & PCRE_CASELESS) != 0)
2603 {
2604 fc = md->lcc[fc];
2605
2606 #ifdef SUPPORT_UTF8
2607 /* UTF-8 mode */
2608 if (utf8)
2609 {
2610 register unsigned int d;
2611 for (i = 1; i <= min; i++)
2612 {
2613 GETCHARINC(d, eptr);
2614 if (d < 256) d = md->lcc[d];
2615 if (fc == d) RRETURN(MATCH_NOMATCH);
2616 }
2617 }
2618 else
2619 #endif
2620
2621 /* Not UTF-8 mode */
2622 {
2623 for (i = 1; i <= min; i++)
2624 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2625 }
2626
2627 if (min == max) continue;
2628
2629 if (minimize)
2630 {
2631 #ifdef SUPPORT_UTF8
2632 /* UTF-8 mode */
2633 if (utf8)
2634 {
2635 register unsigned int d;
2636 for (fi = min;; fi++)
2637 {
2638 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2640 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2641 GETCHARINC(d, eptr);
2642 if (d < 256) d = md->lcc[d];
2643 if (fc == d) RRETURN(MATCH_NOMATCH);
2644
2645 }
2646 }
2647 else
2648 #endif
2649 /* Not UTF-8 mode */
2650 {
2651 for (fi = min;; fi++)
2652 {
2653 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2655 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2656 RRETURN(MATCH_NOMATCH);
2657 }
2658 }
2659 /* Control never gets here */
2660 }
2661
2662 /* Maximize case */
2663
2664 else
2665 {
2666 pp = eptr;
2667
2668 #ifdef SUPPORT_UTF8
2669 /* UTF-8 mode */
2670 if (utf8)
2671 {
2672 register unsigned int d;
2673 for (i = min; i < max; i++)
2674 {
2675 int len = 1;
2676 if (eptr >= md->end_subject) break;
2677 GETCHARLEN(d, eptr, len);
2678 if (d < 256) d = md->lcc[d];
2679 if (fc == d) break;
2680 eptr += len;
2681 }
2682 if (possessive) continue;
2683 for(;;)
2684 {
2685 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2687 if (eptr-- == pp) break; /* Stop if tried at original pos */
2688 BACKCHAR(eptr);
2689 }
2690 }
2691 else
2692 #endif
2693 /* Not UTF-8 mode */
2694 {
2695 for (i = min; i < max; i++)
2696 {
2697 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2698 eptr++;
2699 }
2700 if (possessive) continue;
2701 while (eptr >= pp)
2702 {
2703 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2704 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2705 eptr--;
2706 }
2707 }
2708
2709 RRETURN(MATCH_NOMATCH);
2710 }
2711 /* Control never gets here */
2712 }
2713
2714 /* Caseful comparisons */
2715
2716 else
2717 {
2718 #ifdef SUPPORT_UTF8
2719 /* UTF-8 mode */
2720 if (utf8)
2721 {
2722 register unsigned int d;
2723 for (i = 1; i <= min; i++)
2724 {
2725 GETCHARINC(d, eptr);
2726 if (fc == d) RRETURN(MATCH_NOMATCH);
2727 }
2728 }
2729 else
2730 #endif
2731 /* Not UTF-8 mode */
2732 {
2733 for (i = 1; i <= min; i++)
2734 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2735 }
2736
2737 if (min == max) continue;
2738
2739 if (minimize)
2740 {
2741 #ifdef SUPPORT_UTF8
2742 /* UTF-8 mode */
2743 if (utf8)
2744 {
2745 register unsigned int d;
2746 for (fi = min;; fi++)
2747 {
2748 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2749 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2750 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2751 GETCHARINC(d, eptr);
2752 if (fc == d) RRETURN(MATCH_NOMATCH);
2753 }
2754 }
2755 else
2756 #endif
2757 /* Not UTF-8 mode */
2758 {
2759 for (fi = min;; fi++)
2760 {
2761 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2762 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2764 RRETURN(MATCH_NOMATCH);
2765 }
2766 }
2767 /* Control never gets here */
2768 }
2769
2770 /* Maximize case */
2771
2772 else
2773 {
2774 pp = eptr;
2775
2776 #ifdef SUPPORT_UTF8
2777 /* UTF-8 mode */
2778 if (utf8)
2779 {
2780 register unsigned int d;
2781 for (i = min; i < max; i++)
2782 {
2783 int len = 1;
2784 if (eptr >= md->end_subject) break;
2785 GETCHARLEN(d, eptr, len);
2786 if (fc == d) break;
2787 eptr += len;
2788 }
2789 if (possessive) continue;
2790 for(;;)
2791 {
2792 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2794 if (eptr-- == pp) break; /* Stop if tried at original pos */
2795 BACKCHAR(eptr);
2796 }
2797 }
2798 else
2799 #endif
2800 /* Not UTF-8 mode */
2801 {
2802 for (i = min; i < max; i++)
2803 {
2804 if (eptr >= md->end_subject || fc == *eptr) break;
2805 eptr++;
2806 }
2807 if (possessive) continue;
2808 while (eptr >= pp)
2809 {
2810 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2812 eptr--;
2813 }
2814 }
2815
2816 RRETURN(MATCH_NOMATCH);
2817 }
2818 }
2819 /* Control never gets here */
2820
2821 /* Match a single character type repeatedly; several different opcodes
2822 share code. This is very similar to the code for single characters, but we
2823 repeat it in the interests of efficiency. */
2824
2825 case OP_TYPEEXACT:
2826 min = max = GET2(ecode, 1);
2827 minimize = TRUE;
2828 ecode += 3;
2829 goto REPEATTYPE;
2830
2831 case OP_TYPEUPTO:
2832 case OP_TYPEMINUPTO:
2833 min = 0;
2834 max = GET2(ecode, 1);
2835 minimize = *ecode == OP_TYPEMINUPTO;
2836 ecode += 3;
2837 goto REPEATTYPE;
2838
2839 case OP_TYPEPOSSTAR:
2840 possessive = TRUE;
2841 min = 0;
2842 max = INT_MAX;
2843 ecode++;
2844 goto REPEATTYPE;
2845
2846 case OP_TYPEPOSPLUS:
2847 possessive = TRUE;
2848 min = 1;
2849 max = INT_MAX;
2850 ecode++;
2851 goto REPEATTYPE;
2852
2853 case OP_TYPEPOSQUERY:
2854 possessive = TRUE;
2855 min = 0;
2856 max = 1;
2857 ecode++;
2858 goto REPEATTYPE;
2859
2860 case OP_TYPEPOSUPTO:
2861 possessive = TRUE;
2862 min = 0;
2863 max = GET2(ecode, 1);
2864 ecode += 3;
2865 goto REPEATTYPE;
2866
2867 case OP_TYPESTAR:
2868 case OP_TYPEMINSTAR:
2869 case OP_TYPEPLUS:
2870 case OP_TYPEMINPLUS:
2871 case OP_TYPEQUERY:
2872 case OP_TYPEMINQUERY:
2873 c = *ecode++ - OP_TYPESTAR;
2874 minimize = (c & 1) != 0;
2875 min = rep_min[c]; /* Pick up values from tables; */
2876 max = rep_max[c]; /* zero for max => infinity */
2877 if (max == 0) max = INT_MAX;
2878
2879 /* Common code for all repeated single character type matches. Note that
2880 in UTF-8 mode, '.' matches a character of any length, but for the other
2881 character types, the valid characters are all one-byte long. */
2882
2883 REPEATTYPE:
2884 ctype = *ecode++; /* Code for the character type */
2885
2886 #ifdef SUPPORT_UCP
2887 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2888 {
2889 prop_fail_result = ctype == OP_NOTPROP;
2890 prop_type = *ecode++;
2891 prop_value = *ecode++;
2892 }
2893 else prop_type = -1;
2894 #endif
2895
2896 /* First, ensure the minimum number of matches are present. Use inline
2897 code for maximizing the speed, and do the type test once at the start
2898 (i.e. keep it out of the loop). Also we can test that there are at least
2899 the minimum number of bytes before we start. This isn't as effective in
2900 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2901 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2902 and single-bytes. */
2903
2904 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2905 if (min > 0)
2906 {
2907 #ifdef SUPPORT_UCP
2908 if (prop_type >= 0)
2909 {
2910 switch(prop_type)
2911 {
2912 case PT_ANY:
2913 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2914 for (i = 1; i <= min; i++)
2915 {
2916 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2917 GETCHARINCTEST(c, eptr);
2918 }
2919 break;
2920
2921 case PT_LAMP:
2922 for (i = 1; i <= min; i++)
2923 {
2924 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2925 GETCHARINCTEST(c, eptr);
2926 prop_chartype = UCD_CHARTYPE(c);
2927 if ((prop_chartype == ucp_Lu ||
2928 prop_chartype == ucp_Ll ||
2929 prop_chartype == ucp_Lt) == prop_fail_result)
2930 RRETURN(MATCH_NOMATCH);
2931 }
2932 break;
2933
2934 case PT_GC:
2935 for (i = 1; i <= min; i++)
2936 {
2937 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938 GETCHARINCTEST(c, eptr);
2939 prop_category = UCD_CATEGORY(c);
2940 if ((prop_category == prop_value) == prop_fail_result)
2941 RRETURN(MATCH_NOMATCH);
2942 }
2943 break;
2944
2945 case PT_PC:
2946 for (i = 1; i <= min; i++)
2947 {
2948 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949 GETCHARINCTEST(c, eptr);
2950 prop_chartype = UCD_CHARTYPE(c);
2951 if ((prop_chartype == prop_value) == prop_fail_result)
2952 RRETURN(MATCH_NOMATCH);
2953 }
2954 break;
2955
2956 case PT_SC:
2957 for (i = 1; i <= min; i++)
2958 {
2959 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2960 GETCHARINCTEST(c, eptr);
2961 prop_script = UCD_SCRIPT(c);
2962 if ((prop_script == prop_value) == prop_fail_result)
2963 RRETURN(MATCH_NOMATCH);
2964 }
2965 break;
2966
2967 default:
2968 RRETURN(PCRE_ERROR_INTERNAL);
2969 }
2970 }
2971
2972 /* Match extended Unicode sequences. We will get here only if the
2973 support is in the binary; otherwise a compile-time error occurs. */
2974
2975 else if (ctype == OP_EXTUNI)
2976 {
2977 for (i = 1; i <= min; i++)
2978 {
2979 GETCHARINCTEST(c, eptr);
2980 prop_category = UCD_CATEGORY(c);
2981 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2982 while (eptr < md->end_subject)
2983 {
2984 int len = 1;
2985 if (!utf8) c = *eptr; else
2986 {
2987 GETCHARLEN(c, eptr, len);
2988 }
2989 prop_category = UCD_CATEGORY(c);
2990 if (prop_category != ucp_M) break;
2991 eptr += len;
2992 }
2993 }
2994 }
2995
2996 else
2997 #endif /* SUPPORT_UCP */
2998
2999 /* Handle all other cases when the coding is UTF-8 */
3000
3001 #ifdef SUPPORT_UTF8
3002 if (utf8) switch(ctype)
3003 {
3004 case OP_ANY:
3005 for (i = 1; i <= min; i++)
3006 {
3007 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3008 RRETURN(MATCH_NOMATCH);
3009 eptr++;
3010 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3011 }
3012 break;
3013
3014 case OP_ALLANY:
3015 for (i = 1; i <= min; i++)
3016 {
3017 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3018 eptr++;
3019 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3020 }
3021 break;
3022
3023 case OP_ANYBYTE:
3024 eptr += min;
3025 break;
3026
3027 case OP_ANYNL:
3028 for (i = 1; i <= min; i++)
3029 {
3030 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3031 GETCHARINC(c, eptr);
3032 switch(c)
3033 {
3034 default: RRETURN(MATCH_NOMATCH);
3035 case 0x000d:
3036 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3037 break;
3038
3039 case 0x000a:
3040 break;
3041
3042 case 0x000b:
3043 case 0x000c:
3044 case 0x0085:
3045 case 0x2028:
3046 case 0x2029:
3047 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3048 break;
3049 }
3050 }
3051 break;
3052
3053 case OP_NOT_HSPACE:
3054 for (i = 1; i <= min; i++)
3055 {
3056 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3057 GETCHARINC(c, eptr);
3058 switch(c)
3059 {
3060 default: break;
3061 case 0x09: /* HT */
3062 case 0x20: /* SPACE */
3063 case 0xa0: /* NBSP */
3064 case 0x1680: /* OGHAM SPACE MARK */
3065 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3066 case 0x2000: /* EN QUAD */
3067 case 0x2001: /* EM QUAD */
3068 case 0x2002: /* EN SPACE */
3069 case 0x2003: /* EM SPACE */
3070 case 0x2004: /* THREE-PER-EM SPACE */
3071 case 0x2005: /* FOUR-PER-EM SPACE */
3072 case 0x2006: /* SIX-PER-EM SPACE */
3073 case 0x2007: /* FIGURE SPACE */
3074 case 0x2008: /* PUNCTUATION SPACE */
3075 case 0x2009: /* THIN SPACE */
3076 case 0x200A: /* HAIR SPACE */
3077 case 0x202f: /* NARROW NO-BREAK SPACE */
3078 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3079 case 0x3000: /* IDEOGRAPHIC SPACE */
3080 RRETURN(MATCH_NOMATCH);
3081 }
3082 }
3083 break;
3084
3085 case OP_HSPACE:
3086 for (i = 1; i <= min; i++)
3087 {
3088 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3089 GETCHARINC(c, eptr);
3090 switch(c)
3091 {
3092 default: RRETURN(MATCH_NOMATCH);
3093 case 0x09: /* HT */
3094 case 0x20: /* SPACE */
3095 case 0xa0: /* NBSP */
3096 case 0x1680: /* OGHAM SPACE MARK */
3097 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3098 case 0x2000: /* EN QUAD */
3099 case 0x2001: /* EM QUAD */
3100 case 0x2002: /* EN SPACE */
3101 case 0x2003: /* EM SPACE */
3102 case 0x2004: /* THREE-PER-EM SPACE */
3103 case 0x2005: /* FOUR-PER-EM SPACE */
3104 case 0x2006: /* SIX-PER-EM SPACE */
3105 case 0x2007: /* FIGURE SPACE */
3106 case 0x2008: /* PUNCTUATION SPACE */
3107 case 0x2009: /* THIN SPACE */
3108 case 0x200A: /* HAIR SPACE */
3109 case 0x202f: /* NARROW NO-BREAK SPACE */
3110 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3111 case 0x3000: /* IDEOGRAPHIC SPACE */
3112 break;
3113 }
3114 }
3115 break;
3116
3117 case OP_NOT_VSPACE:
3118 for (i = 1; i <= min; i++)
3119 {
3120 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3121 GETCHARINC(c, eptr);
3122 switch(c)
3123 {
3124 default: break;
3125 case 0x0a: /* LF */
3126 case 0x0b: /* VT */
3127 case 0x0c: /* FF */
3128 case 0x0d: /* CR */
3129 case 0x85: /* NEL */
3130 case 0x2028: /* LINE SEPARATOR */
3131 case 0x2029: /* PARAGRAPH SEPARATOR */
3132 RRETURN(MATCH_NOMATCH);
3133 }
3134 }
3135 break;
3136
3137 case OP_VSPACE:
3138 for (i = 1; i <= min; i++)
3139 {
3140 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3141 GETCHARINC(c, eptr);
3142 switch(c)
3143 {
3144 default: RRETURN(MATCH_NOMATCH);
3145 case 0x0a: /* LF */
3146 case 0x0b: /* VT */
3147 case 0x0c: /* FF */
3148 case 0x0d: /* CR */
3149 case 0x85: /* NEL */
3150 case 0x2028: /* LINE SEPARATOR */
3151 case 0x2029: /* PARAGRAPH SEPARATOR */
3152 break;
3153 }
3154 }
3155 break;
3156
3157 case OP_NOT_DIGIT:
3158 for (i = 1; i <= min; i++)
3159 {
3160 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3161 GETCHARINC(c, eptr);
3162 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3163 RRETURN(MATCH_NOMATCH);
3164 }
3165 break;
3166
3167 case OP_DIGIT:
3168 for (i = 1; i <= min; i++)
3169 {
3170 if (eptr >= md->end_subject ||
3171 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3172 RRETURN(MATCH_NOMATCH);
3173 /* No need to skip more bytes - we know it's a 1-byte character */
3174 }
3175 break;
3176
3177 case OP_NOT_WHITESPACE:
3178 for (i = 1; i <= min; i++)
3179 {
3180 if (eptr >= md->end_subject ||
3181 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3182 RRETURN(MATCH_NOMATCH);
3183 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3184 }
3185 break;
3186
3187 case OP_WHITESPACE:
3188 for (i = 1; i <= min; i++)
3189 {
3190 if (eptr >= md->end_subject ||
3191 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3192 RRETURN(MATCH_NOMATCH);
3193 /* No need to skip more bytes - we know it's a 1-byte character */
3194 }
3195 break;
3196
3197 case OP_NOT_WORDCHAR:
3198 for (i = 1; i <= min; i++)
3199 {
3200 if (eptr >= md->end_subject ||
3201 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3202 RRETURN(MATCH_NOMATCH);
3203 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3204 }
3205 break;
3206
3207 case OP_WORDCHAR:
3208 for (i = 1; i <= min; i++)
3209 {
3210 if (eptr >= md->end_subject ||
3211 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3212 RRETURN(MATCH_NOMATCH);
3213 /* No need to skip more bytes - we know it's a 1-byte character */
3214 }
3215 break;
3216
3217 default:
3218 RRETURN(PCRE_ERROR_INTERNAL);
3219 } /* End switch(ctype) */
3220
3221 else
3222 #endif /* SUPPORT_UTF8 */
3223
3224 /* Code for the non-UTF-8 case for minimum matching of operators other
3225 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3226 number of bytes present, as this was tested above. */
3227
3228 switch(ctype)
3229 {
3230 case OP_ANY:
3231 for (i = 1; i <= min; i++)
3232 {
3233 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3234 eptr++;
3235 }
3236 break;
3237
3238 case OP_ALLANY:
3239 eptr += min;
3240 break;
3241
3242 case OP_ANYBYTE:
3243 eptr += min;
3244 break;
3245
3246 /* Because of the CRLF case, we can't assume the minimum number of
3247 bytes are present in this case. */
3248
3249 case OP_ANYNL:
3250 for (i = 1; i <= min; i++)
3251 {
3252 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3253 switch(*eptr++)
3254 {
3255 default: RRETURN(MATCH_NOMATCH);
3256 case 0x000d:
3257 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3258 break;
3259 case 0x000a:
3260 break;
3261
3262 case 0x000b:
3263 case 0x000c:
3264 case 0x0085:
3265 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3266 break;
3267 }
3268 }
3269 break;
3270
3271 case OP_NOT_HSPACE:
3272 for (i = 1; i <= min; i++)
3273 {
3274 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3275 switch(*eptr++)
3276 {
3277 default: break;
3278 case 0x09: /* HT */
3279 case 0x20: /* SPACE */
3280 case 0xa0: /* NBSP */
3281 RRETURN(MATCH_NOMATCH);
3282 }
3283 }
3284 break;
3285
3286 case OP_HSPACE:
3287 for (i = 1; i <= min; i++)
3288 {
3289 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3290 switch(*eptr++)
3291 {
3292 default: RRETURN(MATCH_NOMATCH);
3293 case 0x09: /* HT */
3294 case 0x20: /* SPACE */
3295 case 0xa0: /* NBSP */
3296 break;
3297 }
3298 }
3299 break;
3300
3301 case OP_NOT_VSPACE:
3302 for (i = 1; i <= min; i++)
3303 {
3304 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3305 switch(*eptr++)
3306 {
3307 default: break;
3308 case 0x0a: /* LF */
3309 case 0x0b: /* VT */
3310 case 0x0c: /* FF */
3311 case 0x0d: /* CR */
3312 case 0x85: /* NEL */
3313 RRETURN(MATCH_NOMATCH);
3314 }
3315 }
3316 break;
3317
3318 case OP_VSPACE:
3319 for (i = 1; i <= min; i++)
3320 {
3321 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3322 switch(*eptr++)
3323 {
3324 default: RRETURN(MATCH_NOMATCH);
3325 case 0x0a: /* LF */
3326 case 0x0b: /* VT */
3327 case 0x0c: /* FF */
3328 case 0x0d: /* CR */
3329 case 0x85: /* NEL */
3330 break;
3331 }
3332 }
3333 break;
3334
3335 case OP_NOT_DIGIT:
3336 for (i = 1; i <= min; i++)
3337 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3338 break;
3339
3340 case OP_DIGIT:
3341 for (i = 1; i <= min; i++)
3342 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3343 break;
3344
3345 case OP_NOT_WHITESPACE:
3346 for (i = 1; i <= min; i++)
3347 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3348 break;
3349
3350 case OP_WHITESPACE:
3351 for (i = 1; i <= min; i++)
3352 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3353 break;
3354
3355 case OP_NOT_WORDCHAR:
3356 for (i = 1; i <= min; i++)
3357 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3358 RRETURN(MATCH_NOMATCH);
3359 break;
3360
3361 case OP_WORDCHAR:
3362 for (i = 1; i <= min; i++)
3363 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3364 RRETURN(MATCH_NOMATCH);
3365 break;
3366
3367 default:
3368 RRETURN(PCRE_ERROR_INTERNAL);
3369 }
3370 }
3371
3372 /* If min = max, continue at the same level without recursing */
3373
3374 if (min == max) continue;
3375
3376 /* If minimizing, we have to test the rest of the pattern before each
3377 subsequent match. Again, separate the UTF-8 case for speed, and also
3378 separate the UCP cases. */
3379
3380 if (minimize)
3381 {
3382 #ifdef SUPPORT_UCP
3383 if (prop_type >= 0)
3384 {
3385 switch(prop_type)
3386 {
3387 case PT_ANY:
3388 for (fi = min;; fi++)
3389 {
3390 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3391 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3392 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3393 GETCHARINC(c, eptr);
3394 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3395 }
3396 /* Control never gets here */
3397
3398 case PT_LAMP:
3399 for (fi = min;; fi++)
3400 {
3401 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404 GETCHARINC(c, eptr);
3405 prop_chartype = UCD_CHARTYPE(c);
3406 if ((prop_chartype == ucp_Lu ||
3407 prop_chartype == ucp_Ll ||
3408 prop_chartype == ucp_Lt) == prop_fail_result)
3409 RRETURN(MATCH_NOMATCH);
3410 }
3411 /* Control never gets here */
3412
3413 case PT_GC:
3414 for (fi = min;; fi++)
3415 {
3416 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3417 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3418 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3419 GETCHARINC(c, eptr);
3420 prop_category = UCD_CATEGORY(c);
3421 if ((prop_category == prop_value) == prop_fail_result)
3422 RRETURN(MATCH_NOMATCH);
3423 }
3424 /* Control never gets here */
3425
3426 case PT_PC:
3427 for (fi = min;; fi++)
3428 {
3429 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3432 GETCHARINC(c, eptr);
3433 prop_chartype = UCD_CHARTYPE(c);
3434 if ((prop_chartype == prop_value) == prop_fail_result)
3435 RRETURN(MATCH_NOMATCH);
3436 }
3437 /* Control never gets here */
3438
3439 case PT_SC:
3440 for (fi = min;; fi++)
3441 {
3442 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3445 GETCHARINC(c, eptr);
3446 prop_script = UCD_SCRIPT(c);
3447 if ((prop_script == prop_value) == prop_fail_result)
3448 RRETURN(MATCH_NOMATCH);
3449 }
3450 /* Control never gets here */
3451
3452 default:
3453 RRETURN(PCRE_ERROR_INTERNAL);
3454 }
3455 }
3456
3457 /* Match extended Unicode sequences. We will get here only if the
3458 support is in the binary; otherwise a compile-time error occurs. */
3459
3460 else if (ctype == OP_EXTUNI)
3461 {
3462 for (fi = min;; fi++)
3463 {
3464 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3466 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3467 GETCHARINCTEST(c, eptr);
3468 prop_category = UCD_CATEGORY(c);
3469 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3470 while (eptr < md->end_subject)
3471 {
3472 int len = 1;
3473 if (!utf8) c = *eptr; else
3474 {
3475 GETCHARLEN(c, eptr, len);
3476 }
3477 prop_category = UCD_CATEGORY(c);
3478 if (prop_category != ucp_M) break;
3479 eptr += len;
3480 }
3481 }
3482 }
3483
3484 else
3485 #endif /* SUPPORT_UCP */
3486
3487 #ifdef SUPPORT_UTF8
3488 /* UTF-8 mode */
3489 if (utf8)
3490 {
3491 for (fi = min;; fi++)
3492 {
3493 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3495 if (fi >= max || eptr >= md->end_subject ||
3496 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3497 RRETURN(MATCH_NOMATCH);
3498
3499 GETCHARINC(c, eptr);
3500 switch(ctype)
3501 {
3502 case OP_ANY: /* This is the non-NL case */
3503 case OP_ALLANY:
3504 case OP_ANYBYTE:
3505 break;
3506
3507 case OP_ANYNL:
3508 switch(c)
3509 {
3510 default: RRETURN(MATCH_NOMATCH);
3511 case 0x000d:
3512 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3513 break;
3514 case 0x000a:
3515 break;
3516
3517 case 0x000b:
3518 case 0x000c:
3519 case 0x0085:
3520 case 0x2028:
3521 case 0x2029:
3522 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3523 break;
3524 }
3525 break;
3526
3527 case OP_NOT_HSPACE:
3528 switch(c)
3529 {
3530 default: break;
3531 case 0x09: /* HT */
3532 case 0x20: /* SPACE */
3533 case 0xa0: /* NBSP */
3534 case 0x1680: /* OGHAM SPACE MARK */
3535 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3536 case 0x2000: /* EN QUAD */
3537 case 0x2001: /* EM QUAD */
3538 case 0x2002: /* EN SPACE */
3539 case 0x2003: /* EM SPACE */
3540 case 0x2004: /* THREE-PER-EM SPACE */
3541 case 0x2005: /* FOUR-PER-EM SPACE */
3542 case 0x2006: /* SIX-PER-EM SPACE */
3543 case 0x2007: /* FIGURE SPACE */
3544 case 0x2008: /* PUNCTUATION SPACE */
3545 case 0x2009: /* THIN SPACE */
3546 case 0x200A: /* HAIR SPACE */
3547 case 0x202f: /* NARROW NO-BREAK SPACE */
3548 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3549 case 0x3000: /* IDEOGRAPHIC SPACE */
3550 RRETURN(MATCH_NOMATCH);
3551 }
3552 break;
3553
3554 case OP_HSPACE:
3555 switch(c)
3556 {
3557 default: RRETURN(MATCH_NOMATCH);
3558 case 0x09: /* HT */
3559 case 0x20: /* SPACE */
3560 case 0xa0: /* NBSP */
3561 case 0x1680: /* OGHAM SPACE MARK */
3562 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3563 case 0x2000: /* EN QUAD */
3564 case 0x2001: /* EM QUAD */
3565 case 0x2002: /* EN SPACE */
3566 case 0x2003: /* EM SPACE */
3567 case 0x2004: /* THREE-PER-EM SPACE */
3568 case 0x2005: /* FOUR-PER-EM SPACE */
3569 case 0x2006: /* SIX-PER-EM SPACE */
3570 case 0x2007: /* FIGURE SPACE */
3571 case 0x2008: /* PUNCTUATION SPACE */
3572 case 0x2009: /* THIN SPACE */
3573 case 0x200A: /* HAIR SPACE */
3574 case 0x202f: /* NARROW NO-BREAK SPACE */
3575 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3576 case 0x3000: /* IDEOGRAPHIC SPACE */
3577 break;
3578 }
3579 break;
3580
3581 case OP_NOT_VSPACE:
3582 switch(c)
3583 {
3584 default: break;
3585 case 0x0a: /* LF */
3586 case 0x0b: /* VT */
3587 case 0x0c: /* FF */
3588 case 0x0d: /* CR */
3589 case 0x85: /* NEL */
3590 case 0x2028: /* LINE SEPARATOR */
3591 case 0x2029: /* PARAGRAPH SEPARATOR */
3592 RRETURN(MATCH_NOMATCH);
3593 }
3594 break;
3595
3596 case OP_VSPACE:
3597 switch(c)
3598 {
3599 default: RRETURN(MATCH_NOMATCH);
3600 case 0x0a: /* LF */
3601 case 0x0b: /* VT */
3602 case 0x0c: /* FF */
3603 case 0x0d: /* CR */
3604 case 0x85: /* NEL */
3605 case 0x2028: /* LINE SEPARATOR */
3606 case 0x2029: /* PARAGRAPH SEPARATOR */
3607 break;
3608 }
3609 break;
3610
3611 case OP_NOT_DIGIT:
3612 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3613 RRETURN(MATCH_NOMATCH);
3614 break;
3615
3616 case OP_DIGIT:
3617 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3618 RRETURN(MATCH_NOMATCH);
3619 break;
3620
3621 case OP_NOT_WHITESPACE:
3622 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3623 RRETURN(MATCH_NOMATCH);
3624 break;
3625
3626 case OP_WHITESPACE:
3627 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3628 RRETURN(MATCH_NOMATCH);
3629 break;
3630
3631 case OP_NOT_WORDCHAR:
3632 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3633 RRETURN(MATCH_NOMATCH);
3634 break;
3635
3636 case OP_WORDCHAR:
3637 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3638 RRETURN(MATCH_NOMATCH);
3639 break;
3640
3641 default:
3642 RRETURN(PCRE_ERROR_INTERNAL);
3643 }
3644 }
3645 }
3646 else
3647 #endif
3648 /* Not UTF-8 mode */
3649 {
3650 for (fi = min;; fi++)
3651 {
3652 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3653 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3654 if (fi >= max || eptr >= md->end_subject ||
3655 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3656 RRETURN(MATCH_NOMATCH);
3657
3658 c = *eptr++;
3659 switch(ctype)
3660 {
3661 case OP_ANY: /* This is the non-NL case */
3662 case OP_ALLANY:
3663 case OP_ANYBYTE:
3664 break;
3665
3666 case OP_ANYNL:
3667 switch(c)
3668 {
3669 default: RRETURN(MATCH_NOMATCH);
3670 case 0x000d:
3671 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3672 break;
3673
3674 case 0x000a:
3675 break;
3676
3677 case 0x000b:
3678 case 0x000c:
3679 case 0x0085:
3680 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3681 break;
3682 }
3683 break;
3684
3685 case OP_NOT_HSPACE:
3686 switch(c)
3687 {
3688 default: break;
3689 case 0x09: /* HT */
3690 case 0x20: /* SPACE */
3691 case 0xa0: /* NBSP */
3692 RRETURN(MATCH_NOMATCH);
3693 }
3694 break;
3695
3696 case OP_HSPACE:
3697 switch(c)
3698 {
3699 default: RRETURN(MATCH_NOMATCH);
3700 case 0x09: /* HT */
3701 case 0x20: /* SPACE */
3702 case 0xa0: /* NBSP */
3703 break;
3704 }
3705 break;
3706
3707 case OP_NOT_VSPACE:
3708 switch(c)
3709 {
3710 default: break;
3711 case 0x0a: /* LF */
3712 case 0x0b: /* VT */
3713 case 0x0c: /* FF */
3714 case 0x0d: /* CR */
3715 case 0x85: /* NEL */
3716 RRETURN(MATCH_NOMATCH);
3717 }
3718 break;
3719
3720 case OP_VSPACE:
3721 switch(c)
3722 {
3723 default: RRETURN(MATCH_NOMATCH);
3724 case 0x0a: /* LF */
3725 case 0x0b: /* VT */
3726 case 0x0c: /* FF */
3727 case 0x0d: /* CR */
3728 case 0x85: /* NEL */
3729 break;
3730 }
3731 break;
3732
3733 case OP_NOT_DIGIT:
3734 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3735 break;
3736
3737 case OP_DIGIT:
3738 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3739 break;
3740
3741 case OP_NOT_WHITESPACE:
3742 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3743 break;
3744
3745 case OP_WHITESPACE:
3746 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3747 break;
3748
3749 case OP_NOT_WORDCHAR:
3750 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3751 break;
3752
3753 case OP_WORDCHAR:
3754 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3755 break;
3756
3757 default:
3758 RRETURN(PCRE_ERROR_INTERNAL);
3759 }
3760 }
3761 }
3762 /* Control never gets here */
3763 }
3764
3765 /* If maximizing, it is worth using inline code for speed, doing the type
3766 test once at the start (i.e. keep it out of the loop). Again, keep the
3767 UTF-8 and UCP stuff separate. */
3768
3769 else
3770 {
3771 pp = eptr; /* Remember where we started */
3772
3773 #ifdef SUPPORT_UCP
3774 if (prop_type >= 0)
3775 {
3776 switch(prop_type)
3777 {
3778 case PT_ANY:
3779 for (i = min; i < max; i++)
3780 {
3781 int len = 1;
3782 if (eptr >= md->end_subject) break;
3783 GETCHARLEN(c, eptr, len);
3784 if (prop_fail_result) break;
3785 eptr+= len;
3786 }
3787 break;
3788
3789 case PT_LAMP:
3790 for (i = min; i < max; i++)
3791 {
3792 int len = 1;
3793 if (eptr >= md->end_subject) break;
3794 GETCHARLEN(c, eptr, len);
3795 prop_chartype = UCD_CHARTYPE(c);
3796 if ((prop_chartype == ucp_Lu ||
3797 prop_chartype == ucp_Ll ||
3798 prop_chartype == ucp_Lt) == prop_fail_result)
3799 break;
3800 eptr+= len;
3801 }
3802 break;
3803
3804 case PT_GC:
3805 for (i = min; i < max; i++)
3806 {
3807 int len = 1;
3808 if (eptr >= md->end_subject) break;
3809 GETCHARLEN(c, eptr, len);
3810 prop_category = UCD_CATEGORY(c);
3811 if ((prop_category == prop_value) == prop_fail_result)
3812 break;
3813 eptr+= len;
3814 }
3815 break;
3816
3817 case PT_PC:
3818 for (i = min; i < max; i++)
3819 {
3820 int len = 1;
3821 if (eptr >= md->end_subject) break;
3822 GETCHARLEN(c, eptr, len);
3823 prop_chartype = UCD_CHARTYPE(c);
3824 if ((prop_chartype == prop_value) == prop_fail_result)
3825 break;
3826 eptr+= len;
3827 }
3828 break;
3829
3830 case PT_SC:
3831 for (i = min; i < max; i++)
3832 {
3833 int len = 1;
3834 if (eptr >= md->end_subject) break;
3835 GETCHARLEN(c, eptr, len);
3836 prop_script = UCD_SCRIPT(c);
3837 if ((prop_script == prop_value) == prop_fail_result)
3838 break;
3839 eptr+= len;
3840 }
3841 break;
3842 }
3843
3844 /* eptr is now past the end of the maximum run */
3845
3846 if (possessive) continue;
3847 for(;;)
3848 {
3849 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3851 if (eptr-- == pp) break; /* Stop if tried at original pos */
3852 if (utf8) BACKCHAR(eptr);
3853 }
3854 }
3855
3856 /* Match extended Unicode sequences. We will get here only if the
3857 support is in the binary; otherwise a compile-time error occurs. */
3858
3859 else if (ctype == OP_EXTUNI)
3860 {
3861 for (i = min; i < max; i++)
3862 {
3863 if (eptr >= md->end_subject) break;
3864 GETCHARINCTEST(c, eptr);
3865 prop_category = UCD_CATEGORY(c);
3866 if (prop_category == ucp_M) break;
3867 while (eptr < md->end_subject)
3868 {
3869 int len = 1;
3870 if (!utf8) c = *eptr; else
3871 {
3872 GETCHARLEN(c, eptr, len);
3873 }
3874 prop_category = UCD_CATEGORY(c);
3875 if (prop_category != ucp_M) break;
3876 eptr += len;
3877 }
3878 }
3879
3880 /* eptr is now past the end of the maximum run */
3881
3882 if (possessive) continue;
3883 for(;;)
3884 {
3885 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3887 if (eptr-- == pp) break; /* Stop if tried at original pos */
3888 for (;;) /* Move back over one extended */
3889 {
3890 int len = 1;
3891 if (!utf8) c = *eptr; else
3892 {
3893 BACKCHAR(eptr);
3894 GETCHARLEN(c, eptr, len);
3895 }
3896 prop_category = UCD_CATEGORY(c);
3897 if (prop_category != ucp_M) break;
3898 eptr--;
3899 }
3900 }
3901 }
3902
3903 else
3904 #endif /* SUPPORT_UCP */
3905
3906 #ifdef SUPPORT_UTF8
3907 /* UTF-8 mode */
3908
3909 if (utf8)
3910 {
3911 switch(ctype)
3912 {
3913 case OP_ANY:
3914 if (max < INT_MAX)
3915 {
3916 for (i = min; i < max; i++)
3917 {
3918 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3919 eptr++;
3920 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3921 }
3922 }
3923
3924 /* Handle unlimited UTF-8 repeat */
3925
3926 else
3927 {
3928 for (i = min; i < max; i++)
3929 {
3930 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3931 eptr++;
3932 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3933 }
3934 }
3935 break;
3936
3937 case OP_ALLANY:
3938 if (max < INT_MAX)
3939 {
3940 for (i = min; i < max; i++)
3941 {
3942 if (eptr >= md->end_subject) break;
3943 eptr++;
3944 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3945 }
3946 }
3947 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3948 break;
3949
3950 /* The byte case is the same as non-UTF8 */
3951
3952 case OP_ANYBYTE:
3953 c = max - min;
3954 if (c > (unsigned int)(md->end_subject - eptr))
3955 c = md->end_subject - eptr;
3956 eptr += c;
3957 break;
3958
3959 case OP_ANYNL:
3960 for (i = min; i < max; i++)
3961 {
3962 int len = 1;
3963 if (eptr >= md->end_subject) break;
3964 GETCHARLEN(c, eptr, len);
3965 if (c == 0x000d)
3966 {
3967 if (++eptr >= md->end_subject) break;
3968 if (*eptr == 0x000a) eptr++;
3969 }
3970 else
3971 {
3972 if (c != 0x000a &&
3973 (md->bsr_anycrlf ||
3974 (c != 0x000b && c != 0x000c &&
3975 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3976 break;
3977 eptr += len;
3978 }
3979 }
3980 break;
3981
3982 case OP_NOT_HSPACE:
3983 case OP_HSPACE:
3984 for (i = min; i < max; i++)
3985 {
3986 BOOL gotspace;
3987 int len = 1;
3988 if (eptr >= md->end_subject) break;
3989 GETCHARLEN(c, eptr, len);
3990 switch(c)
3991 {
3992 default: gotspace = FALSE; break;
3993 case 0x09: /* HT */
3994 case 0x20: /* SPACE */
3995 case 0xa0: /* NBSP */
3996 case 0x1680: /* OGHAM SPACE MARK */
3997 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3998 case 0x2000: /* EN QUAD */
3999 case 0x2001: /* EM QUAD */
4000 case 0x2002: /* EN SPACE */
4001 case 0x2003: /* EM SPACE */
4002 case 0x2004: /* THREE-PER-EM SPACE */
4003 case 0x2005: /* FOUR-PER-EM SPACE */
4004 case 0x2006: /* SIX-PER-EM SPACE */
4005 case 0x2007: /* FIGURE SPACE */
4006 case 0x2008: /* PUNCTUATION SPACE */
4007 case 0x2009: /* THIN SPACE */
4008 case 0x200A: /* HAIR SPACE */
4009 case 0x202f: /* NARROW NO-BREAK SPACE */
4010 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4011 case 0x3000: /* IDEOGRAPHIC SPACE */
4012 gotspace = TRUE;
4013 break;
4014 }
4015 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4016 eptr += len;
4017 }
4018 break;
4019
4020 case OP_NOT_VSPACE:
4021 case OP_VSPACE:
4022 for (i = min; i < max; i++)
4023 {
4024 BOOL gotspace;
4025 int len = 1;
4026 if (eptr >= md->end_subject) break;
4027 GETCHARLEN(c, eptr, len);
4028 switch(c)
4029 {
4030 default: gotspace = FALSE; break;
4031 case 0x0a: /* LF */
4032 case 0x0b: /* VT */
4033 case 0x0c: /* FF */
4034 case 0x0d: /* CR */
4035 case 0x85: /* NEL */
4036 case 0x2028: /* LINE SEPARATOR */
4037 case 0x2029: /* PARAGRAPH SEPARATOR */
4038 gotspace = TRUE;
4039 break;
4040 }
4041 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4042 eptr += len;
4043 }
4044 break;
4045
4046 case OP_NOT_DIGIT:
4047 for (i = min; i < max; i++)
4048 {
4049 int len = 1;
4050 if (eptr >= md->end_subject) break;
4051 GETCHARLEN(c, eptr, len);
4052 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4053 eptr+= len;
4054 }
4055 break;
4056
4057 case OP_DIGIT:
4058 for (i = min; i < max; i++)
4059 {
4060 int len = 1;
4061 if (eptr >= md->end_subject) break;
4062 GETCHARLEN(c, eptr, len);
4063 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4064 eptr+= len;
4065 }
4066 break;
4067
4068 case OP_NOT_WHITESPACE:
4069 for (i = min; i < max; i++)
4070 {
4071 int len = 1;
4072 if (eptr >= md->end_subject) break;
4073 GETCHARLEN(c, eptr, len);
4074 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4075 eptr+= len;
4076 }
4077 break;
4078
4079 case OP_WHITESPACE:
4080 for (i = min; i < max; i++)
4081 {
4082 int len = 1;
4083 if (eptr >= md->end_subject) break;
4084 GETCHARLEN(c, eptr, len);
4085 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4086 eptr+= len;
4087 }
4088 break;
4089
4090 case OP_NOT_WORDCHAR:
4091 for (i = min; i < max; i++)
4092 {
4093 int len = 1;
4094 if (eptr >= md->end_subject) break;
4095 GETCHARLEN(c, eptr, len);
4096 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4097 eptr+= len;
4098 }
4099 break;
4100
4101 case OP_WORDCHAR:
4102 for (i = min; i < max; i++)
4103 {
4104 int len = 1;
4105 if (eptr >= md->end_subject) break;
4106 GETCHARLEN(c, eptr, len);
4107 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4108 eptr+= len;
4109 }
4110 break;
4111
4112 default:
4113 RRETURN(PCRE_ERROR_INTERNAL);
4114 }
4115
4116 /* eptr is now past the end of the maximum run */
4117
4118 if (possessive) continue;
4119 for(;;)
4120 {
4121 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4122 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4123 if (eptr-- == pp) break; /* Stop if tried at original pos */
4124 BACKCHAR(eptr);
4125 }
4126 }
4127 else
4128 #endif /* SUPPORT_UTF8 */
4129
4130 /* Not UTF-8 mode */
4131 {
4132 switch(ctype)
4133 {
4134 case OP_ANY:
4135 for (i = min; i < max; i++)
4136 {
4137 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4138 eptr++;
4139 }
4140 break;
4141
4142 case OP_ALLANY:
4143 case OP_ANYBYTE:
4144 c = max - min;
4145 if (c > (unsigned int)(md->end_subject - eptr))
4146 c = md->end_subject - eptr;
4147 eptr += c;
4148 break;
4149
4150 case OP_ANYNL:
4151 for (i = min; i < max; i++)
4152 {
4153 if (eptr >= md->end_subject) break;
4154 c = *eptr;
4155 if (c == 0x000d)
4156 {
4157 if (++eptr >= md->end_subject) break;
4158 if (*eptr == 0x000a) eptr++;
4159 }
4160 else
4161 {
4162 if (c != 0x000a &&
4163 (md->bsr_anycrlf ||
4164 (c != 0x000b && c != 0x000c && c != 0x0085)))
4165 break;
4166 eptr++;
4167 }
4168 }
4169 break;
4170
4171 case OP_NOT_HSPACE:
4172 for (i = min; i < max; i++)
4173 {
4174 if (eptr >= md->end_subject) break;
4175 c = *eptr;
4176 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4177 eptr++;
4178 }
4179 break;
4180
4181 case OP_HSPACE:
4182 for (i = min; i < max; i++)
4183 {
4184 if (eptr >= md->end_subject) break;
4185 c = *eptr;
4186 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4187 eptr++;
4188 }
4189 break;
4190
4191 case OP_NOT_VSPACE:
4192 for (i = min; i < max; i++)
4193 {
4194 if (eptr >= md->end_subject) break;
4195 c = *eptr;
4196 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4197 break;
4198 eptr++;
4199 }
4200 break;
4201
4202 case OP_VSPACE:
4203 for (i = min; i < max; i++)
4204 {
4205 if (eptr >= md->end_subject) break;
4206 c = *eptr;
4207 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4208 break;
4209 eptr++;
4210 }
4211 break;
4212
4213 case OP_NOT_DIGIT:
4214 for (i = min; i < max; i++)
4215 {
4216 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4217 break;
4218 eptr++;
4219 }
4220 break;
4221
4222 case OP_DIGIT:
4223 for (i = min; i < max; i++)
4224 {
4225 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4226 break;
4227 eptr++;
4228 }
4229 break;
4230
4231 case OP_NOT_WHITESPACE:
4232 for (i = min; i < max; i++)
4233 {
4234 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4235 break;
4236 eptr++;
4237 }
4238 break;
4239
4240 case OP_WHITESPACE:
4241 for (i = min; i < max; i++)
4242 {
4243 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4244 break;
4245 eptr++;
4246 }
4247 break;
4248
4249 case OP_NOT_WORDCHAR:
4250 for (i = min; i < max; i++)
4251 {
4252 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4253 break;
4254 eptr++;
4255 }
4256 break;
4257
4258 case OP_WORDCHAR:
4259 for (i = min; i < max; i++)
4260 {
4261 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4262 break;
4263 eptr++;
4264 }
4265 break;
4266
4267 default:
4268 RRETURN(PCRE_ERROR_INTERNAL);
4269 }
4270
4271 /* eptr is now past the end of the maximum run */
4272
4273 if (possessive) continue;
4274 while (eptr >= pp)
4275 {
4276 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4277 eptr--;
4278 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4279 }
4280 }
4281
4282 /* Get here if we can't make it match with any permitted repetitions */
4283
4284 RRETURN(MATCH_NOMATCH);
4285 }
4286 /* Control never gets here */
4287
4288 /* There's been some horrible disaster. Arrival here can only mean there is
4289 something seriously wrong in the code above or the OP_xxx definitions. */
4290
4291 default:
4292 DPRINTF(("Unknown opcode %d\n", *ecode));
4293 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4294 }
4295
4296 /* Do not stick any code in here without much thought; it is assumed
4297 that "continue" in the code above comes out to here to repeat the main
4298 loop. */
4299
4300 } /* End of main loop */
4301 /* Control never reaches here */
4302
4303
4304 /* When compiling to use the heap rather than the stack for recursive calls to
4305 match(), the RRETURN() macro jumps here. The number that is saved in
4306 frame->Xwhere indicates which label we actually want to return to. */
4307
4308 #ifdef NO_RECURSE
4309 #define LBL(val) case val: goto L_RM##val;
4310 HEAP_RETURN:
4311 switch (frame->Xwhere)
4312 {
4313 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4314 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4315 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4316 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4317 LBL(53) LBL(54)
4318 #ifdef SUPPORT_UTF8
4319 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4320 LBL(32) LBL(34) LBL(42) LBL(46)
4321 #ifdef SUPPORT_UCP
4322 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4323 #endif /* SUPPORT_UCP */
4324 #endif /* SUPPORT_UTF8 */
4325 default:
4326 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4327 return PCRE_ERROR_INTERNAL;
4328 }
4329 #undef LBL
4330 #endif /* NO_RECURSE */
4331 }
4332
4333
4334 /***************************************************************************
4335 ****************************************************************************
4336 RECURSION IN THE match() FUNCTION
4337
4338 Undefine all the macros that were defined above to handle this. */
4339
4340 #ifdef NO_RECURSE
4341 #undef eptr
4342 #undef ecode
4343 #undef mstart
4344 #undef offset_top
4345 #undef ims
4346 #undef eptrb
4347 #undef flags
4348
4349 #undef callpat
4350 #undef charptr
4351 #undef data
4352 #undef next
4353 #undef pp
4354 #undef prev
4355 #undef saved_eptr
4356
4357 #undef new_recursive
4358
4359 #undef cur_is_word
4360 #undef condition
4361 #undef prev_is_word
4362
4363 #undef original_ims
4364
4365 #undef ctype
4366 #undef length
4367 #undef max
4368 #undef min
4369 #undef number
4370 #undef offset
4371 #undef op
4372 #undef save_capture_last
4373 #undef save_offset1
4374 #undef save_offset2
4375 #undef save_offset3
4376 #undef stacksave
4377
4378 #undef newptrb
4379
4380 #endif
4381
4382 /* These two are defined as macros in both cases */
4383
4384 #undef fc
4385 #undef fi
4386
4387 /***************************************************************************
4388 ***************************************************************************/
4389
4390
4391
4392 /*************************************************
4393 * Execute a Regular Expression *
4394 *************************************************/
4395
4396 /* This function applies a compiled re to a subject string and picks out
4397 portions of the string if it matches. Two elements in the vector are set for
4398 each substring: the offsets to the start and end of the substring.
4399
4400 Arguments:
4401 argument_re points to the compiled expression
4402 extra_data points to extra data or is NULL
4403 subject points to the subject string
4404 length length of subject string (may contain binary zeros)
4405 start_offset where to start in the subject string
4406 options option bits
4407 offsets points to a vector of ints to be filled in with offsets
4408 offsetcount the number of elements in the vector
4409
4410 Returns: > 0 => success; value is the number of elements filled in
4411 = 0 => success, but offsets is not big enough
4412 -1 => failed to match
4413 < -1 => some kind of unexpected problem
4414 */
4415
4416 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4417 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4418 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4419 int offsetcount)
4420 {
4421 int rc, resetcount, ocount;
4422 int first_byte = -1;
4423 int req_byte = -1;
4424 int req_byte2 = -1;
4425 int newline;
4426 unsigned long int ims;
4427 BOOL using_temporary_offsets = FALSE;
4428 BOOL anchored;
4429 BOOL startline;
4430 BOOL firstline;
4431 BOOL first_byte_caseless = FALSE;
4432 BOOL req_byte_caseless = FALSE;
4433 BOOL utf8;
4434 match_data match_block;
4435 match_data *md = &match_block;
4436 const uschar *tables;
4437 const uschar *start_bits = NULL;
4438 USPTR start_match = (USPTR)subject + start_offset;
4439 USPTR end_subject;
4440 USPTR req_byte_ptr = start_match - 1;
4441
4442 pcre_study_data internal_study;
4443 const pcre_study_data *study;
4444
4445 real_pcre internal_re;
4446 const real_pcre *external_re = (const real_pcre *)argument_re;
4447 const real_pcre *re = external_re;
4448
4449 /* Plausibility checks */
4450
4451 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4452 if (re == NULL || subject == NULL ||
4453 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4454 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4455
4456 /* Fish out the optional data from the extra_data structure, first setting
4457 the default values. */
4458
4459 study = NULL;
4460 md->match_limit = MATCH_LIMIT;
4461 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4462 md->callout_data = NULL;
4463
4464 /* The table pointer is always in native byte order. */
4465
4466 tables = external_re->tables;
4467
4468 if (extra_data != NULL)
4469 {
4470 register unsigned int flags = extra_data->flags;
4471 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4472 study = (const pcre_study_data *)extra_data->study_data;
4473 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4474 md->match_limit = extra_data->match_limit;
4475 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4476 md->match_limit_recursion = extra_data->match_limit_recursion;
4477 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4478 md->callout_data = extra_data->callout_data;
4479 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4480 }
4481
4482 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4483 is a feature that makes it possible to save compiled regex and re-use them
4484 in other programs later. */
4485
4486 if (tables == NULL) tables = _pcre_default_tables;
4487
4488 /* Check that the first field in the block is the magic number. If it is not,
4489 test for a regex that was compiled on a host of opposite endianness. If this is
4490 the case, flipped values are put in internal_re and internal_study if there was
4491 study data too. */
4492
4493 if (re->magic_number != MAGIC_NUMBER)
4494 {
4495 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4496 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4497 if (study != NULL) study = &internal_study;
4498 }
4499
4500 /* Set up other data */
4501
4502 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4503 startline = (re->flags & PCRE_STARTLINE) != 0;
4504 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4505
4506 /* The code starts after the real_pcre block and the capture name table. */
4507
4508 md->start_code = (const uschar *)external_re + re->name_table_offset +
4509 re->name_count * re->name_entry_size;
4510
4511 md->start_subject = (USPTR)subject;
4512 md->start_offset = start_offset;
4513 md->end_subject = md->start_subject + length;
4514 end_subject = md->end_subject;
4515
4516 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4517 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4518 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4519
4520 md->notbol = (options & PCRE_NOTBOL) != 0;
4521 md->noteol = (options & PCRE_NOTEOL) != 0;
4522 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4523 md->partial = (options & PCRE_PARTIAL) != 0;
4524 md->hitend = FALSE;
4525
4526 md->recursive = NULL; /* No recursion at top level */
4527
4528 md->lcc = tables + lcc_offset;
4529 md->ctypes = tables + ctypes_offset;
4530
4531 /* Handle different \R options. */
4532
4533 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4534 {
4535 case 0:
4536 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4537 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4538 else
4539 #ifdef BSR_ANYCRLF
4540 md->bsr_anycrlf = TRUE;
4541 #else
4542 md->bsr_anycrlf = FALSE;
4543 #endif
4544 break;
4545
4546 case PCRE_BSR_ANYCRLF:
4547 md->bsr_anycrlf = TRUE;
4548 break;
4549
4550 case PCRE_BSR_UNICODE:
4551 md->bsr_anycrlf = FALSE;
4552 break;
4553
4554 default: return PCRE_ERROR_BADNEWLINE;
4555 }
4556
4557 /* Handle different types of newline. The three bits give eight cases. If
4558 nothing is set at run time, whatever was used at compile time applies. */
4559
4560 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4561 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4562 {
4563 case 0: newline = NEWLINE; break; /* Compile-time default */
4564 case PCRE_NEWLINE_CR: newline = '\r'; break;
4565 case PCRE_NEWLINE_LF: newline = '\n'; break;
4566 case PCRE_NEWLINE_CR+
4567 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4568 case PCRE_NEWLINE_ANY: newline = -1; break;
4569 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4570 default: return PCRE_ERROR_BADNEWLINE;
4571 }
4572
4573 if (newline == -2)
4574 {
4575 md->nltype = NLTYPE_ANYCRLF;
4576 }
4577 else if (newline < 0)
4578 {
4579 md->nltype = NLTYPE_ANY;
4580 }
4581 else
4582 {
4583 md->nltype = NLTYPE_FIXED;
4584 if (newline > 255)
4585 {
4586 md->nllen = 2;
4587 md->nl[0] = (newline >> 8) & 255;
4588 md->nl[1] = newline & 255;
4589 }
4590 else
4591 {
4592 md->nllen = 1;
4593 md->nl[0] = newline;
4594 }
4595 }
4596
4597 /* Partial matching is supported only for a restricted set of regexes at the
4598 moment. */
4599
4600 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4601 return PCRE_ERROR_BADPARTIAL;
4602
4603 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4604 back the character offset. */
4605
4606 #ifdef SUPPORT_UTF8
4607 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4608 {
4609 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4610 return PCRE_ERROR_BADUTF8;
4611 if (start_offset > 0 && start_offset < length)
4612 {
4613 int tb = ((uschar *)subject)[start_offset];
4614 if (tb > 127)
4615 {
4616 tb &= 0xc0;
4617 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4618 }
4619 }
4620 }
4621 #endif
4622
4623 /* The ims options can vary during the matching as a result of the presence
4624 of (?ims) items in the pattern. They are kept in a local variable so that
4625 restoring at the exit of a group is easy. */
4626
4627 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4628
4629 /* If the expression has got more back references than the offsets supplied can
4630 hold, we get a temporary chunk of working store to use during the matching.
4631 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4632 of 3. */
4633
4634 ocount = offsetcount - (offsetcount % 3);
4635
4636 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4637 {
4638 ocount = re->top_backref * 3 + 3;
4639 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4640 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4641 using_temporary_offsets = TRUE;
4642 DPRINTF(("Got memory to hold back references\n"));
4643 }
4644 else md->offset_vector = offsets;
4645
4646 md->offset_end = ocount;
4647 md->offset_max = (2*ocount)/3;
4648 md->offset_overflow = FALSE;
4649 md->capture_last = -1;
4650
4651 /* Compute the minimum number of offsets that we need to reset each time. Doing
4652 this makes a huge difference to execution time when there aren't many brackets
4653 in the pattern. */
4654
4655 resetcount = 2 + re->top_bracket * 2;
4656 if (resetcount > offsetcount) resetcount = ocount;
4657
4658 /* Reset the working variable associated with each extraction. These should
4659 never be used unless previously set, but they get saved and restored, and so we
4660 initialize them to avoid reading uninitialized locations. */
4661
4662 if (md->offset_vector != NULL)
4663 {
4664 register int *iptr = md->offset_vector + ocount;
4665 register int *iend = iptr - resetcount/2 + 1;
4666 while (--iptr >= iend) *iptr = -1;
4667 }
4668
4669 /* Set up the first character to match, if available. The first_byte value is
4670 never set for an anchored regular expression, but the anchoring may be forced
4671 at run time, so we have to test for anchoring. The first char may be unset for
4672 an unanchored pattern, of course. If there's no first char and the pattern was
4673 studied, there may be a bitmap of possible first characters. */
4674
4675 if (!anchored)
4676 {
4677 if ((re->flags & PCRE_FIRSTSET) != 0)
4678 {
4679 first_byte = re->first_byte & 255;
4680 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4681 first_byte = md->lcc[first_byte];
4682 }
4683 else
4684 if (!startline && study != NULL &&
4685 (study->options & PCRE_STUDY_MAPPED) != 0)
4686 start_bits = study->start_bits;
4687 }
4688
4689 /* For anchored or unanchored matches, there may be a "last known required
4690 character" set. */
4691
4692 if ((re->flags & PCRE_REQCHSET) != 0)
4693 {
4694 req_byte = re->req_byte & 255;
4695 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4696 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4697 }
4698
4699
4700 /* ==========================================================================*/
4701
4702 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4703 the loop runs just once. */
4704
4705 for(;;)
4706 {
4707 USPTR save_end_subject = end_subject;
4708 USPTR new_start_match;
4709
4710 /* Reset the maximum number of extractions we might see. */
4711
4712 if (md->offset_vector != NULL)
4713 {
4714 register int *iptr = md->offset_vector;
4715 register int *iend = iptr + resetcount;
4716 while (iptr < iend) *iptr++ = -1;
4717 }
4718
4719 /* Advance to a unique first char if possible. If firstline is TRUE, the
4720 start of the match is constrained to the first line of a multiline string.
4721 That is, the match must be before or at the first newline. Implement this by
4722 temporarily adjusting end_subject so that we stop scanning at a newline. If
4723 the match fails at the newline, later code breaks this loop. */
4724
4725 if (firstline)
4726 {
4727 USPTR t = start_match;
4728 #ifdef SUPPORT_UTF8
4729 if (utf8)
4730 {
4731 while (t < md->end_subject && !IS_NEWLINE(t))
4732 {
4733 t++;
4734 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4735 }
4736 }
4737 else
4738 #endif
4739 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4740 end_subject = t;
4741 }
4742
4743 /* Now advance to a unique first byte if there is one. */
4744
4745 if (first_byte >= 0)
4746 {
4747 if (first_byte_caseless)
4748 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4749 start_match++;
4750 else
4751 while (start_match < end_subject && *start_match != first_byte)
4752 start_match++;
4753 }
4754
4755 /* Or to just after a linebreak for a multiline match */
4756
4757 else if (startline)
4758 {
4759 if (start_match > md->start_subject + start_offset)
4760 {
4761 #ifdef SUPPORT_UTF8
4762 if (utf8)
4763 {
4764 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4765 {
4766 start_match++;
4767 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4768 start_match++;
4769 }
4770 }
4771 else
4772 #endif
4773 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4774 start_match++;
4775
4776 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4777 and we are now at a LF, advance the match position by one more character.
4778 */
4779
4780 if (start_match[-1] == '\r' &&
4781 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4782 start_match < end_subject &&
4783 *start_match == '\n')
4784 start_match++;
4785 }
4786 }
4787
4788 /* Or to a non-unique first byte after study */
4789
4790 else if (start_bits != NULL)
4791 {
4792 while (start_match < end_subject)
4793 {
4794 register unsigned int c = *start_match;
4795 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4796 else break;
4797 }
4798 }
4799
4800 /* Restore fudged end_subject */
4801
4802 end_subject = save_end_subject;
4803
4804 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4805 printf(">>>> Match against: ");
4806 pchars(start_match, end_subject - start_match, TRUE, md);
4807 printf("\n");
4808 #endif
4809
4810 /* If req_byte is set, we know that that character must appear in the subject
4811 for the match to succeed. If the first character is set, req_byte must be
4812 later in the subject; otherwise the test starts at the match point. This
4813 optimization can save a huge amount of backtracking in patterns with nested
4814 unlimited repeats that aren't going to match. Writing separate code for
4815 cased/caseless versions makes it go faster, as does using an autoincrement
4816 and backing off on a match.
4817
4818 HOWEVER: when the subject string is very, very long, searching to its end can
4819 take a long time, and give bad performance on quite ordinary patterns. This
4820 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4821 string... so we don't do this when the string is sufficiently long.
4822
4823 ALSO: this processing is disabled when partial matching is requested.
4824 */
4825
4826 if (req_byte >= 0 &&
4827 end_subject - start_match < REQ_BYTE_MAX &&
4828 !md->partial)
4829 {
4830 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4831
4832 /* We don't need to repeat the search if we haven't yet reached the
4833 place we found it at last time. */
4834
4835 if (p > req_byte_ptr)
4836 {
4837 if (req_byte_caseless)
4838 {
4839 while (p < end_subject)
4840 {
4841 register int pp = *p++;
4842 if (pp == req_byte || pp == req_byte2) { p--; break; }
4843 }
4844 }
4845 else
4846 {
4847 while (p < end_subject)
4848 {
4849 if (*p++ == req_byte) { p--; break; }
4850 }
4851 }
4852
4853 /* If we can't find the required character, break the matching loop,
4854 forcing a match failure. */
4855
4856 if (p >= end_subject)
4857 {
4858 rc = MATCH_NOMATCH;
4859 break;
4860 }
4861
4862 /* If we have found the required character, save the point where we
4863 found it, so that we don't search again next time round the loop if
4864 the start hasn't passed this character yet. */
4865
4866 req_byte_ptr = p;
4867 }
4868 }
4869
4870 /* OK, we can now run the match. */
4871
4872 md->start_match_ptr = start_match;
4873 md->match_call_count = 0;
4874 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4875
4876 switch(rc)
4877 {
4878 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4879 exactly like PRUNE. */
4880
4881 case MATCH_NOMATCH:
4882 case MATCH_PRUNE:
4883 case MATCH_THEN:
4884 new_start_match = start_match + 1;
4885 #ifdef SUPPORT_UTF8
4886 if (utf8)
4887 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4888 new_start_match++;
4889 #endif
4890 break;
4891
4892 /* SKIP passes back the next starting point explicitly. */
4893
4894 case MATCH_SKIP:
4895 new_start_match = md->start_match_ptr;
4896 break;
4897
4898 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4899
4900 case MATCH_COMMIT:
4901 rc = MATCH_NOMATCH;
4902 goto ENDLOOP;
4903
4904 /* Any other return is some kind of error. */
4905
4906 default:
4907 goto ENDLOOP;
4908 }
4909
4910 /* Control reaches here for the various types of "no match at this point"
4911 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4912
4913 rc = MATCH_NOMATCH;
4914
4915 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4916 newline in the subject (though it may continue over the newline). Therefore,
4917 if we have just failed to match, starting at a newline, do not continue. */
4918
4919 if (firstline && IS_NEWLINE(start_match)) break;
4920
4921 /* Advance to new matching position */
4922
4923 start_match = new_start_match;
4924
4925 /* Break the loop if the pattern is anchored or if we have passed the end of
4926 the subject. */
4927
4928 if (anchored || start_match > end_subject) break;
4929
4930 /* If we have just passed a CR and we are now at a LF, and the pattern does
4931 not contain any explicit matches for \r or \n, and the newline option is CRLF
4932 or ANY or ANYCRLF, advance the match position by one more character. */
4933
4934 if (start_match[-1] == '\r' &&
4935 start_match < end_subject &&
4936 *start_match == '\n' &&
4937 (re->flags & PCRE_HASCRORLF) == 0 &&
4938 (md->nltype == NLTYPE_ANY ||
4939 md->nltype == NLTYPE_ANYCRLF ||
4940 md->nllen == 2))
4941 start_match++;
4942
4943 } /* End of for(;;) "bumpalong" loop */
4944
4945 /* ==========================================================================*/
4946
4947 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4948 conditions is true:
4949
4950 (1) The pattern is anchored or the match was failed by (*COMMIT);
4951
4952 (2) We are past the end of the subject;
4953
4954 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4955 this option requests that a match occur at or before the first newline in
4956 the subject.
4957
4958 When we have a match and the offset vector is big enough to deal with any
4959 backreferences, captured substring offsets will already be set up. In the case
4960 where we had to get some local store to hold offsets for backreference
4961 processing, copy those that we can. In this case there need not be overflow if
4962 certain parts of the pattern were not used, even though there are more
4963 capturing parentheses than vector slots. */
4964
4965 ENDLOOP:
4966
4967 if (rc == MATCH_MATCH)
4968 {
4969 if (using_temporary_offsets)
4970 {
4971 if (offsetcount >= 4)
4972 {
4973 memcpy(offsets + 2, md->offset_vector + 2,
4974 (offsetcount - 2) * sizeof(int));
4975 DPRINTF(("Copied offsets from temporary memory\n"));
4976 }
4977 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4978 DPRINTF(("Freeing temporary memory\n"));
4979 (pcre_free)(md->offset_vector);
4980 }
4981
4982 /* Set the return code to the number of captured strings, or 0 if there are
4983 too many to fit into the vector. */
4984
4985 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4986
4987 /* If there is space, set up the whole thing as substring 0. The value of
4988 md->start_match_ptr might be modified if \K was encountered on the success
4989 matching path. */
4990
4991 if (offsetcount < 2) rc = 0; else
4992 {
4993 offsets[0] = md->start_match_ptr - md->start_subject;
4994 offsets[1] = md->end_match_ptr - md->start_subject;
4995 }
4996
4997 DPRINTF((">>>> returning %d\n", rc));
4998 return rc;
4999 }
5000
5001 /* Control gets here if there has been an error, or if the overall match
5002 attempt has failed at all permitted starting positions. */
5003
5004 if (using_temporary_offsets)
5005 {
5006 DPRINTF(("Freeing temporary memory\n"));
5007 (pcre_free)(md->offset_vector);
5008 }
5009
5010 if (rc != MATCH_NOMATCH)
5011 {
5012 DPRINTF((">>>> error: returning %d\n", rc));
5013 return rc;
5014 }
5015 else if (md->partial && md->hitend)
5016 {
5017 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5018 return PCRE_ERROR_PARTIAL;
5019 }
5020 else
5021 {
5022 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5023 return PCRE_ERROR_NOMATCH;
5024 }
5025 }
5026
5027 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5