/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 402 - (show annotations)
Sat Mar 21 17:26:03 2009 UTC (6 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 153064 byte(s)
Error occurred while calculating annotation data.
Add missing #ifdef SUPPORT_UTF8 round heapframe::Xcharptr.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 const uschar *Xeptr;
326 const uschar *Xecode;
327 const uschar *Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 const uschar *Xcallpat;
337 #ifdef SUPPORT_UTF8
338 const uschar *Xcharptr;
339 #endif
340 const uschar *Xdata;
341 const uschar *Xnext;
342 const uschar *Xpp;
343 const uschar *Xprev;
344 const uschar *Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xctype;
366 unsigned int Xfc;
367 int Xfi;
368 int Xlength;
369 int Xmax;
370 int Xmin;
371 int Xnumber;
372 int Xoffset;
373 int Xop;
374 int Xsave_capture_last;
375 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
376 int Xstacksave[REC_STACK_SAVE_MAX];
377
378 eptrblock Xnewptrb;
379
380 /* Where to jump back to */
381
382 int Xwhere;
383
384 } heapframe;
385
386 #endif
387
388
389 /***************************************************************************
390 ***************************************************************************/
391
392
393
394 /*************************************************
395 * Match from current position *
396 *************************************************/
397
398 /* This function is called recursively in many circumstances. Whenever it
399 returns a negative (error) response, the outer incarnation must also return the
400 same response.
401
402 Performance note: It might be tempting to extract commonly used fields from the
403 md structure (e.g. utf8, end_subject) into individual variables to improve
404 performance. Tests using gcc on a SPARC disproved this; in the first case, it
405 made performance worse.
406
407 Arguments:
408 eptr pointer to current character in subject
409 ecode pointer to current position in compiled code
410 mstart pointer to the current match start position (can be modified
411 by encountering \K)
412 offset_top current top pointer
413 md pointer to "static" info for the match
414 ims current /i, /m, and /s options
415 eptrb pointer to chain of blocks containing eptr at start of
416 brackets - for testing for empty matches
417 flags can contain
418 match_condassert - this is an assertion condition
419 match_cbegroup - this is the start of an unlimited repeat
420 group that can match an empty string
421 rdepth the recursion depth
422
423 Returns: MATCH_MATCH if matched ) these values are >= 0
424 MATCH_NOMATCH if failed to match )
425 a negative PCRE_ERROR_xxx value if aborted by an error condition
426 (e.g. stopped by repeated call or recursion limit)
427 */
428
429 static int
430 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
431 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
432 int flags, unsigned int rdepth)
433 {
434 /* These variables do not need to be preserved over recursion in this function,
435 so they can be ordinary variables in all cases. Mark some of them with
436 "register" because they are used a lot in loops. */
437
438 register int rrc; /* Returns from recursive calls */
439 register int i; /* Used for loops not involving calls to RMATCH() */
440 register unsigned int c; /* Character values not kept over RMATCH() calls */
441 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
442
443 BOOL minimize, possessive; /* Quantifier options */
444
445 /* When recursion is not being used, all "local" variables that have to be
446 preserved over calls to RMATCH() are part of a "frame" which is obtained from
447 heap storage. Set up the top-level frame here; others are obtained from the
448 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
449
450 #ifdef NO_RECURSE
451 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
452 frame->Xprevframe = NULL; /* Marks the top level */
453
454 /* Copy in the original argument variables */
455
456 frame->Xeptr = eptr;
457 frame->Xecode = ecode;
458 frame->Xmstart = mstart;
459 frame->Xoffset_top = offset_top;
460 frame->Xims = ims;
461 frame->Xeptrb = eptrb;
462 frame->Xflags = flags;
463 frame->Xrdepth = rdepth;
464
465 /* This is where control jumps back to to effect "recursion" */
466
467 HEAP_RECURSE:
468
469 /* Macros make the argument variables come from the current frame */
470
471 #define eptr frame->Xeptr
472 #define ecode frame->Xecode
473 #define mstart frame->Xmstart
474 #define offset_top frame->Xoffset_top
475 #define ims frame->Xims
476 #define eptrb frame->Xeptrb
477 #define flags frame->Xflags
478 #define rdepth frame->Xrdepth
479
480 /* Ditto for the local variables */
481
482 #ifdef SUPPORT_UTF8
483 #define charptr frame->Xcharptr
484 #endif
485 #define callpat frame->Xcallpat
486 #define data frame->Xdata
487 #define next frame->Xnext
488 #define pp frame->Xpp
489 #define prev frame->Xprev
490 #define saved_eptr frame->Xsaved_eptr
491
492 #define new_recursive frame->Xnew_recursive
493
494 #define cur_is_word frame->Xcur_is_word
495 #define condition frame->Xcondition
496 #define prev_is_word frame->Xprev_is_word
497
498 #define original_ims frame->Xoriginal_ims
499
500 #ifdef SUPPORT_UCP
501 #define prop_type frame->Xprop_type
502 #define prop_value frame->Xprop_value
503 #define prop_fail_result frame->Xprop_fail_result
504 #define prop_category frame->Xprop_category
505 #define prop_chartype frame->Xprop_chartype
506 #define prop_script frame->Xprop_script
507 #define oclength frame->Xoclength
508 #define occhars frame->Xocchars
509 #endif
510
511 #define ctype frame->Xctype
512 #define fc frame->Xfc
513 #define fi frame->Xfi
514 #define length frame->Xlength
515 #define max frame->Xmax
516 #define min frame->Xmin
517 #define number frame->Xnumber
518 #define offset frame->Xoffset
519 #define op frame->Xop
520 #define save_capture_last frame->Xsave_capture_last
521 #define save_offset1 frame->Xsave_offset1
522 #define save_offset2 frame->Xsave_offset2
523 #define save_offset3 frame->Xsave_offset3
524 #define stacksave frame->Xstacksave
525
526 #define newptrb frame->Xnewptrb
527
528 /* When recursion is being used, local variables are allocated on the stack and
529 get preserved during recursion in the normal way. In this environment, fi and
530 i, and fc and c, can be the same variables. */
531
532 #else /* NO_RECURSE not defined */
533 #define fi i
534 #define fc c
535
536
537 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
538 const uschar *charptr; /* in small blocks of the code. My normal */
539 #endif /* style of coding would have declared */
540 const uschar *callpat; /* them within each of those blocks. */
541 const uschar *data; /* However, in order to accommodate the */
542 const uschar *next; /* version of this code that uses an */
543 USPTR pp; /* external "stack" implemented on the */
544 const uschar *prev; /* heap, it is easier to declare them all */
545 USPTR saved_eptr; /* here, so the declarations can be cut */
546 /* out in a block. The only declarations */
547 recursion_info new_recursive; /* within blocks below are for variables */
548 /* that do not have to be preserved over */
549 BOOL cur_is_word; /* a recursive call to RMATCH(). */
550 BOOL condition;
551 BOOL prev_is_word;
552
553 unsigned long int original_ims;
554
555 #ifdef SUPPORT_UCP
556 int prop_type;
557 int prop_value;
558 int prop_fail_result;
559 int prop_category;
560 int prop_chartype;
561 int prop_script;
562 int oclength;
563 uschar occhars[8];
564 #endif
565
566 int codelink;
567 int condcode;
568 int ctype;
569 int length;
570 int max;
571 int min;
572 int number;
573 int offset;
574 int op;
575 int save_capture_last;
576 int save_offset1, save_offset2, save_offset3;
577 int stacksave[REC_STACK_SAVE_MAX];
578
579 eptrblock newptrb;
580 #endif /* NO_RECURSE */
581
582 /* These statements are here to stop the compiler complaining about unitialized
583 variables. */
584
585 #ifdef SUPPORT_UCP
586 prop_value = 0;
587 prop_fail_result = 0;
588 #endif
589
590
591 /* This label is used for tail recursion, which is used in a few cases even
592 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
593 used. Thanks to Ian Taylor for noticing this possibility and sending the
594 original patch. */
595
596 TAIL_RECURSE:
597
598 /* OK, now we can get on with the real code of the function. Recursive calls
599 are specified by the macro RMATCH and RRETURN is used to return. When
600 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
601 and a "return", respectively (possibly with some debugging if DEBUG is
602 defined). However, RMATCH isn't like a function call because it's quite a
603 complicated macro. It has to be used in one particular way. This shouldn't,
604 however, impact performance when true recursion is being used. */
605
606 #ifdef SUPPORT_UTF8
607 utf8 = md->utf8; /* Local copy of the flag */
608 #else
609 utf8 = FALSE;
610 #endif
611
612 /* First check that we haven't called match() too many times, or that we
613 haven't exceeded the recursive call limit. */
614
615 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
616 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
617
618 original_ims = ims; /* Save for resetting on ')' */
619
620 /* At the start of a group with an unlimited repeat that may match an empty
621 string, the match_cbegroup flag is set. When this is the case, add the current
622 subject pointer to the chain of such remembered pointers, to be checked when we
623 hit the closing ket, in order to break infinite loops that match no characters.
624 When match() is called in other circumstances, don't add to the chain. The
625 match_cbegroup flag must NOT be used with tail recursion, because the memory
626 block that is used is on the stack, so a new one may be required for each
627 match(). */
628
629 if ((flags & match_cbegroup) != 0)
630 {
631 newptrb.epb_saved_eptr = eptr;
632 newptrb.epb_prev = eptrb;
633 eptrb = &newptrb;
634 }
635
636 /* Now start processing the opcodes. */
637
638 for (;;)
639 {
640 minimize = possessive = FALSE;
641 op = *ecode;
642
643 /* For partial matching, remember if we ever hit the end of the subject after
644 matching at least one subject character. */
645
646 if (md->partial &&
647 eptr >= md->end_subject &&
648 eptr > mstart)
649 md->hitend = TRUE;
650
651 switch(op)
652 {
653 case OP_FAIL:
654 RRETURN(MATCH_NOMATCH);
655
656 case OP_PRUNE:
657 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
658 ims, eptrb, flags, RM51);
659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
660 RRETURN(MATCH_PRUNE);
661
662 case OP_COMMIT:
663 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
664 ims, eptrb, flags, RM52);
665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
666 RRETURN(MATCH_COMMIT);
667
668 case OP_SKIP:
669 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
670 ims, eptrb, flags, RM53);
671 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
672 md->start_match_ptr = eptr; /* Pass back current position */
673 RRETURN(MATCH_SKIP);
674
675 case OP_THEN:
676 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
677 ims, eptrb, flags, RM54);
678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
679 RRETURN(MATCH_THEN);
680
681 /* Handle a capturing bracket. If there is space in the offset vector, save
682 the current subject position in the working slot at the top of the vector.
683 We mustn't change the current values of the data slot, because they may be
684 set from a previous iteration of this group, and be referred to by a
685 reference inside the group.
686
687 If the bracket fails to match, we need to restore this value and also the
688 values of the final offsets, in case they were set by a previous iteration
689 of the same bracket.
690
691 If there isn't enough space in the offset vector, treat this as if it were
692 a non-capturing bracket. Don't worry about setting the flag for the error
693 case here; that is handled in the code for KET. */
694
695 case OP_CBRA:
696 case OP_SCBRA:
697 number = GET2(ecode, 1+LINK_SIZE);
698 offset = number << 1;
699
700 #ifdef DEBUG
701 printf("start bracket %d\n", number);
702 printf("subject=");
703 pchars(eptr, 16, TRUE, md);
704 printf("\n");
705 #endif
706
707 if (offset < md->offset_max)
708 {
709 save_offset1 = md->offset_vector[offset];
710 save_offset2 = md->offset_vector[offset+1];
711 save_offset3 = md->offset_vector[md->offset_end - number];
712 save_capture_last = md->capture_last;
713
714 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
715 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
716
717 flags = (op == OP_SCBRA)? match_cbegroup : 0;
718 do
719 {
720 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
721 ims, eptrb, flags, RM1);
722 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
723 md->capture_last = save_capture_last;
724 ecode += GET(ecode, 1);
725 }
726 while (*ecode == OP_ALT);
727
728 DPRINTF(("bracket %d failed\n", number));
729
730 md->offset_vector[offset] = save_offset1;
731 md->offset_vector[offset+1] = save_offset2;
732 md->offset_vector[md->offset_end - number] = save_offset3;
733
734 RRETURN(MATCH_NOMATCH);
735 }
736
737 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
738 as a non-capturing bracket. */
739
740 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
741 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742
743 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
744
745 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
746 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
747
748 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
749 final alternative within the brackets, we would return the result of a
750 recursive call to match() whatever happened. We can reduce stack usage by
751 turning this into a tail recursion, except in the case when match_cbegroup
752 is set.*/
753
754 case OP_BRA:
755 case OP_SBRA:
756 DPRINTF(("start non-capturing bracket\n"));
757 flags = (op >= OP_SBRA)? match_cbegroup : 0;
758 for (;;)
759 {
760 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
761 {
762 if (flags == 0) /* Not a possibly empty group */
763 {
764 ecode += _pcre_OP_lengths[*ecode];
765 DPRINTF(("bracket 0 tail recursion\n"));
766 goto TAIL_RECURSE;
767 }
768
769 /* Possibly empty group; can't use tail recursion. */
770
771 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
772 eptrb, flags, RM48);
773 RRETURN(rrc);
774 }
775
776 /* For non-final alternatives, continue the loop for a NOMATCH result;
777 otherwise return. */
778
779 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
780 eptrb, flags, RM2);
781 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
782 ecode += GET(ecode, 1);
783 }
784 /* Control never reaches here. */
785
786 /* Conditional group: compilation checked that there are no more than
787 two branches. If the condition is false, skipping the first branch takes us
788 past the end if there is only one branch, but that's OK because that is
789 exactly what going to the ket would do. As there is only one branch to be
790 obeyed, we can use tail recursion to avoid using another stack frame. */
791
792 case OP_COND:
793 case OP_SCOND:
794 codelink= GET(ecode, 1);
795
796 /* Because of the way auto-callout works during compile, a callout item is
797 inserted between OP_COND and an assertion condition. */
798
799 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
800 {
801 if (pcre_callout != NULL)
802 {
803 pcre_callout_block cb;
804 cb.version = 1; /* Version 1 of the callout block */
805 cb.callout_number = ecode[LINK_SIZE+2];
806 cb.offset_vector = md->offset_vector;
807 cb.subject = (PCRE_SPTR)md->start_subject;
808 cb.subject_length = md->end_subject - md->start_subject;
809 cb.start_match = mstart - md->start_subject;
810 cb.current_position = eptr - md->start_subject;
811 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
812 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
813 cb.capture_top = offset_top/2;
814 cb.capture_last = md->capture_last;
815 cb.callout_data = md->callout_data;
816 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
817 if (rrc < 0) RRETURN(rrc);
818 }
819 ecode += _pcre_OP_lengths[OP_CALLOUT];
820 }
821
822 condcode = ecode[LINK_SIZE+1];
823
824 /* Now see what the actual condition is */
825
826 if (condcode == OP_RREF) /* Recursion test */
827 {
828 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
829 condition = md->recursive != NULL &&
830 (offset == RREF_ANY || offset == md->recursive->group_num);
831 ecode += condition? 3 : GET(ecode, 1);
832 }
833
834 else if (condcode == OP_CREF) /* Group used test */
835 {
836 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
837 condition = offset < offset_top && md->offset_vector[offset] >= 0;
838 ecode += condition? 3 : GET(ecode, 1);
839 }
840
841 else if (condcode == OP_DEF) /* DEFINE - always false */
842 {
843 condition = FALSE;
844 ecode += GET(ecode, 1);
845 }
846
847 /* The condition is an assertion. Call match() to evaluate it - setting
848 the final argument match_condassert causes it to stop at the end of an
849 assertion. */
850
851 else
852 {
853 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
854 match_condassert, RM3);
855 if (rrc == MATCH_MATCH)
856 {
857 condition = TRUE;
858 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
859 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
860 }
861 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
862 {
863 RRETURN(rrc); /* Need braces because of following else */
864 }
865 else
866 {
867 condition = FALSE;
868 ecode += codelink;
869 }
870 }
871
872 /* We are now at the branch that is to be obeyed. As there is only one,
873 we can use tail recursion to avoid using another stack frame, except when
874 match_cbegroup is required for an unlimited repeat of a possibly empty
875 group. If the second alternative doesn't exist, we can just plough on. */
876
877 if (condition || *ecode == OP_ALT)
878 {
879 ecode += 1 + LINK_SIZE;
880 if (op == OP_SCOND) /* Possibly empty group */
881 {
882 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
883 RRETURN(rrc);
884 }
885 else /* Group must match something */
886 {
887 flags = 0;
888 goto TAIL_RECURSE;
889 }
890 }
891 else /* Condition false & no alternative */
892 {
893 ecode += 1 + LINK_SIZE;
894 }
895 break;
896
897
898 /* End of the pattern, either real or forced. If we are in a top-level
899 recursion, we should restore the offsets appropriately and continue from
900 after the call. */
901
902 case OP_ACCEPT:
903 case OP_END:
904 if (md->recursive != NULL && md->recursive->group_num == 0)
905 {
906 recursion_info *rec = md->recursive;
907 DPRINTF(("End of pattern in a (?0) recursion\n"));
908 md->recursive = rec->prevrec;
909 memmove(md->offset_vector, rec->offset_save,
910 rec->saved_max * sizeof(int));
911 mstart = rec->save_start;
912 ims = original_ims;
913 ecode = rec->after_call;
914 break;
915 }
916
917 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
918 string - backtracking will then try other alternatives, if any. */
919
920 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
921 md->end_match_ptr = eptr; /* Record where we ended */
922 md->end_offset_top = offset_top; /* and how many extracts were taken */
923 md->start_match_ptr = mstart; /* and the start (\K can modify) */
924 RRETURN(MATCH_MATCH);
925
926 /* Change option settings */
927
928 case OP_OPT:
929 ims = ecode[1];
930 ecode += 2;
931 DPRINTF(("ims set to %02lx\n", ims));
932 break;
933
934 /* Assertion brackets. Check the alternative branches in turn - the
935 matching won't pass the KET for an assertion. If any one branch matches,
936 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
937 start of each branch to move the current point backwards, so the code at
938 this level is identical to the lookahead case. */
939
940 case OP_ASSERT:
941 case OP_ASSERTBACK:
942 do
943 {
944 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
945 RM4);
946 if (rrc == MATCH_MATCH) break;
947 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
948 ecode += GET(ecode, 1);
949 }
950 while (*ecode == OP_ALT);
951 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
952
953 /* If checking an assertion for a condition, return MATCH_MATCH. */
954
955 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
956
957 /* Continue from after the assertion, updating the offsets high water
958 mark, since extracts may have been taken during the assertion. */
959
960 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
961 ecode += 1 + LINK_SIZE;
962 offset_top = md->end_offset_top;
963 continue;
964
965 /* Negative assertion: all branches must fail to match */
966
967 case OP_ASSERT_NOT:
968 case OP_ASSERTBACK_NOT:
969 do
970 {
971 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
972 RM5);
973 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
974 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
975 ecode += GET(ecode,1);
976 }
977 while (*ecode == OP_ALT);
978
979 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
980
981 ecode += 1 + LINK_SIZE;
982 continue;
983
984 /* Move the subject pointer back. This occurs only at the start of
985 each branch of a lookbehind assertion. If we are too close to the start to
986 move back, this match function fails. When working with UTF-8 we move
987 back a number of characters, not bytes. */
988
989 case OP_REVERSE:
990 #ifdef SUPPORT_UTF8
991 if (utf8)
992 {
993 i = GET(ecode, 1);
994 while (i-- > 0)
995 {
996 eptr--;
997 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
998 BACKCHAR(eptr);
999 }
1000 }
1001 else
1002 #endif
1003
1004 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1005
1006 {
1007 eptr -= GET(ecode, 1);
1008 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1009 }
1010
1011 /* Skip to next op code */
1012
1013 ecode += 1 + LINK_SIZE;
1014 break;
1015
1016 /* The callout item calls an external function, if one is provided, passing
1017 details of the match so far. This is mainly for debugging, though the
1018 function is able to force a failure. */
1019
1020 case OP_CALLOUT:
1021 if (pcre_callout != NULL)
1022 {
1023 pcre_callout_block cb;
1024 cb.version = 1; /* Version 1 of the callout block */
1025 cb.callout_number = ecode[1];
1026 cb.offset_vector = md->offset_vector;
1027 cb.subject = (PCRE_SPTR)md->start_subject;
1028 cb.subject_length = md->end_subject - md->start_subject;
1029 cb.start_match = mstart - md->start_subject;
1030 cb.current_position = eptr - md->start_subject;
1031 cb.pattern_position = GET(ecode, 2);
1032 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1033 cb.capture_top = offset_top/2;
1034 cb.capture_last = md->capture_last;
1035 cb.callout_data = md->callout_data;
1036 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1037 if (rrc < 0) RRETURN(rrc);
1038 }
1039 ecode += 2 + 2*LINK_SIZE;
1040 break;
1041
1042 /* Recursion either matches the current regex, or some subexpression. The
1043 offset data is the offset to the starting bracket from the start of the
1044 whole pattern. (This is so that it works from duplicated subpatterns.)
1045
1046 If there are any capturing brackets started but not finished, we have to
1047 save their starting points and reinstate them after the recursion. However,
1048 we don't know how many such there are (offset_top records the completed
1049 total) so we just have to save all the potential data. There may be up to
1050 65535 such values, which is too large to put on the stack, but using malloc
1051 for small numbers seems expensive. As a compromise, the stack is used when
1052 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1053 is used. A problem is what to do if the malloc fails ... there is no way of
1054 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1055 values on the stack, and accept that the rest may be wrong.
1056
1057 There are also other values that have to be saved. We use a chained
1058 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1059 for the original version of this logic. */
1060
1061 case OP_RECURSE:
1062 {
1063 callpat = md->start_code + GET(ecode, 1);
1064 new_recursive.group_num = (callpat == md->start_code)? 0 :
1065 GET2(callpat, 1 + LINK_SIZE);
1066
1067 /* Add to "recursing stack" */
1068
1069 new_recursive.prevrec = md->recursive;
1070 md->recursive = &new_recursive;
1071
1072 /* Find where to continue from afterwards */
1073
1074 ecode += 1 + LINK_SIZE;
1075 new_recursive.after_call = ecode;
1076
1077 /* Now save the offset data. */
1078
1079 new_recursive.saved_max = md->offset_end;
1080 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1081 new_recursive.offset_save = stacksave;
1082 else
1083 {
1084 new_recursive.offset_save =
1085 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1086 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1087 }
1088
1089 memcpy(new_recursive.offset_save, md->offset_vector,
1090 new_recursive.saved_max * sizeof(int));
1091 new_recursive.save_start = mstart;
1092 mstart = eptr;
1093
1094 /* OK, now we can do the recursion. For each top-level alternative we
1095 restore the offset and recursion data. */
1096
1097 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1098 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1099 do
1100 {
1101 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1102 md, ims, eptrb, flags, RM6);
1103 if (rrc == MATCH_MATCH)
1104 {
1105 DPRINTF(("Recursion matched\n"));
1106 md->recursive = new_recursive.prevrec;
1107 if (new_recursive.offset_save != stacksave)
1108 (pcre_free)(new_recursive.offset_save);
1109 RRETURN(MATCH_MATCH);
1110 }
1111 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1112 {
1113 DPRINTF(("Recursion gave error %d\n", rrc));
1114 if (new_recursive.offset_save != stacksave)
1115 (pcre_free)(new_recursive.offset_save);
1116 RRETURN(rrc);
1117 }
1118
1119 md->recursive = &new_recursive;
1120 memcpy(md->offset_vector, new_recursive.offset_save,
1121 new_recursive.saved_max * sizeof(int));
1122 callpat += GET(callpat, 1);
1123 }
1124 while (*callpat == OP_ALT);
1125
1126 DPRINTF(("Recursion didn't match\n"));
1127 md->recursive = new_recursive.prevrec;
1128 if (new_recursive.offset_save != stacksave)
1129 (pcre_free)(new_recursive.offset_save);
1130 RRETURN(MATCH_NOMATCH);
1131 }
1132 /* Control never reaches here */
1133
1134 /* "Once" brackets are like assertion brackets except that after a match,
1135 the point in the subject string is not moved back. Thus there can never be
1136 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1137 Check the alternative branches in turn - the matching won't pass the KET
1138 for this kind of subpattern. If any one branch matches, we carry on as at
1139 the end of a normal bracket, leaving the subject pointer. */
1140
1141 case OP_ONCE:
1142 prev = ecode;
1143 saved_eptr = eptr;
1144
1145 do
1146 {
1147 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1148 if (rrc == MATCH_MATCH) break;
1149 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1150 ecode += GET(ecode,1);
1151 }
1152 while (*ecode == OP_ALT);
1153
1154 /* If hit the end of the group (which could be repeated), fail */
1155
1156 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1157
1158 /* Continue as from after the assertion, updating the offsets high water
1159 mark, since extracts may have been taken. */
1160
1161 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1162
1163 offset_top = md->end_offset_top;
1164 eptr = md->end_match_ptr;
1165
1166 /* For a non-repeating ket, just continue at this level. This also
1167 happens for a repeating ket if no characters were matched in the group.
1168 This is the forcible breaking of infinite loops as implemented in Perl
1169 5.005. If there is an options reset, it will get obeyed in the normal
1170 course of events. */
1171
1172 if (*ecode == OP_KET || eptr == saved_eptr)
1173 {
1174 ecode += 1+LINK_SIZE;
1175 break;
1176 }
1177
1178 /* The repeating kets try the rest of the pattern or restart from the
1179 preceding bracket, in the appropriate order. The second "call" of match()
1180 uses tail recursion, to avoid using another stack frame. We need to reset
1181 any options that changed within the bracket before re-running it, so
1182 check the next opcode. */
1183
1184 if (ecode[1+LINK_SIZE] == OP_OPT)
1185 {
1186 ims = (ims & ~PCRE_IMS) | ecode[4];
1187 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1188 }
1189
1190 if (*ecode == OP_KETRMIN)
1191 {
1192 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1193 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1194 ecode = prev;
1195 flags = 0;
1196 goto TAIL_RECURSE;
1197 }
1198 else /* OP_KETRMAX */
1199 {
1200 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1202 ecode += 1 + LINK_SIZE;
1203 flags = 0;
1204 goto TAIL_RECURSE;
1205 }
1206 /* Control never gets here */
1207
1208 /* An alternation is the end of a branch; scan along to find the end of the
1209 bracketed group and go to there. */
1210
1211 case OP_ALT:
1212 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1213 break;
1214
1215 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1216 indicating that it may occur zero times. It may repeat infinitely, or not
1217 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1218 with fixed upper repeat limits are compiled as a number of copies, with the
1219 optional ones preceded by BRAZERO or BRAMINZERO. */
1220
1221 case OP_BRAZERO:
1222 {
1223 next = ecode+1;
1224 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1226 do next += GET(next,1); while (*next == OP_ALT);
1227 ecode = next + 1 + LINK_SIZE;
1228 }
1229 break;
1230
1231 case OP_BRAMINZERO:
1232 {
1233 next = ecode+1;
1234 do next += GET(next, 1); while (*next == OP_ALT);
1235 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1236 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1237 ecode++;
1238 }
1239 break;
1240
1241 case OP_SKIPZERO:
1242 {
1243 next = ecode+1;
1244 do next += GET(next,1); while (*next == OP_ALT);
1245 ecode = next + 1 + LINK_SIZE;
1246 }
1247 break;
1248
1249 /* End of a group, repeated or non-repeating. */
1250
1251 case OP_KET:
1252 case OP_KETRMIN:
1253 case OP_KETRMAX:
1254 prev = ecode - GET(ecode, 1);
1255
1256 /* If this was a group that remembered the subject start, in order to break
1257 infinite repeats of empty string matches, retrieve the subject start from
1258 the chain. Otherwise, set it NULL. */
1259
1260 if (*prev >= OP_SBRA)
1261 {
1262 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1263 eptrb = eptrb->epb_prev; /* Backup to previous group */
1264 }
1265 else saved_eptr = NULL;
1266
1267 /* If we are at the end of an assertion group, stop matching and return
1268 MATCH_MATCH, but record the current high water mark for use by positive
1269 assertions. Do this also for the "once" (atomic) groups. */
1270
1271 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1272 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1273 *prev == OP_ONCE)
1274 {
1275 md->end_match_ptr = eptr; /* For ONCE */
1276 md->end_offset_top = offset_top;
1277 RRETURN(MATCH_MATCH);
1278 }
1279
1280 /* For capturing groups we have to check the group number back at the start
1281 and if necessary complete handling an extraction by setting the offsets and
1282 bumping the high water mark. Note that whole-pattern recursion is coded as
1283 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1284 when the OP_END is reached. Other recursion is handled here. */
1285
1286 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1287 {
1288 number = GET2(prev, 1+LINK_SIZE);
1289 offset = number << 1;
1290
1291 #ifdef DEBUG
1292 printf("end bracket %d", number);
1293 printf("\n");
1294 #endif
1295
1296 md->capture_last = number;
1297 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1298 {
1299 md->offset_vector[offset] =
1300 md->offset_vector[md->offset_end - number];
1301 md->offset_vector[offset+1] = eptr - md->start_subject;
1302 if (offset_top <= offset) offset_top = offset + 2;
1303 }
1304
1305 /* Handle a recursively called group. Restore the offsets
1306 appropriately and continue from after the call. */
1307
1308 if (md->recursive != NULL && md->recursive->group_num == number)
1309 {
1310 recursion_info *rec = md->recursive;
1311 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1312 md->recursive = rec->prevrec;
1313 mstart = rec->save_start;
1314 memcpy(md->offset_vector, rec->offset_save,
1315 rec->saved_max * sizeof(int));
1316 ecode = rec->after_call;
1317 ims = original_ims;
1318 break;
1319 }
1320 }
1321
1322 /* For both capturing and non-capturing groups, reset the value of the ims
1323 flags, in case they got changed during the group. */
1324
1325 ims = original_ims;
1326 DPRINTF(("ims reset to %02lx\n", ims));
1327
1328 /* For a non-repeating ket, just continue at this level. This also
1329 happens for a repeating ket if no characters were matched in the group.
1330 This is the forcible breaking of infinite loops as implemented in Perl
1331 5.005. If there is an options reset, it will get obeyed in the normal
1332 course of events. */
1333
1334 if (*ecode == OP_KET || eptr == saved_eptr)
1335 {
1336 ecode += 1 + LINK_SIZE;
1337 break;
1338 }
1339
1340 /* The repeating kets try the rest of the pattern or restart from the
1341 preceding bracket, in the appropriate order. In the second case, we can use
1342 tail recursion to avoid using another stack frame, unless we have an
1343 unlimited repeat of a group that can match an empty string. */
1344
1345 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1346
1347 if (*ecode == OP_KETRMIN)
1348 {
1349 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1350 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1351 if (flags != 0) /* Could match an empty string */
1352 {
1353 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1354 RRETURN(rrc);
1355 }
1356 ecode = prev;
1357 goto TAIL_RECURSE;
1358 }
1359 else /* OP_KETRMAX */
1360 {
1361 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1362 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1363 ecode += 1 + LINK_SIZE;
1364 flags = 0;
1365 goto TAIL_RECURSE;
1366 }
1367 /* Control never gets here */
1368
1369 /* Start of subject unless notbol, or after internal newline if multiline */
1370
1371 case OP_CIRC:
1372 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1373 if ((ims & PCRE_MULTILINE) != 0)
1374 {
1375 if (eptr != md->start_subject &&
1376 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1377 RRETURN(MATCH_NOMATCH);
1378 ecode++;
1379 break;
1380 }
1381 /* ... else fall through */
1382
1383 /* Start of subject assertion */
1384
1385 case OP_SOD:
1386 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1387 ecode++;
1388 break;
1389
1390 /* Start of match assertion */
1391
1392 case OP_SOM:
1393 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1394 ecode++;
1395 break;
1396
1397 /* Reset the start of match point */
1398
1399 case OP_SET_SOM:
1400 mstart = eptr;
1401 ecode++;
1402 break;
1403
1404 /* Assert before internal newline if multiline, or before a terminating
1405 newline unless endonly is set, else end of subject unless noteol is set. */
1406
1407 case OP_DOLL:
1408 if ((ims & PCRE_MULTILINE) != 0)
1409 {
1410 if (eptr < md->end_subject)
1411 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1412 else
1413 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1414 ecode++;
1415 break;
1416 }
1417 else
1418 {
1419 if (md->noteol) RRETURN(MATCH_NOMATCH);
1420 if (!md->endonly)
1421 {
1422 if (eptr != md->end_subject &&
1423 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1424 RRETURN(MATCH_NOMATCH);
1425 ecode++;
1426 break;
1427 }
1428 }
1429 /* ... else fall through for endonly */
1430
1431 /* End of subject assertion (\z) */
1432
1433 case OP_EOD:
1434 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1435 ecode++;
1436 break;
1437
1438 /* End of subject or ending \n assertion (\Z) */
1439
1440 case OP_EODN:
1441 if (eptr != md->end_subject &&
1442 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1443 RRETURN(MATCH_NOMATCH);
1444 ecode++;
1445 break;
1446
1447 /* Word boundary assertions */
1448
1449 case OP_NOT_WORD_BOUNDARY:
1450 case OP_WORD_BOUNDARY:
1451 {
1452
1453 /* Find out if the previous and current characters are "word" characters.
1454 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1455 be "non-word" characters. */
1456
1457 #ifdef SUPPORT_UTF8
1458 if (utf8)
1459 {
1460 if (eptr == md->start_subject) prev_is_word = FALSE; else
1461 {
1462 const uschar *lastptr = eptr - 1;
1463 while((*lastptr & 0xc0) == 0x80) lastptr--;
1464 GETCHAR(c, lastptr);
1465 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1466 }
1467 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1468 {
1469 GETCHAR(c, eptr);
1470 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1471 }
1472 }
1473 else
1474 #endif
1475
1476 /* More streamlined when not in UTF-8 mode */
1477
1478 {
1479 prev_is_word = (eptr != md->start_subject) &&
1480 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1481 cur_is_word = (eptr < md->end_subject) &&
1482 ((md->ctypes[*eptr] & ctype_word) != 0);
1483 }
1484
1485 /* Now see if the situation is what we want */
1486
1487 if ((*ecode++ == OP_WORD_BOUNDARY)?
1488 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1489 RRETURN(MATCH_NOMATCH);
1490 }
1491 break;
1492
1493 /* Match a single character type; inline for speed */
1494
1495 case OP_ANY:
1496 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1497 /* Fall through */
1498
1499 case OP_ALLANY:
1500 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1501 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1502 ecode++;
1503 break;
1504
1505 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1506 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1507
1508 case OP_ANYBYTE:
1509 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1510 ecode++;
1511 break;
1512
1513 case OP_NOT_DIGIT:
1514 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1515 GETCHARINCTEST(c, eptr);
1516 if (
1517 #ifdef SUPPORT_UTF8
1518 c < 256 &&
1519 #endif
1520 (md->ctypes[c] & ctype_digit) != 0
1521 )
1522 RRETURN(MATCH_NOMATCH);
1523 ecode++;
1524 break;
1525
1526 case OP_DIGIT:
1527 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1528 GETCHARINCTEST(c, eptr);
1529 if (
1530 #ifdef SUPPORT_UTF8
1531 c >= 256 ||
1532 #endif
1533 (md->ctypes[c] & ctype_digit) == 0
1534 )
1535 RRETURN(MATCH_NOMATCH);
1536 ecode++;
1537 break;
1538
1539 case OP_NOT_WHITESPACE:
1540 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1541 GETCHARINCTEST(c, eptr);
1542 if (
1543 #ifdef SUPPORT_UTF8
1544 c < 256 &&
1545 #endif
1546 (md->ctypes[c] & ctype_space) != 0
1547 )
1548 RRETURN(MATCH_NOMATCH);
1549 ecode++;
1550 break;
1551
1552 case OP_WHITESPACE:
1553 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1554 GETCHARINCTEST(c, eptr);
1555 if (
1556 #ifdef SUPPORT_UTF8
1557 c >= 256 ||
1558 #endif
1559 (md->ctypes[c] & ctype_space) == 0
1560 )
1561 RRETURN(MATCH_NOMATCH);
1562 ecode++;
1563 break;
1564
1565 case OP_NOT_WORDCHAR:
1566 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1567 GETCHARINCTEST(c, eptr);
1568 if (
1569 #ifdef SUPPORT_UTF8
1570 c < 256 &&
1571 #endif
1572 (md->ctypes[c] & ctype_word) != 0
1573 )
1574 RRETURN(MATCH_NOMATCH);
1575 ecode++;
1576 break;
1577
1578 case OP_WORDCHAR:
1579 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1580 GETCHARINCTEST(c, eptr);
1581 if (
1582 #ifdef SUPPORT_UTF8
1583 c >= 256 ||
1584 #endif
1585 (md->ctypes[c] & ctype_word) == 0
1586 )
1587 RRETURN(MATCH_NOMATCH);
1588 ecode++;
1589 break;
1590
1591 case OP_ANYNL:
1592 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1593 GETCHARINCTEST(c, eptr);
1594 switch(c)
1595 {
1596 default: RRETURN(MATCH_NOMATCH);
1597 case 0x000d:
1598 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1599 break;
1600
1601 case 0x000a:
1602 break;
1603
1604 case 0x000b:
1605 case 0x000c:
1606 case 0x0085:
1607 case 0x2028:
1608 case 0x2029:
1609 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1610 break;
1611 }
1612 ecode++;
1613 break;
1614
1615 case OP_NOT_HSPACE:
1616 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1617 GETCHARINCTEST(c, eptr);
1618 switch(c)
1619 {
1620 default: break;
1621 case 0x09: /* HT */
1622 case 0x20: /* SPACE */
1623 case 0xa0: /* NBSP */
1624 case 0x1680: /* OGHAM SPACE MARK */
1625 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1626 case 0x2000: /* EN QUAD */
1627 case 0x2001: /* EM QUAD */
1628 case 0x2002: /* EN SPACE */
1629 case 0x2003: /* EM SPACE */
1630 case 0x2004: /* THREE-PER-EM SPACE */
1631 case 0x2005: /* FOUR-PER-EM SPACE */
1632 case 0x2006: /* SIX-PER-EM SPACE */
1633 case 0x2007: /* FIGURE SPACE */
1634 case 0x2008: /* PUNCTUATION SPACE */
1635 case 0x2009: /* THIN SPACE */
1636 case 0x200A: /* HAIR SPACE */
1637 case 0x202f: /* NARROW NO-BREAK SPACE */
1638 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1639 case 0x3000: /* IDEOGRAPHIC SPACE */
1640 RRETURN(MATCH_NOMATCH);
1641 }
1642 ecode++;
1643 break;
1644
1645 case OP_HSPACE:
1646 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1647 GETCHARINCTEST(c, eptr);
1648 switch(c)
1649 {
1650 default: RRETURN(MATCH_NOMATCH);
1651 case 0x09: /* HT */
1652 case 0x20: /* SPACE */
1653 case 0xa0: /* NBSP */
1654 case 0x1680: /* OGHAM SPACE MARK */
1655 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1656 case 0x2000: /* EN QUAD */
1657 case 0x2001: /* EM QUAD */
1658 case 0x2002: /* EN SPACE */
1659 case 0x2003: /* EM SPACE */
1660 case 0x2004: /* THREE-PER-EM SPACE */
1661 case 0x2005: /* FOUR-PER-EM SPACE */
1662 case 0x2006: /* SIX-PER-EM SPACE */
1663 case 0x2007: /* FIGURE SPACE */
1664 case 0x2008: /* PUNCTUATION SPACE */
1665 case 0x2009: /* THIN SPACE */
1666 case 0x200A: /* HAIR SPACE */
1667 case 0x202f: /* NARROW NO-BREAK SPACE */
1668 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1669 case 0x3000: /* IDEOGRAPHIC SPACE */
1670 break;
1671 }
1672 ecode++;
1673 break;
1674
1675 case OP_NOT_VSPACE:
1676 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1677 GETCHARINCTEST(c, eptr);
1678 switch(c)
1679 {
1680 default: break;
1681 case 0x0a: /* LF */
1682 case 0x0b: /* VT */
1683 case 0x0c: /* FF */
1684 case 0x0d: /* CR */
1685 case 0x85: /* NEL */
1686 case 0x2028: /* LINE SEPARATOR */
1687 case 0x2029: /* PARAGRAPH SEPARATOR */
1688 RRETURN(MATCH_NOMATCH);
1689 }
1690 ecode++;
1691 break;
1692
1693 case OP_VSPACE:
1694 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1695 GETCHARINCTEST(c, eptr);
1696 switch(c)
1697 {
1698 default: RRETURN(MATCH_NOMATCH);
1699 case 0x0a: /* LF */
1700 case 0x0b: /* VT */
1701 case 0x0c: /* FF */
1702 case 0x0d: /* CR */
1703 case 0x85: /* NEL */
1704 case 0x2028: /* LINE SEPARATOR */
1705 case 0x2029: /* PARAGRAPH SEPARATOR */
1706 break;
1707 }
1708 ecode++;
1709 break;
1710
1711 #ifdef SUPPORT_UCP
1712 /* Check the next character by Unicode property. We will get here only
1713 if the support is in the binary; otherwise a compile-time error occurs. */
1714
1715 case OP_PROP:
1716 case OP_NOTPROP:
1717 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1718 GETCHARINCTEST(c, eptr);
1719 {
1720 const ucd_record *prop = GET_UCD(c);
1721
1722 switch(ecode[1])
1723 {
1724 case PT_ANY:
1725 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1726 break;
1727
1728 case PT_LAMP:
1729 if ((prop->chartype == ucp_Lu ||
1730 prop->chartype == ucp_Ll ||
1731 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1732 RRETURN(MATCH_NOMATCH);
1733 break;
1734
1735 case PT_GC:
1736 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1737 RRETURN(MATCH_NOMATCH);
1738 break;
1739
1740 case PT_PC:
1741 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1742 RRETURN(MATCH_NOMATCH);
1743 break;
1744
1745 case PT_SC:
1746 if ((ecode[2] != prop->script) == (op == OP_PROP))
1747 RRETURN(MATCH_NOMATCH);
1748 break;
1749
1750 default:
1751 RRETURN(PCRE_ERROR_INTERNAL);
1752 }
1753
1754 ecode += 3;
1755 }
1756 break;
1757
1758 /* Match an extended Unicode sequence. We will get here only if the support
1759 is in the binary; otherwise a compile-time error occurs. */
1760
1761 case OP_EXTUNI:
1762 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1763 GETCHARINCTEST(c, eptr);
1764 {
1765 int category = UCD_CATEGORY(c);
1766 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1767 while (eptr < md->end_subject)
1768 {
1769 int len = 1;
1770 if (!utf8) c = *eptr; else
1771 {
1772 GETCHARLEN(c, eptr, len);
1773 }
1774 category = UCD_CATEGORY(c);
1775 if (category != ucp_M) break;
1776 eptr += len;
1777 }
1778 }
1779 ecode++;
1780 break;
1781 #endif
1782
1783
1784 /* Match a back reference, possibly repeatedly. Look past the end of the
1785 item to see if there is repeat information following. The code is similar
1786 to that for character classes, but repeated for efficiency. Then obey
1787 similar code to character type repeats - written out again for speed.
1788 However, if the referenced string is the empty string, always treat
1789 it as matched, any number of times (otherwise there could be infinite
1790 loops). */
1791
1792 case OP_REF:
1793 {
1794 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1795 ecode += 3;
1796
1797 /* If the reference is unset, there are two possibilities:
1798
1799 (a) In the default, Perl-compatible state, set the length to be longer
1800 than the amount of subject left; this ensures that every attempt at a
1801 match fails. We can't just fail here, because of the possibility of
1802 quantifiers with zero minima.
1803
1804 (b) If the JavaScript compatibility flag is set, set the length to zero
1805 so that the back reference matches an empty string.
1806
1807 Otherwise, set the length to the length of what was matched by the
1808 referenced subpattern. */
1809
1810 if (offset >= offset_top || md->offset_vector[offset] < 0)
1811 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1812 else
1813 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1814
1815 /* Set up for repetition, or handle the non-repeated case */
1816
1817 switch (*ecode)
1818 {
1819 case OP_CRSTAR:
1820 case OP_CRMINSTAR:
1821 case OP_CRPLUS:
1822 case OP_CRMINPLUS:
1823 case OP_CRQUERY:
1824 case OP_CRMINQUERY:
1825 c = *ecode++ - OP_CRSTAR;
1826 minimize = (c & 1) != 0;
1827 min = rep_min[c]; /* Pick up values from tables; */
1828 max = rep_max[c]; /* zero for max => infinity */
1829 if (max == 0) max = INT_MAX;
1830 break;
1831
1832 case OP_CRRANGE:
1833 case OP_CRMINRANGE:
1834 minimize = (*ecode == OP_CRMINRANGE);
1835 min = GET2(ecode, 1);
1836 max = GET2(ecode, 3);
1837 if (max == 0) max = INT_MAX;
1838 ecode += 5;
1839 break;
1840
1841 default: /* No repeat follows */
1842 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1843 eptr += length;
1844 continue; /* With the main loop */
1845 }
1846
1847 /* If the length of the reference is zero, just continue with the
1848 main loop. */
1849
1850 if (length == 0) continue;
1851
1852 /* First, ensure the minimum number of matches are present. We get back
1853 the length of the reference string explicitly rather than passing the
1854 address of eptr, so that eptr can be a register variable. */
1855
1856 for (i = 1; i <= min; i++)
1857 {
1858 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1859 eptr += length;
1860 }
1861
1862 /* If min = max, continue at the same level without recursion.
1863 They are not both allowed to be zero. */
1864
1865 if (min == max) continue;
1866
1867 /* If minimizing, keep trying and advancing the pointer */
1868
1869 if (minimize)
1870 {
1871 for (fi = min;; fi++)
1872 {
1873 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1875 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1876 RRETURN(MATCH_NOMATCH);
1877 eptr += length;
1878 }
1879 /* Control never gets here */
1880 }
1881
1882 /* If maximizing, find the longest string and work backwards */
1883
1884 else
1885 {
1886 pp = eptr;
1887 for (i = min; i < max; i++)
1888 {
1889 if (!match_ref(offset, eptr, length, md, ims)) break;
1890 eptr += length;
1891 }
1892 while (eptr >= pp)
1893 {
1894 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1896 eptr -= length;
1897 }
1898 RRETURN(MATCH_NOMATCH);
1899 }
1900 }
1901 /* Control never gets here */
1902
1903
1904
1905 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1906 used when all the characters in the class have values in the range 0-255,
1907 and either the matching is caseful, or the characters are in the range
1908 0-127 when UTF-8 processing is enabled. The only difference between
1909 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1910 encountered.
1911
1912 First, look past the end of the item to see if there is repeat information
1913 following. Then obey similar code to character type repeats - written out
1914 again for speed. */
1915
1916 case OP_NCLASS:
1917 case OP_CLASS:
1918 {
1919 data = ecode + 1; /* Save for matching */
1920 ecode += 33; /* Advance past the item */
1921
1922 switch (*ecode)
1923 {
1924 case OP_CRSTAR:
1925 case OP_CRMINSTAR:
1926 case OP_CRPLUS:
1927 case OP_CRMINPLUS:
1928 case OP_CRQUERY:
1929 case OP_CRMINQUERY:
1930 c = *ecode++ - OP_CRSTAR;
1931 minimize = (c & 1) != 0;
1932 min = rep_min[c]; /* Pick up values from tables; */
1933 max = rep_max[c]; /* zero for max => infinity */
1934 if (max == 0) max = INT_MAX;
1935 break;
1936
1937 case OP_CRRANGE:
1938 case OP_CRMINRANGE:
1939 minimize = (*ecode == OP_CRMINRANGE);
1940 min = GET2(ecode, 1);
1941 max = GET2(ecode, 3);
1942 if (max == 0) max = INT_MAX;
1943 ecode += 5;
1944 break;
1945
1946 default: /* No repeat follows */
1947 min = max = 1;
1948 break;
1949 }
1950
1951 /* First, ensure the minimum number of matches are present. */
1952
1953 #ifdef SUPPORT_UTF8
1954 /* UTF-8 mode */
1955 if (utf8)
1956 {
1957 for (i = 1; i <= min; i++)
1958 {
1959 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1960 GETCHARINC(c, eptr);
1961 if (c > 255)
1962 {
1963 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1964 }
1965 else
1966 {
1967 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1968 }
1969 }
1970 }
1971 else
1972 #endif
1973 /* Not UTF-8 mode */
1974 {
1975 for (i = 1; i <= min; i++)
1976 {
1977 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1978 c = *eptr++;
1979 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1980 }
1981 }
1982
1983 /* If max == min we can continue with the main loop without the
1984 need to recurse. */
1985
1986 if (min == max) continue;
1987
1988 /* If minimizing, keep testing the rest of the expression and advancing
1989 the pointer while it matches the class. */
1990
1991 if (minimize)
1992 {
1993 #ifdef SUPPORT_UTF8
1994 /* UTF-8 mode */
1995 if (utf8)
1996 {
1997 for (fi = min;; fi++)
1998 {
1999 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2001 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2002 GETCHARINC(c, eptr);
2003 if (c > 255)
2004 {
2005 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2006 }
2007 else
2008 {
2009 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2010 }
2011 }
2012 }
2013 else
2014 #endif
2015 /* Not UTF-8 mode */
2016 {
2017 for (fi = min;; fi++)
2018 {
2019 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2020 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2021 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2022 c = *eptr++;
2023 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2024 }
2025 }
2026 /* Control never gets here */
2027 }
2028
2029 /* If maximizing, find the longest possible run, then work backwards. */
2030
2031 else
2032 {
2033 pp = eptr;
2034
2035 #ifdef SUPPORT_UTF8
2036 /* UTF-8 mode */
2037 if (utf8)
2038 {
2039 for (i = min; i < max; i++)
2040 {
2041 int len = 1;
2042 if (eptr >= md->end_subject) break;
2043 GETCHARLEN(c, eptr, len);
2044 if (c > 255)
2045 {
2046 if (op == OP_CLASS) break;
2047 }
2048 else
2049 {
2050 if ((data[c/8] & (1 << (c&7))) == 0) break;
2051 }
2052 eptr += len;
2053 }
2054 for (;;)
2055 {
2056 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2058 if (eptr-- == pp) break; /* Stop if tried at original pos */
2059 BACKCHAR(eptr);
2060 }
2061 }
2062 else
2063 #endif
2064 /* Not UTF-8 mode */
2065 {
2066 for (i = min; i < max; i++)
2067 {
2068 if (eptr >= md->end_subject) break;
2069 c = *eptr;
2070 if ((data[c/8] & (1 << (c&7))) == 0) break;
2071 eptr++;
2072 }
2073 while (eptr >= pp)
2074 {
2075 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2076 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2077 eptr--;
2078 }
2079 }
2080
2081 RRETURN(MATCH_NOMATCH);
2082 }
2083 }
2084 /* Control never gets here */
2085
2086
2087 /* Match an extended character class. This opcode is encountered only
2088 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2089 mode, because Unicode properties are supported in non-UTF-8 mode. */
2090
2091 #ifdef SUPPORT_UTF8
2092 case OP_XCLASS:
2093 {
2094 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2095 ecode += GET(ecode, 1); /* Advance past the item */
2096
2097 switch (*ecode)
2098 {
2099 case OP_CRSTAR:
2100 case OP_CRMINSTAR:
2101 case OP_CRPLUS:
2102 case OP_CRMINPLUS:
2103 case OP_CRQUERY:
2104 case OP_CRMINQUERY:
2105 c = *ecode++ - OP_CRSTAR;
2106 minimize = (c & 1) != 0;
2107 min = rep_min[c]; /* Pick up values from tables; */
2108 max = rep_max[c]; /* zero for max => infinity */
2109 if (max == 0) max = INT_MAX;
2110 break;
2111
2112 case OP_CRRANGE:
2113 case OP_CRMINRANGE:
2114 minimize = (*ecode == OP_CRMINRANGE);
2115 min = GET2(ecode, 1);
2116 max = GET2(ecode, 3);
2117 if (max == 0) max = INT_MAX;
2118 ecode += 5;
2119 break;
2120
2121 default: /* No repeat follows */
2122 min = max = 1;
2123 break;
2124 }
2125
2126 /* First, ensure the minimum number of matches are present. */
2127
2128 for (i = 1; i <= min; i++)
2129 {
2130 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2131 GETCHARINCTEST(c, eptr);
2132 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2133 }
2134
2135 /* If max == min we can continue with the main loop without the
2136 need to recurse. */
2137
2138 if (min == max) continue;
2139
2140 /* If minimizing, keep testing the rest of the expression and advancing
2141 the pointer while it matches the class. */
2142
2143 if (minimize)
2144 {
2145 for (fi = min;; fi++)
2146 {
2147 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2148 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2149 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2150 GETCHARINCTEST(c, eptr);
2151 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2152 }
2153 /* Control never gets here */
2154 }
2155
2156 /* If maximizing, find the longest possible run, then work backwards. */
2157
2158 else
2159 {
2160 pp = eptr;
2161 for (i = min; i < max; i++)
2162 {
2163 int len = 1;
2164 if (eptr >= md->end_subject) break;
2165 GETCHARLENTEST(c, eptr, len);
2166 if (!_pcre_xclass(c, data)) break;
2167 eptr += len;
2168 }
2169 for(;;)
2170 {
2171 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2173 if (eptr-- == pp) break; /* Stop if tried at original pos */
2174 if (utf8) BACKCHAR(eptr);
2175 }
2176 RRETURN(MATCH_NOMATCH);
2177 }
2178
2179 /* Control never gets here */
2180 }
2181 #endif /* End of XCLASS */
2182
2183 /* Match a single character, casefully */
2184
2185 case OP_CHAR:
2186 #ifdef SUPPORT_UTF8
2187 if (utf8)
2188 {
2189 length = 1;
2190 ecode++;
2191 GETCHARLEN(fc, ecode, length);
2192 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2193 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2194 }
2195 else
2196 #endif
2197
2198 /* Non-UTF-8 mode */
2199 {
2200 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2201 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2202 ecode += 2;
2203 }
2204 break;
2205
2206 /* Match a single character, caselessly */
2207
2208 case OP_CHARNC:
2209 #ifdef SUPPORT_UTF8
2210 if (utf8)
2211 {
2212 length = 1;
2213 ecode++;
2214 GETCHARLEN(fc, ecode, length);
2215
2216 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2217
2218 /* If the pattern character's value is < 128, we have only one byte, and
2219 can use the fast lookup table. */
2220
2221 if (fc < 128)
2222 {
2223 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2224 }
2225
2226 /* Otherwise we must pick up the subject character */
2227
2228 else
2229 {
2230 unsigned int dc;
2231 GETCHARINC(dc, eptr);
2232 ecode += length;
2233
2234 /* If we have Unicode property support, we can use it to test the other
2235 case of the character, if there is one. */
2236
2237 if (fc != dc)
2238 {
2239 #ifdef SUPPORT_UCP
2240 if (dc != UCD_OTHERCASE(fc))
2241 #endif
2242 RRETURN(MATCH_NOMATCH);
2243 }
2244 }
2245 }
2246 else
2247 #endif /* SUPPORT_UTF8 */
2248
2249 /* Non-UTF-8 mode */
2250 {
2251 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2252 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2253 ecode += 2;
2254 }
2255 break;
2256
2257 /* Match a single character repeatedly. */
2258
2259 case OP_EXACT:
2260 min = max = GET2(ecode, 1);
2261 ecode += 3;
2262 goto REPEATCHAR;
2263
2264 case OP_POSUPTO:
2265 possessive = TRUE;
2266 /* Fall through */
2267
2268 case OP_UPTO:
2269 case OP_MINUPTO:
2270 min = 0;
2271 max = GET2(ecode, 1);
2272 minimize = *ecode == OP_MINUPTO;
2273 ecode += 3;
2274 goto REPEATCHAR;
2275
2276 case OP_POSSTAR:
2277 possessive = TRUE;
2278 min = 0;
2279 max = INT_MAX;
2280 ecode++;
2281 goto REPEATCHAR;
2282
2283 case OP_POSPLUS:
2284 possessive = TRUE;
2285 min = 1;
2286 max = INT_MAX;
2287 ecode++;
2288 goto REPEATCHAR;
2289
2290 case OP_POSQUERY:
2291 possessive = TRUE;
2292 min = 0;
2293 max = 1;
2294 ecode++;
2295 goto REPEATCHAR;
2296
2297 case OP_STAR:
2298 case OP_MINSTAR:
2299 case OP_PLUS:
2300 case OP_MINPLUS:
2301 case OP_QUERY:
2302 case OP_MINQUERY:
2303 c = *ecode++ - OP_STAR;
2304 minimize = (c & 1) != 0;
2305 min = rep_min[c]; /* Pick up values from tables; */
2306 max = rep_max[c]; /* zero for max => infinity */
2307 if (max == 0) max = INT_MAX;
2308
2309 /* Common code for all repeated single-character matches. We can give
2310 up quickly if there are fewer than the minimum number of characters left in
2311 the subject. */
2312
2313 REPEATCHAR:
2314 #ifdef SUPPORT_UTF8
2315 if (utf8)
2316 {
2317 length = 1;
2318 charptr = ecode;
2319 GETCHARLEN(fc, ecode, length);
2320 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2321 ecode += length;
2322
2323 /* Handle multibyte character matching specially here. There is
2324 support for caseless matching if UCP support is present. */
2325
2326 if (length > 1)
2327 {
2328 #ifdef SUPPORT_UCP
2329 unsigned int othercase;
2330 if ((ims & PCRE_CASELESS) != 0 &&
2331 (othercase = UCD_OTHERCASE(fc)) != fc)
2332 oclength = _pcre_ord2utf8(othercase, occhars);
2333 else oclength = 0;
2334 #endif /* SUPPORT_UCP */
2335
2336 for (i = 1; i <= min; i++)
2337 {
2338 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2339 #ifdef SUPPORT_UCP
2340 /* Need braces because of following else */
2341 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2342 else
2343 {
2344 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2345 eptr += oclength;
2346 }
2347 #else /* without SUPPORT_UCP */
2348 else { RRETURN(MATCH_NOMATCH); }
2349 #endif /* SUPPORT_UCP */
2350 }
2351
2352 if (min == max) continue;
2353
2354 if (minimize)
2355 {
2356 for (fi = min;; fi++)
2357 {
2358 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2361 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2362 #ifdef SUPPORT_UCP
2363 /* Need braces because of following else */
2364 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2365 else
2366 {
2367 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2368 eptr += oclength;
2369 }
2370 #else /* without SUPPORT_UCP */
2371 else { RRETURN (MATCH_NOMATCH); }
2372 #endif /* SUPPORT_UCP */
2373 }
2374 /* Control never gets here */
2375 }
2376
2377 else /* Maximize */
2378 {
2379 pp = eptr;
2380 for (i = min; i < max; i++)
2381 {
2382 if (eptr > md->end_subject - length) break;
2383 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2384 #ifdef SUPPORT_UCP
2385 else if (oclength == 0) break;
2386 else
2387 {
2388 if (memcmp(eptr, occhars, oclength) != 0) break;
2389 eptr += oclength;
2390 }
2391 #else /* without SUPPORT_UCP */
2392 else break;
2393 #endif /* SUPPORT_UCP */
2394 }
2395
2396 if (possessive) continue;
2397 for(;;)
2398 {
2399 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2400 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2401 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2402 #ifdef SUPPORT_UCP
2403 eptr--;
2404 BACKCHAR(eptr);
2405 #else /* without SUPPORT_UCP */
2406 eptr -= length;
2407 #endif /* SUPPORT_UCP */
2408 }
2409 }
2410 /* Control never gets here */
2411 }
2412
2413 /* If the length of a UTF-8 character is 1, we fall through here, and
2414 obey the code as for non-UTF-8 characters below, though in this case the
2415 value of fc will always be < 128. */
2416 }
2417 else
2418 #endif /* SUPPORT_UTF8 */
2419
2420 /* When not in UTF-8 mode, load a single-byte character. */
2421 {
2422 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2423 fc = *ecode++;
2424 }
2425
2426 /* The value of fc at this point is always less than 256, though we may or
2427 may not be in UTF-8 mode. The code is duplicated for the caseless and
2428 caseful cases, for speed, since matching characters is likely to be quite
2429 common. First, ensure the minimum number of matches are present. If min =
2430 max, continue at the same level without recursing. Otherwise, if
2431 minimizing, keep trying the rest of the expression and advancing one
2432 matching character if failing, up to the maximum. Alternatively, if
2433 maximizing, find the maximum number of characters and work backwards. */
2434
2435 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2436 max, eptr));
2437
2438 if ((ims & PCRE_CASELESS) != 0)
2439 {
2440 fc = md->lcc[fc];
2441 for (i = 1; i <= min; i++)
2442 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2443 if (min == max) continue;
2444 if (minimize)
2445 {
2446 for (fi = min;; fi++)
2447 {
2448 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2449 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2450 if (fi >= max || eptr >= md->end_subject ||
2451 fc != md->lcc[*eptr++])
2452 RRETURN(MATCH_NOMATCH);
2453 }
2454 /* Control never gets here */
2455 }
2456 else /* Maximize */
2457 {
2458 pp = eptr;
2459 for (i = min; i < max; i++)
2460 {
2461 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2462 eptr++;
2463 }
2464 if (possessive) continue;
2465 while (eptr >= pp)
2466 {
2467 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2468 eptr--;
2469 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2470 }
2471 RRETURN(MATCH_NOMATCH);
2472 }
2473 /* Control never gets here */
2474 }
2475
2476 /* Caseful comparisons (includes all multi-byte characters) */
2477
2478 else
2479 {
2480 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2481 if (min == max) continue;
2482 if (minimize)
2483 {
2484 for (fi = min;; fi++)
2485 {
2486 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2487 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2488 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2489 RRETURN(MATCH_NOMATCH);
2490 }
2491 /* Control never gets here */
2492 }
2493 else /* Maximize */
2494 {
2495 pp = eptr;
2496 for (i = min; i < max; i++)
2497 {
2498 if (eptr >= md->end_subject || fc != *eptr) break;
2499 eptr++;
2500 }
2501 if (possessive) continue;
2502 while (eptr >= pp)
2503 {
2504 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2505 eptr--;
2506 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2507 }
2508 RRETURN(MATCH_NOMATCH);
2509 }
2510 }
2511 /* Control never gets here */
2512
2513 /* Match a negated single one-byte character. The character we are
2514 checking can be multibyte. */
2515
2516 case OP_NOT:
2517 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2518 ecode++;
2519 GETCHARINCTEST(c, eptr);
2520 if ((ims & PCRE_CASELESS) != 0)
2521 {
2522 #ifdef SUPPORT_UTF8
2523 if (c < 256)
2524 #endif
2525 c = md->lcc[c];
2526 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2527 }
2528 else
2529 {
2530 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2531 }
2532 break;
2533
2534 /* Match a negated single one-byte character repeatedly. This is almost a
2535 repeat of the code for a repeated single character, but I haven't found a
2536 nice way of commoning these up that doesn't require a test of the
2537 positive/negative option for each character match. Maybe that wouldn't add
2538 very much to the time taken, but character matching *is* what this is all
2539 about... */
2540
2541 case OP_NOTEXACT:
2542 min = max = GET2(ecode, 1);
2543 ecode += 3;
2544 goto REPEATNOTCHAR;
2545
2546 case OP_NOTUPTO:
2547 case OP_NOTMINUPTO:
2548 min = 0;
2549 max = GET2(ecode, 1);
2550 minimize = *ecode == OP_NOTMINUPTO;
2551 ecode += 3;
2552 goto REPEATNOTCHAR;
2553
2554 case OP_NOTPOSSTAR:
2555 possessive = TRUE;
2556 min = 0;
2557 max = INT_MAX;
2558 ecode++;
2559 goto REPEATNOTCHAR;
2560
2561 case OP_NOTPOSPLUS:
2562 possessive = TRUE;
2563 min = 1;
2564 max = INT_MAX;
2565 ecode++;
2566 goto REPEATNOTCHAR;
2567
2568 case OP_NOTPOSQUERY:
2569 possessive = TRUE;
2570 min = 0;
2571 max = 1;
2572 ecode++;
2573 goto REPEATNOTCHAR;
2574
2575 case OP_NOTPOSUPTO:
2576 possessive = TRUE;
2577 min = 0;
2578 max = GET2(ecode, 1);
2579 ecode += 3;
2580 goto REPEATNOTCHAR;
2581
2582 case OP_NOTSTAR:
2583 case OP_NOTMINSTAR:
2584 case OP_NOTPLUS:
2585 case OP_NOTMINPLUS:
2586 case OP_NOTQUERY:
2587 case OP_NOTMINQUERY:
2588 c = *ecode++ - OP_NOTSTAR;
2589 minimize = (c & 1) != 0;
2590 min = rep_min[c]; /* Pick up values from tables; */
2591 max = rep_max[c]; /* zero for max => infinity */
2592 if (max == 0) max = INT_MAX;
2593
2594 /* Common code for all repeated single-byte matches. We can give up quickly
2595 if there are fewer than the minimum number of bytes left in the
2596 subject. */
2597
2598 REPEATNOTCHAR:
2599 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2600 fc = *ecode++;
2601
2602 /* The code is duplicated for the caseless and caseful cases, for speed,
2603 since matching characters is likely to be quite common. First, ensure the
2604 minimum number of matches are present. If min = max, continue at the same
2605 level without recursing. Otherwise, if minimizing, keep trying the rest of
2606 the expression and advancing one matching character if failing, up to the
2607 maximum. Alternatively, if maximizing, find the maximum number of
2608 characters and work backwards. */
2609
2610 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2611 max, eptr));
2612
2613 if ((ims & PCRE_CASELESS) != 0)
2614 {
2615 fc = md->lcc[fc];
2616
2617 #ifdef SUPPORT_UTF8
2618 /* UTF-8 mode */
2619 if (utf8)
2620 {
2621 register unsigned int d;
2622 for (i = 1; i <= min; i++)
2623 {
2624 GETCHARINC(d, eptr);
2625 if (d < 256) d = md->lcc[d];
2626 if (fc == d) RRETURN(MATCH_NOMATCH);
2627 }
2628 }
2629 else
2630 #endif
2631
2632 /* Not UTF-8 mode */
2633 {
2634 for (i = 1; i <= min; i++)
2635 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2636 }
2637
2638 if (min == max) continue;
2639
2640 if (minimize)
2641 {
2642 #ifdef SUPPORT_UTF8
2643 /* UTF-8 mode */
2644 if (utf8)
2645 {
2646 register unsigned int d;
2647 for (fi = min;; fi++)
2648 {
2649 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2652 GETCHARINC(d, eptr);
2653 if (d < 256) d = md->lcc[d];
2654 if (fc == d) RRETURN(MATCH_NOMATCH);
2655
2656 }
2657 }
2658 else
2659 #endif
2660 /* Not UTF-8 mode */
2661 {
2662 for (fi = min;; fi++)
2663 {
2664 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2666 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2667 RRETURN(MATCH_NOMATCH);
2668 }
2669 }
2670 /* Control never gets here */
2671 }
2672
2673 /* Maximize case */
2674
2675 else
2676 {
2677 pp = eptr;
2678
2679 #ifdef SUPPORT_UTF8
2680 /* UTF-8 mode */
2681 if (utf8)
2682 {
2683 register unsigned int d;
2684 for (i = min; i < max; i++)
2685 {
2686 int len = 1;
2687 if (eptr >= md->end_subject) break;
2688 GETCHARLEN(d, eptr, len);
2689 if (d < 256) d = md->lcc[d];
2690 if (fc == d) break;
2691 eptr += len;
2692 }
2693 if (possessive) continue;
2694 for(;;)
2695 {
2696 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2697 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2698 if (eptr-- == pp) break; /* Stop if tried at original pos */
2699 BACKCHAR(eptr);
2700 }
2701 }
2702 else
2703 #endif
2704 /* Not UTF-8 mode */
2705 {
2706 for (i = min; i < max; i++)
2707 {
2708 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2709 eptr++;
2710 }
2711 if (possessive) continue;
2712 while (eptr >= pp)
2713 {
2714 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2716 eptr--;
2717 }
2718 }
2719
2720 RRETURN(MATCH_NOMATCH);
2721 }
2722 /* Control never gets here */
2723 }
2724
2725 /* Caseful comparisons */
2726
2727 else
2728 {
2729 #ifdef SUPPORT_UTF8
2730 /* UTF-8 mode */
2731 if (utf8)
2732 {
2733 register unsigned int d;
2734 for (i = 1; i <= min; i++)
2735 {
2736 GETCHARINC(d, eptr);
2737 if (fc == d) RRETURN(MATCH_NOMATCH);
2738 }
2739 }
2740 else
2741 #endif
2742 /* Not UTF-8 mode */
2743 {
2744 for (i = 1; i <= min; i++)
2745 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2746 }
2747
2748 if (min == max) continue;
2749
2750 if (minimize)
2751 {
2752 #ifdef SUPPORT_UTF8
2753 /* UTF-8 mode */
2754 if (utf8)
2755 {
2756 register unsigned int d;
2757 for (fi = min;; fi++)
2758 {
2759 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2761 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2762 GETCHARINC(d, eptr);
2763 if (fc == d) RRETURN(MATCH_NOMATCH);
2764 }
2765 }
2766 else
2767 #endif
2768 /* Not UTF-8 mode */
2769 {
2770 for (fi = min;; fi++)
2771 {
2772 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2774 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2775 RRETURN(MATCH_NOMATCH);
2776 }
2777 }
2778 /* Control never gets here */
2779 }
2780
2781 /* Maximize case */
2782
2783 else
2784 {
2785 pp = eptr;
2786
2787 #ifdef SUPPORT_UTF8
2788 /* UTF-8 mode */
2789 if (utf8)
2790 {
2791 register unsigned int d;
2792 for (i = min; i < max; i++)
2793 {
2794 int len = 1;
2795 if (eptr >= md->end_subject) break;
2796 GETCHARLEN(d, eptr, len);
2797 if (fc == d) break;
2798 eptr += len;
2799 }
2800 if (possessive) continue;
2801 for(;;)
2802 {
2803 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2805 if (eptr-- == pp) break; /* Stop if tried at original pos */
2806 BACKCHAR(eptr);
2807 }
2808 }
2809 else
2810 #endif
2811 /* Not UTF-8 mode */
2812 {
2813 for (i = min; i < max; i++)
2814 {
2815 if (eptr >= md->end_subject || fc == *eptr) break;
2816 eptr++;
2817 }
2818 if (possessive) continue;
2819 while (eptr >= pp)
2820 {
2821 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2823 eptr--;
2824 }
2825 }
2826
2827 RRETURN(MATCH_NOMATCH);
2828 }
2829 }
2830 /* Control never gets here */
2831
2832 /* Match a single character type repeatedly; several different opcodes
2833 share code. This is very similar to the code for single characters, but we
2834 repeat it in the interests of efficiency. */
2835
2836 case OP_TYPEEXACT:
2837 min = max = GET2(ecode, 1);
2838 minimize = TRUE;
2839 ecode += 3;
2840 goto REPEATTYPE;
2841
2842 case OP_TYPEUPTO:
2843 case OP_TYPEMINUPTO:
2844 min = 0;
2845 max = GET2(ecode, 1);
2846 minimize = *ecode == OP_TYPEMINUPTO;
2847 ecode += 3;
2848 goto REPEATTYPE;
2849
2850 case OP_TYPEPOSSTAR:
2851 possessive = TRUE;
2852 min = 0;
2853 max = INT_MAX;
2854 ecode++;
2855 goto REPEATTYPE;
2856
2857 case OP_TYPEPOSPLUS:
2858 possessive = TRUE;
2859 min = 1;
2860 max = INT_MAX;
2861 ecode++;
2862 goto REPEATTYPE;
2863
2864 case OP_TYPEPOSQUERY:
2865 possessive = TRUE;
2866 min = 0;
2867 max = 1;
2868 ecode++;
2869 goto REPEATTYPE;
2870
2871 case OP_TYPEPOSUPTO:
2872 possessive = TRUE;
2873 min = 0;
2874 max = GET2(ecode, 1);
2875 ecode += 3;
2876 goto REPEATTYPE;
2877
2878 case OP_TYPESTAR:
2879 case OP_TYPEMINSTAR:
2880 case OP_TYPEPLUS:
2881 case OP_TYPEMINPLUS:
2882 case OP_TYPEQUERY:
2883 case OP_TYPEMINQUERY:
2884 c = *ecode++ - OP_TYPESTAR;
2885 minimize = (c & 1) != 0;
2886 min = rep_min[c]; /* Pick up values from tables; */
2887 max = rep_max[c]; /* zero for max => infinity */
2888 if (max == 0) max = INT_MAX;
2889
2890 /* Common code for all repeated single character type matches. Note that
2891 in UTF-8 mode, '.' matches a character of any length, but for the other
2892 character types, the valid characters are all one-byte long. */
2893
2894 REPEATTYPE:
2895 ctype = *ecode++; /* Code for the character type */
2896
2897 #ifdef SUPPORT_UCP
2898 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2899 {
2900 prop_fail_result = ctype == OP_NOTPROP;
2901 prop_type = *ecode++;
2902 prop_value = *ecode++;
2903 }
2904 else prop_type = -1;
2905 #endif
2906
2907 /* First, ensure the minimum number of matches are present. Use inline
2908 code for maximizing the speed, and do the type test once at the start
2909 (i.e. keep it out of the loop). Also we can test that there are at least
2910 the minimum number of bytes before we start. This isn't as effective in
2911 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2912 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2913 and single-bytes. */
2914
2915 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2916 if (min > 0)
2917 {
2918 #ifdef SUPPORT_UCP
2919 if (prop_type >= 0)
2920 {
2921 switch(prop_type)
2922 {
2923 case PT_ANY:
2924 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2925 for (i = 1; i <= min; i++)
2926 {
2927 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2928 GETCHARINCTEST(c, eptr);
2929 }
2930 break;
2931
2932 case PT_LAMP:
2933 for (i = 1; i <= min; i++)
2934 {
2935 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2936 GETCHARINCTEST(c, eptr);
2937 prop_chartype = UCD_CHARTYPE(c);
2938 if ((prop_chartype == ucp_Lu ||
2939 prop_chartype == ucp_Ll ||
2940 prop_chartype == ucp_Lt) == prop_fail_result)
2941 RRETURN(MATCH_NOMATCH);
2942 }
2943 break;
2944
2945 case PT_GC:
2946 for (i = 1; i <= min; i++)
2947 {
2948 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949 GETCHARINCTEST(c, eptr);
2950 prop_category = UCD_CATEGORY(c);
2951 if ((prop_category == prop_value) == prop_fail_result)
2952 RRETURN(MATCH_NOMATCH);
2953 }
2954 break;
2955
2956 case PT_PC:
2957 for (i = 1; i <= min; i++)
2958 {
2959 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2960 GETCHARINCTEST(c, eptr);
2961 prop_chartype = UCD_CHARTYPE(c);
2962 if ((prop_chartype == prop_value) == prop_fail_result)
2963 RRETURN(MATCH_NOMATCH);
2964 }
2965 break;
2966
2967 case PT_SC:
2968 for (i = 1; i <= min; i++)
2969 {
2970 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2971 GETCHARINCTEST(c, eptr);
2972 prop_script = UCD_SCRIPT(c);
2973 if ((prop_script == prop_value) == prop_fail_result)
2974 RRETURN(MATCH_NOMATCH);
2975 }
2976 break;
2977
2978 default:
2979 RRETURN(PCRE_ERROR_INTERNAL);
2980 }
2981 }
2982
2983 /* Match extended Unicode sequences. We will get here only if the
2984 support is in the binary; otherwise a compile-time error occurs. */
2985
2986 else if (ctype == OP_EXTUNI)
2987 {
2988 for (i = 1; i <= min; i++)
2989 {
2990 GETCHARINCTEST(c, eptr);
2991 prop_category = UCD_CATEGORY(c);
2992 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2993 while (eptr < md->end_subject)
2994 {
2995 int len = 1;
2996 if (!utf8) c = *eptr; else
2997 {
2998 GETCHARLEN(c, eptr, len);
2999 }
3000 prop_category = UCD_CATEGORY(c);
3001 if (prop_category != ucp_M) break;
3002 eptr += len;
3003 }
3004 }
3005 }
3006
3007 else
3008 #endif /* SUPPORT_UCP */
3009
3010 /* Handle all other cases when the coding is UTF-8 */
3011
3012 #ifdef SUPPORT_UTF8
3013 if (utf8) switch(ctype)
3014 {
3015 case OP_ANY:
3016 for (i = 1; i <= min; i++)
3017 {
3018 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3019 RRETURN(MATCH_NOMATCH);
3020 eptr++;
3021 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3022 }
3023 break;
3024
3025 case OP_ALLANY:
3026 for (i = 1; i <= min; i++)
3027 {
3028 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3029 eptr++;
3030 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3031 }
3032 break;
3033
3034 case OP_ANYBYTE:
3035 eptr += min;
3036 break;
3037
3038 case OP_ANYNL:
3039 for (i = 1; i <= min; i++)
3040 {
3041 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3042 GETCHARINC(c, eptr);
3043 switch(c)
3044 {
3045 default: RRETURN(MATCH_NOMATCH);
3046 case 0x000d:
3047 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3048 break;
3049
3050 case 0x000a:
3051 break;
3052
3053 case 0x000b:
3054 case 0x000c:
3055 case 0x0085:
3056 case 0x2028:
3057 case 0x2029:
3058 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3059 break;
3060 }
3061 }
3062 break;
3063
3064 case OP_NOT_HSPACE:
3065 for (i = 1; i <= min; i++)
3066 {
3067 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3068 GETCHARINC(c, eptr);
3069 switch(c)
3070 {
3071 default: break;
3072 case 0x09: /* HT */
3073 case 0x20: /* SPACE */
3074 case 0xa0: /* NBSP */
3075 case 0x1680: /* OGHAM SPACE MARK */
3076 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3077 case 0x2000: /* EN QUAD */
3078 case 0x2001: /* EM QUAD */
3079 case 0x2002: /* EN SPACE */
3080 case 0x2003: /* EM SPACE */
3081 case 0x2004: /* THREE-PER-EM SPACE */
3082 case 0x2005: /* FOUR-PER-EM SPACE */
3083 case 0x2006: /* SIX-PER-EM SPACE */
3084 case 0x2007: /* FIGURE SPACE */
3085 case 0x2008: /* PUNCTUATION SPACE */
3086 case 0x2009: /* THIN SPACE */
3087 case 0x200A: /* HAIR SPACE */
3088 case 0x202f: /* NARROW NO-BREAK SPACE */
3089 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3090 case 0x3000: /* IDEOGRAPHIC SPACE */
3091 RRETURN(MATCH_NOMATCH);
3092 }
3093 }
3094 break;
3095
3096 case OP_HSPACE:
3097 for (i = 1; i <= min; i++)
3098 {
3099 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3100 GETCHARINC(c, eptr);
3101 switch(c)
3102 {
3103 default: RRETURN(MATCH_NOMATCH);
3104 case 0x09: /* HT */
3105 case 0x20: /* SPACE */
3106 case 0xa0: /* NBSP */
3107 case 0x1680: /* OGHAM SPACE MARK */
3108 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3109 case 0x2000: /* EN QUAD */
3110 case 0x2001: /* EM QUAD */
3111 case 0x2002: /* EN SPACE */
3112 case 0x2003: /* EM SPACE */
3113 case 0x2004: /* THREE-PER-EM SPACE */
3114 case 0x2005: /* FOUR-PER-EM SPACE */
3115 case 0x2006: /* SIX-PER-EM SPACE */
3116 case 0x2007: /* FIGURE SPACE */
3117 case 0x2008: /* PUNCTUATION SPACE */
3118 case 0x2009: /* THIN SPACE */
3119 case 0x200A: /* HAIR SPACE */
3120 case 0x202f: /* NARROW NO-BREAK SPACE */
3121 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3122 case 0x3000: /* IDEOGRAPHIC SPACE */
3123 break;
3124 }
3125 }
3126 break;
3127
3128 case OP_NOT_VSPACE:
3129 for (i = 1; i <= min; i++)
3130 {
3131 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3132 GETCHARINC(c, eptr);
3133 switch(c)
3134 {
3135 default: break;
3136 case 0x0a: /* LF */
3137 case 0x0b: /* VT */
3138 case 0x0c: /* FF */
3139 case 0x0d: /* CR */
3140 case 0x85: /* NEL */
3141 case 0x2028: /* LINE SEPARATOR */
3142 case 0x2029: /* PARAGRAPH SEPARATOR */
3143 RRETURN(MATCH_NOMATCH);
3144 }
3145 }
3146 break;
3147
3148 case OP_VSPACE:
3149 for (i = 1; i <= min; i++)
3150 {
3151 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3152 GETCHARINC(c, eptr);
3153 switch(c)
3154 {
3155 default: RRETURN(MATCH_NOMATCH);
3156 case 0x0a: /* LF */
3157 case 0x0b: /* VT */
3158 case 0x0c: /* FF */
3159 case 0x0d: /* CR */
3160 case 0x85: /* NEL */
3161 case 0x2028: /* LINE SEPARATOR */
3162 case 0x2029: /* PARAGRAPH SEPARATOR */
3163 break;
3164 }
3165 }
3166 break;
3167
3168 case OP_NOT_DIGIT:
3169 for (i = 1; i <= min; i++)
3170 {
3171 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3172 GETCHARINC(c, eptr);
3173 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3174 RRETURN(MATCH_NOMATCH);
3175 }
3176 break;
3177
3178 case OP_DIGIT:
3179 for (i = 1; i <= min; i++)
3180 {
3181 if (eptr >= md->end_subject ||
3182 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3183 RRETURN(MATCH_NOMATCH);
3184 /* No need to skip more bytes - we know it's a 1-byte character */
3185 }
3186 break;
3187
3188 case OP_NOT_WHITESPACE:
3189 for (i = 1; i <= min; i++)
3190 {
3191 if (eptr >= md->end_subject ||
3192 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3193 RRETURN(MATCH_NOMATCH);
3194 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3195 }
3196 break;
3197
3198 case OP_WHITESPACE:
3199 for (i = 1; i <= min; i++)
3200 {
3201 if (eptr >= md->end_subject ||
3202 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3203 RRETURN(MATCH_NOMATCH);
3204 /* No need to skip more bytes - we know it's a 1-byte character */
3205 }
3206 break;
3207
3208 case OP_NOT_WORDCHAR:
3209 for (i = 1; i <= min; i++)
3210 {
3211 if (eptr >= md->end_subject ||
3212 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3213 RRETURN(MATCH_NOMATCH);
3214 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3215 }
3216 break;
3217
3218 case OP_WORDCHAR:
3219 for (i = 1; i <= min; i++)
3220 {
3221 if (eptr >= md->end_subject ||
3222 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3223 RRETURN(MATCH_NOMATCH);
3224 /* No need to skip more bytes - we know it's a 1-byte character */
3225 }
3226 break;
3227
3228 default:
3229 RRETURN(PCRE_ERROR_INTERNAL);
3230 } /* End switch(ctype) */
3231
3232 else
3233 #endif /* SUPPORT_UTF8 */
3234
3235 /* Code for the non-UTF-8 case for minimum matching of operators other
3236 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3237 number of bytes present, as this was tested above. */
3238
3239 switch(ctype)
3240 {
3241 case OP_ANY:
3242 for (i = 1; i <= min; i++)
3243 {
3244 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3245 eptr++;
3246 }
3247 break;
3248
3249 case OP_ALLANY:
3250 eptr += min;
3251 break;
3252
3253 case OP_ANYBYTE:
3254 eptr += min;
3255 break;
3256
3257 /* Because of the CRLF case, we can't assume the minimum number of
3258 bytes are present in this case. */
3259
3260 case OP_ANYNL:
3261 for (i = 1; i <= min; i++)
3262 {
3263 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3264 switch(*eptr++)
3265 {
3266 default: RRETURN(MATCH_NOMATCH);
3267 case 0x000d:
3268 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3269 break;
3270 case 0x000a:
3271 break;
3272
3273 case 0x000b:
3274 case 0x000c:
3275 case 0x0085:
3276 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3277 break;
3278 }
3279 }
3280 break;
3281
3282 case OP_NOT_HSPACE:
3283 for (i = 1; i <= min; i++)
3284 {
3285 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3286 switch(*eptr++)
3287 {
3288 default: break;
3289 case 0x09: /* HT */
3290 case 0x20: /* SPACE */
3291 case 0xa0: /* NBSP */
3292 RRETURN(MATCH_NOMATCH);
3293 }
3294 }
3295 break;
3296
3297 case OP_HSPACE:
3298 for (i = 1; i <= min; i++)
3299 {
3300 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3301 switch(*eptr++)
3302 {
3303 default: RRETURN(MATCH_NOMATCH);
3304 case 0x09: /* HT */
3305 case 0x20: /* SPACE */
3306 case 0xa0: /* NBSP */
3307 break;
3308 }
3309 }
3310 break;
3311
3312 case OP_NOT_VSPACE:
3313 for (i = 1; i <= min; i++)
3314 {
3315 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3316 switch(*eptr++)
3317 {
3318 default: break;
3319 case 0x0a: /* LF */
3320 case 0x0b: /* VT */
3321 case 0x0c: /* FF */
3322 case 0x0d: /* CR */
3323 case 0x85: /* NEL */
3324 RRETURN(MATCH_NOMATCH);
3325 }
3326 }
3327 break;
3328
3329 case OP_VSPACE:
3330 for (i = 1; i <= min; i++)
3331 {
3332 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3333 switch(*eptr++)
3334 {
3335 default: RRETURN(MATCH_NOMATCH);
3336 case 0x0a: /* LF */
3337 case 0x0b: /* VT */
3338 case 0x0c: /* FF */
3339 case 0x0d: /* CR */
3340 case 0x85: /* NEL */
3341 break;
3342 }
3343 }
3344 break;
3345
3346 case OP_NOT_DIGIT:
3347 for (i = 1; i <= min; i++)
3348 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3349 break;
3350
3351 case OP_DIGIT:
3352 for (i = 1; i <= min; i++)
3353 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3354 break;
3355
3356 case OP_NOT_WHITESPACE:
3357 for (i = 1; i <= min; i++)
3358 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3359 break;
3360
3361 case OP_WHITESPACE:
3362 for (i = 1; i <= min; i++)
3363 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3364 break;
3365
3366 case OP_NOT_WORDCHAR:
3367 for (i = 1; i <= min; i++)
3368 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3369 RRETURN(MATCH_NOMATCH);
3370 break;
3371
3372 case OP_WORDCHAR:
3373 for (i = 1; i <= min; i++)
3374 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3375 RRETURN(MATCH_NOMATCH);
3376 break;
3377
3378 default:
3379 RRETURN(PCRE_ERROR_INTERNAL);
3380 }
3381 }
3382
3383 /* If min = max, continue at the same level without recursing */
3384
3385 if (min == max) continue;
3386
3387 /* If minimizing, we have to test the rest of the pattern before each
3388 subsequent match. Again, separate the UTF-8 case for speed, and also
3389 separate the UCP cases. */
3390
3391 if (minimize)
3392 {
3393 #ifdef SUPPORT_UCP
3394 if (prop_type >= 0)
3395 {
3396 switch(prop_type)
3397 {
3398 case PT_ANY:
3399 for (fi = min;; fi++)
3400 {
3401 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404 GETCHARINC(c, eptr);
3405 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3406 }
3407 /* Control never gets here */
3408
3409 case PT_LAMP:
3410 for (fi = min;; fi++)
3411 {
3412 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3413 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3414 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3415 GETCHARINC(c, eptr);
3416 prop_chartype = UCD_CHARTYPE(c);
3417 if ((prop_chartype == ucp_Lu ||
3418 prop_chartype == ucp_Ll ||
3419 prop_chartype == ucp_Lt) == prop_fail_result)
3420 RRETURN(MATCH_NOMATCH);
3421 }
3422 /* Control never gets here */
3423
3424 case PT_GC:
3425 for (fi = min;; fi++)
3426 {
3427 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3430 GETCHARINC(c, eptr);
3431 prop_category = UCD_CATEGORY(c);
3432 if ((prop_category == prop_value) == prop_fail_result)
3433 RRETURN(MATCH_NOMATCH);
3434 }
3435 /* Control never gets here */
3436
3437 case PT_PC:
3438 for (fi = min;; fi++)
3439 {
3440 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3441 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3443 GETCHARINC(c, eptr);
3444 prop_chartype = UCD_CHARTYPE(c);
3445 if ((prop_chartype == prop_value) == prop_fail_result)
3446 RRETURN(MATCH_NOMATCH);
3447 }
3448 /* Control never gets here */
3449
3450 case PT_SC:
3451 for (fi = min;; fi++)
3452 {
3453 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3454 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3455 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3456 GETCHARINC(c, eptr);
3457 prop_script = UCD_SCRIPT(c);
3458 if ((prop_script == prop_value) == prop_fail_result)
3459 RRETURN(MATCH_NOMATCH);
3460 }
3461 /* Control never gets here */
3462
3463 default:
3464 RRETURN(PCRE_ERROR_INTERNAL);
3465 }
3466 }
3467
3468 /* Match extended Unicode sequences. We will get here only if the
3469 support is in the binary; otherwise a compile-time error occurs. */
3470
3471 else if (ctype == OP_EXTUNI)
3472 {
3473 for (fi = min;; fi++)
3474 {
3475 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3476 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3477 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3478 GETCHARINCTEST(c, eptr);
3479 prop_category = UCD_CATEGORY(c);
3480 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3481 while (eptr < md->end_subject)
3482 {
3483 int len = 1;
3484 if (!utf8) c = *eptr; else
3485 {
3486 GETCHARLEN(c, eptr, len);
3487 }
3488 prop_category = UCD_CATEGORY(c);
3489 if (prop_category != ucp_M) break;
3490 eptr += len;
3491 }
3492 }
3493 }
3494
3495 else
3496 #endif /* SUPPORT_UCP */
3497
3498 #ifdef SUPPORT_UTF8
3499 /* UTF-8 mode */
3500 if (utf8)
3501 {
3502 for (fi = min;; fi++)
3503 {
3504 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3505 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3506 if (fi >= max || eptr >= md->end_subject ||
3507 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3508 RRETURN(MATCH_NOMATCH);
3509
3510 GETCHARINC(c, eptr);
3511 switch(ctype)
3512 {
3513 case OP_ANY: /* This is the non-NL case */
3514 case OP_ALLANY:
3515 case OP_ANYBYTE:
3516 break;
3517
3518 case OP_ANYNL:
3519 switch(c)
3520 {
3521 default: RRETURN(MATCH_NOMATCH);
3522 case 0x000d:
3523 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3524 break;
3525 case 0x000a:
3526 break;
3527
3528 case 0x000b:
3529 case 0x000c:
3530 case 0x0085:
3531 case 0x2028:
3532 case 0x2029:
3533 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3534 break;
3535 }
3536 break;
3537
3538 case OP_NOT_HSPACE:
3539 switch(c)
3540 {
3541 default: break;
3542 case 0x09: /* HT */
3543 case 0x20: /* SPACE */
3544 case 0xa0: /* NBSP */
3545 case 0x1680: /* OGHAM SPACE MARK */
3546 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3547 case 0x2000: /* EN QUAD */
3548 case 0x2001: /* EM QUAD */
3549 case 0x2002: /* EN SPACE */
3550 case 0x2003: /* EM SPACE */
3551 case 0x2004: /* THREE-PER-EM SPACE */
3552 case 0x2005: /* FOUR-PER-EM SPACE */
3553 case 0x2006: /* SIX-PER-EM SPACE */
3554 case 0x2007: /* FIGURE SPACE */
3555 case 0x2008: /* PUNCTUATION SPACE */
3556 case 0x2009: /* THIN SPACE */
3557 case 0x200A: /* HAIR SPACE */
3558 case 0x202f: /* NARROW NO-BREAK SPACE */
3559 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3560 case 0x3000: /* IDEOGRAPHIC SPACE */
3561 RRETURN(MATCH_NOMATCH);
3562 }
3563 break;
3564
3565 case OP_HSPACE:
3566 switch(c)
3567 {
3568 default: RRETURN(MATCH_NOMATCH);
3569 case 0x09: /* HT */
3570 case 0x20: /* SPACE */
3571 case 0xa0: /* NBSP */
3572 case 0x1680: /* OGHAM SPACE MARK */
3573 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3574 case 0x2000: /* EN QUAD */
3575 case 0x2001: /* EM QUAD */
3576 case 0x2002: /* EN SPACE */
3577 case 0x2003: /* EM SPACE */
3578 case 0x2004: /* THREE-PER-EM SPACE */
3579 case 0x2005: /* FOUR-PER-EM SPACE */
3580 case 0x2006: /* SIX-PER-EM SPACE */
3581 case 0x2007: /* FIGURE SPACE */
3582 case 0x2008: /* PUNCTUATION SPACE */
3583 case 0x2009: /* THIN SPACE */
3584 case 0x200A: /* HAIR SPACE */
3585 case 0x202f: /* NARROW NO-BREAK SPACE */
3586 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3587 case 0x3000: /* IDEOGRAPHIC SPACE */
3588 break;
3589 }
3590 break;
3591
3592 case OP_NOT_VSPACE:
3593 switch(c)
3594 {
3595 default: break;
3596 case 0x0a: /* LF */
3597 case 0x0b: /* VT */
3598 case 0x0c: /* FF */
3599 case 0x0d: /* CR */
3600 case 0x85: /* NEL */
3601 case 0x2028: /* LINE SEPARATOR */
3602 case 0x2029: /* PARAGRAPH SEPARATOR */
3603 RRETURN(MATCH_NOMATCH);
3604 }
3605 break;
3606
3607 case OP_VSPACE:
3608 switch(c)
3609 {
3610 default: RRETURN(MATCH_NOMATCH);
3611 case 0x0a: /* LF */
3612 case 0x0b: /* VT */
3613 case 0x0c: /* FF */
3614 case 0x0d: /* CR */
3615 case 0x85: /* NEL */
3616 case 0x2028: /* LINE SEPARATOR */
3617 case 0x2029: /* PARAGRAPH SEPARATOR */
3618 break;
3619 }
3620 break;
3621
3622 case OP_NOT_DIGIT:
3623 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3624 RRETURN(MATCH_NOMATCH);
3625 break;
3626
3627 case OP_DIGIT:
3628 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3629 RRETURN(MATCH_NOMATCH);
3630 break;
3631
3632 case OP_NOT_WHITESPACE:
3633 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3634 RRETURN(MATCH_NOMATCH);
3635 break;
3636
3637 case OP_WHITESPACE:
3638 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3639 RRETURN(MATCH_NOMATCH);
3640 break;
3641
3642 case OP_NOT_WORDCHAR:
3643 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3644 RRETURN(MATCH_NOMATCH);
3645 break;
3646
3647 case OP_WORDCHAR:
3648 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3649 RRETURN(MATCH_NOMATCH);
3650 break;
3651
3652 default:
3653 RRETURN(PCRE_ERROR_INTERNAL);
3654 }
3655 }
3656 }
3657 else
3658 #endif
3659 /* Not UTF-8 mode */
3660 {
3661 for (fi = min;; fi++)
3662 {
3663 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3664 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3665 if (fi >= max || eptr >= md->end_subject ||
3666 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3667 RRETURN(MATCH_NOMATCH);
3668
3669 c = *eptr++;
3670 switch(ctype)
3671 {
3672 case OP_ANY: /* This is the non-NL case */
3673 case OP_ALLANY:
3674 case OP_ANYBYTE:
3675 break;
3676
3677 case OP_ANYNL:
3678 switch(c)
3679 {
3680 default: RRETURN(MATCH_NOMATCH);
3681 case 0x000d:
3682 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3683 break;
3684
3685 case 0x000a:
3686 break;
3687
3688 case 0x000b:
3689 case 0x000c:
3690 case 0x0085:
3691 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3692 break;
3693 }
3694 break;
3695
3696 case OP_NOT_HSPACE:
3697 switch(c)
3698 {
3699 default: break;
3700 case 0x09: /* HT */
3701 case 0x20: /* SPACE */
3702 case 0xa0: /* NBSP */
3703 RRETURN(MATCH_NOMATCH);
3704 }
3705 break;
3706
3707 case OP_HSPACE:
3708 switch(c)
3709 {
3710 default: RRETURN(MATCH_NOMATCH);
3711 case 0x09: /* HT */
3712 case 0x20: /* SPACE */
3713 case 0xa0: /* NBSP */
3714 break;
3715 }
3716 break;
3717
3718 case OP_NOT_VSPACE:
3719 switch(c)
3720 {
3721 default: break;
3722 case 0x0a: /* LF */
3723 case 0x0b: /* VT */
3724 case 0x0c: /* FF */
3725 case 0x0d: /* CR */
3726 case 0x85: /* NEL */
3727 RRETURN(MATCH_NOMATCH);
3728 }
3729 break;
3730
3731 case OP_VSPACE:
3732 switch(c)
3733 {
3734 default: RRETURN(MATCH_NOMATCH);
3735 case 0x0a: /* LF */
3736 case 0x0b: /* VT */
3737 case 0x0c: /* FF */
3738 case 0x0d: /* CR */
3739 case 0x85: /* NEL */
3740 break;
3741 }
3742 break;
3743
3744 case OP_NOT_DIGIT:
3745 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3746 break;
3747
3748 case OP_DIGIT:
3749 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3750 break;
3751
3752 case OP_NOT_WHITESPACE:
3753 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3754 break;
3755
3756 case OP_WHITESPACE:
3757 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3758 break;
3759
3760 case OP_NOT_WORDCHAR:
3761 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3762 break;
3763
3764 case OP_WORDCHAR:
3765 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3766 break;
3767
3768 default:
3769 RRETURN(PCRE_ERROR_INTERNAL);
3770 }
3771 }
3772 }
3773 /* Control never gets here */
3774 }
3775
3776 /* If maximizing, it is worth using inline code for speed, doing the type
3777 test once at the start (i.e. keep it out of the loop). Again, keep the
3778 UTF-8 and UCP stuff separate. */
3779
3780 else
3781 {
3782 pp = eptr; /* Remember where we started */
3783
3784 #ifdef SUPPORT_UCP
3785 if (prop_type >= 0)
3786 {
3787 switch(prop_type)
3788 {
3789 case PT_ANY:
3790 for (i = min; i < max; i++)
3791 {
3792 int len = 1;
3793 if (eptr >= md->end_subject) break;
3794 GETCHARLEN(c, eptr, len);
3795 if (prop_fail_result) break;
3796 eptr+= len;
3797 }
3798 break;
3799
3800 case PT_LAMP:
3801 for (i = min; i < max; i++)
3802 {
3803 int len = 1;
3804 if (eptr >= md->end_subject) break;
3805 GETCHARLEN(c, eptr, len);
3806 prop_chartype = UCD_CHARTYPE(c);
3807 if ((prop_chartype == ucp_Lu ||
3808 prop_chartype == ucp_Ll ||
3809 prop_chartype == ucp_Lt) == prop_fail_result)
3810 break;
3811 eptr+= len;
3812 }
3813 break;
3814
3815 case PT_GC:
3816 for (i = min; i < max; i++)
3817 {
3818 int len = 1;
3819 if (eptr >= md->end_subject) break;
3820 GETCHARLEN(c, eptr, len);
3821 prop_category = UCD_CATEGORY(c);
3822 if ((prop_category == prop_value) == prop_fail_result)
3823 break;
3824 eptr+= len;
3825 }
3826 break;
3827
3828 case PT_PC:
3829 for (i = min; i < max; i++)
3830 {
3831 int len = 1;
3832 if (eptr >= md->end_subject) break;
3833 GETCHARLEN(c, eptr, len);
3834 prop_chartype = UCD_CHARTYPE(c);
3835 if ((prop_chartype == prop_value) == prop_fail_result)
3836 break;
3837 eptr+= len;
3838 }
3839 break;
3840
3841 case PT_SC:
3842 for (i = min; i < max; i++)
3843 {
3844 int len = 1;
3845 if (eptr >= md->end_subject) break;
3846 GETCHARLEN(c, eptr, len);
3847 prop_script = UCD_SCRIPT(c);
3848 if ((prop_script == prop_value) == prop_fail_result)
3849 break;
3850 eptr+= len;
3851 }
3852 break;
3853 }
3854
3855 /* eptr is now past the end of the maximum run */
3856
3857 if (possessive) continue;
3858 for(;;)
3859 {
3860 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3862 if (eptr-- == pp) break; /* Stop if tried at original pos */
3863 if (utf8) BACKCHAR(eptr);
3864 }
3865 }
3866
3867 /* Match extended Unicode sequences. We will get here only if the
3868 support is in the binary; otherwise a compile-time error occurs. */
3869
3870 else if (ctype == OP_EXTUNI)
3871 {
3872 for (i = min; i < max; i++)
3873 {
3874 if (eptr >= md->end_subject) break;
3875 GETCHARINCTEST(c, eptr);
3876 prop_category = UCD_CATEGORY(c);
3877 if (prop_category == ucp_M) break;
3878 while (eptr < md->end_subject)
3879 {
3880 int len = 1;
3881 if (!utf8) c = *eptr; else
3882 {
3883 GETCHARLEN(c, eptr, len);
3884 }
3885 prop_category = UCD_CATEGORY(c);
3886 if (prop_category != ucp_M) break;
3887 eptr += len;
3888 }
3889 }
3890
3891 /* eptr is now past the end of the maximum run */
3892
3893 if (possessive) continue;
3894 for(;;)
3895 {
3896 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3898 if (eptr-- == pp) break; /* Stop if tried at original pos */
3899 for (;;) /* Move back over one extended */
3900 {
3901 int len = 1;
3902 if (!utf8) c = *eptr; else
3903 {
3904 BACKCHAR(eptr);
3905 GETCHARLEN(c, eptr, len);
3906 }
3907 prop_category = UCD_CATEGORY(c);
3908 if (prop_category != ucp_M) break;
3909 eptr--;
3910 }
3911 }
3912 }
3913
3914 else
3915 #endif /* SUPPORT_UCP */
3916
3917 #ifdef SUPPORT_UTF8
3918 /* UTF-8 mode */
3919
3920 if (utf8)
3921 {
3922 switch(ctype)
3923 {
3924 case OP_ANY:
3925 if (max < INT_MAX)
3926 {
3927 for (i = min; i < max; i++)
3928 {
3929 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3930 eptr++;
3931 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3932 }
3933 }
3934
3935 /* Handle unlimited UTF-8 repeat */
3936
3937 else
3938 {
3939 for (i = min; i < max; i++)
3940 {
3941 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3942 eptr++;
3943 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3944 }
3945 }
3946 break;
3947
3948 case OP_ALLANY:
3949 if (max < INT_MAX)
3950 {
3951 for (i = min; i < max; i++)
3952 {
3953 if (eptr >= md->end_subject) break;
3954 eptr++;
3955 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3956 }
3957 }
3958 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3959 break;
3960
3961 /* The byte case is the same as non-UTF8 */
3962
3963 case OP_ANYBYTE:
3964 c = max - min;
3965 if (c > (unsigned int)(md->end_subject - eptr))
3966 c = md->end_subject - eptr;
3967 eptr += c;
3968 break;
3969
3970 case OP_ANYNL:
3971 for (i = min; i < max; i++)
3972 {
3973 int len = 1;
3974 if (eptr >= md->end_subject) break;
3975 GETCHARLEN(c, eptr, len);
3976 if (c == 0x000d)
3977 {
3978 if (++eptr >= md->end_subject) break;
3979 if (*eptr == 0x000a) eptr++;
3980 }
3981 else
3982 {
3983 if (c != 0x000a &&
3984 (md->bsr_anycrlf ||
3985 (c != 0x000b && c != 0x000c &&
3986 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3987 break;
3988 eptr += len;
3989 }
3990 }
3991 break;
3992
3993 case OP_NOT_HSPACE:
3994 case OP_HSPACE:
3995 for (i = min; i < max; i++)
3996 {
3997 BOOL gotspace;
3998 int len = 1;
3999 if (eptr >= md->end_subject) break;
4000 GETCHARLEN(c, eptr, len);
4001 switch(c)
4002 {
4003 default: gotspace = FALSE; break;
4004 case 0x09: /* HT */
4005 case 0x20: /* SPACE */
4006 case 0xa0: /* NBSP */
4007 case 0x1680: /* OGHAM SPACE MARK */
4008 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4009 case 0x2000: /* EN QUAD */
4010 case 0x2001: /* EM QUAD */
4011 case 0x2002: /* EN SPACE */
4012 case 0x2003: /* EM SPACE */
4013 case 0x2004: /* THREE-PER-EM SPACE */
4014 case 0x2005: /* FOUR-PER-EM SPACE */
4015 case 0x2006: /* SIX-PER-EM SPACE */
4016 case 0x2007: /* FIGURE SPACE */
4017 case 0x2008: /* PUNCTUATION SPACE */
4018 case 0x2009: /* THIN SPACE */
4019 case 0x200A: /* HAIR SPACE */
4020 case 0x202f: /* NARROW NO-BREAK SPACE */
4021 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4022 case 0x3000: /* IDEOGRAPHIC SPACE */
4023 gotspace = TRUE;
4024 break;
4025 }
4026 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4027 eptr += len;
4028 }
4029 break;
4030
4031 case OP_NOT_VSPACE:
4032 case OP_VSPACE:
4033 for (i = min; i < max; i++)
4034 {
4035 BOOL gotspace;
4036 int len = 1;
4037 if (eptr >= md->end_subject) break;
4038 GETCHARLEN(c, eptr, len);
4039 switch(c)
4040 {
4041 default: gotspace = FALSE; break;
4042 case 0x0a: /* LF */
4043 case 0x0b: /* VT */
4044 case 0x0c: /* FF */
4045 case 0x0d: /* CR */
4046 case 0x85: /* NEL */
4047 case 0x2028: /* LINE SEPARATOR */
4048 case 0x2029: /* PARAGRAPH SEPARATOR */
4049 gotspace = TRUE;
4050 break;
4051 }
4052 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4053 eptr += len;
4054 }
4055 break;
4056
4057 case OP_NOT_DIGIT:
4058 for (i = min; i < max; i++)
4059 {
4060 int len = 1;
4061 if (eptr >= md->end_subject) break;
4062 GETCHARLEN(c, eptr, len);
4063 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4064 eptr+= len;
4065 }
4066 break;
4067
4068 case OP_DIGIT:
4069 for (i = min; i < max; i++)
4070 {
4071 int len = 1;
4072 if (eptr >= md->end_subject) break;
4073 GETCHARLEN(c, eptr, len);
4074 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4075 eptr+= len;
4076 }
4077 break;
4078
4079 case OP_NOT_WHITESPACE:
4080 for (i = min; i < max; i++)
4081 {
4082 int len = 1;
4083 if (eptr >= md->end_subject) break;
4084 GETCHARLEN(c, eptr, len);
4085 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4086 eptr+= len;
4087 }
4088 break;
4089
4090 case OP_WHITESPACE:
4091 for (i = min; i < max; i++)
4092 {
4093 int len = 1;
4094 if (eptr >= md->end_subject) break;
4095 GETCHARLEN(c, eptr, len);
4096 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4097 eptr+= len;
4098 }
4099 break;
4100
4101 case OP_NOT_WORDCHAR:
4102 for (i = min; i < max; i++)
4103 {
4104 int len = 1;
4105 if (eptr >= md->end_subject) break;
4106 GETCHARLEN(c, eptr, len);
4107 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4108 eptr+= len;
4109 }
4110 break;
4111
4112 case OP_WORDCHAR:
4113 for (i = min; i < max; i++)
4114 {
4115 int len = 1;
4116 if (eptr >= md->end_subject) break;
4117 GETCHARLEN(c, eptr, len);
4118 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4119 eptr+= len;
4120 }
4121 break;
4122
4123 default:
4124 RRETURN(PCRE_ERROR_INTERNAL);
4125 }
4126
4127 /* eptr is now past the end of the maximum run */
4128
4129 if (possessive) continue;
4130 for(;;)
4131 {
4132 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4133 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4134 if (eptr-- == pp) break; /* Stop if tried at original pos */
4135 BACKCHAR(eptr);
4136 }
4137 }
4138 else
4139 #endif /* SUPPORT_UTF8 */
4140
4141 /* Not UTF-8 mode */
4142 {
4143 switch(ctype)
4144 {
4145 case OP_ANY:
4146 for (i = min; i < max; i++)
4147 {
4148 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4149 eptr++;
4150 }
4151 break;
4152
4153 case OP_ALLANY:
4154 case OP_ANYBYTE:
4155 c = max - min;
4156 if (c > (unsigned int)(md->end_subject - eptr))
4157 c = md->end_subject - eptr;
4158 eptr += c;
4159 break;
4160
4161 case OP_ANYNL:
4162 for (i = min; i < max; i++)
4163 {
4164 if (eptr >= md->end_subject) break;
4165 c = *eptr;
4166 if (c == 0x000d)
4167 {
4168 if (++eptr >= md->end_subject) break;
4169 if (*eptr == 0x000a) eptr++;
4170 }
4171 else
4172 {
4173 if (c != 0x000a &&
4174 (md->bsr_anycrlf ||
4175 (c != 0x000b && c != 0x000c && c != 0x0085)))
4176 break;
4177 eptr++;
4178 }
4179 }
4180 break;
4181
4182 case OP_NOT_HSPACE:
4183 for (i = min; i < max; i++)
4184 {
4185 if (eptr >= md->end_subject) break;
4186 c = *eptr;
4187 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4188 eptr++;
4189 }
4190 break;
4191
4192 case OP_HSPACE:
4193 for (i = min; i < max; i++)
4194 {
4195 if (eptr >= md->end_subject) break;
4196 c = *eptr;
4197 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4198 eptr++;
4199 }
4200 break;
4201
4202 case OP_NOT_VSPACE:
4203 for (i = min; i < max; i++)
4204 {
4205 if (eptr >= md->end_subject) break;
4206 c = *eptr;
4207 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4208 break;
4209 eptr++;
4210 }
4211 break;
4212
4213 case OP_VSPACE:
4214 for (i = min; i < max; i++)
4215 {
4216 if (eptr >= md->end_subject) break;
4217 c = *eptr;
4218 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4219 break;
4220 eptr++;
4221 }
4222 break;
4223
4224 case OP_NOT_DIGIT:
4225 for (i = min; i < max; i++)
4226 {
4227 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4228 break;
4229 eptr++;
4230 }
4231 break;
4232
4233 case OP_DIGIT:
4234 for (i = min; i < max; i++)
4235 {
4236 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4237 break;
4238 eptr++;
4239 }
4240 break;
4241
4242 case OP_NOT_WHITESPACE:
4243 for (i = min; i < max; i++)
4244 {
4245 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4246 break;
4247 eptr++;
4248 }
4249 break;
4250
4251 case OP_WHITESPACE:
4252 for (i = min; i < max; i++)
4253 {
4254 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4255 break;
4256 eptr++;
4257 }
4258 break;
4259
4260 case OP_NOT_WORDCHAR:
4261 for (i = min; i < max; i++)
4262 {
4263 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4264 break;
4265 eptr++;
4266 }
4267 break;
4268
4269 case OP_WORDCHAR:
4270 for (i = min; i < max; i++)
4271 {
4272 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4273 break;
4274 eptr++;
4275 }
4276 break;
4277
4278 default:
4279 RRETURN(PCRE_ERROR_INTERNAL);
4280 }
4281
4282 /* eptr is now past the end of the maximum run */
4283
4284 if (possessive) continue;
4285 while (eptr >= pp)
4286 {
4287 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4288 eptr--;
4289 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4290 }
4291 }
4292
4293 /* Get here if we can't make it match with any permitted repetitions */
4294
4295 RRETURN(MATCH_NOMATCH);
4296 }
4297 /* Control never gets here */
4298
4299 /* There's been some horrible disaster. Arrival here can only mean there is
4300 something seriously wrong in the code above or the OP_xxx definitions. */
4301
4302 default:
4303 DPRINTF(("Unknown opcode %d\n", *ecode));
4304 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4305 }
4306
4307 /* Do not stick any code in here without much thought; it is assumed
4308 that "continue" in the code above comes out to here to repeat the main
4309 loop. */
4310
4311 } /* End of main loop */
4312 /* Control never reaches here */
4313
4314
4315 /* When compiling to use the heap rather than the stack for recursive calls to
4316 match(), the RRETURN() macro jumps here. The number that is saved in
4317 frame->Xwhere indicates which label we actually want to return to. */
4318
4319 #ifdef NO_RECURSE
4320 #define LBL(val) case val: goto L_RM##val;
4321 HEAP_RETURN:
4322 switch (frame->Xwhere)
4323 {
4324 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4325 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4326 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4327 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4328 LBL(53) LBL(54)
4329 #ifdef SUPPORT_UTF8
4330 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4331 LBL(32) LBL(34) LBL(42) LBL(46)
4332 #ifdef SUPPORT_UCP
4333 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4334 #endif /* SUPPORT_UCP */
4335 #endif /* SUPPORT_UTF8 */
4336 default:
4337 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4338 return PCRE_ERROR_INTERNAL;
4339 }
4340 #undef LBL
4341 #endif /* NO_RECURSE */
4342 }
4343
4344
4345 /***************************************************************************
4346 ****************************************************************************
4347 RECURSION IN THE match() FUNCTION
4348
4349 Undefine all the macros that were defined above to handle this. */
4350
4351 #ifdef NO_RECURSE
4352 #undef eptr
4353 #undef ecode
4354 #undef mstart
4355 #undef offset_top
4356 #undef ims
4357 #undef eptrb
4358 #undef flags
4359
4360 #undef callpat
4361 #undef charptr
4362 #undef data
4363 #undef next
4364 #undef pp
4365 #undef prev
4366 #undef saved_eptr
4367
4368 #undef new_recursive
4369
4370 #undef cur_is_word
4371 #undef condition
4372 #undef prev_is_word
4373
4374 #undef original_ims
4375
4376 #undef ctype
4377 #undef length
4378 #undef max
4379 #undef min
4380 #undef number
4381 #undef offset
4382 #undef op
4383 #undef save_capture_last
4384 #undef save_offset1
4385 #undef save_offset2
4386 #undef save_offset3
4387 #undef stacksave
4388
4389 #undef newptrb
4390
4391 #endif
4392
4393 /* These two are defined as macros in both cases */
4394
4395 #undef fc
4396 #undef fi
4397
4398 /***************************************************************************
4399 ***************************************************************************/
4400
4401
4402
4403 /*************************************************
4404 * Execute a Regular Expression *
4405 *************************************************/
4406
4407 /* This function applies a compiled re to a subject string and picks out
4408 portions of the string if it matches. Two elements in the vector are set for
4409 each substring: the offsets to the start and end of the substring.
4410
4411 Arguments:
4412 argument_re points to the compiled expression
4413 extra_data points to extra data or is NULL
4414 subject points to the subject string
4415 length length of subject string (may contain binary zeros)
4416 start_offset where to start in the subject string
4417 options option bits
4418 offsets points to a vector of ints to be filled in with offsets
4419 offsetcount the number of elements in the vector
4420
4421 Returns: > 0 => success; value is the number of elements filled in
4422 = 0 => success, but offsets is not big enough
4423 -1 => failed to match
4424 < -1 => some kind of unexpected problem
4425 */
4426
4427 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4428 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4429 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4430 int offsetcount)
4431 {
4432 int rc, resetcount, ocount;
4433 int first_byte = -1;
4434 int req_byte = -1;
4435 int req_byte2 = -1;
4436 int newline;
4437 unsigned long int ims;
4438 BOOL using_temporary_offsets = FALSE;
4439 BOOL anchored;
4440 BOOL startline;
4441 BOOL firstline;
4442 BOOL first_byte_caseless = FALSE;
4443 BOOL req_byte_caseless = FALSE;
4444 BOOL utf8;
4445 match_data match_block;
4446 match_data *md = &match_block;
4447 const uschar *tables;
4448 const uschar *start_bits = NULL;
4449 USPTR start_match = (USPTR)subject + start_offset;
4450 USPTR end_subject;
4451 USPTR req_byte_ptr = start_match - 1;
4452
4453 pcre_study_data internal_study;
4454 const pcre_study_data *study;
4455
4456 real_pcre internal_re;
4457 const real_pcre *external_re = (const real_pcre *)argument_re;
4458 const real_pcre *re = external_re;
4459
4460 /* Plausibility checks */
4461
4462 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4463 if (re == NULL || subject == NULL ||
4464 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4465 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4466
4467 /* Fish out the optional data from the extra_data structure, first setting
4468 the default values. */
4469
4470 study = NULL;
4471 md->match_limit = MATCH_LIMIT;
4472 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4473 md->callout_data = NULL;
4474
4475 /* The table pointer is always in native byte order. */
4476
4477 tables = external_re->tables;
4478
4479 if (extra_data != NULL)
4480 {
4481 register unsigned int flags = extra_data->flags;
4482 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4483 study = (const pcre_study_data *)extra_data->study_data;
4484 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4485 md->match_limit = extra_data->match_limit;
4486 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4487 md->match_limit_recursion = extra_data->match_limit_recursion;
4488 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4489 md->callout_data = extra_data->callout_data;
4490 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4491 }
4492
4493 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4494 is a feature that makes it possible to save compiled regex and re-use them
4495 in other programs later. */
4496
4497 if (tables == NULL) tables = _pcre_default_tables;
4498
4499 /* Check that the first field in the block is the magic number. If it is not,
4500 test for a regex that was compiled on a host of opposite endianness. If this is
4501 the case, flipped values are put in internal_re and internal_study if there was
4502 study data too. */
4503
4504 if (re->magic_number != MAGIC_NUMBER)
4505 {
4506 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4507 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4508 if (study != NULL) study = &internal_study;
4509 }
4510
4511 /* Set up other data */
4512
4513 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4514 startline = (re->flags & PCRE_STARTLINE) != 0;
4515 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4516
4517 /* The code starts after the real_pcre block and the capture name table. */
4518
4519 md->start_code = (const uschar *)external_re + re->name_table_offset +
4520 re->name_count * re->name_entry_size;
4521
4522 md->start_subject = (USPTR)subject;
4523 md->start_offset = start_offset;
4524 md->end_subject = md->start_subject + length;
4525 end_subject = md->end_subject;
4526
4527 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4528 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4529 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4530
4531 md->notbol = (options & PCRE_NOTBOL) != 0;
4532 md->noteol = (options & PCRE_NOTEOL) != 0;
4533 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4534 md->partial = (options & PCRE_PARTIAL) != 0;
4535 md->hitend = FALSE;
4536
4537 md->recursive = NULL; /* No recursion at top level */
4538
4539 md->lcc = tables + lcc_offset;
4540 md->ctypes = tables + ctypes_offset;
4541
4542 /* Handle different \R options. */
4543
4544 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4545 {
4546 case 0:
4547 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4548 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4549 else
4550 #ifdef BSR_ANYCRLF
4551 md->bsr_anycrlf = TRUE;
4552 #else
4553 md->bsr_anycrlf = FALSE;
4554 #endif
4555 break;
4556
4557 case PCRE_BSR_ANYCRLF:
4558 md->bsr_anycrlf = TRUE;
4559 break;
4560
4561 case PCRE_BSR_UNICODE:
4562 md->bsr_anycrlf = FALSE;
4563 break;
4564
4565 default: return PCRE_ERROR_BADNEWLINE;
4566 }
4567
4568 /* Handle different types of newline. The three bits give eight cases. If
4569 nothing is set at run time, whatever was used at compile time applies. */
4570
4571 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4572 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4573 {
4574 case 0: newline = NEWLINE; break; /* Compile-time default */
4575 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4576 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4577 case PCRE_NEWLINE_CR+
4578 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4579 case PCRE_NEWLINE_ANY: newline = -1; break;
4580 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4581 default: return PCRE_ERROR_BADNEWLINE;
4582 }
4583
4584 if (newline == -2)
4585 {
4586 md->nltype = NLTYPE_ANYCRLF;
4587 }
4588 else if (newline < 0)
4589 {
4590 md->nltype = NLTYPE_ANY;
4591 }
4592 else
4593 {
4594 md->nltype = NLTYPE_FIXED;
4595 if (newline > 255)
4596 {
4597 md->nllen = 2;
4598 md->nl[0] = (newline >> 8) & 255;
4599 md->nl[1] = newline & 255;
4600 }
4601 else
4602 {
4603 md->nllen = 1;
4604 md->nl[0] = newline;
4605 }
4606 }
4607
4608 /* Partial matching is supported only for a restricted set of regexes at the
4609 moment. */
4610
4611 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4612 return PCRE_ERROR_BADPARTIAL;
4613
4614 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4615 back the character offset. */
4616
4617 #ifdef SUPPORT_UTF8
4618 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4619 {
4620 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4621 return PCRE_ERROR_BADUTF8;
4622 if (start_offset > 0 && start_offset < length)
4623 {
4624 int tb = ((uschar *)subject)[start_offset];
4625 if (tb > 127)
4626 {
4627 tb &= 0xc0;
4628 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4629 }
4630 }
4631 }
4632 #endif
4633
4634 /* The ims options can vary during the matching as a result of the presence
4635 of (?ims) items in the pattern. They are kept in a local variable so that
4636 restoring at the exit of a group is easy. */
4637
4638 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4639
4640 /* If the expression has got more back references than the offsets supplied can
4641 hold, we get a temporary chunk of working store to use during the matching.
4642 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4643 of 3. */
4644
4645 ocount = offsetcount - (offsetcount % 3);
4646
4647 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4648 {
4649 ocount = re->top_backref * 3 + 3;
4650 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4651 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4652 using_temporary_offsets = TRUE;
4653 DPRINTF(("Got memory to hold back references\n"));
4654 }
4655 else md->offset_vector = offsets;
4656
4657 md->offset_end = ocount;
4658 md->offset_max = (2*ocount)/3;
4659 md->offset_overflow = FALSE;
4660 md->capture_last = -1;
4661
4662 /* Compute the minimum number of offsets that we need to reset each time. Doing
4663 this makes a huge difference to execution time when there aren't many brackets
4664 in the pattern. */
4665
4666 resetcount = 2 + re->top_bracket * 2;
4667 if (resetcount > offsetcount) resetcount = ocount;
4668
4669 /* Reset the working variable associated with each extraction. These should
4670 never be used unless previously set, but they get saved and restored, and so we
4671 initialize them to avoid reading uninitialized locations. */
4672
4673 if (md->offset_vector != NULL)
4674 {
4675 register int *iptr = md->offset_vector + ocount;
4676 register int *iend = iptr - resetcount/2 + 1;
4677 while (--iptr >= iend) *iptr = -1;
4678 }
4679
4680 /* Set up the first character to match, if available. The first_byte value is
4681 never set for an anchored regular expression, but the anchoring may be forced
4682 at run time, so we have to test for anchoring. The first char may be unset for
4683 an unanchored pattern, of course. If there's no first char and the pattern was
4684 studied, there may be a bitmap of possible first characters. */
4685
4686 if (!anchored)
4687 {
4688 if ((re->flags & PCRE_FIRSTSET) != 0)
4689 {
4690 first_byte = re->first_byte & 255;
4691 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4692 first_byte = md->lcc[first_byte];
4693 }
4694 else
4695 if (!startline && study != NULL &&
4696 (study->options & PCRE_STUDY_MAPPED) != 0)
4697 start_bits = study->start_bits;
4698 }
4699
4700 /* For anchored or unanchored matches, there may be a "last known required
4701 character" set. */
4702
4703 if ((re->flags & PCRE_REQCHSET) != 0)
4704 {
4705 req_byte = re->req_byte & 255;
4706 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4707 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4708 }
4709
4710
4711 /* ==========================================================================*/
4712
4713 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4714 the loop runs just once. */
4715
4716 for(;;)
4717 {
4718 USPTR save_end_subject = end_subject;
4719 USPTR new_start_match;
4720
4721 /* Reset the maximum number of extractions we might see. */
4722
4723 if (md->offset_vector != NULL)
4724 {
4725 register int *iptr = md->offset_vector;
4726 register int *iend = iptr + resetcount;
4727 while (iptr < iend) *iptr++ = -1;
4728 }
4729
4730 /* If firstline is TRUE, the start of the match is constrained to the first
4731 line of a multiline string. That is, the match must be before or at the first
4732 newline. Implement this by temporarily adjusting end_subject so that we stop
4733 scanning at a newline. If the match fails at the newline, later code breaks
4734 this loop. */
4735
4736 if (firstline)
4737 {
4738 USPTR t = start_match;
4739 #ifdef SUPPORT_UTF8
4740 if (utf8)
4741 {
4742 while (t < md->end_subject && !IS_NEWLINE(t))
4743 {
4744 t++;
4745 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4746 }
4747 }
4748 else
4749 #endif
4750 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4751 end_subject = t;
4752 }
4753
4754 /* There are some optimizations that avoid running the match if a known
4755 starting point is not found, or if a known later character is not present.
4756 However, there is an option that disables these, for testing and for ensuring
4757 that all callouts do actually occur. */
4758
4759 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4760 {
4761 /* Advance to a unique first byte if there is one. */
4762
4763 if (first_byte >= 0)
4764 {
4765 if (first_byte_caseless)
4766 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4767 start_match++;
4768 else
4769 while (start_match < end_subject && *start_match != first_byte)
4770 start_match++;
4771 }
4772
4773 /* Or to just after a linebreak for a multiline match */
4774
4775 else if (startline)
4776 {
4777 if (start_match > md->start_subject + start_offset)
4778 {
4779 #ifdef SUPPORT_UTF8
4780 if (utf8)
4781 {
4782 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4783 {
4784 start_match++;
4785 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4786 start_match++;
4787 }
4788 }
4789 else
4790 #endif
4791 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4792 start_match++;
4793
4794 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4795 and we are now at a LF, advance the match position by one more character.
4796 */
4797
4798 if (start_match[-1] == CHAR_CR &&
4799 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4800 start_match < end_subject &&
4801 *start_match == CHAR_NL)
4802 start_match++;
4803 }
4804 }
4805
4806 /* Or to a non-unique first byte after study */
4807
4808 else if (start_bits != NULL)
4809 {
4810 while (start_match < end_subject)
4811 {
4812 register unsigned int c = *start_match;
4813 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4814 else break;
4815 }
4816 }
4817 } /* Starting optimizations */
4818
4819 /* Restore fudged end_subject */
4820
4821 end_subject = save_end_subject;
4822
4823 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4824 printf(">>>> Match against: ");
4825 pchars(start_match, end_subject - start_match, TRUE, md);
4826 printf("\n");
4827 #endif
4828
4829 /* If req_byte is set, we know that that character must appear in the
4830 subject for the match to succeed. If the first character is set, req_byte
4831 must be later in the subject; otherwise the test starts at the match point.
4832 This optimization can save a huge amount of backtracking in patterns with
4833 nested unlimited repeats that aren't going to match. Writing separate code
4834 for cased/caseless versions makes it go faster, as does using an
4835 autoincrement and backing off on a match.
4836
4837 HOWEVER: when the subject string is very, very long, searching to its end
4838 can take a long time, and give bad performance on quite ordinary patterns.
4839 This showed up when somebody was matching something like /^\d+C/ on a
4840 32-megabyte string... so we don't do this when the string is sufficiently
4841 long.
4842
4843 ALSO: this processing is disabled when partial matching is requested, or if
4844 disabling is explicitly requested. */
4845
4846 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4847 req_byte >= 0 &&
4848 end_subject - start_match < REQ_BYTE_MAX &&
4849 !md->partial)
4850 {
4851 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4852
4853 /* We don't need to repeat the search if we haven't yet reached the
4854 place we found it at last time. */
4855
4856 if (p > req_byte_ptr)
4857 {
4858 if (req_byte_caseless)
4859 {
4860 while (p < end_subject)
4861 {
4862 register int pp = *p++;
4863 if (pp == req_byte || pp == req_byte2) { p--; break; }
4864 }
4865 }
4866 else
4867 {
4868 while (p < end_subject)
4869 {
4870 if (*p++ == req_byte) { p--; break; }
4871 }
4872 }
4873
4874 /* If we can't find the required character, break the matching loop,
4875 forcing a match failure. */
4876
4877 if (p >= end_subject)
4878 {
4879 rc = MATCH_NOMATCH;
4880 break;
4881 }
4882
4883 /* If we have found the required character, save the point where we
4884 found it, so that we don't search again next time round the loop if
4885 the start hasn't passed this character yet. */
4886
4887 req_byte_ptr = p;
4888 }
4889 }
4890
4891 /* OK, we can now run the match. */
4892
4893 md->start_match_ptr = start_match;
4894 md->match_call_count = 0;
4895 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4896
4897 switch(rc)
4898 {
4899 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4900 exactly like PRUNE. */
4901
4902 case MATCH_NOMATCH:
4903 case MATCH_PRUNE:
4904 case MATCH_THEN:
4905 new_start_match = start_match + 1;
4906 #ifdef SUPPORT_UTF8
4907 if (utf8)
4908 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4909 new_start_match++;
4910 #endif
4911 break;
4912
4913 /* SKIP passes back the next starting point explicitly. */
4914
4915 case MATCH_SKIP:
4916 new_start_match = md->start_match_ptr;
4917 break;
4918
4919 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4920
4921 case MATCH_COMMIT:
4922 rc = MATCH_NOMATCH;
4923 goto ENDLOOP;
4924
4925 /* Any other return is some kind of error. */
4926
4927 default:
4928 goto ENDLOOP;
4929 }
4930
4931 /* Control reaches here for the various types of "no match at this point"
4932 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4933
4934 rc = MATCH_NOMATCH;
4935
4936 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4937 newline in the subject (though it may continue over the newline). Therefore,
4938 if we have just failed to match, starting at a newline, do not continue. */
4939
4940 if (firstline && IS_NEWLINE(start_match)) break;
4941
4942 /* Advance to new matching position */
4943
4944 start_match = new_start_match;
4945
4946 /* Break the loop if the pattern is anchored or if we have passed the end of
4947 the subject. */
4948
4949 if (anchored || start_match > end_subject) break;
4950
4951 /* If we have just passed a CR and we are now at a LF, and the pattern does
4952 not contain any explicit matches for \r or \n, and the newline option is CRLF
4953 or ANY or ANYCRLF, advance the match position by one more character. */
4954
4955 if (start_match[-1] == CHAR_CR &&
4956 start_match < end_subject &&
4957 *start_match == CHAR_NL &&
4958 (re->flags & PCRE_HASCRORLF) == 0 &&
4959 (md->nltype == NLTYPE_ANY ||
4960 md->nltype == NLTYPE_ANYCRLF ||
4961 md->nllen == 2))
4962 start_match++;
4963
4964 } /* End of for(;;) "bumpalong" loop */
4965
4966 /* ==========================================================================*/
4967
4968 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4969 conditions is true:
4970
4971 (1) The pattern is anchored or the match was failed by (*COMMIT);
4972
4973 (2) We are past the end of the subject;
4974
4975 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4976 this option requests that a match occur at or before the first newline in
4977 the subject.
4978
4979 When we have a match and the offset vector is big enough to deal with any
4980 backreferences, captured substring offsets will already be set up. In the case
4981 where we had to get some local store to hold offsets for backreference
4982 processing, copy those that we can. In this case there need not be overflow if
4983 certain parts of the pattern were not used, even though there are more
4984 capturing parentheses than vector slots. */
4985
4986 ENDLOOP:
4987
4988 if (rc == MATCH_MATCH)
4989 {
4990 if (using_temporary_offsets)
4991 {
4992 if (offsetcount >= 4)
4993 {
4994 memcpy(offsets + 2, md->offset_vector + 2,
4995 (offsetcount - 2) * sizeof(int));
4996 DPRINTF(("Copied offsets from temporary memory\n"));
4997 }
4998 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4999 DPRINTF(("Freeing temporary memory\n"));
5000 (pcre_free)(md->offset_vector);
5001 }
5002
5003 /* Set the return code to the number of captured strings, or 0 if there are
5004 too many to fit into the vector. */
5005
5006 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5007
5008 /* If there is space, set up the whole thing as substring 0. The value of
5009 md->start_match_ptr might be modified if \K was encountered on the success
5010 matching path. */
5011
5012 if (offsetcount < 2) rc = 0; else
5013 {
5014 offsets[0] = md->start_match_ptr - md->start_subject;
5015 offsets[1] = md->end_match_ptr - md->start_subject;
5016 }
5017
5018 DPRINTF((">>>> returning %d\n", rc));
5019 return rc;
5020 }
5021
5022 /* Control gets here if there has been an error, or if the overall match
5023 attempt has failed at all permitted starting positions. */
5024
5025 if (using_temporary_offsets)
5026 {
5027 DPRINTF(("Freeing temporary memory\n"));
5028 (pcre_free)(md->offset_vector);
5029 }
5030
5031 if (rc != MATCH_NOMATCH)
5032 {
5033 DPRINTF((">>>> error: returning %d\n", rc));
5034 return rc;
5035 }
5036 else if (md->partial && md->hitend)
5037 {
5038 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5039 return PCRE_ERROR_PARTIAL;
5040 }
5041 else
5042 {
5043 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5044 return PCRE_ERROR_NOMATCH;
5045 }
5046 }
5047
5048 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5