/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 435 - (show annotations)
Sat Sep 5 10:20:44 2009 UTC (10 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 161431 byte(s)
Further updates to partial matching.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF) /* Recursion test */
843 {
844 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845 condition = md->recursive != NULL &&
846 (offset == RREF_ANY || offset == md->recursive->group_num);
847 ecode += condition? 3 : GET(ecode, 1);
848 }
849
850 else if (condcode == OP_CREF) /* Group used test */
851 {
852 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854 ecode += condition? 3 : GET(ecode, 1);
855 }
856
857 else if (condcode == OP_DEF) /* DEFINE - always false */
858 {
859 condition = FALSE;
860 ecode += GET(ecode, 1);
861 }
862
863 /* The condition is an assertion. Call match() to evaluate it - setting
864 the final argument match_condassert causes it to stop at the end of an
865 assertion. */
866
867 else
868 {
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870 match_condassert, RM3);
871 if (rrc == MATCH_MATCH)
872 {
873 condition = TRUE;
874 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876 }
877 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 {
879 RRETURN(rrc); /* Need braces because of following else */
880 }
881 else
882 {
883 condition = FALSE;
884 ecode += codelink;
885 }
886 }
887
888 /* We are now at the branch that is to be obeyed. As there is only one,
889 we can use tail recursion to avoid using another stack frame, except when
890 match_cbegroup is required for an unlimited repeat of a possibly empty
891 group. If the second alternative doesn't exist, we can just plough on. */
892
893 if (condition || *ecode == OP_ALT)
894 {
895 ecode += 1 + LINK_SIZE;
896 if (op == OP_SCOND) /* Possibly empty group */
897 {
898 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899 RRETURN(rrc);
900 }
901 else /* Group must match something */
902 {
903 flags = 0;
904 goto TAIL_RECURSE;
905 }
906 }
907 else /* Condition false & no alternative */
908 {
909 ecode += 1 + LINK_SIZE;
910 }
911 break;
912
913
914 /* End of the pattern, either real or forced. If we are in a top-level
915 recursion, we should restore the offsets appropriately and continue from
916 after the call. */
917
918 case OP_ACCEPT:
919 case OP_END:
920 if (md->recursive != NULL && md->recursive->group_num == 0)
921 {
922 recursion_info *rec = md->recursive;
923 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 md->recursive = rec->prevrec;
925 memmove(md->offset_vector, rec->offset_save,
926 rec->saved_max * sizeof(int));
927 mstart = rec->save_start;
928 ims = original_ims;
929 ecode = rec->after_call;
930 break;
931 }
932
933 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
934 string - backtracking will then try other alternatives, if any. */
935
936 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
937 md->end_match_ptr = eptr; /* Record where we ended */
938 md->end_offset_top = offset_top; /* and how many extracts were taken */
939 md->start_match_ptr = mstart; /* and the start (\K can modify) */
940 RRETURN(MATCH_MATCH);
941
942 /* Change option settings */
943
944 case OP_OPT:
945 ims = ecode[1];
946 ecode += 2;
947 DPRINTF(("ims set to %02lx\n", ims));
948 break;
949
950 /* Assertion brackets. Check the alternative branches in turn - the
951 matching won't pass the KET for an assertion. If any one branch matches,
952 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
953 start of each branch to move the current point backwards, so the code at
954 this level is identical to the lookahead case. */
955
956 case OP_ASSERT:
957 case OP_ASSERTBACK:
958 do
959 {
960 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
961 RM4);
962 if (rrc == MATCH_MATCH) break;
963 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
964 ecode += GET(ecode, 1);
965 }
966 while (*ecode == OP_ALT);
967 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
968
969 /* If checking an assertion for a condition, return MATCH_MATCH. */
970
971 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972
973 /* Continue from after the assertion, updating the offsets high water
974 mark, since extracts may have been taken during the assertion. */
975
976 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
977 ecode += 1 + LINK_SIZE;
978 offset_top = md->end_offset_top;
979 continue;
980
981 /* Negative assertion: all branches must fail to match */
982
983 case OP_ASSERT_NOT:
984 case OP_ASSERTBACK_NOT:
985 do
986 {
987 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
988 RM5);
989 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
990 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
991 ecode += GET(ecode,1);
992 }
993 while (*ecode == OP_ALT);
994
995 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
996
997 ecode += 1 + LINK_SIZE;
998 continue;
999
1000 /* Move the subject pointer back. This occurs only at the start of
1001 each branch of a lookbehind assertion. If we are too close to the start to
1002 move back, this match function fails. When working with UTF-8 we move
1003 back a number of characters, not bytes. */
1004
1005 case OP_REVERSE:
1006 #ifdef SUPPORT_UTF8
1007 if (utf8)
1008 {
1009 i = GET(ecode, 1);
1010 while (i-- > 0)
1011 {
1012 eptr--;
1013 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1014 BACKCHAR(eptr);
1015 }
1016 }
1017 else
1018 #endif
1019
1020 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1021
1022 {
1023 eptr -= GET(ecode, 1);
1024 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1025 }
1026
1027 /* Save the earliest consulted character, then skip to next op code */
1028
1029 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1030 ecode += 1 + LINK_SIZE;
1031 break;
1032
1033 /* The callout item calls an external function, if one is provided, passing
1034 details of the match so far. This is mainly for debugging, though the
1035 function is able to force a failure. */
1036
1037 case OP_CALLOUT:
1038 if (pcre_callout != NULL)
1039 {
1040 pcre_callout_block cb;
1041 cb.version = 1; /* Version 1 of the callout block */
1042 cb.callout_number = ecode[1];
1043 cb.offset_vector = md->offset_vector;
1044 cb.subject = (PCRE_SPTR)md->start_subject;
1045 cb.subject_length = md->end_subject - md->start_subject;
1046 cb.start_match = mstart - md->start_subject;
1047 cb.current_position = eptr - md->start_subject;
1048 cb.pattern_position = GET(ecode, 2);
1049 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1050 cb.capture_top = offset_top/2;
1051 cb.capture_last = md->capture_last;
1052 cb.callout_data = md->callout_data;
1053 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1054 if (rrc < 0) RRETURN(rrc);
1055 }
1056 ecode += 2 + 2*LINK_SIZE;
1057 break;
1058
1059 /* Recursion either matches the current regex, or some subexpression. The
1060 offset data is the offset to the starting bracket from the start of the
1061 whole pattern. (This is so that it works from duplicated subpatterns.)
1062
1063 If there are any capturing brackets started but not finished, we have to
1064 save their starting points and reinstate them after the recursion. However,
1065 we don't know how many such there are (offset_top records the completed
1066 total) so we just have to save all the potential data. There may be up to
1067 65535 such values, which is too large to put on the stack, but using malloc
1068 for small numbers seems expensive. As a compromise, the stack is used when
1069 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1070 is used. A problem is what to do if the malloc fails ... there is no way of
1071 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1072 values on the stack, and accept that the rest may be wrong.
1073
1074 There are also other values that have to be saved. We use a chained
1075 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1076 for the original version of this logic. */
1077
1078 case OP_RECURSE:
1079 {
1080 callpat = md->start_code + GET(ecode, 1);
1081 new_recursive.group_num = (callpat == md->start_code)? 0 :
1082 GET2(callpat, 1 + LINK_SIZE);
1083
1084 /* Add to "recursing stack" */
1085
1086 new_recursive.prevrec = md->recursive;
1087 md->recursive = &new_recursive;
1088
1089 /* Find where to continue from afterwards */
1090
1091 ecode += 1 + LINK_SIZE;
1092 new_recursive.after_call = ecode;
1093
1094 /* Now save the offset data. */
1095
1096 new_recursive.saved_max = md->offset_end;
1097 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1098 new_recursive.offset_save = stacksave;
1099 else
1100 {
1101 new_recursive.offset_save =
1102 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1103 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1104 }
1105
1106 memcpy(new_recursive.offset_save, md->offset_vector,
1107 new_recursive.saved_max * sizeof(int));
1108 new_recursive.save_start = mstart;
1109 mstart = eptr;
1110
1111 /* OK, now we can do the recursion. For each top-level alternative we
1112 restore the offset and recursion data. */
1113
1114 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1115 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1116 do
1117 {
1118 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1119 md, ims, eptrb, flags, RM6);
1120 if (rrc == MATCH_MATCH)
1121 {
1122 DPRINTF(("Recursion matched\n"));
1123 md->recursive = new_recursive.prevrec;
1124 if (new_recursive.offset_save != stacksave)
1125 (pcre_free)(new_recursive.offset_save);
1126 RRETURN(MATCH_MATCH);
1127 }
1128 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1129 {
1130 DPRINTF(("Recursion gave error %d\n", rrc));
1131 if (new_recursive.offset_save != stacksave)
1132 (pcre_free)(new_recursive.offset_save);
1133 RRETURN(rrc);
1134 }
1135
1136 md->recursive = &new_recursive;
1137 memcpy(md->offset_vector, new_recursive.offset_save,
1138 new_recursive.saved_max * sizeof(int));
1139 callpat += GET(callpat, 1);
1140 }
1141 while (*callpat == OP_ALT);
1142
1143 DPRINTF(("Recursion didn't match\n"));
1144 md->recursive = new_recursive.prevrec;
1145 if (new_recursive.offset_save != stacksave)
1146 (pcre_free)(new_recursive.offset_save);
1147 RRETURN(MATCH_NOMATCH);
1148 }
1149 /* Control never reaches here */
1150
1151 /* "Once" brackets are like assertion brackets except that after a match,
1152 the point in the subject string is not moved back. Thus there can never be
1153 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1154 Check the alternative branches in turn - the matching won't pass the KET
1155 for this kind of subpattern. If any one branch matches, we carry on as at
1156 the end of a normal bracket, leaving the subject pointer. */
1157
1158 case OP_ONCE:
1159 prev = ecode;
1160 saved_eptr = eptr;
1161
1162 do
1163 {
1164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1165 if (rrc == MATCH_MATCH) break;
1166 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1167 ecode += GET(ecode,1);
1168 }
1169 while (*ecode == OP_ALT);
1170
1171 /* If hit the end of the group (which could be repeated), fail */
1172
1173 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1174
1175 /* Continue as from after the assertion, updating the offsets high water
1176 mark, since extracts may have been taken. */
1177
1178 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1179
1180 offset_top = md->end_offset_top;
1181 eptr = md->end_match_ptr;
1182
1183 /* For a non-repeating ket, just continue at this level. This also
1184 happens for a repeating ket if no characters were matched in the group.
1185 This is the forcible breaking of infinite loops as implemented in Perl
1186 5.005. If there is an options reset, it will get obeyed in the normal
1187 course of events. */
1188
1189 if (*ecode == OP_KET || eptr == saved_eptr)
1190 {
1191 ecode += 1+LINK_SIZE;
1192 break;
1193 }
1194
1195 /* The repeating kets try the rest of the pattern or restart from the
1196 preceding bracket, in the appropriate order. The second "call" of match()
1197 uses tail recursion, to avoid using another stack frame. We need to reset
1198 any options that changed within the bracket before re-running it, so
1199 check the next opcode. */
1200
1201 if (ecode[1+LINK_SIZE] == OP_OPT)
1202 {
1203 ims = (ims & ~PCRE_IMS) | ecode[4];
1204 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1205 }
1206
1207 if (*ecode == OP_KETRMIN)
1208 {
1209 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1211 ecode = prev;
1212 flags = 0;
1213 goto TAIL_RECURSE;
1214 }
1215 else /* OP_KETRMAX */
1216 {
1217 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1218 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1219 ecode += 1 + LINK_SIZE;
1220 flags = 0;
1221 goto TAIL_RECURSE;
1222 }
1223 /* Control never gets here */
1224
1225 /* An alternation is the end of a branch; scan along to find the end of the
1226 bracketed group and go to there. */
1227
1228 case OP_ALT:
1229 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1230 break;
1231
1232 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1233 indicating that it may occur zero times. It may repeat infinitely, or not
1234 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1235 with fixed upper repeat limits are compiled as a number of copies, with the
1236 optional ones preceded by BRAZERO or BRAMINZERO. */
1237
1238 case OP_BRAZERO:
1239 {
1240 next = ecode+1;
1241 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1243 do next += GET(next,1); while (*next == OP_ALT);
1244 ecode = next + 1 + LINK_SIZE;
1245 }
1246 break;
1247
1248 case OP_BRAMINZERO:
1249 {
1250 next = ecode+1;
1251 do next += GET(next, 1); while (*next == OP_ALT);
1252 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1254 ecode++;
1255 }
1256 break;
1257
1258 case OP_SKIPZERO:
1259 {
1260 next = ecode+1;
1261 do next += GET(next,1); while (*next == OP_ALT);
1262 ecode = next + 1 + LINK_SIZE;
1263 }
1264 break;
1265
1266 /* End of a group, repeated or non-repeating. */
1267
1268 case OP_KET:
1269 case OP_KETRMIN:
1270 case OP_KETRMAX:
1271 prev = ecode - GET(ecode, 1);
1272
1273 /* If this was a group that remembered the subject start, in order to break
1274 infinite repeats of empty string matches, retrieve the subject start from
1275 the chain. Otherwise, set it NULL. */
1276
1277 if (*prev >= OP_SBRA)
1278 {
1279 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1280 eptrb = eptrb->epb_prev; /* Backup to previous group */
1281 }
1282 else saved_eptr = NULL;
1283
1284 /* If we are at the end of an assertion group, stop matching and return
1285 MATCH_MATCH, but record the current high water mark for use by positive
1286 assertions. Do this also for the "once" (atomic) groups. */
1287
1288 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1289 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1290 *prev == OP_ONCE)
1291 {
1292 md->end_match_ptr = eptr; /* For ONCE */
1293 md->end_offset_top = offset_top;
1294 RRETURN(MATCH_MATCH);
1295 }
1296
1297 /* For capturing groups we have to check the group number back at the start
1298 and if necessary complete handling an extraction by setting the offsets and
1299 bumping the high water mark. Note that whole-pattern recursion is coded as
1300 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1301 when the OP_END is reached. Other recursion is handled here. */
1302
1303 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1304 {
1305 number = GET2(prev, 1+LINK_SIZE);
1306 offset = number << 1;
1307
1308 #ifdef DEBUG
1309 printf("end bracket %d", number);
1310 printf("\n");
1311 #endif
1312
1313 md->capture_last = number;
1314 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1315 {
1316 md->offset_vector[offset] =
1317 md->offset_vector[md->offset_end - number];
1318 md->offset_vector[offset+1] = eptr - md->start_subject;
1319 if (offset_top <= offset) offset_top = offset + 2;
1320 }
1321
1322 /* Handle a recursively called group. Restore the offsets
1323 appropriately and continue from after the call. */
1324
1325 if (md->recursive != NULL && md->recursive->group_num == number)
1326 {
1327 recursion_info *rec = md->recursive;
1328 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1329 md->recursive = rec->prevrec;
1330 mstart = rec->save_start;
1331 memcpy(md->offset_vector, rec->offset_save,
1332 rec->saved_max * sizeof(int));
1333 ecode = rec->after_call;
1334 ims = original_ims;
1335 break;
1336 }
1337 }
1338
1339 /* For both capturing and non-capturing groups, reset the value of the ims
1340 flags, in case they got changed during the group. */
1341
1342 ims = original_ims;
1343 DPRINTF(("ims reset to %02lx\n", ims));
1344
1345 /* For a non-repeating ket, just continue at this level. This also
1346 happens for a repeating ket if no characters were matched in the group.
1347 This is the forcible breaking of infinite loops as implemented in Perl
1348 5.005. If there is an options reset, it will get obeyed in the normal
1349 course of events. */
1350
1351 if (*ecode == OP_KET || eptr == saved_eptr)
1352 {
1353 ecode += 1 + LINK_SIZE;
1354 break;
1355 }
1356
1357 /* The repeating kets try the rest of the pattern or restart from the
1358 preceding bracket, in the appropriate order. In the second case, we can use
1359 tail recursion to avoid using another stack frame, unless we have an
1360 unlimited repeat of a group that can match an empty string. */
1361
1362 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1363
1364 if (*ecode == OP_KETRMIN)
1365 {
1366 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1367 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1368 if (flags != 0) /* Could match an empty string */
1369 {
1370 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1371 RRETURN(rrc);
1372 }
1373 ecode = prev;
1374 goto TAIL_RECURSE;
1375 }
1376 else /* OP_KETRMAX */
1377 {
1378 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1379 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1380 ecode += 1 + LINK_SIZE;
1381 flags = 0;
1382 goto TAIL_RECURSE;
1383 }
1384 /* Control never gets here */
1385
1386 /* Start of subject unless notbol, or after internal newline if multiline */
1387
1388 case OP_CIRC:
1389 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1390 if ((ims & PCRE_MULTILINE) != 0)
1391 {
1392 if (eptr != md->start_subject &&
1393 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1394 RRETURN(MATCH_NOMATCH);
1395 ecode++;
1396 break;
1397 }
1398 /* ... else fall through */
1399
1400 /* Start of subject assertion */
1401
1402 case OP_SOD:
1403 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1404 ecode++;
1405 break;
1406
1407 /* Start of match assertion */
1408
1409 case OP_SOM:
1410 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1411 ecode++;
1412 break;
1413
1414 /* Reset the start of match point */
1415
1416 case OP_SET_SOM:
1417 mstart = eptr;
1418 ecode++;
1419 break;
1420
1421 /* Assert before internal newline if multiline, or before a terminating
1422 newline unless endonly is set, else end of subject unless noteol is set. */
1423
1424 case OP_DOLL:
1425 if ((ims & PCRE_MULTILINE) != 0)
1426 {
1427 if (eptr < md->end_subject)
1428 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1429 else
1430 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1431 ecode++;
1432 break;
1433 }
1434 else
1435 {
1436 if (md->noteol) RRETURN(MATCH_NOMATCH);
1437 if (!md->endonly)
1438 {
1439 if (eptr != md->end_subject &&
1440 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1441 RRETURN(MATCH_NOMATCH);
1442 ecode++;
1443 break;
1444 }
1445 }
1446 /* ... else fall through for endonly */
1447
1448 /* End of subject assertion (\z) */
1449
1450 case OP_EOD:
1451 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1452 ecode++;
1453 break;
1454
1455 /* End of subject or ending \n assertion (\Z) */
1456
1457 case OP_EODN:
1458 if (eptr != md->end_subject &&
1459 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1460 RRETURN(MATCH_NOMATCH);
1461 ecode++;
1462 break;
1463
1464 /* Word boundary assertions */
1465
1466 case OP_NOT_WORD_BOUNDARY:
1467 case OP_WORD_BOUNDARY:
1468 {
1469
1470 /* Find out if the previous and current characters are "word" characters.
1471 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1472 be "non-word" characters. Remember the earliest consulted character for
1473 partial matching. */
1474
1475 #ifdef SUPPORT_UTF8
1476 if (utf8)
1477 {
1478 if (eptr == md->start_subject) prev_is_word = FALSE; else
1479 {
1480 USPTR lastptr = eptr - 1;
1481 while((*lastptr & 0xc0) == 0x80) lastptr--;
1482 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1483 GETCHAR(c, lastptr);
1484 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1485 }
1486 if (eptr >= md->end_subject)
1487 {
1488 SCHECK_PARTIAL();
1489 cur_is_word = FALSE;
1490 }
1491 else
1492 {
1493 GETCHAR(c, eptr);
1494 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1495 }
1496 }
1497 else
1498 #endif
1499
1500 /* Not in UTF-8 mode */
1501
1502 {
1503 if (eptr == md->start_subject) prev_is_word = FALSE; else
1504 {
1505 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1506 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1507 }
1508 if (eptr >= md->end_subject)
1509 {
1510 SCHECK_PARTIAL();
1511 cur_is_word = FALSE;
1512 }
1513 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1514 }
1515
1516 /* Now see if the situation is what we want */
1517
1518 if ((*ecode++ == OP_WORD_BOUNDARY)?
1519 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1520 RRETURN(MATCH_NOMATCH);
1521 }
1522 break;
1523
1524 /* Match a single character type; inline for speed */
1525
1526 case OP_ANY:
1527 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1528 /* Fall through */
1529
1530 case OP_ALLANY:
1531 if (eptr++ >= md->end_subject)
1532 {
1533 SCHECK_PARTIAL();
1534 RRETURN(MATCH_NOMATCH);
1535 }
1536 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1537 ecode++;
1538 break;
1539
1540 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1541 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1542
1543 case OP_ANYBYTE:
1544 if (eptr++ >= md->end_subject)
1545 {
1546 SCHECK_PARTIAL();
1547 RRETURN(MATCH_NOMATCH);
1548 }
1549 ecode++;
1550 break;
1551
1552 case OP_NOT_DIGIT:
1553 if (eptr >= md->end_subject)
1554 {
1555 SCHECK_PARTIAL();
1556 RRETURN(MATCH_NOMATCH);
1557 }
1558 GETCHARINCTEST(c, eptr);
1559 if (
1560 #ifdef SUPPORT_UTF8
1561 c < 256 &&
1562 #endif
1563 (md->ctypes[c] & ctype_digit) != 0
1564 )
1565 RRETURN(MATCH_NOMATCH);
1566 ecode++;
1567 break;
1568
1569 case OP_DIGIT:
1570 if (eptr >= md->end_subject)
1571 {
1572 SCHECK_PARTIAL();
1573 RRETURN(MATCH_NOMATCH);
1574 }
1575 GETCHARINCTEST(c, eptr);
1576 if (
1577 #ifdef SUPPORT_UTF8
1578 c >= 256 ||
1579 #endif
1580 (md->ctypes[c] & ctype_digit) == 0
1581 )
1582 RRETURN(MATCH_NOMATCH);
1583 ecode++;
1584 break;
1585
1586 case OP_NOT_WHITESPACE:
1587 if (eptr >= md->end_subject)
1588 {
1589 SCHECK_PARTIAL();
1590 RRETURN(MATCH_NOMATCH);
1591 }
1592 GETCHARINCTEST(c, eptr);
1593 if (
1594 #ifdef SUPPORT_UTF8
1595 c < 256 &&
1596 #endif
1597 (md->ctypes[c] & ctype_space) != 0
1598 )
1599 RRETURN(MATCH_NOMATCH);
1600 ecode++;
1601 break;
1602
1603 case OP_WHITESPACE:
1604 if (eptr >= md->end_subject)
1605 {
1606 SCHECK_PARTIAL();
1607 RRETURN(MATCH_NOMATCH);
1608 }
1609 GETCHARINCTEST(c, eptr);
1610 if (
1611 #ifdef SUPPORT_UTF8
1612 c >= 256 ||
1613 #endif
1614 (md->ctypes[c] & ctype_space) == 0
1615 )
1616 RRETURN(MATCH_NOMATCH);
1617 ecode++;
1618 break;
1619
1620 case OP_NOT_WORDCHAR:
1621 if (eptr >= md->end_subject)
1622 {
1623 SCHECK_PARTIAL();
1624 RRETURN(MATCH_NOMATCH);
1625 }
1626 GETCHARINCTEST(c, eptr);
1627 if (
1628 #ifdef SUPPORT_UTF8
1629 c < 256 &&
1630 #endif
1631 (md->ctypes[c] & ctype_word) != 0
1632 )
1633 RRETURN(MATCH_NOMATCH);
1634 ecode++;
1635 break;
1636
1637 case OP_WORDCHAR:
1638 if (eptr >= md->end_subject)
1639 {
1640 SCHECK_PARTIAL();
1641 RRETURN(MATCH_NOMATCH);
1642 }
1643 GETCHARINCTEST(c, eptr);
1644 if (
1645 #ifdef SUPPORT_UTF8
1646 c >= 256 ||
1647 #endif
1648 (md->ctypes[c] & ctype_word) == 0
1649 )
1650 RRETURN(MATCH_NOMATCH);
1651 ecode++;
1652 break;
1653
1654 case OP_ANYNL:
1655 if (eptr >= md->end_subject)
1656 {
1657 SCHECK_PARTIAL();
1658 RRETURN(MATCH_NOMATCH);
1659 }
1660 GETCHARINCTEST(c, eptr);
1661 switch(c)
1662 {
1663 default: RRETURN(MATCH_NOMATCH);
1664 case 0x000d:
1665 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1666 break;
1667
1668 case 0x000a:
1669 break;
1670
1671 case 0x000b:
1672 case 0x000c:
1673 case 0x0085:
1674 case 0x2028:
1675 case 0x2029:
1676 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1677 break;
1678 }
1679 ecode++;
1680 break;
1681
1682 case OP_NOT_HSPACE:
1683 if (eptr >= md->end_subject)
1684 {
1685 SCHECK_PARTIAL();
1686 RRETURN(MATCH_NOMATCH);
1687 }
1688 GETCHARINCTEST(c, eptr);
1689 switch(c)
1690 {
1691 default: break;
1692 case 0x09: /* HT */
1693 case 0x20: /* SPACE */
1694 case 0xa0: /* NBSP */
1695 case 0x1680: /* OGHAM SPACE MARK */
1696 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1697 case 0x2000: /* EN QUAD */
1698 case 0x2001: /* EM QUAD */
1699 case 0x2002: /* EN SPACE */
1700 case 0x2003: /* EM SPACE */
1701 case 0x2004: /* THREE-PER-EM SPACE */
1702 case 0x2005: /* FOUR-PER-EM SPACE */
1703 case 0x2006: /* SIX-PER-EM SPACE */
1704 case 0x2007: /* FIGURE SPACE */
1705 case 0x2008: /* PUNCTUATION SPACE */
1706 case 0x2009: /* THIN SPACE */
1707 case 0x200A: /* HAIR SPACE */
1708 case 0x202f: /* NARROW NO-BREAK SPACE */
1709 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1710 case 0x3000: /* IDEOGRAPHIC SPACE */
1711 RRETURN(MATCH_NOMATCH);
1712 }
1713 ecode++;
1714 break;
1715
1716 case OP_HSPACE:
1717 if (eptr >= md->end_subject)
1718 {
1719 SCHECK_PARTIAL();
1720 RRETURN(MATCH_NOMATCH);
1721 }
1722 GETCHARINCTEST(c, eptr);
1723 switch(c)
1724 {
1725 default: RRETURN(MATCH_NOMATCH);
1726 case 0x09: /* HT */
1727 case 0x20: /* SPACE */
1728 case 0xa0: /* NBSP */
1729 case 0x1680: /* OGHAM SPACE MARK */
1730 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1731 case 0x2000: /* EN QUAD */
1732 case 0x2001: /* EM QUAD */
1733 case 0x2002: /* EN SPACE */
1734 case 0x2003: /* EM SPACE */
1735 case 0x2004: /* THREE-PER-EM SPACE */
1736 case 0x2005: /* FOUR-PER-EM SPACE */
1737 case 0x2006: /* SIX-PER-EM SPACE */
1738 case 0x2007: /* FIGURE SPACE */
1739 case 0x2008: /* PUNCTUATION SPACE */
1740 case 0x2009: /* THIN SPACE */
1741 case 0x200A: /* HAIR SPACE */
1742 case 0x202f: /* NARROW NO-BREAK SPACE */
1743 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1744 case 0x3000: /* IDEOGRAPHIC SPACE */
1745 break;
1746 }
1747 ecode++;
1748 break;
1749
1750 case OP_NOT_VSPACE:
1751 if (eptr >= md->end_subject)
1752 {
1753 SCHECK_PARTIAL();
1754 RRETURN(MATCH_NOMATCH);
1755 }
1756 GETCHARINCTEST(c, eptr);
1757 switch(c)
1758 {
1759 default: break;
1760 case 0x0a: /* LF */
1761 case 0x0b: /* VT */
1762 case 0x0c: /* FF */
1763 case 0x0d: /* CR */
1764 case 0x85: /* NEL */
1765 case 0x2028: /* LINE SEPARATOR */
1766 case 0x2029: /* PARAGRAPH SEPARATOR */
1767 RRETURN(MATCH_NOMATCH);
1768 }
1769 ecode++;
1770 break;
1771
1772 case OP_VSPACE:
1773 if (eptr >= md->end_subject)
1774 {
1775 SCHECK_PARTIAL();
1776 RRETURN(MATCH_NOMATCH);
1777 }
1778 GETCHARINCTEST(c, eptr);
1779 switch(c)
1780 {
1781 default: RRETURN(MATCH_NOMATCH);
1782 case 0x0a: /* LF */
1783 case 0x0b: /* VT */
1784 case 0x0c: /* FF */
1785 case 0x0d: /* CR */
1786 case 0x85: /* NEL */
1787 case 0x2028: /* LINE SEPARATOR */
1788 case 0x2029: /* PARAGRAPH SEPARATOR */
1789 break;
1790 }
1791 ecode++;
1792 break;
1793
1794 #ifdef SUPPORT_UCP
1795 /* Check the next character by Unicode property. We will get here only
1796 if the support is in the binary; otherwise a compile-time error occurs. */
1797
1798 case OP_PROP:
1799 case OP_NOTPROP:
1800 if (eptr >= md->end_subject)
1801 {
1802 SCHECK_PARTIAL();
1803 RRETURN(MATCH_NOMATCH);
1804 }
1805 GETCHARINCTEST(c, eptr);
1806 {
1807 const ucd_record *prop = GET_UCD(c);
1808
1809 switch(ecode[1])
1810 {
1811 case PT_ANY:
1812 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1813 break;
1814
1815 case PT_LAMP:
1816 if ((prop->chartype == ucp_Lu ||
1817 prop->chartype == ucp_Ll ||
1818 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1819 RRETURN(MATCH_NOMATCH);
1820 break;
1821
1822 case PT_GC:
1823 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1824 RRETURN(MATCH_NOMATCH);
1825 break;
1826
1827 case PT_PC:
1828 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1829 RRETURN(MATCH_NOMATCH);
1830 break;
1831
1832 case PT_SC:
1833 if ((ecode[2] != prop->script) == (op == OP_PROP))
1834 RRETURN(MATCH_NOMATCH);
1835 break;
1836
1837 default:
1838 RRETURN(PCRE_ERROR_INTERNAL);
1839 }
1840
1841 ecode += 3;
1842 }
1843 break;
1844
1845 /* Match an extended Unicode sequence. We will get here only if the support
1846 is in the binary; otherwise a compile-time error occurs. */
1847
1848 case OP_EXTUNI:
1849 if (eptr >= md->end_subject)
1850 {
1851 SCHECK_PARTIAL();
1852 RRETURN(MATCH_NOMATCH);
1853 }
1854 GETCHARINCTEST(c, eptr);
1855 {
1856 int category = UCD_CATEGORY(c);
1857 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1858 while (eptr < md->end_subject)
1859 {
1860 int len = 1;
1861 if (!utf8) c = *eptr; else
1862 {
1863 GETCHARLEN(c, eptr, len);
1864 }
1865 category = UCD_CATEGORY(c);
1866 if (category != ucp_M) break;
1867 eptr += len;
1868 }
1869 }
1870 ecode++;
1871 break;
1872 #endif
1873
1874
1875 /* Match a back reference, possibly repeatedly. Look past the end of the
1876 item to see if there is repeat information following. The code is similar
1877 to that for character classes, but repeated for efficiency. Then obey
1878 similar code to character type repeats - written out again for speed.
1879 However, if the referenced string is the empty string, always treat
1880 it as matched, any number of times (otherwise there could be infinite
1881 loops). */
1882
1883 case OP_REF:
1884 {
1885 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1886 ecode += 3;
1887
1888 /* If the reference is unset, there are two possibilities:
1889
1890 (a) In the default, Perl-compatible state, set the length to be longer
1891 than the amount of subject left; this ensures that every attempt at a
1892 match fails. We can't just fail here, because of the possibility of
1893 quantifiers with zero minima.
1894
1895 (b) If the JavaScript compatibility flag is set, set the length to zero
1896 so that the back reference matches an empty string.
1897
1898 Otherwise, set the length to the length of what was matched by the
1899 referenced subpattern. */
1900
1901 if (offset >= offset_top || md->offset_vector[offset] < 0)
1902 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1903 else
1904 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1905
1906 /* Set up for repetition, or handle the non-repeated case */
1907
1908 switch (*ecode)
1909 {
1910 case OP_CRSTAR:
1911 case OP_CRMINSTAR:
1912 case OP_CRPLUS:
1913 case OP_CRMINPLUS:
1914 case OP_CRQUERY:
1915 case OP_CRMINQUERY:
1916 c = *ecode++ - OP_CRSTAR;
1917 minimize = (c & 1) != 0;
1918 min = rep_min[c]; /* Pick up values from tables; */
1919 max = rep_max[c]; /* zero for max => infinity */
1920 if (max == 0) max = INT_MAX;
1921 break;
1922
1923 case OP_CRRANGE:
1924 case OP_CRMINRANGE:
1925 minimize = (*ecode == OP_CRMINRANGE);
1926 min = GET2(ecode, 1);
1927 max = GET2(ecode, 3);
1928 if (max == 0) max = INT_MAX;
1929 ecode += 5;
1930 break;
1931
1932 default: /* No repeat follows */
1933 if (!match_ref(offset, eptr, length, md, ims))
1934 {
1935 CHECK_PARTIAL();
1936 RRETURN(MATCH_NOMATCH);
1937 }
1938 eptr += length;
1939 continue; /* With the main loop */
1940 }
1941
1942 /* If the length of the reference is zero, just continue with the
1943 main loop. */
1944
1945 if (length == 0) continue;
1946
1947 /* First, ensure the minimum number of matches are present. We get back
1948 the length of the reference string explicitly rather than passing the
1949 address of eptr, so that eptr can be a register variable. */
1950
1951 for (i = 1; i <= min; i++)
1952 {
1953 if (!match_ref(offset, eptr, length, md, ims))
1954 {
1955 CHECK_PARTIAL();
1956 RRETURN(MATCH_NOMATCH);
1957 }
1958 eptr += length;
1959 }
1960
1961 /* If min = max, continue at the same level without recursion.
1962 They are not both allowed to be zero. */
1963
1964 if (min == max) continue;
1965
1966 /* If minimizing, keep trying and advancing the pointer */
1967
1968 if (minimize)
1969 {
1970 for (fi = min;; fi++)
1971 {
1972 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974 if (fi >= max) RRETURN(MATCH_NOMATCH);
1975 if (!match_ref(offset, eptr, length, md, ims))
1976 {
1977 CHECK_PARTIAL();
1978 RRETURN(MATCH_NOMATCH);
1979 }
1980 eptr += length;
1981 }
1982 /* Control never gets here */
1983 }
1984
1985 /* If maximizing, find the longest string and work backwards */
1986
1987 else
1988 {
1989 pp = eptr;
1990 for (i = min; i < max; i++)
1991 {
1992 if (!match_ref(offset, eptr, length, md, ims)) break;
1993 eptr += length;
1994 }
1995 while (eptr >= pp)
1996 {
1997 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1999 eptr -= length;
2000 }
2001 RRETURN(MATCH_NOMATCH);
2002 }
2003 }
2004 /* Control never gets here */
2005
2006 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2007 used when all the characters in the class have values in the range 0-255,
2008 and either the matching is caseful, or the characters are in the range
2009 0-127 when UTF-8 processing is enabled. The only difference between
2010 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2011 encountered.
2012
2013 First, look past the end of the item to see if there is repeat information
2014 following. Then obey similar code to character type repeats - written out
2015 again for speed. */
2016
2017 case OP_NCLASS:
2018 case OP_CLASS:
2019 {
2020 data = ecode + 1; /* Save for matching */
2021 ecode += 33; /* Advance past the item */
2022
2023 switch (*ecode)
2024 {
2025 case OP_CRSTAR:
2026 case OP_CRMINSTAR:
2027 case OP_CRPLUS:
2028 case OP_CRMINPLUS:
2029 case OP_CRQUERY:
2030 case OP_CRMINQUERY:
2031 c = *ecode++ - OP_CRSTAR;
2032 minimize = (c & 1) != 0;
2033 min = rep_min[c]; /* Pick up values from tables; */
2034 max = rep_max[c]; /* zero for max => infinity */
2035 if (max == 0) max = INT_MAX;
2036 break;
2037
2038 case OP_CRRANGE:
2039 case OP_CRMINRANGE:
2040 minimize = (*ecode == OP_CRMINRANGE);
2041 min = GET2(ecode, 1);
2042 max = GET2(ecode, 3);
2043 if (max == 0) max = INT_MAX;
2044 ecode += 5;
2045 break;
2046
2047 default: /* No repeat follows */
2048 min = max = 1;
2049 break;
2050 }
2051
2052 /* First, ensure the minimum number of matches are present. */
2053
2054 #ifdef SUPPORT_UTF8
2055 /* UTF-8 mode */
2056 if (utf8)
2057 {
2058 for (i = 1; i <= min; i++)
2059 {
2060 if (eptr >= md->end_subject)
2061 {
2062 SCHECK_PARTIAL();
2063 RRETURN(MATCH_NOMATCH);
2064 }
2065 GETCHARINC(c, eptr);
2066 if (c > 255)
2067 {
2068 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2069 }
2070 else
2071 {
2072 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2073 }
2074 }
2075 }
2076 else
2077 #endif
2078 /* Not UTF-8 mode */
2079 {
2080 for (i = 1; i <= min; i++)
2081 {
2082 if (eptr >= md->end_subject)
2083 {
2084 SCHECK_PARTIAL();
2085 RRETURN(MATCH_NOMATCH);
2086 }
2087 c = *eptr++;
2088 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2089 }
2090 }
2091
2092 /* If max == min we can continue with the main loop without the
2093 need to recurse. */
2094
2095 if (min == max) continue;
2096
2097 /* If minimizing, keep testing the rest of the expression and advancing
2098 the pointer while it matches the class. */
2099
2100 if (minimize)
2101 {
2102 #ifdef SUPPORT_UTF8
2103 /* UTF-8 mode */
2104 if (utf8)
2105 {
2106 for (fi = min;; fi++)
2107 {
2108 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2109 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110 if (fi >= max) RRETURN(MATCH_NOMATCH);
2111 if (eptr >= md->end_subject)
2112 {
2113 SCHECK_PARTIAL();
2114 RRETURN(MATCH_NOMATCH);
2115 }
2116 GETCHARINC(c, eptr);
2117 if (c > 255)
2118 {
2119 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2120 }
2121 else
2122 {
2123 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2124 }
2125 }
2126 }
2127 else
2128 #endif
2129 /* Not UTF-8 mode */
2130 {
2131 for (fi = min;; fi++)
2132 {
2133 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2134 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2135 if (fi >= max) RRETURN(MATCH_NOMATCH);
2136 if (eptr >= md->end_subject)
2137 {
2138 SCHECK_PARTIAL();
2139 RRETURN(MATCH_NOMATCH);
2140 }
2141 c = *eptr++;
2142 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2143 }
2144 }
2145 /* Control never gets here */
2146 }
2147
2148 /* If maximizing, find the longest possible run, then work backwards. */
2149
2150 else
2151 {
2152 pp = eptr;
2153
2154 #ifdef SUPPORT_UTF8
2155 /* UTF-8 mode */
2156 if (utf8)
2157 {
2158 for (i = min; i < max; i++)
2159 {
2160 int len = 1;
2161 if (eptr >= md->end_subject) break;
2162 GETCHARLEN(c, eptr, len);
2163 if (c > 255)
2164 {
2165 if (op == OP_CLASS) break;
2166 }
2167 else
2168 {
2169 if ((data[c/8] & (1 << (c&7))) == 0) break;
2170 }
2171 eptr += len;
2172 }
2173 for (;;)
2174 {
2175 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2176 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2177 if (eptr-- == pp) break; /* Stop if tried at original pos */
2178 BACKCHAR(eptr);
2179 }
2180 }
2181 else
2182 #endif
2183 /* Not UTF-8 mode */
2184 {
2185 for (i = min; i < max; i++)
2186 {
2187 if (eptr >= md->end_subject) break;
2188 c = *eptr;
2189 if ((data[c/8] & (1 << (c&7))) == 0) break;
2190 eptr++;
2191 }
2192 while (eptr >= pp)
2193 {
2194 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2195 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2196 eptr--;
2197 }
2198 }
2199
2200 RRETURN(MATCH_NOMATCH);
2201 }
2202 }
2203 /* Control never gets here */
2204
2205
2206 /* Match an extended character class. This opcode is encountered only
2207 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2208 mode, because Unicode properties are supported in non-UTF-8 mode. */
2209
2210 #ifdef SUPPORT_UTF8
2211 case OP_XCLASS:
2212 {
2213 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2214 ecode += GET(ecode, 1); /* Advance past the item */
2215
2216 switch (*ecode)
2217 {
2218 case OP_CRSTAR:
2219 case OP_CRMINSTAR:
2220 case OP_CRPLUS:
2221 case OP_CRMINPLUS:
2222 case OP_CRQUERY:
2223 case OP_CRMINQUERY:
2224 c = *ecode++ - OP_CRSTAR;
2225 minimize = (c & 1) != 0;
2226 min = rep_min[c]; /* Pick up values from tables; */
2227 max = rep_max[c]; /* zero for max => infinity */
2228 if (max == 0) max = INT_MAX;
2229 break;
2230
2231 case OP_CRRANGE:
2232 case OP_CRMINRANGE:
2233 minimize = (*ecode == OP_CRMINRANGE);
2234 min = GET2(ecode, 1);
2235 max = GET2(ecode, 3);
2236 if (max == 0) max = INT_MAX;
2237 ecode += 5;
2238 break;
2239
2240 default: /* No repeat follows */
2241 min = max = 1;
2242 break;
2243 }
2244
2245 /* First, ensure the minimum number of matches are present. */
2246
2247 for (i = 1; i <= min; i++)
2248 {
2249 if (eptr >= md->end_subject)
2250 {
2251 SCHECK_PARTIAL();
2252 RRETURN(MATCH_NOMATCH);
2253 }
2254 GETCHARINCTEST(c, eptr);
2255 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2256 }
2257
2258 /* If max == min we can continue with the main loop without the
2259 need to recurse. */
2260
2261 if (min == max) continue;
2262
2263 /* If minimizing, keep testing the rest of the expression and advancing
2264 the pointer while it matches the class. */
2265
2266 if (minimize)
2267 {
2268 for (fi = min;; fi++)
2269 {
2270 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2271 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2272 if (fi >= max) RRETURN(MATCH_NOMATCH);
2273 if (eptr >= md->end_subject)
2274 {
2275 SCHECK_PARTIAL();
2276 RRETURN(MATCH_NOMATCH);
2277 }
2278 GETCHARINCTEST(c, eptr);
2279 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2280 }
2281 /* Control never gets here */
2282 }
2283
2284 /* If maximizing, find the longest possible run, then work backwards. */
2285
2286 else
2287 {
2288 pp = eptr;
2289 for (i = min; i < max; i++)
2290 {
2291 int len = 1;
2292 if (eptr >= md->end_subject) break;
2293 GETCHARLENTEST(c, eptr, len);
2294 if (!_pcre_xclass(c, data)) break;
2295 eptr += len;
2296 }
2297 for(;;)
2298 {
2299 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2300 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2301 if (eptr-- == pp) break; /* Stop if tried at original pos */
2302 if (utf8) BACKCHAR(eptr);
2303 }
2304 RRETURN(MATCH_NOMATCH);
2305 }
2306
2307 /* Control never gets here */
2308 }
2309 #endif /* End of XCLASS */
2310
2311 /* Match a single character, casefully */
2312
2313 case OP_CHAR:
2314 #ifdef SUPPORT_UTF8
2315 if (utf8)
2316 {
2317 length = 1;
2318 ecode++;
2319 GETCHARLEN(fc, ecode, length);
2320 if (length > md->end_subject - eptr)
2321 {
2322 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2323 RRETURN(MATCH_NOMATCH);
2324 }
2325 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2326 }
2327 else
2328 #endif
2329
2330 /* Non-UTF-8 mode */
2331 {
2332 if (md->end_subject - eptr < 1)
2333 {
2334 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2335 RRETURN(MATCH_NOMATCH);
2336 }
2337 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2338 ecode += 2;
2339 }
2340 break;
2341
2342 /* Match a single character, caselessly */
2343
2344 case OP_CHARNC:
2345 #ifdef SUPPORT_UTF8
2346 if (utf8)
2347 {
2348 length = 1;
2349 ecode++;
2350 GETCHARLEN(fc, ecode, length);
2351
2352 if (length > md->end_subject - eptr)
2353 {
2354 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2355 RRETURN(MATCH_NOMATCH);
2356 }
2357
2358 /* If the pattern character's value is < 128, we have only one byte, and
2359 can use the fast lookup table. */
2360
2361 if (fc < 128)
2362 {
2363 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2364 }
2365
2366 /* Otherwise we must pick up the subject character */
2367
2368 else
2369 {
2370 unsigned int dc;
2371 GETCHARINC(dc, eptr);
2372 ecode += length;
2373
2374 /* If we have Unicode property support, we can use it to test the other
2375 case of the character, if there is one. */
2376
2377 if (fc != dc)
2378 {
2379 #ifdef SUPPORT_UCP
2380 if (dc != UCD_OTHERCASE(fc))
2381 #endif
2382 RRETURN(MATCH_NOMATCH);
2383 }
2384 }
2385 }
2386 else
2387 #endif /* SUPPORT_UTF8 */
2388
2389 /* Non-UTF-8 mode */
2390 {
2391 if (md->end_subject - eptr < 1)
2392 {
2393 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2397 ecode += 2;
2398 }
2399 break;
2400
2401 /* Match a single character repeatedly. */
2402
2403 case OP_EXACT:
2404 min = max = GET2(ecode, 1);
2405 ecode += 3;
2406 goto REPEATCHAR;
2407
2408 case OP_POSUPTO:
2409 possessive = TRUE;
2410 /* Fall through */
2411
2412 case OP_UPTO:
2413 case OP_MINUPTO:
2414 min = 0;
2415 max = GET2(ecode, 1);
2416 minimize = *ecode == OP_MINUPTO;
2417 ecode += 3;
2418 goto REPEATCHAR;
2419
2420 case OP_POSSTAR:
2421 possessive = TRUE;
2422 min = 0;
2423 max = INT_MAX;
2424 ecode++;
2425 goto REPEATCHAR;
2426
2427 case OP_POSPLUS:
2428 possessive = TRUE;
2429 min = 1;
2430 max = INT_MAX;
2431 ecode++;
2432 goto REPEATCHAR;
2433
2434 case OP_POSQUERY:
2435 possessive = TRUE;
2436 min = 0;
2437 max = 1;
2438 ecode++;
2439 goto REPEATCHAR;
2440
2441 case OP_STAR:
2442 case OP_MINSTAR:
2443 case OP_PLUS:
2444 case OP_MINPLUS:
2445 case OP_QUERY:
2446 case OP_MINQUERY:
2447 c = *ecode++ - OP_STAR;
2448 minimize = (c & 1) != 0;
2449
2450 min = rep_min[c]; /* Pick up values from tables; */
2451 max = rep_max[c]; /* zero for max => infinity */
2452 if (max == 0) max = INT_MAX;
2453
2454 /* Common code for all repeated single-character matches. */
2455
2456 REPEATCHAR:
2457 #ifdef SUPPORT_UTF8
2458 if (utf8)
2459 {
2460 length = 1;
2461 charptr = ecode;
2462 GETCHARLEN(fc, ecode, length);
2463 ecode += length;
2464
2465 /* Handle multibyte character matching specially here. There is
2466 support for caseless matching if UCP support is present. */
2467
2468 if (length > 1)
2469 {
2470 #ifdef SUPPORT_UCP
2471 unsigned int othercase;
2472 if ((ims & PCRE_CASELESS) != 0 &&
2473 (othercase = UCD_OTHERCASE(fc)) != fc)
2474 oclength = _pcre_ord2utf8(othercase, occhars);
2475 else oclength = 0;
2476 #endif /* SUPPORT_UCP */
2477
2478 for (i = 1; i <= min; i++)
2479 {
2480 if (eptr <= md->end_subject - length &&
2481 memcmp(eptr, charptr, length) == 0) eptr += length;
2482 #ifdef SUPPORT_UCP
2483 else if (oclength > 0 &&
2484 eptr <= md->end_subject - oclength &&
2485 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2486 #endif /* SUPPORT_UCP */
2487 else
2488 {
2489 CHECK_PARTIAL();
2490 RRETURN(MATCH_NOMATCH);
2491 }
2492 }
2493
2494 if (min == max) continue;
2495
2496 if (minimize)
2497 {
2498 for (fi = min;; fi++)
2499 {
2500 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2501 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2502 if (fi >= max) RRETURN(MATCH_NOMATCH);
2503 if (eptr <= md->end_subject - length &&
2504 memcmp(eptr, charptr, length) == 0) eptr += length;
2505 #ifdef SUPPORT_UCP
2506 else if (oclength > 0 &&
2507 eptr <= md->end_subject - oclength &&
2508 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2509 #endif /* SUPPORT_UCP */
2510 else
2511 {
2512 CHECK_PARTIAL();
2513 RRETURN(MATCH_NOMATCH);
2514 }
2515 }
2516 /* Control never gets here */
2517 }
2518
2519 else /* Maximize */
2520 {
2521 pp = eptr;
2522 for (i = min; i < max; i++)
2523 {
2524 if (eptr <= md->end_subject - length &&
2525 memcmp(eptr, charptr, length) == 0) eptr += length;
2526 #ifdef SUPPORT_UCP
2527 else if (oclength > 0 &&
2528 eptr <= md->end_subject - oclength &&
2529 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2530 #endif /* SUPPORT_UCP */
2531 else break;
2532 }
2533
2534 if (possessive) continue;
2535
2536 for(;;)
2537 {
2538 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2540 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2541 #ifdef SUPPORT_UCP
2542 eptr--;
2543 BACKCHAR(eptr);
2544 #else /* without SUPPORT_UCP */
2545 eptr -= length;
2546 #endif /* SUPPORT_UCP */
2547 }
2548 }
2549 /* Control never gets here */
2550 }
2551
2552 /* If the length of a UTF-8 character is 1, we fall through here, and
2553 obey the code as for non-UTF-8 characters below, though in this case the
2554 value of fc will always be < 128. */
2555 }
2556 else
2557 #endif /* SUPPORT_UTF8 */
2558
2559 /* When not in UTF-8 mode, load a single-byte character. */
2560
2561 fc = *ecode++;
2562
2563 /* The value of fc at this point is always less than 256, though we may or
2564 may not be in UTF-8 mode. The code is duplicated for the caseless and
2565 caseful cases, for speed, since matching characters is likely to be quite
2566 common. First, ensure the minimum number of matches are present. If min =
2567 max, continue at the same level without recursing. Otherwise, if
2568 minimizing, keep trying the rest of the expression and advancing one
2569 matching character if failing, up to the maximum. Alternatively, if
2570 maximizing, find the maximum number of characters and work backwards. */
2571
2572 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2573 max, eptr));
2574
2575 if ((ims & PCRE_CASELESS) != 0)
2576 {
2577 fc = md->lcc[fc];
2578 for (i = 1; i <= min; i++)
2579 {
2580 if (eptr >= md->end_subject)
2581 {
2582 SCHECK_PARTIAL();
2583 RRETURN(MATCH_NOMATCH);
2584 }
2585 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2586 }
2587 if (min == max) continue;
2588 if (minimize)
2589 {
2590 for (fi = min;; fi++)
2591 {
2592 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2593 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2594 if (fi >= max) RRETURN(MATCH_NOMATCH);
2595 if (eptr >= md->end_subject)
2596 {
2597 SCHECK_PARTIAL();
2598 RRETURN(MATCH_NOMATCH);
2599 }
2600 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2601 }
2602 /* Control never gets here */
2603 }
2604 else /* Maximize */
2605 {
2606 pp = eptr;
2607 for (i = min; i < max; i++)
2608 {
2609 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2610 eptr++;
2611 }
2612
2613 if (possessive) continue;
2614
2615 while (eptr >= pp)
2616 {
2617 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2618 eptr--;
2619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2620 }
2621 RRETURN(MATCH_NOMATCH);
2622 }
2623 /* Control never gets here */
2624 }
2625
2626 /* Caseful comparisons (includes all multi-byte characters) */
2627
2628 else
2629 {
2630 for (i = 1; i <= min; i++)
2631 {
2632 if (eptr >= md->end_subject)
2633 {
2634 SCHECK_PARTIAL();
2635 RRETURN(MATCH_NOMATCH);
2636 }
2637 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2638 }
2639
2640 if (min == max) continue;
2641
2642 if (minimize)
2643 {
2644 for (fi = min;; fi++)
2645 {
2646 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2647 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2648 if (fi >= max) RRETURN(MATCH_NOMATCH);
2649 if (eptr >= md->end_subject)
2650 {
2651 SCHECK_PARTIAL();
2652 RRETURN(MATCH_NOMATCH);
2653 }
2654 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2655 }
2656 /* Control never gets here */
2657 }
2658 else /* Maximize */
2659 {
2660 pp = eptr;
2661 for (i = min; i < max; i++)
2662 {
2663 if (eptr >= md->end_subject || fc != *eptr) break;
2664 eptr++;
2665 }
2666 if (possessive) continue;
2667
2668 while (eptr >= pp)
2669 {
2670 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2671 eptr--;
2672 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2673 }
2674 RRETURN(MATCH_NOMATCH);
2675 }
2676 }
2677 /* Control never gets here */
2678
2679 /* Match a negated single one-byte character. The character we are
2680 checking can be multibyte. */
2681
2682 case OP_NOT:
2683 if (eptr >= md->end_subject)
2684 {
2685 SCHECK_PARTIAL();
2686 RRETURN(MATCH_NOMATCH);
2687 }
2688 ecode++;
2689 GETCHARINCTEST(c, eptr);
2690 if ((ims & PCRE_CASELESS) != 0)
2691 {
2692 #ifdef SUPPORT_UTF8
2693 if (c < 256)
2694 #endif
2695 c = md->lcc[c];
2696 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2697 }
2698 else
2699 {
2700 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2701 }
2702 break;
2703
2704 /* Match a negated single one-byte character repeatedly. This is almost a
2705 repeat of the code for a repeated single character, but I haven't found a
2706 nice way of commoning these up that doesn't require a test of the
2707 positive/negative option for each character match. Maybe that wouldn't add
2708 very much to the time taken, but character matching *is* what this is all
2709 about... */
2710
2711 case OP_NOTEXACT:
2712 min = max = GET2(ecode, 1);
2713 ecode += 3;
2714 goto REPEATNOTCHAR;
2715
2716 case OP_NOTUPTO:
2717 case OP_NOTMINUPTO:
2718 min = 0;
2719 max = GET2(ecode, 1);
2720 minimize = *ecode == OP_NOTMINUPTO;
2721 ecode += 3;
2722 goto REPEATNOTCHAR;
2723
2724 case OP_NOTPOSSTAR:
2725 possessive = TRUE;
2726 min = 0;
2727 max = INT_MAX;
2728 ecode++;
2729 goto REPEATNOTCHAR;
2730
2731 case OP_NOTPOSPLUS:
2732 possessive = TRUE;
2733 min = 1;
2734 max = INT_MAX;
2735 ecode++;
2736 goto REPEATNOTCHAR;
2737
2738 case OP_NOTPOSQUERY:
2739 possessive = TRUE;
2740 min = 0;
2741 max = 1;
2742 ecode++;
2743 goto REPEATNOTCHAR;
2744
2745 case OP_NOTPOSUPTO:
2746 possessive = TRUE;
2747 min = 0;
2748 max = GET2(ecode, 1);
2749 ecode += 3;
2750 goto REPEATNOTCHAR;
2751
2752 case OP_NOTSTAR:
2753 case OP_NOTMINSTAR:
2754 case OP_NOTPLUS:
2755 case OP_NOTMINPLUS:
2756 case OP_NOTQUERY:
2757 case OP_NOTMINQUERY:
2758 c = *ecode++ - OP_NOTSTAR;
2759 minimize = (c & 1) != 0;
2760 min = rep_min[c]; /* Pick up values from tables; */
2761 max = rep_max[c]; /* zero for max => infinity */
2762 if (max == 0) max = INT_MAX;
2763
2764 /* Common code for all repeated single-byte matches. */
2765
2766 REPEATNOTCHAR:
2767 fc = *ecode++;
2768
2769 /* The code is duplicated for the caseless and caseful cases, for speed,
2770 since matching characters is likely to be quite common. First, ensure the
2771 minimum number of matches are present. If min = max, continue at the same
2772 level without recursing. Otherwise, if minimizing, keep trying the rest of
2773 the expression and advancing one matching character if failing, up to the
2774 maximum. Alternatively, if maximizing, find the maximum number of
2775 characters and work backwards. */
2776
2777 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2778 max, eptr));
2779
2780 if ((ims & PCRE_CASELESS) != 0)
2781 {
2782 fc = md->lcc[fc];
2783
2784 #ifdef SUPPORT_UTF8
2785 /* UTF-8 mode */
2786 if (utf8)
2787 {
2788 register unsigned int d;
2789 for (i = 1; i <= min; i++)
2790 {
2791 if (eptr >= md->end_subject)
2792 {
2793 SCHECK_PARTIAL();
2794 RRETURN(MATCH_NOMATCH);
2795 }
2796 GETCHARINC(d, eptr);
2797 if (d < 256) d = md->lcc[d];
2798 if (fc == d) RRETURN(MATCH_NOMATCH);
2799 }
2800 }
2801 else
2802 #endif
2803
2804 /* Not UTF-8 mode */
2805 {
2806 for (i = 1; i <= min; i++)
2807 {
2808 if (eptr >= md->end_subject)
2809 {
2810 SCHECK_PARTIAL();
2811 RRETURN(MATCH_NOMATCH);
2812 }
2813 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2814 }
2815 }
2816
2817 if (min == max) continue;
2818
2819 if (minimize)
2820 {
2821 #ifdef SUPPORT_UTF8
2822 /* UTF-8 mode */
2823 if (utf8)
2824 {
2825 register unsigned int d;
2826 for (fi = min;; fi++)
2827 {
2828 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2830 if (fi >= max) RRETURN(MATCH_NOMATCH);
2831 if (eptr >= md->end_subject)
2832 {
2833 SCHECK_PARTIAL();
2834 RRETURN(MATCH_NOMATCH);
2835 }
2836 GETCHARINC(d, eptr);
2837 if (d < 256) d = md->lcc[d];
2838 if (fc == d) RRETURN(MATCH_NOMATCH);
2839 }
2840 }
2841 else
2842 #endif
2843 /* Not UTF-8 mode */
2844 {
2845 for (fi = min;; fi++)
2846 {
2847 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2849 if (fi >= max) RRETURN(MATCH_NOMATCH);
2850 if (eptr >= md->end_subject)
2851 {
2852 SCHECK_PARTIAL();
2853 RRETURN(MATCH_NOMATCH);
2854 }
2855 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2856 }
2857 }
2858 /* Control never gets here */
2859 }
2860
2861 /* Maximize case */
2862
2863 else
2864 {
2865 pp = eptr;
2866
2867 #ifdef SUPPORT_UTF8
2868 /* UTF-8 mode */
2869 if (utf8)
2870 {
2871 register unsigned int d;
2872 for (i = min; i < max; i++)
2873 {
2874 int len = 1;
2875 if (eptr >= md->end_subject) break;
2876 GETCHARLEN(d, eptr, len);
2877 if (d < 256) d = md->lcc[d];
2878 if (fc == d) break;
2879 eptr += len;
2880 }
2881 if (possessive) continue;
2882 for(;;)
2883 {
2884 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2886 if (eptr-- == pp) break; /* Stop if tried at original pos */
2887 BACKCHAR(eptr);
2888 }
2889 }
2890 else
2891 #endif
2892 /* Not UTF-8 mode */
2893 {
2894 for (i = min; i < max; i++)
2895 {
2896 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2897 eptr++;
2898 }
2899 if (possessive) continue;
2900 while (eptr >= pp)
2901 {
2902 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2904 eptr--;
2905 }
2906 }
2907
2908 RRETURN(MATCH_NOMATCH);
2909 }
2910 /* Control never gets here */
2911 }
2912
2913 /* Caseful comparisons */
2914
2915 else
2916 {
2917 #ifdef SUPPORT_UTF8
2918 /* UTF-8 mode */
2919 if (utf8)
2920 {
2921 register unsigned int d;
2922 for (i = 1; i <= min; i++)
2923 {
2924 if (eptr >= md->end_subject)
2925 {
2926 SCHECK_PARTIAL();
2927 RRETURN(MATCH_NOMATCH);
2928 }
2929 GETCHARINC(d, eptr);
2930 if (fc == d) RRETURN(MATCH_NOMATCH);
2931 }
2932 }
2933 else
2934 #endif
2935 /* Not UTF-8 mode */
2936 {
2937 for (i = 1; i <= min; i++)
2938 {
2939 if (eptr >= md->end_subject)
2940 {
2941 SCHECK_PARTIAL();
2942 RRETURN(MATCH_NOMATCH);
2943 }
2944 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2945 }
2946 }
2947
2948 if (min == max) continue;
2949
2950 if (minimize)
2951 {
2952 #ifdef SUPPORT_UTF8
2953 /* UTF-8 mode */
2954 if (utf8)
2955 {
2956 register unsigned int d;
2957 for (fi = min;; fi++)
2958 {
2959 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2961 if (fi >= max) RRETURN(MATCH_NOMATCH);
2962 if (eptr >= md->end_subject)
2963 {
2964 SCHECK_PARTIAL();
2965 RRETURN(MATCH_NOMATCH);
2966 }
2967 GETCHARINC(d, eptr);
2968 if (fc == d) RRETURN(MATCH_NOMATCH);
2969 }
2970 }
2971 else
2972 #endif
2973 /* Not UTF-8 mode */
2974 {
2975 for (fi = min;; fi++)
2976 {
2977 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2979 if (fi >= max) RRETURN(MATCH_NOMATCH);
2980 if (eptr >= md->end_subject)
2981 {
2982 SCHECK_PARTIAL();
2983 RRETURN(MATCH_NOMATCH);
2984 }
2985 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2986 }
2987 }
2988 /* Control never gets here */
2989 }
2990
2991 /* Maximize case */
2992
2993 else
2994 {
2995 pp = eptr;
2996
2997 #ifdef SUPPORT_UTF8
2998 /* UTF-8 mode */
2999 if (utf8)
3000 {
3001 register unsigned int d;
3002 for (i = min; i < max; i++)
3003 {
3004 int len = 1;
3005 if (eptr >= md->end_subject) break;
3006 GETCHARLEN(d, eptr, len);
3007 if (fc == d) break;
3008 eptr += len;
3009 }
3010 if (possessive) continue;
3011 for(;;)
3012 {
3013 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015 if (eptr-- == pp) break; /* Stop if tried at original pos */
3016 BACKCHAR(eptr);
3017 }
3018 }
3019 else
3020 #endif
3021 /* Not UTF-8 mode */
3022 {
3023 for (i = min; i < max; i++)
3024 {
3025 if (eptr >= md->end_subject || fc == *eptr) break;
3026 eptr++;
3027 }
3028 if (possessive) continue;
3029 while (eptr >= pp)
3030 {
3031 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3033 eptr--;
3034 }
3035 }
3036
3037 RRETURN(MATCH_NOMATCH);
3038 }
3039 }
3040 /* Control never gets here */
3041
3042 /* Match a single character type repeatedly; several different opcodes
3043 share code. This is very similar to the code for single characters, but we
3044 repeat it in the interests of efficiency. */
3045
3046 case OP_TYPEEXACT:
3047 min = max = GET2(ecode, 1);
3048 minimize = TRUE;
3049 ecode += 3;
3050 goto REPEATTYPE;
3051
3052 case OP_TYPEUPTO:
3053 case OP_TYPEMINUPTO:
3054 min = 0;
3055 max = GET2(ecode, 1);
3056 minimize = *ecode == OP_TYPEMINUPTO;
3057 ecode += 3;
3058 goto REPEATTYPE;
3059
3060 case OP_TYPEPOSSTAR:
3061 possessive = TRUE;
3062 min = 0;
3063 max = INT_MAX;
3064 ecode++;
3065 goto REPEATTYPE;
3066
3067 case OP_TYPEPOSPLUS:
3068 possessive = TRUE;
3069 min = 1;
3070 max = INT_MAX;
3071 ecode++;
3072 goto REPEATTYPE;
3073
3074 case OP_TYPEPOSQUERY:
3075 possessive = TRUE;
3076 min = 0;
3077 max = 1;
3078 ecode++;
3079 goto REPEATTYPE;
3080
3081 case OP_TYPEPOSUPTO:
3082 possessive = TRUE;
3083 min = 0;
3084 max = GET2(ecode, 1);
3085 ecode += 3;
3086 goto REPEATTYPE;
3087
3088 case OP_TYPESTAR:
3089 case OP_TYPEMINSTAR:
3090 case OP_TYPEPLUS:
3091 case OP_TYPEMINPLUS:
3092 case OP_TYPEQUERY:
3093 case OP_TYPEMINQUERY:
3094 c = *ecode++ - OP_TYPESTAR;
3095 minimize = (c & 1) != 0;
3096 min = rep_min[c]; /* Pick up values from tables; */
3097 max = rep_max[c]; /* zero for max => infinity */
3098 if (max == 0) max = INT_MAX;
3099
3100 /* Common code for all repeated single character type matches. Note that
3101 in UTF-8 mode, '.' matches a character of any length, but for the other
3102 character types, the valid characters are all one-byte long. */
3103
3104 REPEATTYPE:
3105 ctype = *ecode++; /* Code for the character type */
3106
3107 #ifdef SUPPORT_UCP
3108 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3109 {
3110 prop_fail_result = ctype == OP_NOTPROP;
3111 prop_type = *ecode++;
3112 prop_value = *ecode++;
3113 }
3114 else prop_type = -1;
3115 #endif
3116
3117 /* First, ensure the minimum number of matches are present. Use inline
3118 code for maximizing the speed, and do the type test once at the start
3119 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3120 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3121 and single-bytes. */
3122
3123 if (min > 0)
3124 {
3125 #ifdef SUPPORT_UCP
3126 if (prop_type >= 0)
3127 {
3128 switch(prop_type)
3129 {
3130 case PT_ANY:
3131 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3132 for (i = 1; i <= min; i++)
3133 {
3134 if (eptr >= md->end_subject)
3135 {
3136 SCHECK_PARTIAL();
3137 RRETURN(MATCH_NOMATCH);
3138 }
3139 GETCHARINCTEST(c, eptr);
3140 }
3141 break;
3142
3143 case PT_LAMP:
3144 for (i = 1; i <= min; i++)
3145 {
3146 if (eptr >= md->end_subject)
3147 {
3148 SCHECK_PARTIAL();
3149 RRETURN(MATCH_NOMATCH);
3150 }
3151 GETCHARINCTEST(c, eptr);
3152 prop_chartype = UCD_CHARTYPE(c);
3153 if ((prop_chartype == ucp_Lu ||
3154 prop_chartype == ucp_Ll ||
3155 prop_chartype == ucp_Lt) == prop_fail_result)
3156 RRETURN(MATCH_NOMATCH);
3157 }
3158 break;
3159
3160 case PT_GC:
3161 for (i = 1; i <= min; i++)
3162 {
3163 if (eptr >= md->end_subject)
3164 {
3165 SCHECK_PARTIAL();
3166 RRETURN(MATCH_NOMATCH);
3167 }
3168 GETCHARINCTEST(c, eptr);
3169 prop_category = UCD_CATEGORY(c);
3170 if ((prop_category == prop_value) == prop_fail_result)
3171 RRETURN(MATCH_NOMATCH);
3172 }
3173 break;
3174
3175 case PT_PC:
3176 for (i = 1; i <= min; i++)
3177 {
3178 if (eptr >= md->end_subject)
3179 {
3180 SCHECK_PARTIAL();
3181 RRETURN(MATCH_NOMATCH);
3182 }
3183 GETCHARINCTEST(c, eptr);
3184 prop_chartype = UCD_CHARTYPE(c);
3185 if ((prop_chartype == prop_value) == prop_fail_result)
3186 RRETURN(MATCH_NOMATCH);
3187 }
3188 break;
3189
3190 case PT_SC:
3191 for (i = 1; i <= min; i++)
3192 {
3193 if (eptr >= md->end_subject)
3194 {
3195 SCHECK_PARTIAL();
3196 RRETURN(MATCH_NOMATCH);
3197 }
3198 GETCHARINCTEST(c, eptr);
3199 prop_script = UCD_SCRIPT(c);
3200 if ((prop_script == prop_value) == prop_fail_result)
3201 RRETURN(MATCH_NOMATCH);
3202 }
3203 break;
3204
3205 default:
3206 RRETURN(PCRE_ERROR_INTERNAL);
3207 }
3208 }
3209
3210 /* Match extended Unicode sequences. We will get here only if the
3211 support is in the binary; otherwise a compile-time error occurs. */
3212
3213 else if (ctype == OP_EXTUNI)
3214 {
3215 for (i = 1; i <= min; i++)
3216 {
3217 if (eptr >= md->end_subject)
3218 {
3219 SCHECK_PARTIAL();
3220 RRETURN(MATCH_NOMATCH);
3221 }
3222 GETCHARINCTEST(c, eptr);
3223 prop_category = UCD_CATEGORY(c);
3224 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3225 while (eptr < md->end_subject)
3226 {
3227 int len = 1;
3228 if (!utf8) c = *eptr;
3229 else { GETCHARLEN(c, eptr, len); }
3230 prop_category = UCD_CATEGORY(c);
3231 if (prop_category != ucp_M) break;
3232 eptr += len;
3233 }
3234 }
3235 }
3236
3237 else
3238 #endif /* SUPPORT_UCP */
3239
3240 /* Handle all other cases when the coding is UTF-8 */
3241
3242 #ifdef SUPPORT_UTF8
3243 if (utf8) switch(ctype)
3244 {
3245 case OP_ANY:
3246 for (i = 1; i <= min; i++)
3247 {
3248 if (eptr >= md->end_subject)
3249 {
3250 SCHECK_PARTIAL();
3251 RRETURN(MATCH_NOMATCH);
3252 }
3253 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3254 eptr++;
3255 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3256 }
3257 break;
3258
3259 case OP_ALLANY:
3260 for (i = 1; i <= min; i++)
3261 {
3262 if (eptr >= md->end_subject)
3263 {
3264 SCHECK_PARTIAL();
3265 RRETURN(MATCH_NOMATCH);
3266 }
3267 eptr++;
3268 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3269 }
3270 break;
3271
3272 case OP_ANYBYTE:
3273 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3274 eptr += min;
3275 break;
3276
3277 case OP_ANYNL:
3278 for (i = 1; i <= min; i++)
3279 {
3280 if (eptr >= md->end_subject)
3281 {
3282 SCHECK_PARTIAL();
3283 RRETURN(MATCH_NOMATCH);
3284 }
3285 GETCHARINC(c, eptr);
3286 switch(c)
3287 {
3288 default: RRETURN(MATCH_NOMATCH);
3289 case 0x000d:
3290 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3291 break;
3292
3293 case 0x000a:
3294 break;
3295
3296 case 0x000b:
3297 case 0x000c:
3298 case 0x0085:
3299 case 0x2028:
3300 case 0x2029:
3301 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3302 break;
3303 }
3304 }
3305 break;
3306
3307 case OP_NOT_HSPACE:
3308 for (i = 1; i <= min; i++)
3309 {
3310 if (eptr >= md->end_subject)
3311 {
3312 SCHECK_PARTIAL();
3313 RRETURN(MATCH_NOMATCH);
3314 }
3315 GETCHARINC(c, eptr);
3316 switch(c)
3317 {
3318 default: break;
3319 case 0x09: /* HT */
3320 case 0x20: /* SPACE */
3321 case 0xa0: /* NBSP */
3322 case 0x1680: /* OGHAM SPACE MARK */
3323 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3324 case 0x2000: /* EN QUAD */
3325 case 0x2001: /* EM QUAD */
3326 case 0x2002: /* EN SPACE */
3327 case 0x2003: /* EM SPACE */
3328 case 0x2004: /* THREE-PER-EM SPACE */
3329 case 0x2005: /* FOUR-PER-EM SPACE */
3330 case 0x2006: /* SIX-PER-EM SPACE */
3331 case 0x2007: /* FIGURE SPACE */
3332 case 0x2008: /* PUNCTUATION SPACE */
3333 case 0x2009: /* THIN SPACE */
3334 case 0x200A: /* HAIR SPACE */
3335 case 0x202f: /* NARROW NO-BREAK SPACE */
3336 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3337 case 0x3000: /* IDEOGRAPHIC SPACE */
3338 RRETURN(MATCH_NOMATCH);
3339 }
3340 }
3341 break;
3342
3343 case OP_HSPACE:
3344 for (i = 1; i <= min; i++)
3345 {
3346 if (eptr >= md->end_subject)
3347 {
3348 SCHECK_PARTIAL();
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 GETCHARINC(c, eptr);
3352 switch(c)
3353 {
3354 default: RRETURN(MATCH_NOMATCH);
3355 case 0x09: /* HT */
3356 case 0x20: /* SPACE */
3357 case 0xa0: /* NBSP */
3358 case 0x1680: /* OGHAM SPACE MARK */
3359 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3360 case 0x2000: /* EN QUAD */
3361 case 0x2001: /* EM QUAD */
3362 case 0x2002: /* EN SPACE */
3363 case 0x2003: /* EM SPACE */
3364 case 0x2004: /* THREE-PER-EM SPACE */
3365 case 0x2005: /* FOUR-PER-EM SPACE */
3366 case 0x2006: /* SIX-PER-EM SPACE */
3367 case 0x2007: /* FIGURE SPACE */
3368 case 0x2008: /* PUNCTUATION SPACE */
3369 case 0x2009: /* THIN SPACE */
3370 case 0x200A: /* HAIR SPACE */
3371 case 0x202f: /* NARROW NO-BREAK SPACE */
3372 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3373 case 0x3000: /* IDEOGRAPHIC SPACE */
3374 break;
3375 }
3376 }
3377 break;
3378
3379 case OP_NOT_VSPACE:
3380 for (i = 1; i <= min; i++)
3381 {
3382 if (eptr >= md->end_subject)
3383 {
3384 SCHECK_PARTIAL();
3385 RRETURN(MATCH_NOMATCH);
3386 }
3387 GETCHARINC(c, eptr);
3388 switch(c)
3389 {
3390 default: break;
3391 case 0x0a: /* LF */
3392 case 0x0b: /* VT */
3393 case 0x0c: /* FF */
3394 case 0x0d: /* CR */
3395 case 0x85: /* NEL */
3396 case 0x2028: /* LINE SEPARATOR */
3397 case 0x2029: /* PARAGRAPH SEPARATOR */
3398 RRETURN(MATCH_NOMATCH);
3399 }
3400 }
3401 break;
3402
3403 case OP_VSPACE:
3404 for (i = 1; i <= min; i++)
3405 {
3406 if (eptr >= md->end_subject)
3407 {
3408 SCHECK_PARTIAL();
3409 RRETURN(MATCH_NOMATCH);
3410 }
3411 GETCHARINC(c, eptr);
3412 switch(c)
3413 {
3414 default: RRETURN(MATCH_NOMATCH);
3415 case 0x0a: /* LF */
3416 case 0x0b: /* VT */
3417 case 0x0c: /* FF */
3418 case 0x0d: /* CR */
3419 case 0x85: /* NEL */
3420 case 0x2028: /* LINE SEPARATOR */
3421 case 0x2029: /* PARAGRAPH SEPARATOR */
3422 break;
3423 }
3424 }
3425 break;
3426
3427 case OP_NOT_DIGIT:
3428 for (i = 1; i <= min; i++)
3429 {
3430 if (eptr >= md->end_subject)
3431 {
3432 SCHECK_PARTIAL();
3433 RRETURN(MATCH_NOMATCH);
3434 }
3435 GETCHARINC(c, eptr);
3436 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3437 RRETURN(MATCH_NOMATCH);
3438 }
3439 break;
3440
3441 case OP_DIGIT:
3442 for (i = 1; i <= min; i++)
3443 {
3444 if (eptr >= md->end_subject)
3445 {
3446 SCHECK_PARTIAL();
3447 RRETURN(MATCH_NOMATCH);
3448 }
3449 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3450 RRETURN(MATCH_NOMATCH);
3451 /* No need to skip more bytes - we know it's a 1-byte character */
3452 }
3453 break;
3454
3455 case OP_NOT_WHITESPACE:
3456 for (i = 1; i <= min; i++)
3457 {
3458 if (eptr >= md->end_subject)
3459 {
3460 SCHECK_PARTIAL();
3461 RRETURN(MATCH_NOMATCH);
3462 }
3463 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3464 RRETURN(MATCH_NOMATCH);
3465 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3466 }
3467 break;
3468
3469 case OP_WHITESPACE:
3470 for (i = 1; i <= min; i++)
3471 {
3472 if (eptr >= md->end_subject)
3473 {
3474 SCHECK_PARTIAL();
3475 RRETURN(MATCH_NOMATCH);
3476 }
3477 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3478 RRETURN(MATCH_NOMATCH);
3479 /* No need to skip more bytes - we know it's a 1-byte character */
3480 }
3481 break;
3482
3483 case OP_NOT_WORDCHAR:
3484 for (i = 1; i <= min; i++)
3485 {
3486 if (eptr >= md->end_subject ||
3487 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3488 RRETURN(MATCH_NOMATCH);
3489 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3490 }
3491 break;
3492
3493 case OP_WORDCHAR:
3494 for (i = 1; i <= min; i++)
3495 {
3496 if (eptr >= md->end_subject)
3497 {
3498 SCHECK_PARTIAL();
3499 RRETURN(MATCH_NOMATCH);
3500 }
3501 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3502 RRETURN(MATCH_NOMATCH);
3503 /* No need to skip more bytes - we know it's a 1-byte character */
3504 }
3505 break;
3506
3507 default:
3508 RRETURN(PCRE_ERROR_INTERNAL);
3509 } /* End switch(ctype) */
3510
3511 else
3512 #endif /* SUPPORT_UTF8 */
3513
3514 /* Code for the non-UTF-8 case for minimum matching of operators other
3515 than OP_PROP and OP_NOTPROP. */
3516
3517 switch(ctype)
3518 {
3519 case OP_ANY:
3520 for (i = 1; i <= min; i++)
3521 {
3522 if (eptr >= md->end_subject)
3523 {
3524 SCHECK_PARTIAL();
3525 RRETURN(MATCH_NOMATCH);
3526 }
3527 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3528 eptr++;
3529 }
3530 break;
3531
3532 case OP_ALLANY:
3533 if (eptr > md->end_subject - min)
3534 {
3535 SCHECK_PARTIAL();
3536 RRETURN(MATCH_NOMATCH);
3537 }
3538 eptr += min;
3539 break;
3540
3541 case OP_ANYBYTE:
3542 if (eptr > md->end_subject - min)
3543 {
3544 SCHECK_PARTIAL();
3545 RRETURN(MATCH_NOMATCH);
3546 }
3547 eptr += min;
3548 break;
3549
3550 case OP_ANYNL:
3551 for (i = 1; i <= min; i++)
3552 {
3553 if (eptr >= md->end_subject)
3554 {
3555 SCHECK_PARTIAL();
3556 RRETURN(MATCH_NOMATCH);
3557 }
3558 switch(*eptr++)
3559 {
3560 default: RRETURN(MATCH_NOMATCH);
3561 case 0x000d:
3562 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3563 break;
3564 case 0x000a:
3565 break;
3566
3567 case 0x000b:
3568 case 0x000c:
3569 case 0x0085:
3570 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3571 break;
3572 }
3573 }
3574 break;
3575
3576 case OP_NOT_HSPACE:
3577 for (i = 1; i <= min; i++)
3578 {
3579 if (eptr >= md->end_subject)
3580 {
3581 SCHECK_PARTIAL();
3582 RRETURN(MATCH_NOMATCH);
3583 }
3584 switch(*eptr++)
3585 {
3586 default: break;
3587 case 0x09: /* HT */
3588 case 0x20: /* SPACE */
3589 case 0xa0: /* NBSP */
3590 RRETURN(MATCH_NOMATCH);
3591 }
3592 }
3593 break;
3594
3595 case OP_HSPACE:
3596 for (i = 1; i <= min; i++)
3597 {
3598 if (eptr >= md->end_subject)
3599 {
3600 SCHECK_PARTIAL();
3601 RRETURN(MATCH_NOMATCH);
3602 }
3603 switch(*eptr++)
3604 {
3605 default: RRETURN(MATCH_NOMATCH);
3606 case 0x09: /* HT */
3607 case 0x20: /* SPACE */
3608 case 0xa0: /* NBSP */
3609 break;
3610 }
3611 }
3612 break;
3613
3614 case OP_NOT_VSPACE:
3615 for (i = 1; i <= min; i++)
3616 {
3617 if (eptr >= md->end_subject)
3618 {
3619 SCHECK_PARTIAL();
3620 RRETURN(MATCH_NOMATCH);
3621 }
3622 switch(*eptr++)
3623 {
3624 default: break;
3625 case 0x0a: /* LF */
3626 case 0x0b: /* VT */
3627 case 0x0c: /* FF */
3628 case 0x0d: /* CR */
3629 case 0x85: /* NEL */
3630 RRETURN(MATCH_NOMATCH);
3631 }
3632 }
3633 break;
3634
3635 case OP_VSPACE:
3636 for (i = 1; i <= min; i++)
3637 {
3638 if (eptr >= md->end_subject)
3639 {
3640 SCHECK_PARTIAL();
3641 RRETURN(MATCH_NOMATCH);
3642 }
3643 switch(*eptr++)
3644 {
3645 default: RRETURN(MATCH_NOMATCH);
3646 case 0x0a: /* LF */
3647 case 0x0b: /* VT */
3648 case 0x0c: /* FF */
3649 case 0x0d: /* CR */
3650 case 0x85: /* NEL */
3651 break;
3652 }
3653 }
3654 break;
3655
3656 case OP_NOT_DIGIT:
3657 for (i = 1; i <= min; i++)
3658 {
3659 if (eptr >= md->end_subject)
3660 {
3661 SCHECK_PARTIAL();
3662 RRETURN(MATCH_NOMATCH);
3663 }
3664 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3665 }
3666 break;
3667
3668 case OP_DIGIT:
3669 for (i = 1; i <= min; i++)
3670 {
3671 if (eptr >= md->end_subject)
3672 {
3673 SCHECK_PARTIAL();
3674 RRETURN(MATCH_NOMATCH);
3675 }
3676 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3677 }
3678 break;
3679
3680 case OP_NOT_WHITESPACE:
3681 for (i = 1; i <= min; i++)
3682 {
3683 if (eptr >= md->end_subject)
3684 {
3685 SCHECK_PARTIAL();
3686 RRETURN(MATCH_NOMATCH);
3687 }
3688 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3689 }
3690 break;
3691
3692 case OP_WHITESPACE:
3693 for (i = 1; i <= min; i++)
3694 {
3695 if (eptr >= md->end_subject)
3696 {
3697 SCHECK_PARTIAL();
3698 RRETURN(MATCH_NOMATCH);
3699 }
3700 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3701 }
3702 break;
3703
3704 case OP_NOT_WORDCHAR:
3705 for (i = 1; i <= min; i++)
3706 {
3707 if (eptr >= md->end_subject)
3708 {
3709 SCHECK_PARTIAL();
3710 RRETURN(MATCH_NOMATCH);
3711 }
3712 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3713 RRETURN(MATCH_NOMATCH);
3714 }
3715 break;
3716
3717 case OP_WORDCHAR:
3718 for (i = 1; i <= min; i++)
3719 {
3720 if (eptr >= md->end_subject)
3721 {
3722 SCHECK_PARTIAL();
3723 RRETURN(MATCH_NOMATCH);
3724 }
3725 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3726 RRETURN(MATCH_NOMATCH);
3727 }
3728 break;
3729
3730 default:
3731 RRETURN(PCRE_ERROR_INTERNAL);
3732 }
3733 }
3734
3735 /* If min = max, continue at the same level without recursing */
3736
3737 if (min == max) continue;
3738
3739 /* If minimizing, we have to test the rest of the pattern before each
3740 subsequent match. Again, separate the UTF-8 case for speed, and also
3741 separate the UCP cases. */
3742
3743 if (minimize)
3744 {
3745 #ifdef SUPPORT_UCP
3746 if (prop_type >= 0)
3747 {
3748 switch(prop_type)
3749 {
3750 case PT_ANY:
3751 for (fi = min;; fi++)
3752 {
3753 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3754 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3755 if (fi >= max) RRETURN(MATCH_NOMATCH);
3756 if (eptr >= md->end_subject)
3757 {
3758 SCHECK_PARTIAL();
3759 RRETURN(MATCH_NOMATCH);
3760 }
3761 GETCHARINC(c, eptr);
3762 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3763 }
3764 /* Control never gets here */
3765
3766 case PT_LAMP:
3767 for (fi = min;; fi++)
3768 {
3769 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3770 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3772 if (eptr >= md->end_subject)
3773 {
3774 SCHECK_PARTIAL();
3775 RRETURN(MATCH_NOMATCH);
3776 }
3777 GETCHARINC(c, eptr);
3778 prop_chartype = UCD_CHARTYPE(c);
3779 if ((prop_chartype == ucp_Lu ||
3780 prop_chartype == ucp_Ll ||
3781 prop_chartype == ucp_Lt) == prop_fail_result)
3782 RRETURN(MATCH_NOMATCH);
3783 }
3784 /* Control never gets here */
3785
3786 case PT_GC:
3787 for (fi = min;; fi++)
3788 {
3789 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3791 if (fi >= max) RRETURN(MATCH_NOMATCH);
3792 if (eptr >= md->end_subject)
3793 {
3794 SCHECK_PARTIAL();
3795 RRETURN(MATCH_NOMATCH);
3796 }
3797 GETCHARINC(c, eptr);
3798 prop_category = UCD_CATEGORY(c);
3799 if ((prop_category == prop_value) == prop_fail_result)
3800 RRETURN(MATCH_NOMATCH);
3801 }
3802 /* Control never gets here */
3803
3804 case PT_PC:
3805 for (fi = min;; fi++)
3806 {
3807 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3809 if (fi >= max) RRETURN(MATCH_NOMATCH);
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 RRETURN(MATCH_NOMATCH);
3814 }
3815 GETCHARINC(c, eptr);
3816 prop_chartype = UCD_CHARTYPE(c);
3817 if ((prop_chartype == prop_value) == prop_fail_result)
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 /* Control never gets here */
3821
3822 case PT_SC:
3823 for (fi = min;; fi++)
3824 {
3825 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3827 if (fi >= max) RRETURN(MATCH_NOMATCH);
3828 if (eptr >= md->end_subject)
3829 {
3830 SCHECK_PARTIAL();
3831 RRETURN(MATCH_NOMATCH);
3832 }
3833 GETCHARINC(c, eptr);
3834 prop_script = UCD_SCRIPT(c);
3835 if ((prop_script == prop_value) == prop_fail_result)
3836 RRETURN(MATCH_NOMATCH);
3837 }
3838 /* Control never gets here */
3839
3840 default:
3841 RRETURN(PCRE_ERROR_INTERNAL);
3842 }
3843 }
3844
3845 /* Match extended Unicode sequences. We will get here only if the
3846 support is in the binary; otherwise a compile-time error occurs. */
3847
3848 else if (ctype == OP_EXTUNI)
3849 {
3850 for (fi = min;; fi++)
3851 {
3852 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3854 if (fi >= max) RRETURN(MATCH_NOMATCH);
3855 if (eptr >= md->end_subject)
3856 {
3857 SCHECK_PARTIAL();
3858 RRETURN(MATCH_NOMATCH);
3859 }
3860 GETCHARINCTEST(c, eptr);
3861 prop_category = UCD_CATEGORY(c);
3862 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3863 while (eptr < md->end_subject)
3864 {
3865 int len = 1;
3866 if (!utf8) c = *eptr;
3867 else { GETCHARLEN(c, eptr, len); }
3868 prop_category = UCD_CATEGORY(c);
3869 if (prop_category != ucp_M) break;
3870 eptr += len;
3871 }
3872 }
3873 }
3874
3875 else
3876 #endif /* SUPPORT_UCP */
3877
3878 #ifdef SUPPORT_UTF8
3879 /* UTF-8 mode */
3880 if (utf8)
3881 {
3882 for (fi = min;; fi++)
3883 {
3884 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3886 if (fi >= max) RRETURN(MATCH_NOMATCH);
3887 if (eptr >= md->end_subject)
3888 {
3889 SCHECK_PARTIAL();
3890 RRETURN(MATCH_NOMATCH);
3891 }
3892 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3893 RRETURN(MATCH_NOMATCH);
3894 GETCHARINC(c, eptr);
3895 switch(ctype)
3896 {
3897 case OP_ANY: /* This is the non-NL case */
3898 case OP_ALLANY:
3899 case OP_ANYBYTE:
3900 break;
3901
3902 case OP_ANYNL:
3903 switch(c)
3904 {
3905 default: RRETURN(MATCH_NOMATCH);
3906 case 0x000d:
3907 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3908 break;
3909 case 0x000a:
3910 break;
3911
3912 case 0x000b:
3913 case 0x000c:
3914 case 0x0085:
3915 case 0x2028:
3916 case 0x2029:
3917 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3918 break;
3919 }
3920 break;
3921
3922 case OP_NOT_HSPACE:
3923 switch(c)
3924 {
3925 default: break;
3926 case 0x09: /* HT */
3927 case 0x20: /* SPACE */
3928 case 0xa0: /* NBSP */
3929 case 0x1680: /* OGHAM SPACE MARK */
3930 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3931 case 0x2000: /* EN QUAD */
3932 case 0x2001: /* EM QUAD */
3933 case 0x2002: /* EN SPACE */
3934 case 0x2003: /* EM SPACE */
3935 case 0x2004: /* THREE-PER-EM SPACE */
3936 case 0x2005: /* FOUR-PER-EM SPACE */
3937 case 0x2006: /* SIX-PER-EM SPACE */
3938 case 0x2007: /* FIGURE SPACE */
3939 case 0x2008: /* PUNCTUATION SPACE */
3940 case 0x2009: /* THIN SPACE */
3941 case 0x200A: /* HAIR SPACE */
3942 case 0x202f: /* NARROW NO-BREAK SPACE */
3943 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3944 case 0x3000: /* IDEOGRAPHIC SPACE */
3945 RRETURN(MATCH_NOMATCH);
3946 }
3947 break;
3948
3949 case OP_HSPACE:
3950 switch(c)
3951 {
3952 default: RRETURN(MATCH_NOMATCH);
3953 case 0x09: /* HT */
3954 case 0x20: /* SPACE */
3955 case 0xa0: /* NBSP */
3956 case 0x1680: /* OGHAM SPACE MARK */
3957 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3958 case 0x2000: /* EN QUAD */
3959 case 0x2001: /* EM QUAD */
3960 case 0x2002: /* EN SPACE */
3961 case 0x2003: /* EM SPACE */
3962 case 0x2004: /* THREE-PER-EM SPACE */
3963 case 0x2005: /* FOUR-PER-EM SPACE */
3964 case 0x2006: /* SIX-PER-EM SPACE */
3965 case 0x2007: /* FIGURE SPACE */
3966 case 0x2008: /* PUNCTUATION SPACE */
3967 case 0x2009: /* THIN SPACE */
3968 case 0x200A: /* HAIR SPACE */
3969 case 0x202f: /* NARROW NO-BREAK SPACE */
3970 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3971 case 0x3000: /* IDEOGRAPHIC SPACE */
3972 break;
3973 }
3974 break;
3975
3976 case OP_NOT_VSPACE:
3977 switch(c)
3978 {
3979 default: break;
3980 case 0x0a: /* LF */
3981 case 0x0b: /* VT */
3982 case 0x0c: /* FF */
3983 case 0x0d: /* CR */
3984 case 0x85: /* NEL */
3985 case 0x2028: /* LINE SEPARATOR */
3986 case 0x2029: /* PARAGRAPH SEPARATOR */
3987 RRETURN(MATCH_NOMATCH);
3988 }
3989 break;
3990
3991 case OP_VSPACE:
3992 switch(c)
3993 {
3994 default: RRETURN(MATCH_NOMATCH);
3995 case 0x0a: /* LF */
3996 case 0x0b: /* VT */
3997 case 0x0c: /* FF */
3998 case 0x0d: /* CR */
3999 case 0x85: /* NEL */
4000 case 0x2028: /* LINE SEPARATOR */
4001 case 0x2029: /* PARAGRAPH SEPARATOR */
4002 break;
4003 }
4004 break;
4005
4006 case OP_NOT_DIGIT:
4007 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4008 RRETURN(MATCH_NOMATCH);
4009 break;
4010
4011 case OP_DIGIT:
4012 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4013 RRETURN(MATCH_NOMATCH);
4014 break;
4015
4016 case OP_NOT_WHITESPACE:
4017 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4018 RRETURN(MATCH_NOMATCH);
4019 break;
4020
4021 case OP_WHITESPACE:
4022 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4023 RRETURN(MATCH_NOMATCH);
4024 break;
4025
4026 case OP_NOT_WORDCHAR:
4027 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4028 RRETURN(MATCH_NOMATCH);
4029 break;
4030
4031 case OP_WORDCHAR:
4032 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4033 RRETURN(MATCH_NOMATCH);
4034 break;
4035
4036 default:
4037 RRETURN(PCRE_ERROR_INTERNAL);
4038 }
4039 }
4040 }
4041 else
4042 #endif
4043 /* Not UTF-8 mode */
4044 {
4045 for (fi = min;; fi++)
4046 {
4047 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4049 if (fi >= max) RRETURN(MATCH_NOMATCH);
4050 if (eptr >= md->end_subject)
4051 {
4052 SCHECK_PARTIAL();
4053 RRETURN(MATCH_NOMATCH);
4054 }
4055 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4056 RRETURN(MATCH_NOMATCH);
4057 c = *eptr++;
4058 switch(ctype)
4059 {
4060 case OP_ANY: /* This is the non-NL case */
4061 case OP_ALLANY:
4062 case OP_ANYBYTE:
4063 break;
4064
4065 case OP_ANYNL:
4066 switch(c)
4067 {
4068 default: RRETURN(MATCH_NOMATCH);
4069 case 0x000d:
4070 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4071 break;
4072
4073 case 0x000a:
4074 break;
4075
4076 case 0x000b:
4077 case 0x000c:
4078 case 0x0085:
4079 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4080 break;
4081 }
4082 break;
4083
4084 case OP_NOT_HSPACE:
4085 switch(c)
4086 {
4087 default: break;
4088 case 0x09: /* HT */
4089 case 0x20: /* SPACE */
4090 case 0xa0: /* NBSP */
4091 RRETURN(MATCH_NOMATCH);
4092 }
4093 break;
4094
4095 case OP_HSPACE:
4096 switch(c)
4097 {
4098 default: RRETURN(MATCH_NOMATCH);
4099 case 0x09: /* HT */
4100 case 0x20: /* SPACE */
4101 case 0xa0: /* NBSP */
4102 break;
4103 }
4104 break;
4105
4106 case OP_NOT_VSPACE:
4107 switch(c)
4108 {
4109 default: break;
4110 case 0x0a: /* LF */
4111 case 0x0b: /* VT */
4112 case 0x0c: /* FF */
4113 case 0x0d: /* CR */
4114 case 0x85: /* NEL */
4115 RRETURN(MATCH_NOMATCH);
4116 }
4117 break;
4118
4119 case OP_VSPACE:
4120 switch(c)
4121 {
4122 default: RRETURN(MATCH_NOMATCH);
4123 case 0x0a: /* LF */
4124 case 0x0b: /* VT */
4125 case 0x0c: /* FF */
4126 case 0x0d: /* CR */
4127 case 0x85: /* NEL */
4128 break;
4129 }
4130 break;
4131
4132 case OP_NOT_DIGIT:
4133 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4134 break;
4135
4136 case OP_DIGIT:
4137 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4138 break;
4139
4140 case OP_NOT_WHITESPACE:
4141 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4142 break;
4143
4144 case OP_WHITESPACE:
4145 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4146 break;
4147
4148 case OP_NOT_WORDCHAR:
4149 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4150 break;
4151
4152 case OP_WORDCHAR:
4153 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4154 break;
4155
4156 default:
4157 RRETURN(PCRE_ERROR_INTERNAL);
4158 }
4159 }
4160 }
4161 /* Control never gets here */
4162 }
4163
4164 /* If maximizing, it is worth using inline code for speed, doing the type
4165 test once at the start (i.e. keep it out of the loop). Again, keep the
4166 UTF-8 and UCP stuff separate. */
4167
4168 else
4169 {
4170 pp = eptr; /* Remember where we started */
4171
4172 #ifdef SUPPORT_UCP
4173 if (prop_type >= 0)
4174 {
4175 switch(prop_type)
4176 {
4177 case PT_ANY:
4178 for (i = min; i < max; i++)
4179 {
4180 int len = 1;
4181 if (eptr >= md->end_subject) break;
4182 GETCHARLEN(c, eptr, len);
4183 if (prop_fail_result) break;
4184 eptr+= len;
4185 }
4186 break;
4187
4188 case PT_LAMP:
4189 for (i = min; i < max; i++)
4190 {
4191 int len = 1;
4192 if (eptr >= md->end_subject) break;
4193 GETCHARLEN(c, eptr, len);
4194 prop_chartype = UCD_CHARTYPE(c);
4195 if ((prop_chartype == ucp_Lu ||
4196 prop_chartype == ucp_Ll ||
4197 prop_chartype == ucp_Lt) == prop_fail_result)
4198 break;
4199 eptr+= len;
4200 }
4201 break;
4202
4203 case PT_GC:
4204 for (i = min; i < max; i++)
4205 {
4206 int len = 1;
4207 if (eptr >= md->end_subject) break;
4208 GETCHARLEN(c, eptr, len);
4209 prop_category = UCD_CATEGORY(c);
4210 if ((prop_category == prop_value) == prop_fail_result)
4211 break;
4212 eptr+= len;
4213 }
4214 break;
4215
4216 case PT_PC:
4217 for (i = min; i < max; i++)
4218 {
4219 int len = 1;
4220 if (eptr >= md->end_subject) break;
4221 GETCHARLEN(c, eptr, len);
4222 prop_chartype = UCD_CHARTYPE(c);
4223 if ((prop_chartype == prop_value) == prop_fail_result)
4224 break;
4225 eptr+= len;
4226 }
4227 break;
4228
4229 case PT_SC:
4230 for (i = min; i < max; i++)
4231 {
4232 int len = 1;
4233 if (eptr >= md->end_subject) break;
4234 GETCHARLEN(c, eptr, len);
4235 prop_script = UCD_SCRIPT(c);
4236 if ((prop_script == prop_value) == prop_fail_result)
4237 break;
4238 eptr+= len;
4239 }
4240 break;
4241 }
4242
4243 /* eptr is now past the end of the maximum run */
4244
4245 if (possessive) continue;
4246 for(;;)
4247 {
4248 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4249 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4250 if (eptr-- == pp) break; /* Stop if tried at original pos */
4251 if (utf8) BACKCHAR(eptr);
4252 }
4253 }
4254
4255 /* Match extended Unicode sequences. We will get here only if the
4256 support is in the binary; otherwise a compile-time error occurs. */
4257
4258 else if (ctype == OP_EXTUNI)
4259 {
4260 for (i = min; i < max; i++)
4261 {
4262 if (eptr >= md->end_subject) break;
4263 GETCHARINCTEST(c, eptr);
4264 prop_category = UCD_CATEGORY(c);
4265 if (prop_category == ucp_M) break;
4266 while (eptr < md->end_subject)
4267 {
4268 int len = 1;
4269 if (!utf8) c = *eptr; else
4270 {
4271 GETCHARLEN(c, eptr, len);
4272 }
4273 prop_category = UCD_CATEGORY(c);
4274 if (prop_category != ucp_M) break;
4275 eptr += len;
4276 }
4277 }
4278
4279 /* eptr is now past the end of the maximum run */
4280
4281 if (possessive) continue;
4282 for(;;)
4283 {
4284 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4285 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4286 if (eptr-- == pp) break; /* Stop if tried at original pos */
4287 for (;;) /* Move back over one extended */
4288 {
4289 int len = 1;
4290 if (!utf8) c = *eptr; else
4291 {
4292 BACKCHAR(eptr);
4293 GETCHARLEN(c, eptr, len);
4294 }
4295 prop_category = UCD_CATEGORY(c);
4296 if (prop_category != ucp_M) break;
4297 eptr--;
4298 }
4299 }
4300 }
4301
4302 else
4303 #endif /* SUPPORT_UCP */
4304
4305 #ifdef SUPPORT_UTF8
4306 /* UTF-8 mode */
4307
4308 if (utf8)
4309 {
4310 switch(ctype)
4311 {
4312 case OP_ANY:
4313 if (max < INT_MAX)
4314 {
4315 for (i = min; i < max; i++)
4316 {
4317 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4318 eptr++;
4319 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4320 }
4321 }
4322
4323 /* Handle unlimited UTF-8 repeat */
4324
4325 else
4326 {
4327 for (i = min; i < max; i++)
4328 {
4329 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4330 eptr++;
4331 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4332 }
4333 }
4334 break;
4335
4336 case OP_ALLANY:
4337 if (max < INT_MAX)
4338 {
4339 for (i = min; i < max; i++)
4340 {
4341 if (eptr >= md->end_subject) break;
4342 eptr++;
4343 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4344 }
4345 }
4346 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4347 break;
4348
4349 /* The byte case is the same as non-UTF8 */
4350
4351 case OP_ANYBYTE:
4352 c = max - min;
4353 if (c > (unsigned int)(md->end_subject - eptr))
4354 c = md->end_subject - eptr;
4355 eptr += c;
4356 break;
4357
4358 case OP_ANYNL:
4359 for (i = min; i < max; i++)
4360 {
4361 int len = 1;
4362 if (eptr >= md->end_subject) break;
4363 GETCHARLEN(c, eptr, len);
4364 if (c == 0x000d)
4365 {
4366 if (++eptr >= md->end_subject) break;
4367 if (*eptr == 0x000a) eptr++;
4368 }
4369 else
4370 {
4371 if (c != 0x000a &&
4372 (md->bsr_anycrlf ||
4373 (c != 0x000b && c != 0x000c &&
4374 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4375 break;
4376 eptr += len;
4377 }
4378 }
4379 break;
4380
4381 case OP_NOT_HSPACE:
4382 case OP_HSPACE:
4383 for (i = min; i < max; i++)
4384 {
4385 BOOL gotspace;
4386 int len = 1;
4387 if (eptr >= md->end_subject) break;
4388 GETCHARLEN(c, eptr, len);
4389 switch(c)
4390 {
4391 default: gotspace = FALSE; break;
4392 case 0x09: /* HT */
4393 case 0x20: /* SPACE */
4394 case 0xa0: /* NBSP */
4395 case 0x1680: /* OGHAM SPACE MARK */
4396 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4397 case 0x2000: /* EN QUAD */
4398 case 0x2001: /* EM QUAD */
4399 case 0x2002: /* EN SPACE */
4400 case 0x2003: /* EM SPACE */
4401 case 0x2004: /* THREE-PER-EM SPACE */
4402 case 0x2005: /* FOUR-PER-EM SPACE */
4403 case 0x2006: /* SIX-PER-EM SPACE */
4404 case 0x2007: /* FIGURE SPACE */
4405 case 0x2008: /* PUNCTUATION SPACE */
4406 case 0x2009: /* THIN SPACE */
4407 case 0x200A: /* HAIR SPACE */
4408 case 0x202f: /* NARROW NO-BREAK SPACE */
4409 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4410 case 0x3000: /* IDEOGRAPHIC SPACE */
4411 gotspace = TRUE;
4412 break;
4413 }
4414 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4415 eptr += len;
4416 }
4417 break;
4418
4419 case OP_NOT_VSPACE:
4420 case OP_VSPACE:
4421 for (i = min; i < max; i++)
4422 {
4423 BOOL gotspace;
4424 int len = 1;
4425 if (eptr >= md->end_subject) break;
4426 GETCHARLEN(c, eptr, len);
4427 switch(c)
4428 {
4429 default: gotspace = FALSE; break;
4430 case 0x0a: /* LF */
4431 case 0x0b: /* VT */
4432 case 0x0c: /* FF */
4433 case 0x0d: /* CR */
4434 case 0x85: /* NEL */
4435 case 0x2028: /* LINE SEPARATOR */
4436 case 0x2029: /* PARAGRAPH SEPARATOR */
4437 gotspace = TRUE;
4438 break;
4439 }
4440 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4441 eptr += len;
4442 }
4443 break;
4444
4445 case OP_NOT_DIGIT:
4446 for (i = min; i < max; i++)
4447 {
4448 int len = 1;
4449 if (eptr >= md->end_subject) break;
4450 GETCHARLEN(c, eptr, len);
4451 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4452 eptr+= len;
4453 }
4454 break;
4455
4456 case OP_DIGIT:
4457 for (i = min; i < max; i++)
4458 {
4459 int len = 1;
4460 if (eptr >= md->end_subject) break;
4461 GETCHARLEN(c, eptr, len);
4462 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4463 eptr+= len;
4464 }
4465 break;
4466
4467 case OP_NOT_WHITESPACE:
4468 for (i = min; i < max; i++)
4469 {
4470 int len = 1;
4471 if (eptr >= md->end_subject) break;
4472 GETCHARLEN(c, eptr, len);
4473 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4474 eptr+= len;
4475 }
4476 break;
4477
4478 case OP_WHITESPACE:
4479 for (i = min; i < max; i++)
4480 {
4481 int len = 1;
4482 if (eptr >= md->end_subject) break;
4483 GETCHARLEN(c, eptr, len);
4484 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4485 eptr+= len;
4486 }
4487 break;
4488
4489 case OP_NOT_WORDCHAR:
4490 for (i = min; i < max; i++)
4491 {
4492 int len = 1;
4493 if (eptr >= md->end_subject) break;
4494 GETCHARLEN(c, eptr, len);
4495 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4496 eptr+= len;
4497 }
4498 break;
4499
4500 case OP_WORDCHAR:
4501 for (i = min; i < max; i++)
4502 {
4503 int len = 1;
4504 if (eptr >= md->end_subject) break;
4505 GETCHARLEN(c, eptr, len);
4506 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4507 eptr+= len;
4508 }
4509 break;
4510
4511 default:
4512 RRETURN(PCRE_ERROR_INTERNAL);
4513 }
4514
4515 /* eptr is now past the end of the maximum run */
4516
4517 if (possessive) continue;
4518 for(;;)
4519 {
4520 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4521 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4522 if (eptr-- == pp) break; /* Stop if tried at original pos */
4523 BACKCHAR(eptr);
4524 }
4525 }
4526 else
4527 #endif /* SUPPORT_UTF8 */
4528
4529 /* Not UTF-8 mode */
4530 {
4531 switch(ctype)
4532 {
4533 case OP_ANY:
4534 for (i = min; i < max; i++)
4535 {
4536 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4537 eptr++;
4538 }
4539 break;
4540
4541 case OP_ALLANY:
4542 case OP_ANYBYTE:
4543 c = max - min;
4544 if (c > (unsigned int)(md->end_subject - eptr))
4545 c = md->end_subject - eptr;
4546 eptr += c;
4547 break;
4548
4549 case OP_ANYNL:
4550 for (i = min; i < max; i++)
4551 {
4552 if (eptr >= md->end_subject) break;
4553 c = *eptr;
4554 if (c == 0x000d)
4555 {
4556 if (++eptr >= md->end_subject) break;
4557 if (*eptr == 0x000a) eptr++;
4558 }
4559 else
4560 {
4561 if (c != 0x000a &&
4562 (md->bsr_anycrlf ||
4563 (c != 0x000b && c != 0x000c && c != 0x0085)))
4564 break;
4565 eptr++;
4566 }
4567 }
4568 break;
4569
4570 case OP_NOT_HSPACE:
4571 for (i = min; i < max; i++)
4572 {
4573 if (eptr >= md->end_subject) break;
4574 c = *eptr;
4575 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4576 eptr++;
4577 }
4578 break;
4579
4580 case OP_HSPACE:
4581 for (i = min; i < max; i++)
4582 {
4583 if (eptr >= md->end_subject) break;
4584 c = *eptr;
4585 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4586 eptr++;
4587 }
4588 break;
4589
4590 case OP_NOT_VSPACE:
4591 for (i = min; i < max; i++)
4592 {
4593 if (eptr >= md->end_subject) break;
4594 c = *eptr;
4595 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4596 break;
4597 eptr++;
4598 }
4599 break;
4600
4601 case OP_VSPACE:
4602 for (i = min; i < max; i++)
4603 {
4604 if (eptr >= md->end_subject) break;
4605 c = *eptr;
4606 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4607 break;
4608 eptr++;
4609 }
4610 break;
4611
4612 case OP_NOT_DIGIT:
4613 for (i = min; i < max; i++)
4614 {
4615 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4616 break;
4617 eptr++;
4618 }
4619 break;
4620
4621 case OP_DIGIT:
4622 for (i = min; i < max; i++)
4623 {
4624 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4625 break;
4626 eptr++;
4627 }
4628 break;
4629
4630 case OP_NOT_WHITESPACE:
4631 for (i = min; i < max; i++)
4632 {
4633 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4634 break;
4635 eptr++;
4636 }
4637 break;
4638
4639 case OP_WHITESPACE:
4640 for (i = min; i < max; i++)
4641 {
4642 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4643 break;
4644 eptr++;
4645 }
4646 break;
4647
4648 case OP_NOT_WORDCHAR:
4649 for (i = min; i < max; i++)
4650 {
4651 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4652 break;
4653 eptr++;
4654 }
4655 break;
4656
4657 case OP_WORDCHAR:
4658 for (i = min; i < max; i++)
4659 {
4660 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4661 break;
4662 eptr++;
4663 }
4664 break;
4665
4666 default:
4667 RRETURN(PCRE_ERROR_INTERNAL);
4668 }
4669
4670 /* eptr is now past the end of the maximum run */
4671
4672 if (possessive) continue;
4673 while (eptr >= pp)
4674 {
4675 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4676 eptr--;
4677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4678 }
4679 }
4680
4681 /* Get here if we can't make it match with any permitted repetitions */
4682
4683 RRETURN(MATCH_NOMATCH);
4684 }
4685 /* Control never gets here */
4686
4687 /* There's been some horrible disaster. Arrival here can only mean there is
4688 something seriously wrong in the code above or the OP_xxx definitions. */
4689
4690 default:
4691 DPRINTF(("Unknown opcode %d\n", *ecode));
4692 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4693 }
4694
4695 /* Do not stick any code in here without much thought; it is assumed
4696 that "continue" in the code above comes out to here to repeat the main
4697 loop. */
4698
4699 } /* End of main loop */
4700 /* Control never reaches here */
4701
4702
4703 /* When compiling to use the heap rather than the stack for recursive calls to
4704 match(), the RRETURN() macro jumps here. The number that is saved in
4705 frame->Xwhere indicates which label we actually want to return to. */
4706
4707 #ifdef NO_RECURSE
4708 #define LBL(val) case val: goto L_RM##val;
4709 HEAP_RETURN:
4710 switch (frame->Xwhere)
4711 {
4712 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4713 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4714 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4715 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4716 LBL(53) LBL(54)
4717 #ifdef SUPPORT_UTF8
4718 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4719 LBL(32) LBL(34) LBL(42) LBL(46)
4720 #ifdef SUPPORT_UCP
4721 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4722 #endif /* SUPPORT_UCP */
4723 #endif /* SUPPORT_UTF8 */
4724 default:
4725 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4726 return PCRE_ERROR_INTERNAL;
4727 }
4728 #undef LBL
4729 #endif /* NO_RECURSE */
4730 }
4731
4732
4733 /***************************************************************************
4734 ****************************************************************************
4735 RECURSION IN THE match() FUNCTION
4736
4737 Undefine all the macros that were defined above to handle this. */
4738
4739 #ifdef NO_RECURSE
4740 #undef eptr
4741 #undef ecode
4742 #undef mstart
4743 #undef offset_top
4744 #undef ims
4745 #undef eptrb
4746 #undef flags
4747
4748 #undef callpat
4749 #undef charptr
4750 #undef data
4751 #undef next
4752 #undef pp
4753 #undef prev
4754 #undef saved_eptr
4755
4756 #undef new_recursive
4757
4758 #undef cur_is_word
4759 #undef condition
4760 #undef prev_is_word
4761
4762 #undef original_ims
4763
4764 #undef ctype
4765 #undef length
4766 #undef max
4767 #undef min
4768 #undef number
4769 #undef offset
4770 #undef op
4771 #undef save_capture_last
4772 #undef save_offset1
4773 #undef save_offset2
4774 #undef save_offset3
4775 #undef stacksave
4776
4777 #undef newptrb
4778
4779 #endif
4780
4781 /* These two are defined as macros in both cases */
4782
4783 #undef fc
4784 #undef fi
4785
4786 /***************************************************************************
4787 ***************************************************************************/
4788
4789
4790
4791 /*************************************************
4792 * Execute a Regular Expression *
4793 *************************************************/
4794
4795 /* This function applies a compiled re to a subject string and picks out
4796 portions of the string if it matches. Two elements in the vector are set for
4797 each substring: the offsets to the start and end of the substring.
4798
4799 Arguments:
4800 argument_re points to the compiled expression
4801 extra_data points to extra data or is NULL
4802 subject points to the subject string
4803 length length of subject string (may contain binary zeros)
4804 start_offset where to start in the subject string
4805 options option bits
4806 offsets points to a vector of ints to be filled in with offsets
4807 offsetcount the number of elements in the vector
4808
4809 Returns: > 0 => success; value is the number of elements filled in
4810 = 0 => success, but offsets is not big enough
4811 -1 => failed to match
4812 < -1 => some kind of unexpected problem
4813 */
4814
4815 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4816 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4817 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4818 int offsetcount)
4819 {
4820 int rc, resetcount, ocount;
4821 int first_byte = -1;
4822 int req_byte = -1;
4823 int req_byte2 = -1;
4824 int newline;
4825 unsigned long int ims;
4826 BOOL using_temporary_offsets = FALSE;
4827 BOOL anchored;
4828 BOOL startline;
4829 BOOL firstline;
4830 BOOL first_byte_caseless = FALSE;
4831 BOOL req_byte_caseless = FALSE;
4832 BOOL utf8;
4833 match_data match_block;
4834 match_data *md = &match_block;
4835 const uschar *tables;
4836 const uschar *start_bits = NULL;
4837 USPTR start_match = (USPTR)subject + start_offset;
4838 USPTR end_subject;
4839 USPTR start_partial = NULL;
4840 USPTR req_byte_ptr = start_match - 1;
4841
4842 pcre_study_data internal_study;
4843 const pcre_study_data *study;
4844
4845 real_pcre internal_re;
4846 const real_pcre *external_re = (const real_pcre *)argument_re;
4847 const real_pcre *re = external_re;
4848
4849 /* Plausibility checks */
4850
4851 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4852 if (re == NULL || subject == NULL ||
4853 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4854 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4855
4856 /* Fish out the optional data from the extra_data structure, first setting
4857 the default values. */
4858
4859 study = NULL;
4860 md->match_limit = MATCH_LIMIT;
4861 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4862 md->callout_data = NULL;
4863
4864 /* The table pointer is always in native byte order. */
4865
4866 tables = external_re->tables;
4867
4868 if (extra_data != NULL)
4869 {
4870 register unsigned int flags = extra_data->flags;
4871 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4872 study = (const pcre_study_data *)extra_data->study_data;
4873 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4874 md->match_limit = extra_data->match_limit;
4875 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4876 md->match_limit_recursion = extra_data->match_limit_recursion;
4877 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4878 md->callout_data = extra_data->callout_data;
4879 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4880 }
4881
4882 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4883 is a feature that makes it possible to save compiled regex and re-use them
4884 in other programs later. */
4885
4886 if (tables == NULL) tables = _pcre_default_tables;
4887
4888 /* Check that the first field in the block is the magic number. If it is not,
4889 test for a regex that was compiled on a host of opposite endianness. If this is
4890 the case, flipped values are put in internal_re and internal_study if there was
4891 study data too. */
4892
4893 if (re->magic_number != MAGIC_NUMBER)
4894 {
4895 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4896 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4897 if (study != NULL) study = &internal_study;
4898 }
4899
4900 /* Set up other data */
4901
4902 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4903 startline = (re->flags & PCRE_STARTLINE) != 0;
4904 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4905
4906 /* The code starts after the real_pcre block and the capture name table. */
4907
4908 md->start_code = (const uschar *)external_re + re->name_table_offset +
4909 re->name_count * re->name_entry_size;
4910
4911 md->start_subject = (USPTR)subject;
4912 md->start_offset = start_offset;
4913 md->end_subject = md->start_subject + length;
4914 end_subject = md->end_subject;
4915
4916 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4917 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4918 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4919
4920 md->notbol = (options & PCRE_NOTBOL) != 0;
4921 md->noteol = (options & PCRE_NOTEOL) != 0;
4922 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4923 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
4924 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
4925 md->hitend = FALSE;
4926
4927 md->recursive = NULL; /* No recursion at top level */
4928
4929 md->lcc = tables + lcc_offset;
4930 md->ctypes = tables + ctypes_offset;
4931
4932 /* Handle different \R options. */
4933
4934 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4935 {
4936 case 0:
4937 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4938 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4939 else
4940 #ifdef BSR_ANYCRLF
4941 md->bsr_anycrlf = TRUE;
4942 #else
4943 md->bsr_anycrlf = FALSE;
4944 #endif
4945 break;
4946
4947 case PCRE_BSR_ANYCRLF:
4948 md->bsr_anycrlf = TRUE;
4949 break;
4950
4951 case PCRE_BSR_UNICODE:
4952 md->bsr_anycrlf = FALSE;
4953 break;
4954
4955 default: return PCRE_ERROR_BADNEWLINE;
4956 }
4957
4958 /* Handle different types of newline. The three bits give eight cases. If
4959 nothing is set at run time, whatever was used at compile time applies. */
4960
4961 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4962 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4963 {
4964 case 0: newline = NEWLINE; break; /* Compile-time default */
4965 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4966 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4967 case PCRE_NEWLINE_CR+
4968 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4969 case PCRE_NEWLINE_ANY: newline = -1; break;
4970 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4971 default: return PCRE_ERROR_BADNEWLINE;
4972 }
4973
4974 if (newline == -2)
4975 {
4976 md->nltype = NLTYPE_ANYCRLF;
4977 }
4978 else if (newline < 0)
4979 {
4980 md->nltype = NLTYPE_ANY;
4981 }
4982 else
4983 {
4984 md->nltype = NLTYPE_FIXED;
4985 if (newline > 255)
4986 {
4987 md->nllen = 2;
4988 md->nl[0] = (newline >> 8) & 255;
4989 md->nl[1] = newline & 255;
4990 }
4991 else
4992 {
4993 md->nllen = 1;
4994 md->nl[0] = newline;
4995 }
4996 }
4997
4998 /* Partial matching was originally supported only for a restricted set of
4999 regexes; from release 8.00 there are no restrictions, but the bits are still
5000 defined (though never set). So there's no harm in leaving this code. */
5001
5002 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5003 return PCRE_ERROR_BADPARTIAL;
5004
5005 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5006 back the character offset. */
5007
5008 #ifdef SUPPORT_UTF8
5009 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5010 {
5011 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5012 return PCRE_ERROR_BADUTF8;
5013 if (start_offset > 0 && start_offset < length)
5014 {
5015 int tb = ((USPTR)subject)[start_offset];
5016 if (tb > 127)
5017 {
5018 tb &= 0xc0;
5019 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5020 }
5021 }
5022 }
5023 #endif
5024
5025 /* The ims options can vary during the matching as a result of the presence
5026 of (?ims) items in the pattern. They are kept in a local variable so that
5027 restoring at the exit of a group is easy. */
5028
5029 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5030
5031 /* If the expression has got more back references than the offsets supplied can
5032 hold, we get a temporary chunk of working store to use during the matching.
5033 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5034 of 3. */
5035
5036 ocount = offsetcount - (offsetcount % 3);
5037
5038 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5039 {
5040 ocount = re->top_backref * 3 + 3;
5041 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5042 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5043 using_temporary_offsets = TRUE;
5044 DPRINTF(("Got memory to hold back references\n"));
5045 }
5046 else md->offset_vector = offsets;
5047
5048 md->offset_end = ocount;
5049 md->offset_max = (2*ocount)/3;
5050 md->offset_overflow = FALSE;
5051 md->capture_last = -1;
5052
5053 /* Compute the minimum number of offsets that we need to reset each time. Doing
5054 this makes a huge difference to execution time when there aren't many brackets
5055 in the pattern. */
5056
5057 resetcount = 2 + re->top_bracket * 2;
5058 if (resetcount > offsetcount) resetcount = ocount;
5059
5060 /* Reset the working variable associated with each extraction. These should
5061 never be used unless previously set, but they get saved and restored, and so we
5062 initialize them to avoid reading uninitialized locations. */
5063
5064 if (md->offset_vector != NULL)
5065 {
5066 register int *iptr = md->offset_vector + ocount;
5067 register int *iend = iptr - resetcount/2 + 1;
5068 while (--iptr >= iend) *iptr = -1;
5069 }
5070
5071 /* Set up the first character to match, if available. The first_byte value is
5072 never set for an anchored regular expression, but the anchoring may be forced
5073 at run time, so we have to test for anchoring. The first char may be unset for
5074 an unanchored pattern, of course. If there's no first char and the pattern was
5075 studied, there may be a bitmap of possible first characters. */
5076
5077 if (!anchored)
5078 {
5079 if ((re->flags & PCRE_FIRSTSET) != 0)
5080 {
5081 first_byte = re->first_byte & 255;
5082 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5083 first_byte = md->lcc[first_byte];
5084 }
5085 else
5086 if (!startline && study != NULL &&
5087 (study->options & PCRE_STUDY_MAPPED) != 0)
5088 start_bits = study->start_bits;
5089 }
5090
5091 /* For anchored or unanchored matches, there may be a "last known required
5092 character" set. */
5093
5094 if ((re->flags & PCRE_REQCHSET) != 0)
5095 {
5096 req_byte = re->req_byte & 255;
5097 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5098 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5099 }
5100
5101
5102 /* ==========================================================================*/
5103
5104 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5105 the loop runs just once. */
5106
5107 for(;;)
5108 {
5109 USPTR save_end_subject = end_subject;
5110 USPTR new_start_match;
5111
5112 /* Reset the maximum number of extractions we might see. */
5113
5114 if (md->offset_vector != NULL)
5115 {
5116 register int *iptr = md->offset_vector;
5117 register int *iend = iptr + resetcount;
5118 while (iptr < iend) *iptr++ = -1;
5119 }
5120
5121 /* If firstline is TRUE, the start of the match is constrained to the first
5122 line of a multiline string. That is, the match must be before or at the first
5123 newline. Implement this by temporarily adjusting end_subject so that we stop
5124 scanning at a newline. If the match fails at the newline, later code breaks
5125 this loop. */
5126
5127 if (firstline)
5128 {
5129 USPTR t = start_match;
5130 #ifdef SUPPORT_UTF8
5131 if (utf8)
5132 {
5133 while (t < md->end_subject && !IS_NEWLINE(t))
5134 {
5135 t++;
5136 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5137 }
5138 }
5139 else
5140 #endif
5141 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5142 end_subject = t;
5143 }
5144
5145 /* There are some optimizations that avoid running the match if a known
5146 starting point is not found, or if a known later character is not present.
5147 However, there is an option that disables these, for testing and for ensuring
5148 that all callouts do actually occur. */
5149
5150 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5151 {
5152 /* Advance to a unique first byte if there is one. */
5153
5154 if (first_byte >= 0)
5155 {
5156 if (first_byte_caseless)
5157 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5158 start_match++;
5159 else
5160 while (start_match < end_subject && *start_match != first_byte)
5161 start_match++;
5162 }
5163
5164 /* Or to just after a linebreak for a multiline match */
5165
5166 else if (startline)
5167 {
5168 if (start_match > md->start_subject + start_offset)
5169 {
5170 #ifdef SUPPORT_UTF8
5171 if (utf8)
5172 {
5173 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5174 {
5175 start_match++;
5176 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5177 start_match++;
5178 }
5179 }
5180 else
5181 #endif
5182 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5183 start_match++;
5184
5185 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5186 and we are now at a LF, advance the match position by one more character.
5187 */
5188
5189 if (start_match[-1] == CHAR_CR &&
5190 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5191 start_match < end_subject &&
5192 *start_match == CHAR_NL)
5193 start_match++;
5194 }
5195 }
5196
5197 /* Or to a non-unique first byte after study */
5198
5199 else if (start_bits != NULL)
5200 {
5201 while (start_match < end_subject)
5202 {
5203 register unsigned int c = *start_match;
5204 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5205 else break;
5206 }
5207 }
5208 } /* Starting optimizations */
5209
5210 /* Restore fudged end_subject */
5211
5212 end_subject = save_end_subject;
5213
5214 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5215 printf(">>>> Match against: ");
5216 pchars(start_match, end_subject - start_match, TRUE, md);
5217 printf("\n");
5218 #endif
5219
5220 /* If req_byte is set, we know that that character must appear in the
5221 subject for the match to succeed. If the first character is set, req_byte
5222 must be later in the subject; otherwise the test starts at the match point.
5223 This optimization can save a huge amount of backtracking in patterns with
5224 nested unlimited repeats that aren't going to match. Writing separate code
5225 for cased/caseless versions makes it go faster, as does using an
5226 autoincrement and backing off on a match.
5227
5228 HOWEVER: when the subject string is very, very long, searching to its end
5229 can take a long time, and give bad performance on quite ordinary patterns.
5230 This showed up when somebody was matching something like /^\d+C/ on a
5231 32-megabyte string... so we don't do this when the string is sufficiently
5232 long.
5233
5234 ALSO: this processing is disabled when partial matching is requested, or if
5235 disabling is explicitly requested. */
5236
5237 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
5238 req_byte >= 0 &&
5239 end_subject - start_match < REQ_BYTE_MAX &&
5240 !md->partial)
5241 {
5242 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5243
5244 /* We don't need to repeat the search if we haven't yet reached the
5245 place we found it at last time. */
5246
5247 if (p > req_byte_ptr)
5248 {
5249 if (req_byte_caseless)
5250 {
5251 while (p < end_subject)
5252 {
5253 register int pp = *p++;
5254 if (pp == req_byte || pp == req_byte2) { p--; break; }
5255 }
5256 }
5257 else
5258 {
5259 while (p < end_subject)
5260 {
5261 if (*p++ == req_byte) { p--; break; }
5262 }
5263 }
5264
5265 /* If we can't find the required character, break the matching loop,
5266 forcing a match failure. */
5267
5268 if (p >= end_subject)
5269 {
5270 rc = MATCH_NOMATCH;
5271 break;
5272 }
5273
5274 /* If we have found the required character, save the point where we
5275 found it, so that we don't search again next time round the loop if
5276 the start hasn't passed this character yet. */
5277
5278 req_byte_ptr = p;
5279 }
5280 }
5281
5282 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5283 first starting point for which a partial match was found. */
5284
5285 md->start_match_ptr = start_match;
5286 md->start_used_ptr = start_match;
5287 md->match_call_count = 0;
5288 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5289 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5290
5291 switch(rc)
5292 {
5293 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5294 exactly like PRUNE. */
5295
5296 case MATCH_NOMATCH:
5297 case MATCH_PRUNE:
5298 case MATCH_THEN:
5299 new_start_match = start_match + 1;
5300 #ifdef SUPPORT_UTF8
5301 if (utf8)
5302 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5303 new_start_match++;
5304 #endif
5305 break;
5306
5307 /* SKIP passes back the next starting point explicitly. */
5308
5309 case MATCH_SKIP:
5310 new_start_match = md->start_match_ptr;
5311 break;
5312
5313 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5314
5315 case MATCH_COMMIT:
5316 rc = MATCH_NOMATCH;
5317 goto ENDLOOP;
5318
5319 /* Any other return is some kind of error. */
5320
5321 default:
5322 goto ENDLOOP;
5323 }
5324
5325 /* Control reaches here for the various types of "no match at this point"
5326 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5327
5328 rc = MATCH_NOMATCH;
5329
5330 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5331 newline in the subject (though it may continue over the newline). Therefore,
5332 if we have just failed to match, starting at a newline, do not continue. */
5333
5334 if (firstline && IS_NEWLINE(start_match)) break;
5335
5336 /* Advance to new matching position */
5337
5338 start_match = new_start_match;
5339
5340 /* Break the loop if the pattern is anchored or if we have passed the end of
5341 the subject. */
5342
5343 if (anchored || start_match > end_subject) break;
5344
5345 /* If we have just passed a CR and we are now at a LF, and the pattern does
5346 not contain any explicit matches for \r or \n, and the newline option is CRLF
5347 or ANY or ANYCRLF, advance the match position by one more character. */
5348
5349 if (start_match[-1] == CHAR_CR &&
5350 start_match < end_subject &&
5351 *start_match == CHAR_NL &&
5352 (re->flags & PCRE_HASCRORLF) == 0 &&
5353 (md->nltype == NLTYPE_ANY ||
5354 md->nltype == NLTYPE_ANYCRLF ||
5355 md->nllen == 2))
5356 start_match++;
5357
5358 } /* End of for(;;) "bumpalong" loop */
5359
5360 /* ==========================================================================*/
5361
5362 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5363 conditions is true:
5364
5365 (1) The pattern is anchored or the match was failed by (*COMMIT);
5366
5367 (2) We are past the end of the subject;
5368
5369 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5370 this option requests that a match occur at or before the first newline in
5371 the subject.
5372
5373 When we have a match and the offset vector is big enough to deal with any
5374 backreferences, captured substring offsets will already be set up. In the case
5375 where we had to get some local store to hold offsets for backreference
5376 processing, copy those that we can. In this case there need not be overflow if
5377 certain parts of the pattern were not used, even though there are more
5378 capturing parentheses than vector slots. */
5379
5380 ENDLOOP:
5381
5382 if (rc == MATCH_MATCH)
5383 {
5384 if (using_temporary_offsets)
5385 {
5386 if (offsetcount >= 4)
5387 {
5388 memcpy(offsets + 2, md->offset_vector + 2,
5389 (offsetcount - 2) * sizeof(int));
5390 DPRINTF(("Copied offsets from temporary memory\n"));
5391 }
5392 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5393 DPRINTF(("Freeing temporary memory\n"));
5394 (pcre_free)(md->offset_vector);
5395 }
5396
5397 /* Set the return code to the number of captured strings, or 0 if there are
5398 too many to fit into the vector. */
5399
5400 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5401
5402 /* If there is space, set up the whole thing as substring 0. The value of
5403 md->start_match_ptr might be modified if \K was encountered on the success
5404 matching path. */
5405
5406 if (offsetcount < 2) rc = 0; else
5407 {
5408 offsets[0] = md->start_match_ptr - md->start_subject;
5409 offsets[1] = md->end_match_ptr - md->start_subject;
5410 }
5411
5412 DPRINTF((">>>> returning %d\n", rc));
5413 return rc;
5414 }
5415
5416 /* Control gets here if there has been an error, or if the overall match
5417 attempt has failed at all permitted starting positions. */
5418
5419 if (using_temporary_offsets)
5420 {
5421 DPRINTF(("Freeing temporary memory\n"));
5422 (pcre_free)(md->offset_vector);
5423 }
5424
5425 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5426 {
5427 DPRINTF((">>>> error: returning %d\n", rc));
5428 return rc;
5429 }
5430 else if (start_partial != NULL)
5431 {
5432 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5433 if (offsetcount > 1)
5434 {
5435 offsets[0] = start_partial - (USPTR)subject;
5436 offsets[1] = end_subject - (USPTR)subject;
5437 }
5438 return PCRE_ERROR_PARTIAL;
5439 }
5440 else
5441 {
5442 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5443 return PCRE_ERROR_NOMATCH;
5444 }
5445 }
5446
5447 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5