/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 550 - (show annotations)
Sun Oct 10 16:24:11 2010 UTC (9 years ago) by ph10
File MIME type: text/plain
File size: 186301 byte(s)
Error occurred while calculating annotation data.
Fix problem with (*THEN) not backing up far enough.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
259 RM61, RM62 };
260
261 /* These versions of the macros use the stack, as normal. There are debugging
262 versions and production versions. Note that the "rw" argument of RMATCH isn't
263 actually used in this definition. */
264
265 #ifndef NO_RECURSE
266 #define REGISTER register
267
268 #ifdef PCRE_DEBUG
269 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
270 { \
271 printf("match() called in line %d\n", __LINE__); \
272 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
273 printf("to line %d\n", __LINE__); \
274 }
275 #define RRETURN(ra) \
276 { \
277 printf("match() returned %d from line %d ", ra, __LINE__); \
278 return ra; \
279 }
280 #else
281 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
282 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
283 #define RRETURN(ra) return ra
284 #endif
285
286 #else
287
288
289 /* These versions of the macros manage a private stack on the heap. Note that
290 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
291 argument of match(), which never changes. */
292
293 #define REGISTER
294
295 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
296 {\
297 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
298 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
299 frame->Xwhere = rw; \
300 newframe->Xeptr = ra;\
301 newframe->Xecode = rb;\
302 newframe->Xmstart = mstart;\
303 newframe->Xmarkptr = markptr;\
304 newframe->Xoffset_top = rc;\
305 newframe->Xims = re;\
306 newframe->Xeptrb = rf;\
307 newframe->Xflags = rg;\
308 newframe->Xrdepth = frame->Xrdepth + 1;\
309 newframe->Xprevframe = frame;\
310 frame = newframe;\
311 DPRINTF(("restarting from line %d\n", __LINE__));\
312 goto HEAP_RECURSE;\
313 L_##rw:\
314 DPRINTF(("jumped back to line %d\n", __LINE__));\
315 }
316
317 #define RRETURN(ra)\
318 {\
319 heapframe *oldframe = frame;\
320 frame = oldframe->Xprevframe;\
321 (pcre_stack_free)(oldframe);\
322 if (frame != NULL)\
323 {\
324 rrc = ra;\
325 goto HEAP_RETURN;\
326 }\
327 return ra;\
328 }
329
330
331 /* Structure for remembering the local variables in a private frame */
332
333 typedef struct heapframe {
334 struct heapframe *Xprevframe;
335
336 /* Function arguments that may change */
337
338 USPTR Xeptr;
339 const uschar *Xecode;
340 USPTR Xmstart;
341 USPTR Xmarkptr;
342 int Xoffset_top;
343 long int Xims;
344 eptrblock *Xeptrb;
345 int Xflags;
346 unsigned int Xrdepth;
347
348 /* Function local variables */
349
350 USPTR Xcallpat;
351 #ifdef SUPPORT_UTF8
352 USPTR Xcharptr;
353 #endif
354 USPTR Xdata;
355 USPTR Xnext;
356 USPTR Xpp;
357 USPTR Xprev;
358 USPTR Xsaved_eptr;
359
360 recursion_info Xnew_recursive;
361
362 BOOL Xcur_is_word;
363 BOOL Xcondition;
364 BOOL Xprev_is_word;
365
366 unsigned long int Xoriginal_ims;
367
368 #ifdef SUPPORT_UCP
369 int Xprop_type;
370 int Xprop_value;
371 int Xprop_fail_result;
372 int Xprop_category;
373 int Xprop_chartype;
374 int Xprop_script;
375 int Xoclength;
376 uschar Xocchars[8];
377 #endif
378
379 int Xcodelink;
380 int Xctype;
381 unsigned int Xfc;
382 int Xfi;
383 int Xlength;
384 int Xmax;
385 int Xmin;
386 int Xnumber;
387 int Xoffset;
388 int Xop;
389 int Xsave_capture_last;
390 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
391 int Xstacksave[REC_STACK_SAVE_MAX];
392
393 eptrblock Xnewptrb;
394
395 /* Where to jump back to */
396
397 int Xwhere;
398
399 } heapframe;
400
401 #endif
402
403
404 /***************************************************************************
405 ***************************************************************************/
406
407
408
409 /*************************************************
410 * Match from current position *
411 *************************************************/
412
413 /* This function is called recursively in many circumstances. Whenever it
414 returns a negative (error) response, the outer incarnation must also return the
415 same response. */
416
417 /* These macros pack up tests that are used for partial matching, and which
418 appears several times in the code. We set the "hit end" flag if the pointer is
419 at the end of the subject and also past the start of the subject (i.e.
420 something has been matched). For hard partial matching, we then return
421 immediately. The second one is used when we already know we are past the end of
422 the subject. */
423
424 #define CHECK_PARTIAL()\
425 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
426 {\
427 md->hitend = TRUE;\
428 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
429 }
430
431 #define SCHECK_PARTIAL()\
432 if (md->partial != 0 && eptr > mstart)\
433 {\
434 md->hitend = TRUE;\
435 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
436 }
437
438
439 /* Performance note: It might be tempting to extract commonly used fields from
440 the md structure (e.g. utf8, end_subject) into individual variables to improve
441 performance. Tests using gcc on a SPARC disproved this; in the first case, it
442 made performance worse.
443
444 Arguments:
445 eptr pointer to current character in subject
446 ecode pointer to current position in compiled code
447 mstart pointer to the current match start position (can be modified
448 by encountering \K)
449 markptr pointer to the most recent MARK name, or NULL
450 offset_top current top pointer
451 md pointer to "static" info for the match
452 ims current /i, /m, and /s options
453 eptrb pointer to chain of blocks containing eptr at start of
454 brackets - for testing for empty matches
455 flags can contain
456 match_condassert - this is an assertion condition
457 match_cbegroup - this is the start of an unlimited repeat
458 group that can match an empty string
459 rdepth the recursion depth
460
461 Returns: MATCH_MATCH if matched ) these values are >= 0
462 MATCH_NOMATCH if failed to match )
463 a negative MATCH_xxx value for PRUNE, SKIP, etc
464 a negative PCRE_ERROR_xxx value if aborted by an error condition
465 (e.g. stopped by repeated call or recursion limit)
466 */
467
468 static int
469 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
470 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
471 eptrblock *eptrb, int flags, unsigned int rdepth)
472 {
473 /* These variables do not need to be preserved over recursion in this function,
474 so they can be ordinary variables in all cases. Mark some of them with
475 "register" because they are used a lot in loops. */
476
477 register int rrc; /* Returns from recursive calls */
478 register int i; /* Used for loops not involving calls to RMATCH() */
479 register unsigned int c; /* Character values not kept over RMATCH() calls */
480 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
481
482 BOOL minimize, possessive; /* Quantifier options */
483 int condcode;
484
485 /* When recursion is not being used, all "local" variables that have to be
486 preserved over calls to RMATCH() are part of a "frame" which is obtained from
487 heap storage. Set up the top-level frame here; others are obtained from the
488 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
489
490 #ifdef NO_RECURSE
491 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
492 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
493 frame->Xprevframe = NULL; /* Marks the top level */
494
495 /* Copy in the original argument variables */
496
497 frame->Xeptr = eptr;
498 frame->Xecode = ecode;
499 frame->Xmstart = mstart;
500 frame->Xmarkptr = markptr;
501 frame->Xoffset_top = offset_top;
502 frame->Xims = ims;
503 frame->Xeptrb = eptrb;
504 frame->Xflags = flags;
505 frame->Xrdepth = rdepth;
506
507 /* This is where control jumps back to to effect "recursion" */
508
509 HEAP_RECURSE:
510
511 /* Macros make the argument variables come from the current frame */
512
513 #define eptr frame->Xeptr
514 #define ecode frame->Xecode
515 #define mstart frame->Xmstart
516 #define markptr frame->Xmarkptr
517 #define offset_top frame->Xoffset_top
518 #define ims frame->Xims
519 #define eptrb frame->Xeptrb
520 #define flags frame->Xflags
521 #define rdepth frame->Xrdepth
522
523 /* Ditto for the local variables */
524
525 #ifdef SUPPORT_UTF8
526 #define charptr frame->Xcharptr
527 #endif
528 #define callpat frame->Xcallpat
529 #define codelink frame->Xcodelink
530 #define data frame->Xdata
531 #define next frame->Xnext
532 #define pp frame->Xpp
533 #define prev frame->Xprev
534 #define saved_eptr frame->Xsaved_eptr
535
536 #define new_recursive frame->Xnew_recursive
537
538 #define cur_is_word frame->Xcur_is_word
539 #define condition frame->Xcondition
540 #define prev_is_word frame->Xprev_is_word
541
542 #define original_ims frame->Xoriginal_ims
543
544 #ifdef SUPPORT_UCP
545 #define prop_type frame->Xprop_type
546 #define prop_value frame->Xprop_value
547 #define prop_fail_result frame->Xprop_fail_result
548 #define prop_category frame->Xprop_category
549 #define prop_chartype frame->Xprop_chartype
550 #define prop_script frame->Xprop_script
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580
581 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
582 const uschar *charptr; /* in small blocks of the code. My normal */
583 #endif /* style of coding would have declared */
584 const uschar *callpat; /* them within each of those blocks. */
585 const uschar *data; /* However, in order to accommodate the */
586 const uschar *next; /* version of this code that uses an */
587 USPTR pp; /* external "stack" implemented on the */
588 const uschar *prev; /* heap, it is easier to declare them all */
589 USPTR saved_eptr; /* here, so the declarations can be cut */
590 /* out in a block. The only declarations */
591 recursion_info new_recursive; /* within blocks below are for variables */
592 /* that do not have to be preserved over */
593 BOOL cur_is_word; /* a recursive call to RMATCH(). */
594 BOOL condition;
595 BOOL prev_is_word;
596
597 unsigned long int original_ims;
598
599 #ifdef SUPPORT_UCP
600 int prop_type;
601 int prop_value;
602 int prop_fail_result;
603 int prop_category;
604 int prop_chartype;
605 int prop_script;
606 int oclength;
607 uschar occhars[8];
608 #endif
609
610 int codelink;
611 int ctype;
612 int length;
613 int max;
614 int min;
615 int number;
616 int offset;
617 int op;
618 int save_capture_last;
619 int save_offset1, save_offset2, save_offset3;
620 int stacksave[REC_STACK_SAVE_MAX];
621
622 eptrblock newptrb;
623 #endif /* NO_RECURSE */
624
625 /* These statements are here to stop the compiler complaining about unitialized
626 variables. */
627
628 #ifdef SUPPORT_UCP
629 prop_value = 0;
630 prop_fail_result = 0;
631 #endif
632
633
634 /* This label is used for tail recursion, which is used in a few cases even
635 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
636 used. Thanks to Ian Taylor for noticing this possibility and sending the
637 original patch. */
638
639 TAIL_RECURSE:
640
641 /* OK, now we can get on with the real code of the function. Recursive calls
642 are specified by the macro RMATCH and RRETURN is used to return. When
643 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
644 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
645 defined). However, RMATCH isn't like a function call because it's quite a
646 complicated macro. It has to be used in one particular way. This shouldn't,
647 however, impact performance when true recursion is being used. */
648
649 #ifdef SUPPORT_UTF8
650 utf8 = md->utf8; /* Local copy of the flag */
651 #else
652 utf8 = FALSE;
653 #endif
654
655 /* First check that we haven't called match() too many times, or that we
656 haven't exceeded the recursive call limit. */
657
658 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
659 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
660
661 original_ims = ims; /* Save for resetting on ')' */
662
663 /* At the start of a group with an unlimited repeat that may match an empty
664 string, the match_cbegroup flag is set. When this is the case, add the current
665 subject pointer to the chain of such remembered pointers, to be checked when we
666 hit the closing ket, in order to break infinite loops that match no characters.
667 When match() is called in other circumstances, don't add to the chain. The
668 match_cbegroup flag must NOT be used with tail recursion, because the memory
669 block that is used is on the stack, so a new one may be required for each
670 match(). */
671
672 if ((flags & match_cbegroup) != 0)
673 {
674 newptrb.epb_saved_eptr = eptr;
675 newptrb.epb_prev = eptrb;
676 eptrb = &newptrb;
677 }
678
679 /* Now start processing the opcodes. */
680
681 for (;;)
682 {
683 minimize = possessive = FALSE;
684 op = *ecode;
685
686 switch(op)
687 {
688 case OP_MARK:
689 markptr = ecode + 2;
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
691 ims, eptrb, flags, RM55);
692
693 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
694 argument, and we must check whether that argument matches this MARK's
695 argument. It is passed back in md->start_match_ptr (an overloading of that
696 variable). If it does match, we reset that variable to the current subject
697 position and return MATCH_SKIP. Otherwise, pass back the return code
698 unaltered. */
699
700 if (rrc == MATCH_SKIP_ARG &&
701 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
702 {
703 md->start_match_ptr = eptr;
704 RRETURN(MATCH_SKIP);
705 }
706
707 if (md->mark == NULL) md->mark = markptr;
708 RRETURN(rrc);
709
710 case OP_FAIL:
711 MRRETURN(MATCH_NOMATCH);
712
713 case OP_COMMIT:
714 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
715 ims, eptrb, flags, RM52);
716 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
717 MRRETURN(MATCH_COMMIT);
718
719 case OP_PRUNE:
720 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
721 ims, eptrb, flags, RM51);
722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
723 MRRETURN(MATCH_PRUNE);
724
725 case OP_PRUNE_ARG:
726 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
727 ims, eptrb, flags, RM56);
728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
729 md->mark = ecode + 2;
730 RRETURN(MATCH_PRUNE);
731
732 case OP_SKIP:
733 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
734 ims, eptrb, flags, RM53);
735 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
736 md->start_match_ptr = eptr; /* Pass back current position */
737 MRRETURN(MATCH_SKIP);
738
739 case OP_SKIP_ARG:
740 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
741 ims, eptrb, flags, RM57);
742 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
743
744 /* Pass back the current skip name by overloading md->start_match_ptr and
745 returning the special MATCH_SKIP_ARG return code. This will either be
746 caught by a matching MARK, or get to the top, where it is treated the same
747 as PRUNE. */
748
749 md->start_match_ptr = ecode + 2;
750 RRETURN(MATCH_SKIP_ARG);
751
752 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
753 the alt that is at the start of the current branch. This makes it possible
754 to skip back past alternatives that precede the THEN within the current
755 branch. */
756
757 case OP_THEN:
758 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
759 ims, eptrb, flags, RM54);
760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
761 md->start_match_ptr = ecode - GET(ecode, 1);
762 MRRETURN(MATCH_THEN);
763
764 case OP_THEN_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
766 offset_top, md, ims, eptrb, flags, RM58);
767 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
768 md->start_match_ptr = ecode - GET(ecode, 1);
769 md->mark = ecode + LINK_SIZE + 2;
770 RRETURN(MATCH_THEN);
771
772 /* Handle a capturing bracket. If there is space in the offset vector, save
773 the current subject position in the working slot at the top of the vector.
774 We mustn't change the current values of the data slot, because they may be
775 set from a previous iteration of this group, and be referred to by a
776 reference inside the group.
777
778 If the bracket fails to match, we need to restore this value and also the
779 values of the final offsets, in case they were set by a previous iteration
780 of the same bracket.
781
782 If there isn't enough space in the offset vector, treat this as if it were
783 a non-capturing bracket. Don't worry about setting the flag for the error
784 case here; that is handled in the code for KET. */
785
786 case OP_CBRA:
787 case OP_SCBRA:
788 number = GET2(ecode, 1+LINK_SIZE);
789 offset = number << 1;
790
791 #ifdef PCRE_DEBUG
792 printf("start bracket %d\n", number);
793 printf("subject=");
794 pchars(eptr, 16, TRUE, md);
795 printf("\n");
796 #endif
797
798 if (offset < md->offset_max)
799 {
800 save_offset1 = md->offset_vector[offset];
801 save_offset2 = md->offset_vector[offset+1];
802 save_offset3 = md->offset_vector[md->offset_end - number];
803 save_capture_last = md->capture_last;
804
805 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
806 md->offset_vector[md->offset_end - number] =
807 (int)(eptr - md->start_subject);
808
809 flags = (op == OP_SCBRA)? match_cbegroup : 0;
810 do
811 {
812 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
813 ims, eptrb, flags, RM1);
814 if (rrc != MATCH_NOMATCH &&
815 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
816 RRETURN(rrc);
817 md->capture_last = save_capture_last;
818 ecode += GET(ecode, 1);
819 }
820 while (*ecode == OP_ALT);
821
822 DPRINTF(("bracket %d failed\n", number));
823
824 md->offset_vector[offset] = save_offset1;
825 md->offset_vector[offset+1] = save_offset2;
826 md->offset_vector[md->offset_end - number] = save_offset3;
827
828 if (rrc != MATCH_THEN) md->mark = markptr;
829 RRETURN(MATCH_NOMATCH);
830 }
831
832 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
833 as a non-capturing bracket. */
834
835 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
836 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
837
838 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
839
840 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
841 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
842
843 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
844 final alternative within the brackets, we would return the result of a
845 recursive call to match() whatever happened. We can reduce stack usage by
846 turning this into a tail recursion, except in the case when match_cbegroup
847 is set.*/
848
849 case OP_BRA:
850 case OP_SBRA:
851 DPRINTF(("start non-capturing bracket\n"));
852 flags = (op >= OP_SBRA)? match_cbegroup : 0;
853 for (;;)
854 {
855 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
856 {
857 if (flags == 0) /* Not a possibly empty group */
858 {
859 ecode += _pcre_OP_lengths[*ecode];
860 DPRINTF(("bracket 0 tail recursion\n"));
861 goto TAIL_RECURSE;
862 }
863
864 /* Possibly empty group; can't use tail recursion. */
865
866 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
867 eptrb, flags, RM48);
868 if (rrc == MATCH_NOMATCH) md->mark = markptr;
869 RRETURN(rrc);
870 }
871
872 /* For non-final alternatives, continue the loop for a NOMATCH result;
873 otherwise return. */
874
875 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
876 eptrb, flags, RM2);
877 if (rrc != MATCH_NOMATCH &&
878 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
879 RRETURN(rrc);
880 ecode += GET(ecode, 1);
881 }
882 /* Control never reaches here. */
883
884 /* Conditional group: compilation checked that there are no more than
885 two branches. If the condition is false, skipping the first branch takes us
886 past the end if there is only one branch, but that's OK because that is
887 exactly what going to the ket would do. As there is only one branch to be
888 obeyed, we can use tail recursion to avoid using another stack frame. */
889
890 case OP_COND:
891 case OP_SCOND:
892 codelink= GET(ecode, 1);
893
894 /* Because of the way auto-callout works during compile, a callout item is
895 inserted between OP_COND and an assertion condition. */
896
897 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
898 {
899 if (pcre_callout != NULL)
900 {
901 pcre_callout_block cb;
902 cb.version = 1; /* Version 1 of the callout block */
903 cb.callout_number = ecode[LINK_SIZE+2];
904 cb.offset_vector = md->offset_vector;
905 cb.subject = (PCRE_SPTR)md->start_subject;
906 cb.subject_length = (int)(md->end_subject - md->start_subject);
907 cb.start_match = (int)(mstart - md->start_subject);
908 cb.current_position = (int)(eptr - md->start_subject);
909 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
910 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
911 cb.capture_top = offset_top/2;
912 cb.capture_last = md->capture_last;
913 cb.callout_data = md->callout_data;
914 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
915 if (rrc < 0) RRETURN(rrc);
916 }
917 ecode += _pcre_OP_lengths[OP_CALLOUT];
918 }
919
920 condcode = ecode[LINK_SIZE+1];
921
922 /* Now see what the actual condition is */
923
924 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
925 {
926 if (md->recursive == NULL) /* Not recursing => FALSE */
927 {
928 condition = FALSE;
929 ecode += GET(ecode, 1);
930 }
931 else
932 {
933 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
934 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
935
936 /* If the test is for recursion into a specific subpattern, and it is
937 false, but the test was set up by name, scan the table to see if the
938 name refers to any other numbers, and test them. The condition is true
939 if any one is set. */
940
941 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
942 {
943 uschar *slotA = md->name_table;
944 for (i = 0; i < md->name_count; i++)
945 {
946 if (GET2(slotA, 0) == recno) break;
947 slotA += md->name_entry_size;
948 }
949
950 /* Found a name for the number - there can be only one; duplicate
951 names for different numbers are allowed, but not vice versa. First
952 scan down for duplicates. */
953
954 if (i < md->name_count)
955 {
956 uschar *slotB = slotA;
957 while (slotB > md->name_table)
958 {
959 slotB -= md->name_entry_size;
960 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
961 {
962 condition = GET2(slotB, 0) == md->recursive->group_num;
963 if (condition) break;
964 }
965 else break;
966 }
967
968 /* Scan up for duplicates */
969
970 if (!condition)
971 {
972 slotB = slotA;
973 for (i++; i < md->name_count; i++)
974 {
975 slotB += md->name_entry_size;
976 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
977 {
978 condition = GET2(slotB, 0) == md->recursive->group_num;
979 if (condition) break;
980 }
981 else break;
982 }
983 }
984 }
985 }
986
987 /* Chose branch according to the condition */
988
989 ecode += condition? 3 : GET(ecode, 1);
990 }
991 }
992
993 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
994 {
995 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
996 condition = offset < offset_top && md->offset_vector[offset] >= 0;
997
998 /* If the numbered capture is unset, but the reference was by name,
999 scan the table to see if the name refers to any other numbers, and test
1000 them. The condition is true if any one is set. This is tediously similar
1001 to the code above, but not close enough to try to amalgamate. */
1002
1003 if (!condition && condcode == OP_NCREF)
1004 {
1005 int refno = offset >> 1;
1006 uschar *slotA = md->name_table;
1007
1008 for (i = 0; i < md->name_count; i++)
1009 {
1010 if (GET2(slotA, 0) == refno) break;
1011 slotA += md->name_entry_size;
1012 }
1013
1014 /* Found a name for the number - there can be only one; duplicate names
1015 for different numbers are allowed, but not vice versa. First scan down
1016 for duplicates. */
1017
1018 if (i < md->name_count)
1019 {
1020 uschar *slotB = slotA;
1021 while (slotB > md->name_table)
1022 {
1023 slotB -= md->name_entry_size;
1024 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1025 {
1026 offset = GET2(slotB, 0) << 1;
1027 condition = offset < offset_top &&
1028 md->offset_vector[offset] >= 0;
1029 if (condition) break;
1030 }
1031 else break;
1032 }
1033
1034 /* Scan up for duplicates */
1035
1036 if (!condition)
1037 {
1038 slotB = slotA;
1039 for (i++; i < md->name_count; i++)
1040 {
1041 slotB += md->name_entry_size;
1042 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1043 {
1044 offset = GET2(slotB, 0) << 1;
1045 condition = offset < offset_top &&
1046 md->offset_vector[offset] >= 0;
1047 if (condition) break;
1048 }
1049 else break;
1050 }
1051 }
1052 }
1053 }
1054
1055 /* Chose branch according to the condition */
1056
1057 ecode += condition? 3 : GET(ecode, 1);
1058 }
1059
1060 else if (condcode == OP_DEF) /* DEFINE - always false */
1061 {
1062 condition = FALSE;
1063 ecode += GET(ecode, 1);
1064 }
1065
1066 /* The condition is an assertion. Call match() to evaluate it - setting
1067 the final argument match_condassert causes it to stop at the end of an
1068 assertion. */
1069
1070 else
1071 {
1072 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1073 match_condassert, RM3);
1074 if (rrc == MATCH_MATCH)
1075 {
1076 condition = TRUE;
1077 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1078 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1079 }
1080 else if (rrc != MATCH_NOMATCH &&
1081 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1082 {
1083 RRETURN(rrc); /* Need braces because of following else */
1084 }
1085 else
1086 {
1087 condition = FALSE;
1088 ecode += codelink;
1089 }
1090 }
1091
1092 /* We are now at the branch that is to be obeyed. As there is only one,
1093 we can use tail recursion to avoid using another stack frame, except when
1094 match_cbegroup is required for an unlimited repeat of a possibly empty
1095 group. If the second alternative doesn't exist, we can just plough on. */
1096
1097 if (condition || *ecode == OP_ALT)
1098 {
1099 ecode += 1 + LINK_SIZE;
1100 if (op == OP_SCOND) /* Possibly empty group */
1101 {
1102 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1103 RRETURN(rrc);
1104 }
1105 else /* Group must match something */
1106 {
1107 flags = 0;
1108 goto TAIL_RECURSE;
1109 }
1110 }
1111 else /* Condition false & no alternative */
1112 {
1113 ecode += 1 + LINK_SIZE;
1114 }
1115 break;
1116
1117
1118 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1119 to close any currently open capturing brackets. */
1120
1121 case OP_CLOSE:
1122 number = GET2(ecode, 1);
1123 offset = number << 1;
1124
1125 #ifdef PCRE_DEBUG
1126 printf("end bracket %d at *ACCEPT", number);
1127 printf("\n");
1128 #endif
1129
1130 md->capture_last = number;
1131 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1132 {
1133 md->offset_vector[offset] =
1134 md->offset_vector[md->offset_end - number];
1135 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1136 if (offset_top <= offset) offset_top = offset + 2;
1137 }
1138 ecode += 3;
1139 break;
1140
1141
1142 /* End of the pattern, either real or forced. If we are in a top-level
1143 recursion, we should restore the offsets appropriately and continue from
1144 after the call. */
1145
1146 case OP_ACCEPT:
1147 case OP_END:
1148 if (md->recursive != NULL && md->recursive->group_num == 0)
1149 {
1150 recursion_info *rec = md->recursive;
1151 DPRINTF(("End of pattern in a (?0) recursion\n"));
1152 md->recursive = rec->prevrec;
1153 memmove(md->offset_vector, rec->offset_save,
1154 rec->saved_max * sizeof(int));
1155 offset_top = rec->save_offset_top;
1156 ims = original_ims;
1157 ecode = rec->after_call;
1158 break;
1159 }
1160
1161 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1162 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1163 the subject. In both cases, backtracking will then try other alternatives,
1164 if any. */
1165
1166 if (eptr == mstart &&
1167 (md->notempty ||
1168 (md->notempty_atstart &&
1169 mstart == md->start_subject + md->start_offset)))
1170 MRRETURN(MATCH_NOMATCH);
1171
1172 /* Otherwise, we have a match. */
1173
1174 md->end_match_ptr = eptr; /* Record where we ended */
1175 md->end_offset_top = offset_top; /* and how many extracts were taken */
1176 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1177
1178 /* For some reason, the macros don't work properly if an expression is
1179 given as the argument to MRRETURN when the heap is in use. */
1180
1181 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1182 MRRETURN(rrc);
1183
1184 /* Change option settings */
1185
1186 case OP_OPT:
1187 ims = ecode[1];
1188 ecode += 2;
1189 DPRINTF(("ims set to %02lx\n", ims));
1190 break;
1191
1192 /* Assertion brackets. Check the alternative branches in turn - the
1193 matching won't pass the KET for an assertion. If any one branch matches,
1194 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1195 start of each branch to move the current point backwards, so the code at
1196 this level is identical to the lookahead case. */
1197
1198 case OP_ASSERT:
1199 case OP_ASSERTBACK:
1200 do
1201 {
1202 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1203 RM4);
1204 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1205 {
1206 mstart = md->start_match_ptr; /* In case \K reset it */
1207 break;
1208 }
1209 if (rrc != MATCH_NOMATCH &&
1210 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1211 RRETURN(rrc);
1212 ecode += GET(ecode, 1);
1213 }
1214 while (*ecode == OP_ALT);
1215 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1216
1217 /* If checking an assertion for a condition, return MATCH_MATCH. */
1218
1219 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1220
1221 /* Continue from after the assertion, updating the offsets high water
1222 mark, since extracts may have been taken during the assertion. */
1223
1224 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1225 ecode += 1 + LINK_SIZE;
1226 offset_top = md->end_offset_top;
1227 continue;
1228
1229 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1230 PRUNE, or COMMIT means we must assume failure without checking subsequent
1231 branches. */
1232
1233 case OP_ASSERT_NOT:
1234 case OP_ASSERTBACK_NOT:
1235 do
1236 {
1237 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1238 RM5);
1239 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1240 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1241 {
1242 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1243 break;
1244 }
1245 if (rrc != MATCH_NOMATCH &&
1246 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1247 RRETURN(rrc);
1248 ecode += GET(ecode,1);
1249 }
1250 while (*ecode == OP_ALT);
1251
1252 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1253
1254 ecode += 1 + LINK_SIZE;
1255 continue;
1256
1257 /* Move the subject pointer back. This occurs only at the start of
1258 each branch of a lookbehind assertion. If we are too close to the start to
1259 move back, this match function fails. When working with UTF-8 we move
1260 back a number of characters, not bytes. */
1261
1262 case OP_REVERSE:
1263 #ifdef SUPPORT_UTF8
1264 if (utf8)
1265 {
1266 i = GET(ecode, 1);
1267 while (i-- > 0)
1268 {
1269 eptr--;
1270 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1271 BACKCHAR(eptr);
1272 }
1273 }
1274 else
1275 #endif
1276
1277 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1278
1279 {
1280 eptr -= GET(ecode, 1);
1281 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1282 }
1283
1284 /* Save the earliest consulted character, then skip to next op code */
1285
1286 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1287 ecode += 1 + LINK_SIZE;
1288 break;
1289
1290 /* The callout item calls an external function, if one is provided, passing
1291 details of the match so far. This is mainly for debugging, though the
1292 function is able to force a failure. */
1293
1294 case OP_CALLOUT:
1295 if (pcre_callout != NULL)
1296 {
1297 pcre_callout_block cb;
1298 cb.version = 1; /* Version 1 of the callout block */
1299 cb.callout_number = ecode[1];
1300 cb.offset_vector = md->offset_vector;
1301 cb.subject = (PCRE_SPTR)md->start_subject;
1302 cb.subject_length = (int)(md->end_subject - md->start_subject);
1303 cb.start_match = (int)(mstart - md->start_subject);
1304 cb.current_position = (int)(eptr - md->start_subject);
1305 cb.pattern_position = GET(ecode, 2);
1306 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1307 cb.capture_top = offset_top/2;
1308 cb.capture_last = md->capture_last;
1309 cb.callout_data = md->callout_data;
1310 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1311 if (rrc < 0) RRETURN(rrc);
1312 }
1313 ecode += 2 + 2*LINK_SIZE;
1314 break;
1315
1316 /* Recursion either matches the current regex, or some subexpression. The
1317 offset data is the offset to the starting bracket from the start of the
1318 whole pattern. (This is so that it works from duplicated subpatterns.)
1319
1320 If there are any capturing brackets started but not finished, we have to
1321 save their starting points and reinstate them after the recursion. However,
1322 we don't know how many such there are (offset_top records the completed
1323 total) so we just have to save all the potential data. There may be up to
1324 65535 such values, which is too large to put on the stack, but using malloc
1325 for small numbers seems expensive. As a compromise, the stack is used when
1326 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1327 is used. A problem is what to do if the malloc fails ... there is no way of
1328 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1329 values on the stack, and accept that the rest may be wrong.
1330
1331 There are also other values that have to be saved. We use a chained
1332 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1333 for the original version of this logic. */
1334
1335 case OP_RECURSE:
1336 {
1337 callpat = md->start_code + GET(ecode, 1);
1338 new_recursive.group_num = (callpat == md->start_code)? 0 :
1339 GET2(callpat, 1 + LINK_SIZE);
1340
1341 /* Add to "recursing stack" */
1342
1343 new_recursive.prevrec = md->recursive;
1344 md->recursive = &new_recursive;
1345
1346 /* Find where to continue from afterwards */
1347
1348 ecode += 1 + LINK_SIZE;
1349 new_recursive.after_call = ecode;
1350
1351 /* Now save the offset data. */
1352
1353 new_recursive.saved_max = md->offset_end;
1354 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1355 new_recursive.offset_save = stacksave;
1356 else
1357 {
1358 new_recursive.offset_save =
1359 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1360 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1361 }
1362
1363 memcpy(new_recursive.offset_save, md->offset_vector,
1364 new_recursive.saved_max * sizeof(int));
1365 new_recursive.save_offset_top = offset_top;
1366
1367 /* OK, now we can do the recursion. For each top-level alternative we
1368 restore the offset and recursion data. */
1369
1370 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1371 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1372 do
1373 {
1374 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1375 md, ims, eptrb, flags, RM6);
1376 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1377 {
1378 DPRINTF(("Recursion matched\n"));
1379 md->recursive = new_recursive.prevrec;
1380 if (new_recursive.offset_save != stacksave)
1381 (pcre_free)(new_recursive.offset_save);
1382 MRRETURN(MATCH_MATCH);
1383 }
1384 else if (rrc != MATCH_NOMATCH &&
1385 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1386 {
1387 DPRINTF(("Recursion gave error %d\n", rrc));
1388 if (new_recursive.offset_save != stacksave)
1389 (pcre_free)(new_recursive.offset_save);
1390 RRETURN(rrc);
1391 }
1392
1393 md->recursive = &new_recursive;
1394 memcpy(md->offset_vector, new_recursive.offset_save,
1395 new_recursive.saved_max * sizeof(int));
1396 callpat += GET(callpat, 1);
1397 }
1398 while (*callpat == OP_ALT);
1399
1400 DPRINTF(("Recursion didn't match\n"));
1401 md->recursive = new_recursive.prevrec;
1402 if (new_recursive.offset_save != stacksave)
1403 (pcre_free)(new_recursive.offset_save);
1404 MRRETURN(MATCH_NOMATCH);
1405 }
1406 /* Control never reaches here */
1407
1408 /* "Once" brackets are like assertion brackets except that after a match,
1409 the point in the subject string is not moved back. Thus there can never be
1410 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1411 Check the alternative branches in turn - the matching won't pass the KET
1412 for this kind of subpattern. If any one branch matches, we carry on as at
1413 the end of a normal bracket, leaving the subject pointer, but resetting
1414 the start-of-match value in case it was changed by \K. */
1415
1416 case OP_ONCE:
1417 prev = ecode;
1418 saved_eptr = eptr;
1419
1420 do
1421 {
1422 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1423 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1424 {
1425 mstart = md->start_match_ptr;
1426 break;
1427 }
1428 if (rrc != MATCH_NOMATCH &&
1429 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1430 RRETURN(rrc);
1431 ecode += GET(ecode,1);
1432 }
1433 while (*ecode == OP_ALT);
1434
1435 /* If hit the end of the group (which could be repeated), fail */
1436
1437 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1438
1439 /* Continue as from after the assertion, updating the offsets high water
1440 mark, since extracts may have been taken. */
1441
1442 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1443
1444 offset_top = md->end_offset_top;
1445 eptr = md->end_match_ptr;
1446
1447 /* For a non-repeating ket, just continue at this level. This also
1448 happens for a repeating ket if no characters were matched in the group.
1449 This is the forcible breaking of infinite loops as implemented in Perl
1450 5.005. If there is an options reset, it will get obeyed in the normal
1451 course of events. */
1452
1453 if (*ecode == OP_KET || eptr == saved_eptr)
1454 {
1455 ecode += 1+LINK_SIZE;
1456 break;
1457 }
1458
1459 /* The repeating kets try the rest of the pattern or restart from the
1460 preceding bracket, in the appropriate order. The second "call" of match()
1461 uses tail recursion, to avoid using another stack frame. We need to reset
1462 any options that changed within the bracket before re-running it, so
1463 check the next opcode. */
1464
1465 if (ecode[1+LINK_SIZE] == OP_OPT)
1466 {
1467 ims = (ims & ~PCRE_IMS) | ecode[4];
1468 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1469 }
1470
1471 if (*ecode == OP_KETRMIN)
1472 {
1473 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1474 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1475 ecode = prev;
1476 flags = 0;
1477 goto TAIL_RECURSE;
1478 }
1479 else /* OP_KETRMAX */
1480 {
1481 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1483 ecode += 1 + LINK_SIZE;
1484 flags = 0;
1485 goto TAIL_RECURSE;
1486 }
1487 /* Control never gets here */
1488
1489 /* An alternation is the end of a branch; scan along to find the end of the
1490 bracketed group and go to there. */
1491
1492 case OP_ALT:
1493 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1494 break;
1495
1496 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1497 indicating that it may occur zero times. It may repeat infinitely, or not
1498 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1499 with fixed upper repeat limits are compiled as a number of copies, with the
1500 optional ones preceded by BRAZERO or BRAMINZERO. */
1501
1502 case OP_BRAZERO:
1503 {
1504 next = ecode+1;
1505 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1506 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1507 do next += GET(next,1); while (*next == OP_ALT);
1508 ecode = next + 1 + LINK_SIZE;
1509 }
1510 break;
1511
1512 case OP_BRAMINZERO:
1513 {
1514 next = ecode+1;
1515 do next += GET(next, 1); while (*next == OP_ALT);
1516 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1517 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1518 ecode++;
1519 }
1520 break;
1521
1522 case OP_SKIPZERO:
1523 {
1524 next = ecode+1;
1525 do next += GET(next,1); while (*next == OP_ALT);
1526 ecode = next + 1 + LINK_SIZE;
1527 }
1528 break;
1529
1530 /* End of a group, repeated or non-repeating. */
1531
1532 case OP_KET:
1533 case OP_KETRMIN:
1534 case OP_KETRMAX:
1535 prev = ecode - GET(ecode, 1);
1536
1537 /* If this was a group that remembered the subject start, in order to break
1538 infinite repeats of empty string matches, retrieve the subject start from
1539 the chain. Otherwise, set it NULL. */
1540
1541 if (*prev >= OP_SBRA)
1542 {
1543 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1544 eptrb = eptrb->epb_prev; /* Backup to previous group */
1545 }
1546 else saved_eptr = NULL;
1547
1548 /* If we are at the end of an assertion group or an atomic group, stop
1549 matching and return MATCH_MATCH, but record the current high water mark for
1550 use by positive assertions. We also need to record the match start in case
1551 it was changed by \K. */
1552
1553 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1554 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1555 *prev == OP_ONCE)
1556 {
1557 md->end_match_ptr = eptr; /* For ONCE */
1558 md->end_offset_top = offset_top;
1559 md->start_match_ptr = mstart;
1560 MRRETURN(MATCH_MATCH);
1561 }
1562
1563 /* For capturing groups we have to check the group number back at the start
1564 and if necessary complete handling an extraction by setting the offsets and
1565 bumping the high water mark. Note that whole-pattern recursion is coded as
1566 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1567 when the OP_END is reached. Other recursion is handled here. */
1568
1569 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1570 {
1571 number = GET2(prev, 1+LINK_SIZE);
1572 offset = number << 1;
1573
1574 #ifdef PCRE_DEBUG
1575 printf("end bracket %d", number);
1576 printf("\n");
1577 #endif
1578
1579 md->capture_last = number;
1580 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1581 {
1582 md->offset_vector[offset] =
1583 md->offset_vector[md->offset_end - number];
1584 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1585 if (offset_top <= offset) offset_top = offset + 2;
1586 }
1587
1588 /* Handle a recursively called group. Restore the offsets
1589 appropriately and continue from after the call. */
1590
1591 if (md->recursive != NULL && md->recursive->group_num == number)
1592 {
1593 recursion_info *rec = md->recursive;
1594 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1595 md->recursive = rec->prevrec;
1596 memcpy(md->offset_vector, rec->offset_save,
1597 rec->saved_max * sizeof(int));
1598 offset_top = rec->save_offset_top;
1599 ecode = rec->after_call;
1600 ims = original_ims;
1601 break;
1602 }
1603 }
1604
1605 /* For both capturing and non-capturing groups, reset the value of the ims
1606 flags, in case they got changed during the group. */
1607
1608 ims = original_ims;
1609 DPRINTF(("ims reset to %02lx\n", ims));
1610
1611 /* For a non-repeating ket, just continue at this level. This also
1612 happens for a repeating ket if no characters were matched in the group.
1613 This is the forcible breaking of infinite loops as implemented in Perl
1614 5.005. If there is an options reset, it will get obeyed in the normal
1615 course of events. */
1616
1617 if (*ecode == OP_KET || eptr == saved_eptr)
1618 {
1619 ecode += 1 + LINK_SIZE;
1620 break;
1621 }
1622
1623 /* The repeating kets try the rest of the pattern or restart from the
1624 preceding bracket, in the appropriate order. In the second case, we can use
1625 tail recursion to avoid using another stack frame, unless we have an
1626 unlimited repeat of a group that can match an empty string. */
1627
1628 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1629
1630 if (*ecode == OP_KETRMIN)
1631 {
1632 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1634 if (flags != 0) /* Could match an empty string */
1635 {
1636 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1637 RRETURN(rrc);
1638 }
1639 ecode = prev;
1640 goto TAIL_RECURSE;
1641 }
1642 else /* OP_KETRMAX */
1643 {
1644 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1645 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1646 ecode += 1 + LINK_SIZE;
1647 flags = 0;
1648 goto TAIL_RECURSE;
1649 }
1650 /* Control never gets here */
1651
1652 /* Start of subject unless notbol, or after internal newline if multiline */
1653
1654 case OP_CIRC:
1655 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1656 if ((ims & PCRE_MULTILINE) != 0)
1657 {
1658 if (eptr != md->start_subject &&
1659 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1660 MRRETURN(MATCH_NOMATCH);
1661 ecode++;
1662 break;
1663 }
1664 /* ... else fall through */
1665
1666 /* Start of subject assertion */
1667
1668 case OP_SOD:
1669 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1670 ecode++;
1671 break;
1672
1673 /* Start of match assertion */
1674
1675 case OP_SOM:
1676 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1677 ecode++;
1678 break;
1679
1680 /* Reset the start of match point */
1681
1682 case OP_SET_SOM:
1683 mstart = eptr;
1684 ecode++;
1685 break;
1686
1687 /* Assert before internal newline if multiline, or before a terminating
1688 newline unless endonly is set, else end of subject unless noteol is set. */
1689
1690 case OP_DOLL:
1691 if ((ims & PCRE_MULTILINE) != 0)
1692 {
1693 if (eptr < md->end_subject)
1694 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1695 else
1696 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1697 ecode++;
1698 break;
1699 }
1700 else
1701 {
1702 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1703 if (!md->endonly)
1704 {
1705 if (eptr != md->end_subject &&
1706 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1707 MRRETURN(MATCH_NOMATCH);
1708 ecode++;
1709 break;
1710 }
1711 }
1712 /* ... else fall through for endonly */
1713
1714 /* End of subject assertion (\z) */
1715
1716 case OP_EOD:
1717 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1718 ecode++;
1719 break;
1720
1721 /* End of subject or ending \n assertion (\Z) */
1722
1723 case OP_EODN:
1724 if (eptr != md->end_subject &&
1725 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1726 MRRETURN(MATCH_NOMATCH);
1727 ecode++;
1728 break;
1729
1730 /* Word boundary assertions */
1731
1732 case OP_NOT_WORD_BOUNDARY:
1733 case OP_WORD_BOUNDARY:
1734 {
1735
1736 /* Find out if the previous and current characters are "word" characters.
1737 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1738 be "non-word" characters. Remember the earliest consulted character for
1739 partial matching. */
1740
1741 #ifdef SUPPORT_UTF8
1742 if (utf8)
1743 {
1744 /* Get status of previous character */
1745
1746 if (eptr == md->start_subject) prev_is_word = FALSE; else
1747 {
1748 USPTR lastptr = eptr - 1;
1749 while((*lastptr & 0xc0) == 0x80) lastptr--;
1750 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1751 GETCHAR(c, lastptr);
1752 #ifdef SUPPORT_UCP
1753 if (md->use_ucp)
1754 {
1755 if (c == '_') prev_is_word = TRUE; else
1756 {
1757 int cat = UCD_CATEGORY(c);
1758 prev_is_word = (cat == ucp_L || cat == ucp_N);
1759 }
1760 }
1761 else
1762 #endif
1763 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1764 }
1765
1766 /* Get status of next character */
1767
1768 if (eptr >= md->end_subject)
1769 {
1770 SCHECK_PARTIAL();
1771 cur_is_word = FALSE;
1772 }
1773 else
1774 {
1775 GETCHAR(c, eptr);
1776 #ifdef SUPPORT_UCP
1777 if (md->use_ucp)
1778 {
1779 if (c == '_') cur_is_word = TRUE; else
1780 {
1781 int cat = UCD_CATEGORY(c);
1782 cur_is_word = (cat == ucp_L || cat == ucp_N);
1783 }
1784 }
1785 else
1786 #endif
1787 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1788 }
1789 }
1790 else
1791 #endif
1792
1793 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1794 consistency with the behaviour of \w we do use it in this case. */
1795
1796 {
1797 /* Get status of previous character */
1798
1799 if (eptr == md->start_subject) prev_is_word = FALSE; else
1800 {
1801 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1802 #ifdef SUPPORT_UCP
1803 if (md->use_ucp)
1804 {
1805 c = eptr[-1];
1806 if (c == '_') prev_is_word = TRUE; else
1807 {
1808 int cat = UCD_CATEGORY(c);
1809 prev_is_word = (cat == ucp_L || cat == ucp_N);
1810 }
1811 }
1812 else
1813 #endif
1814 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1815 }
1816
1817 /* Get status of next character */
1818
1819 if (eptr >= md->end_subject)
1820 {
1821 SCHECK_PARTIAL();
1822 cur_is_word = FALSE;
1823 }
1824 else
1825 #ifdef SUPPORT_UCP
1826 if (md->use_ucp)
1827 {
1828 c = *eptr;
1829 if (c == '_') cur_is_word = TRUE; else
1830 {
1831 int cat = UCD_CATEGORY(c);
1832 cur_is_word = (cat == ucp_L || cat == ucp_N);
1833 }
1834 }
1835 else
1836 #endif
1837 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1838 }
1839
1840 /* Now see if the situation is what we want */
1841
1842 if ((*ecode++ == OP_WORD_BOUNDARY)?
1843 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1844 MRRETURN(MATCH_NOMATCH);
1845 }
1846 break;
1847
1848 /* Match a single character type; inline for speed */
1849
1850 case OP_ANY:
1851 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1852 /* Fall through */
1853
1854 case OP_ALLANY:
1855 if (eptr++ >= md->end_subject)
1856 {
1857 SCHECK_PARTIAL();
1858 MRRETURN(MATCH_NOMATCH);
1859 }
1860 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1861 ecode++;
1862 break;
1863
1864 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1865 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1866
1867 case OP_ANYBYTE:
1868 if (eptr++ >= md->end_subject)
1869 {
1870 SCHECK_PARTIAL();
1871 MRRETURN(MATCH_NOMATCH);
1872 }
1873 ecode++;
1874 break;
1875
1876 case OP_NOT_DIGIT:
1877 if (eptr >= md->end_subject)
1878 {
1879 SCHECK_PARTIAL();
1880 MRRETURN(MATCH_NOMATCH);
1881 }
1882 GETCHARINCTEST(c, eptr);
1883 if (
1884 #ifdef SUPPORT_UTF8
1885 c < 256 &&
1886 #endif
1887 (md->ctypes[c] & ctype_digit) != 0
1888 )
1889 MRRETURN(MATCH_NOMATCH);
1890 ecode++;
1891 break;
1892
1893 case OP_DIGIT:
1894 if (eptr >= md->end_subject)
1895 {
1896 SCHECK_PARTIAL();
1897 MRRETURN(MATCH_NOMATCH);
1898 }
1899 GETCHARINCTEST(c, eptr);
1900 if (
1901 #ifdef SUPPORT_UTF8
1902 c >= 256 ||
1903 #endif
1904 (md->ctypes[c] & ctype_digit) == 0
1905 )
1906 MRRETURN(MATCH_NOMATCH);
1907 ecode++;
1908 break;
1909
1910 case OP_NOT_WHITESPACE:
1911 if (eptr >= md->end_subject)
1912 {
1913 SCHECK_PARTIAL();
1914 MRRETURN(MATCH_NOMATCH);
1915 }
1916 GETCHARINCTEST(c, eptr);
1917 if (
1918 #ifdef SUPPORT_UTF8
1919 c < 256 &&
1920 #endif
1921 (md->ctypes[c] & ctype_space) != 0
1922 )
1923 MRRETURN(MATCH_NOMATCH);
1924 ecode++;
1925 break;
1926
1927 case OP_WHITESPACE:
1928 if (eptr >= md->end_subject)
1929 {
1930 SCHECK_PARTIAL();
1931 MRRETURN(MATCH_NOMATCH);
1932 }
1933 GETCHARINCTEST(c, eptr);
1934 if (
1935 #ifdef SUPPORT_UTF8
1936 c >= 256 ||
1937 #endif
1938 (md->ctypes[c] & ctype_space) == 0
1939 )
1940 MRRETURN(MATCH_NOMATCH);
1941 ecode++;
1942 break;
1943
1944 case OP_NOT_WORDCHAR:
1945 if (eptr >= md->end_subject)
1946 {
1947 SCHECK_PARTIAL();
1948 MRRETURN(MATCH_NOMATCH);
1949 }
1950 GETCHARINCTEST(c, eptr);
1951 if (
1952 #ifdef SUPPORT_UTF8
1953 c < 256 &&
1954 #endif
1955 (md->ctypes[c] & ctype_word) != 0
1956 )
1957 MRRETURN(MATCH_NOMATCH);
1958 ecode++;
1959 break;
1960
1961 case OP_WORDCHAR:
1962 if (eptr >= md->end_subject)
1963 {
1964 SCHECK_PARTIAL();
1965 MRRETURN(MATCH_NOMATCH);
1966 }
1967 GETCHARINCTEST(c, eptr);
1968 if (
1969 #ifdef SUPPORT_UTF8
1970 c >= 256 ||
1971 #endif
1972 (md->ctypes[c] & ctype_word) == 0
1973 )
1974 MRRETURN(MATCH_NOMATCH);
1975 ecode++;
1976 break;
1977
1978 case OP_ANYNL:
1979 if (eptr >= md->end_subject)
1980 {
1981 SCHECK_PARTIAL();
1982 MRRETURN(MATCH_NOMATCH);
1983 }
1984 GETCHARINCTEST(c, eptr);
1985 switch(c)
1986 {
1987 default: MRRETURN(MATCH_NOMATCH);
1988 case 0x000d:
1989 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1990 break;
1991
1992 case 0x000a:
1993 break;
1994
1995 case 0x000b:
1996 case 0x000c:
1997 case 0x0085:
1998 case 0x2028:
1999 case 0x2029:
2000 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2001 break;
2002 }
2003 ecode++;
2004 break;
2005
2006 case OP_NOT_HSPACE:
2007 if (eptr >= md->end_subject)
2008 {
2009 SCHECK_PARTIAL();
2010 MRRETURN(MATCH_NOMATCH);
2011 }
2012 GETCHARINCTEST(c, eptr);
2013 switch(c)
2014 {
2015 default: break;
2016 case 0x09: /* HT */
2017 case 0x20: /* SPACE */
2018 case 0xa0: /* NBSP */
2019 case 0x1680: /* OGHAM SPACE MARK */
2020 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2021 case 0x2000: /* EN QUAD */
2022 case 0x2001: /* EM QUAD */
2023 case 0x2002: /* EN SPACE */
2024 case 0x2003: /* EM SPACE */
2025 case 0x2004: /* THREE-PER-EM SPACE */
2026 case 0x2005: /* FOUR-PER-EM SPACE */
2027 case 0x2006: /* SIX-PER-EM SPACE */
2028 case 0x2007: /* FIGURE SPACE */
2029 case 0x2008: /* PUNCTUATION SPACE */
2030 case 0x2009: /* THIN SPACE */
2031 case 0x200A: /* HAIR SPACE */
2032 case 0x202f: /* NARROW NO-BREAK SPACE */
2033 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2034 case 0x3000: /* IDEOGRAPHIC SPACE */
2035 MRRETURN(MATCH_NOMATCH);
2036 }
2037 ecode++;
2038 break;
2039
2040 case OP_HSPACE:
2041 if (eptr >= md->end_subject)
2042 {
2043 SCHECK_PARTIAL();
2044 MRRETURN(MATCH_NOMATCH);
2045 }
2046 GETCHARINCTEST(c, eptr);
2047 switch(c)
2048 {
2049 default: MRRETURN(MATCH_NOMATCH);
2050 case 0x09: /* HT */
2051 case 0x20: /* SPACE */
2052 case 0xa0: /* NBSP */
2053 case 0x1680: /* OGHAM SPACE MARK */
2054 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2055 case 0x2000: /* EN QUAD */
2056 case 0x2001: /* EM QUAD */
2057 case 0x2002: /* EN SPACE */
2058 case 0x2003: /* EM SPACE */
2059 case 0x2004: /* THREE-PER-EM SPACE */
2060 case 0x2005: /* FOUR-PER-EM SPACE */
2061 case 0x2006: /* SIX-PER-EM SPACE */
2062 case 0x2007: /* FIGURE SPACE */
2063 case 0x2008: /* PUNCTUATION SPACE */
2064 case 0x2009: /* THIN SPACE */
2065 case 0x200A: /* HAIR SPACE */
2066 case 0x202f: /* NARROW NO-BREAK SPACE */
2067 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2068 case 0x3000: /* IDEOGRAPHIC SPACE */
2069 break;
2070 }
2071 ecode++;
2072 break;
2073
2074 case OP_NOT_VSPACE:
2075 if (eptr >= md->end_subject)
2076 {
2077 SCHECK_PARTIAL();
2078 MRRETURN(MATCH_NOMATCH);
2079 }
2080 GETCHARINCTEST(c, eptr);
2081 switch(c)
2082 {
2083 default: break;
2084 case 0x0a: /* LF */
2085 case 0x0b: /* VT */
2086 case 0x0c: /* FF */
2087 case 0x0d: /* CR */
2088 case 0x85: /* NEL */
2089 case 0x2028: /* LINE SEPARATOR */
2090 case 0x2029: /* PARAGRAPH SEPARATOR */
2091 MRRETURN(MATCH_NOMATCH);
2092 }
2093 ecode++;
2094 break;
2095
2096 case OP_VSPACE:
2097 if (eptr >= md->end_subject)
2098 {
2099 SCHECK_PARTIAL();
2100 MRRETURN(MATCH_NOMATCH);
2101 }
2102 GETCHARINCTEST(c, eptr);
2103 switch(c)
2104 {
2105 default: MRRETURN(MATCH_NOMATCH);
2106 case 0x0a: /* LF */
2107 case 0x0b: /* VT */
2108 case 0x0c: /* FF */
2109 case 0x0d: /* CR */
2110 case 0x85: /* NEL */
2111 case 0x2028: /* LINE SEPARATOR */
2112 case 0x2029: /* PARAGRAPH SEPARATOR */
2113 break;
2114 }
2115 ecode++;
2116 break;
2117
2118 #ifdef SUPPORT_UCP
2119 /* Check the next character by Unicode property. We will get here only
2120 if the support is in the binary; otherwise a compile-time error occurs. */
2121
2122 case OP_PROP:
2123 case OP_NOTPROP:
2124 if (eptr >= md->end_subject)
2125 {
2126 SCHECK_PARTIAL();
2127 MRRETURN(MATCH_NOMATCH);
2128 }
2129 GETCHARINCTEST(c, eptr);
2130 {
2131 const ucd_record *prop = GET_UCD(c);
2132
2133 switch(ecode[1])
2134 {
2135 case PT_ANY:
2136 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2137 break;
2138
2139 case PT_LAMP:
2140 if ((prop->chartype == ucp_Lu ||
2141 prop->chartype == ucp_Ll ||
2142 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2143 MRRETURN(MATCH_NOMATCH);
2144 break;
2145
2146 case PT_GC:
2147 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2148 MRRETURN(MATCH_NOMATCH);
2149 break;
2150
2151 case PT_PC:
2152 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2153 MRRETURN(MATCH_NOMATCH);
2154 break;
2155
2156 case PT_SC:
2157 if ((ecode[2] != prop->script) == (op == OP_PROP))
2158 MRRETURN(MATCH_NOMATCH);
2159 break;
2160
2161 /* These are specials */
2162
2163 case PT_ALNUM:
2164 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2165 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2166 MRRETURN(MATCH_NOMATCH);
2167 break;
2168
2169 case PT_SPACE: /* Perl space */
2170 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2171 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2172 == (op == OP_NOTPROP))
2173 MRRETURN(MATCH_NOMATCH);
2174 break;
2175
2176 case PT_PXSPACE: /* POSIX space */
2177 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2178 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2179 c == CHAR_FF || c == CHAR_CR)
2180 == (op == OP_NOTPROP))
2181 MRRETURN(MATCH_NOMATCH);
2182 break;
2183
2184 case PT_WORD:
2185 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2186 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2187 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2188 MRRETURN(MATCH_NOMATCH);
2189 break;
2190
2191 /* This should never occur */
2192
2193 default:
2194 RRETURN(PCRE_ERROR_INTERNAL);
2195 }
2196
2197 ecode += 3;
2198 }
2199 break;
2200
2201 /* Match an extended Unicode sequence. We will get here only if the support
2202 is in the binary; otherwise a compile-time error occurs. */
2203
2204 case OP_EXTUNI:
2205 if (eptr >= md->end_subject)
2206 {
2207 SCHECK_PARTIAL();
2208 MRRETURN(MATCH_NOMATCH);
2209 }
2210 GETCHARINCTEST(c, eptr);
2211 {
2212 int category = UCD_CATEGORY(c);
2213 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2214 while (eptr < md->end_subject)
2215 {
2216 int len = 1;
2217 if (!utf8) c = *eptr; else
2218 {
2219 GETCHARLEN(c, eptr, len);
2220 }
2221 category = UCD_CATEGORY(c);
2222 if (category != ucp_M) break;
2223 eptr += len;
2224 }
2225 }
2226 ecode++;
2227 break;
2228 #endif
2229
2230
2231 /* Match a back reference, possibly repeatedly. Look past the end of the
2232 item to see if there is repeat information following. The code is similar
2233 to that for character classes, but repeated for efficiency. Then obey
2234 similar code to character type repeats - written out again for speed.
2235 However, if the referenced string is the empty string, always treat
2236 it as matched, any number of times (otherwise there could be infinite
2237 loops). */
2238
2239 case OP_REF:
2240 {
2241 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2242 ecode += 3;
2243
2244 /* If the reference is unset, there are two possibilities:
2245
2246 (a) In the default, Perl-compatible state, set the length to be longer
2247 than the amount of subject left; this ensures that every attempt at a
2248 match fails. We can't just fail here, because of the possibility of
2249 quantifiers with zero minima.
2250
2251 (b) If the JavaScript compatibility flag is set, set the length to zero
2252 so that the back reference matches an empty string.
2253
2254 Otherwise, set the length to the length of what was matched by the
2255 referenced subpattern. */
2256
2257 if (offset >= offset_top || md->offset_vector[offset] < 0)
2258 length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
2259 else
2260 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2261
2262 /* Set up for repetition, or handle the non-repeated case */
2263
2264 switch (*ecode)
2265 {
2266 case OP_CRSTAR:
2267 case OP_CRMINSTAR:
2268 case OP_CRPLUS:
2269 case OP_CRMINPLUS:
2270 case OP_CRQUERY:
2271 case OP_CRMINQUERY:
2272 c = *ecode++ - OP_CRSTAR;
2273 minimize = (c & 1) != 0;
2274 min = rep_min[c]; /* Pick up values from tables; */
2275 max = rep_max[c]; /* zero for max => infinity */
2276 if (max == 0) max = INT_MAX;
2277 break;
2278
2279 case OP_CRRANGE:
2280 case OP_CRMINRANGE:
2281 minimize = (*ecode == OP_CRMINRANGE);
2282 min = GET2(ecode, 1);
2283 max = GET2(ecode, 3);
2284 if (max == 0) max = INT_MAX;
2285 ecode += 5;
2286 break;
2287
2288 default: /* No repeat follows */
2289 if (!match_ref(offset, eptr, length, md, ims))
2290 {
2291 CHECK_PARTIAL();
2292 MRRETURN(MATCH_NOMATCH);
2293 }
2294 eptr += length;
2295 continue; /* With the main loop */
2296 }
2297
2298 /* If the length of the reference is zero, just continue with the
2299 main loop. */
2300
2301 if (length == 0) continue;
2302
2303 /* First, ensure the minimum number of matches are present. We get back
2304 the length of the reference string explicitly rather than passing the
2305 address of eptr, so that eptr can be a register variable. */
2306
2307 for (i = 1; i <= min; i++)
2308 {
2309 if (!match_ref(offset, eptr, length, md, ims))
2310 {
2311 CHECK_PARTIAL();
2312 MRRETURN(MATCH_NOMATCH);
2313 }
2314 eptr += length;
2315 }
2316
2317 /* If min = max, continue at the same level without recursion.
2318 They are not both allowed to be zero. */
2319
2320 if (min == max) continue;
2321
2322 /* If minimizing, keep trying and advancing the pointer */
2323
2324 if (minimize)
2325 {
2326 for (fi = min;; fi++)
2327 {
2328 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2329 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2330 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2331 if (!match_ref(offset, eptr, length, md, ims))
2332 {
2333 CHECK_PARTIAL();
2334 MRRETURN(MATCH_NOMATCH);
2335 }
2336 eptr += length;
2337 }
2338 /* Control never gets here */
2339 }
2340
2341 /* If maximizing, find the longest string and work backwards */
2342
2343 else
2344 {
2345 pp = eptr;
2346 for (i = min; i < max; i++)
2347 {
2348 if (!match_ref(offset, eptr, length, md, ims))
2349 {
2350 CHECK_PARTIAL();
2351 break;
2352 }
2353 eptr += length;
2354 }
2355 while (eptr >= pp)
2356 {
2357 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2358 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2359 eptr -= length;
2360 }
2361 MRRETURN(MATCH_NOMATCH);
2362 }
2363 }
2364 /* Control never gets here */
2365
2366 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2367 used when all the characters in the class have values in the range 0-255,
2368 and either the matching is caseful, or the characters are in the range
2369 0-127 when UTF-8 processing is enabled. The only difference between
2370 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2371 encountered.
2372
2373 First, look past the end of the item to see if there is repeat information
2374 following. Then obey similar code to character type repeats - written out
2375 again for speed. */
2376
2377 case OP_NCLASS:
2378 case OP_CLASS:
2379 {
2380 data = ecode + 1; /* Save for matching */
2381 ecode += 33; /* Advance past the item */
2382
2383 switch (*ecode)
2384 {
2385 case OP_CRSTAR:
2386 case OP_CRMINSTAR:
2387 case OP_CRPLUS:
2388 case OP_CRMINPLUS:
2389 case OP_CRQUERY:
2390 case OP_CRMINQUERY:
2391 c = *ecode++ - OP_CRSTAR;
2392 minimize = (c & 1) != 0;
2393 min = rep_min[c]; /* Pick up values from tables; */
2394 max = rep_max[c]; /* zero for max => infinity */
2395 if (max == 0) max = INT_MAX;
2396 break;
2397
2398 case OP_CRRANGE:
2399 case OP_CRMINRANGE:
2400 minimize = (*ecode == OP_CRMINRANGE);
2401 min = GET2(ecode, 1);
2402 max = GET2(ecode, 3);
2403 if (max == 0) max = INT_MAX;
2404 ecode += 5;
2405 break;
2406
2407 default: /* No repeat follows */
2408 min = max = 1;
2409 break;
2410 }
2411
2412 /* First, ensure the minimum number of matches are present. */
2413
2414 #ifdef SUPPORT_UTF8
2415 /* UTF-8 mode */
2416 if (utf8)
2417 {
2418 for (i = 1; i <= min; i++)
2419 {
2420 if (eptr >= md->end_subject)
2421 {
2422 SCHECK_PARTIAL();
2423 MRRETURN(MATCH_NOMATCH);
2424 }
2425 GETCHARINC(c, eptr);
2426 if (c > 255)
2427 {
2428 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2429 }
2430 else
2431 {
2432 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2433 }
2434 }
2435 }
2436 else
2437 #endif
2438 /* Not UTF-8 mode */
2439 {
2440 for (i = 1; i <= min; i++)
2441 {
2442 if (eptr >= md->end_subject)
2443 {
2444 SCHECK_PARTIAL();
2445 MRRETURN(MATCH_NOMATCH);
2446 }
2447 c = *eptr++;
2448 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2449 }
2450 }
2451
2452 /* If max == min we can continue with the main loop without the
2453 need to recurse. */
2454
2455 if (min == max) continue;
2456
2457 /* If minimizing, keep testing the rest of the expression and advancing
2458 the pointer while it matches the class. */
2459
2460 if (minimize)
2461 {
2462 #ifdef SUPPORT_UTF8
2463 /* UTF-8 mode */
2464 if (utf8)
2465 {
2466 for (fi = min;; fi++)
2467 {
2468 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2469 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2470 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2471 if (eptr >= md->end_subject)
2472 {
2473 SCHECK_PARTIAL();
2474 MRRETURN(MATCH_NOMATCH);
2475 }
2476 GETCHARINC(c, eptr);
2477 if (c > 255)
2478 {
2479 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2480 }
2481 else
2482 {
2483 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2484 }
2485 }
2486 }
2487 else
2488 #endif
2489 /* Not UTF-8 mode */
2490 {
2491 for (fi = min;; fi++)
2492 {
2493 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2495 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2496 if (eptr >= md->end_subject)
2497 {
2498 SCHECK_PARTIAL();
2499 MRRETURN(MATCH_NOMATCH);
2500 }
2501 c = *eptr++;
2502 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2503 }
2504 }
2505 /* Control never gets here */
2506 }
2507
2508 /* If maximizing, find the longest possible run, then work backwards. */
2509
2510 else
2511 {
2512 pp = eptr;
2513
2514 #ifdef SUPPORT_UTF8
2515 /* UTF-8 mode */
2516 if (utf8)
2517 {
2518 for (i = min; i < max; i++)
2519 {
2520 int len = 1;
2521 if (eptr >= md->end_subject)
2522 {
2523 SCHECK_PARTIAL();
2524 break;
2525 }
2526 GETCHARLEN(c, eptr, len);
2527 if (c > 255)
2528 {
2529 if (op == OP_CLASS) break;
2530 }
2531 else
2532 {
2533 if ((data[c/8] & (1 << (c&7))) == 0) break;
2534 }
2535 eptr += len;
2536 }
2537 for (;;)
2538 {
2539 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2540 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2541 if (eptr-- == pp) break; /* Stop if tried at original pos */
2542 BACKCHAR(eptr);
2543 }
2544 }
2545 else
2546 #endif
2547 /* Not UTF-8 mode */
2548 {
2549 for (i = min; i < max; i++)
2550 {
2551 if (eptr >= md->end_subject)
2552 {
2553 SCHECK_PARTIAL();
2554 break;
2555 }
2556 c = *eptr;
2557 if ((data[c/8] & (1 << (c&7))) == 0) break;
2558 eptr++;
2559 }
2560 while (eptr >= pp)
2561 {
2562 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2563 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2564 eptr--;
2565 }
2566 }
2567
2568 MRRETURN(MATCH_NOMATCH);
2569 }
2570 }
2571 /* Control never gets here */
2572
2573
2574 /* Match an extended character class. This opcode is encountered only
2575 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2576 mode, because Unicode properties are supported in non-UTF-8 mode. */
2577
2578 #ifdef SUPPORT_UTF8
2579 case OP_XCLASS:
2580 {
2581 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2582 ecode += GET(ecode, 1); /* Advance past the item */
2583
2584 switch (*ecode)
2585 {
2586 case OP_CRSTAR:
2587 case OP_CRMINSTAR:
2588 case OP_CRPLUS:
2589 case OP_CRMINPLUS:
2590 case OP_CRQUERY:
2591 case OP_CRMINQUERY:
2592 c = *ecode++ - OP_CRSTAR;
2593 minimize = (c & 1) != 0;
2594 min = rep_min[c]; /* Pick up values from tables; */
2595 max = rep_max[c]; /* zero for max => infinity */
2596 if (max == 0) max = INT_MAX;
2597 break;
2598
2599 case OP_CRRANGE:
2600 case OP_CRMINRANGE:
2601 minimize = (*ecode == OP_CRMINRANGE);
2602 min = GET2(ecode, 1);
2603 max = GET2(ecode, 3);
2604 if (max == 0) max = INT_MAX;
2605 ecode += 5;
2606 break;
2607
2608 default: /* No repeat follows */
2609 min = max = 1;
2610 break;
2611 }
2612
2613 /* First, ensure the minimum number of matches are present. */
2614
2615 for (i = 1; i <= min; i++)
2616 {
2617 if (eptr >= md->end_subject)
2618 {
2619 SCHECK_PARTIAL();
2620 MRRETURN(MATCH_NOMATCH);
2621 }
2622 GETCHARINCTEST(c, eptr);
2623 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2624 }
2625
2626 /* If max == min we can continue with the main loop without the
2627 need to recurse. */
2628
2629 if (min == max) continue;
2630
2631 /* If minimizing, keep testing the rest of the expression and advancing
2632 the pointer while it matches the class. */
2633
2634 if (minimize)
2635 {
2636 for (fi = min;; fi++)
2637 {
2638 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2640 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2641 if (eptr >= md->end_subject)
2642 {
2643 SCHECK_PARTIAL();
2644 MRRETURN(MATCH_NOMATCH);
2645 }
2646 GETCHARINCTEST(c, eptr);
2647 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2648 }
2649 /* Control never gets here */
2650 }
2651
2652 /* If maximizing, find the longest possible run, then work backwards. */
2653
2654 else
2655 {
2656 pp = eptr;
2657 for (i = min; i < max; i++)
2658 {
2659 int len = 1;
2660 if (eptr >= md->end_subject)
2661 {
2662 SCHECK_PARTIAL();
2663 break;
2664 }
2665 GETCHARLENTEST(c, eptr, len);
2666 if (!_pcre_xclass(c, data)) break;
2667 eptr += len;
2668 }
2669 for(;;)
2670 {
2671 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2672 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2673 if (eptr-- == pp) break; /* Stop if tried at original pos */
2674 if (utf8) BACKCHAR(eptr);
2675 }
2676 MRRETURN(MATCH_NOMATCH);
2677 }
2678
2679 /* Control never gets here */
2680 }
2681 #endif /* End of XCLASS */
2682
2683 /* Match a single character, casefully */
2684
2685 case OP_CHAR:
2686 #ifdef SUPPORT_UTF8
2687 if (utf8)
2688 {
2689 length = 1;
2690 ecode++;
2691 GETCHARLEN(fc, ecode, length);
2692 if (length > md->end_subject - eptr)
2693 {
2694 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2695 MRRETURN(MATCH_NOMATCH);
2696 }
2697 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2698 }
2699 else
2700 #endif
2701
2702 /* Non-UTF-8 mode */
2703 {
2704 if (md->end_subject - eptr < 1)
2705 {
2706 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2707 MRRETURN(MATCH_NOMATCH);
2708 }
2709 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2710 ecode += 2;
2711 }
2712 break;
2713
2714 /* Match a single character, caselessly */
2715
2716 case OP_CHARNC:
2717 #ifdef SUPPORT_UTF8
2718 if (utf8)
2719 {
2720 length = 1;
2721 ecode++;
2722 GETCHARLEN(fc, ecode, length);
2723
2724 if (length > md->end_subject - eptr)
2725 {
2726 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2727 MRRETURN(MATCH_NOMATCH);
2728 }
2729
2730 /* If the pattern character's value is < 128, we have only one byte, and
2731 can use the fast lookup table. */
2732
2733 if (fc < 128)
2734 {
2735 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2736 }
2737
2738 /* Otherwise we must pick up the subject character */
2739
2740 else
2741 {
2742 unsigned int dc;
2743 GETCHARINC(dc, eptr);
2744 ecode += length;
2745
2746 /* If we have Unicode property support, we can use it to test the other
2747 case of the character, if there is one. */
2748
2749 if (fc != dc)
2750 {
2751 #ifdef SUPPORT_UCP
2752 if (dc != UCD_OTHERCASE(fc))
2753 #endif
2754 MRRETURN(MATCH_NOMATCH);
2755 }
2756 }
2757 }
2758 else
2759 #endif /* SUPPORT_UTF8 */
2760
2761 /* Non-UTF-8 mode */
2762 {
2763 if (md->end_subject - eptr < 1)
2764 {
2765 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2766 MRRETURN(MATCH_NOMATCH);
2767 }
2768 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2769 ecode += 2;
2770 }
2771 break;
2772
2773 /* Match a single character repeatedly. */
2774
2775 case OP_EXACT:
2776 min = max = GET2(ecode, 1);
2777 ecode += 3;
2778 goto REPEATCHAR;
2779
2780 case OP_POSUPTO:
2781 possessive = TRUE;
2782 /* Fall through */
2783
2784 case OP_UPTO:
2785 case OP_MINUPTO:
2786 min = 0;
2787 max = GET2(ecode, 1);
2788 minimize = *ecode == OP_MINUPTO;
2789 ecode += 3;
2790 goto REPEATCHAR;
2791
2792 case OP_POSSTAR:
2793 possessive = TRUE;
2794 min = 0;
2795 max = INT_MAX;
2796 ecode++;
2797 goto REPEATCHAR;
2798
2799 case OP_POSPLUS:
2800 possessive = TRUE;
2801 min = 1;
2802 max = INT_MAX;
2803 ecode++;
2804 goto REPEATCHAR;
2805
2806 case OP_POSQUERY:
2807 possessive = TRUE;
2808 min = 0;
2809 max = 1;
2810 ecode++;
2811 goto REPEATCHAR;
2812
2813 case OP_STAR:
2814 case OP_MINSTAR:
2815 case OP_PLUS:
2816 case OP_MINPLUS:
2817 case OP_QUERY:
2818 case OP_MINQUERY:
2819 c = *ecode++ - OP_STAR;
2820 minimize = (c & 1) != 0;
2821
2822 min = rep_min[c]; /* Pick up values from tables; */
2823 max = rep_max[c]; /* zero for max => infinity */
2824 if (max == 0) max = INT_MAX;
2825
2826 /* Common code for all repeated single-character matches. */
2827
2828 REPEATCHAR:
2829 #ifdef SUPPORT_UTF8
2830 if (utf8)
2831 {
2832 length = 1;
2833 charptr = ecode;
2834 GETCHARLEN(fc, ecode, length);
2835 ecode += length;
2836
2837 /* Handle multibyte character matching specially here. There is
2838 support for caseless matching if UCP support is present. */
2839
2840 if (length > 1)
2841 {
2842 #ifdef SUPPORT_UCP
2843 unsigned int othercase;
2844 if ((ims & PCRE_CASELESS) != 0 &&
2845 (othercase = UCD_OTHERCASE(fc)) != fc)
2846 oclength = _pcre_ord2utf8(othercase, occhars);
2847 else oclength = 0;
2848 #endif /* SUPPORT_UCP */
2849
2850 for (i = 1; i <= min; i++)
2851 {
2852 if (eptr <= md->end_subject - length &&
2853 memcmp(eptr, charptr, length) == 0) eptr += length;
2854 #ifdef SUPPORT_UCP
2855 else if (oclength > 0 &&
2856 eptr <= md->end_subject - oclength &&
2857 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2858 #endif /* SUPPORT_UCP */
2859 else
2860 {
2861 CHECK_PARTIAL();
2862 MRRETURN(MATCH_NOMATCH);
2863 }
2864 }
2865
2866 if (min == max) continue;
2867
2868 if (minimize)
2869 {
2870 for (fi = min;; fi++)
2871 {
2872 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2874 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2875 if (eptr <= md->end_subject - length &&
2876 memcmp(eptr, charptr, length) == 0) eptr += length;
2877 #ifdef SUPPORT_UCP
2878 else if (oclength > 0 &&
2879 eptr <= md->end_subject - oclength &&
2880 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2881 #endif /* SUPPORT_UCP */
2882 else
2883 {
2884 CHECK_PARTIAL();
2885 MRRETURN(MATCH_NOMATCH);
2886 }
2887 }
2888 /* Control never gets here */
2889 }
2890
2891 else /* Maximize */
2892 {
2893 pp = eptr;
2894 for (i = min; i < max; i++)
2895 {
2896 if (eptr <= md->end_subject - length &&
2897 memcmp(eptr, charptr, length) == 0) eptr += length;
2898 #ifdef SUPPORT_UCP
2899 else if (oclength > 0 &&
2900 eptr <= md->end_subject - oclength &&
2901 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2902 #endif /* SUPPORT_UCP */
2903 else
2904 {
2905 CHECK_PARTIAL();
2906 break;
2907 }
2908 }
2909
2910 if (possessive) continue;
2911
2912 for(;;)
2913 {
2914 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2916 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2917 #ifdef SUPPORT_UCP
2918 eptr--;
2919 BACKCHAR(eptr);
2920 #else /* without SUPPORT_UCP */
2921 eptr -= length;
2922 #endif /* SUPPORT_UCP */
2923 }
2924 }
2925 /* Control never gets here */
2926 }
2927
2928 /* If the length of a UTF-8 character is 1, we fall through here, and
2929 obey the code as for non-UTF-8 characters below, though in this case the
2930 value of fc will always be < 128. */
2931 }
2932 else
2933 #endif /* SUPPORT_UTF8 */
2934
2935 /* When not in UTF-8 mode, load a single-byte character. */
2936
2937 fc = *ecode++;
2938
2939 /* The value of fc at this point is always less than 256, though we may or
2940 may not be in UTF-8 mode. The code is duplicated for the caseless and
2941 caseful cases, for speed, since matching characters is likely to be quite
2942 common. First, ensure the minimum number of matches are present. If min =
2943 max, continue at the same level without recursing. Otherwise, if
2944 minimizing, keep trying the rest of the expression and advancing one
2945 matching character if failing, up to the maximum. Alternatively, if
2946 maximizing, find the maximum number of characters and work backwards. */
2947
2948 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2949 max, eptr));
2950
2951 if ((ims & PCRE_CASELESS) != 0)
2952 {
2953 fc = md->lcc[fc];
2954 for (i = 1; i <= min; i++)
2955 {
2956 if (eptr >= md->end_subject)
2957 {
2958 SCHECK_PARTIAL();
2959 MRRETURN(MATCH_NOMATCH);
2960 }
2961 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2962 }
2963 if (min == max) continue;
2964 if (minimize)
2965 {
2966 for (fi = min;; fi++)
2967 {
2968 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2970 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2971 if (eptr >= md->end_subject)
2972 {
2973 SCHECK_PARTIAL();
2974 MRRETURN(MATCH_NOMATCH);
2975 }
2976 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2977 }
2978 /* Control never gets here */
2979 }
2980 else /* Maximize */
2981 {
2982 pp = eptr;
2983 for (i = min; i < max; i++)
2984 {
2985 if (eptr >= md->end_subject)
2986 {
2987 SCHECK_PARTIAL();
2988 break;
2989 }
2990 if (fc != md->lcc[*eptr]) break;
2991 eptr++;
2992 }
2993
2994 if (possessive) continue;
2995
2996 while (eptr >= pp)
2997 {
2998 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2999 eptr--;
3000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001 }
3002 MRRETURN(MATCH_NOMATCH);
3003 }
3004 /* Control never gets here */
3005 }
3006
3007 /* Caseful comparisons (includes all multi-byte characters) */
3008
3009 else
3010 {
3011 for (i = 1; i <= min; i++)
3012 {
3013 if (eptr >= md->end_subject)
3014 {
3015 SCHECK_PARTIAL();
3016 MRRETURN(MATCH_NOMATCH);
3017 }
3018 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3019 }
3020
3021 if (min == max) continue;
3022
3023 if (minimize)
3024 {
3025 for (fi = min;; fi++)
3026 {
3027 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3029 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3030 if (eptr >= md->end_subject)
3031 {
3032 SCHECK_PARTIAL();
3033 MRRETURN(MATCH_NOMATCH);
3034 }
3035 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3036 }
3037 /* Control never gets here */
3038 }
3039 else /* Maximize */
3040 {
3041 pp = eptr;
3042 for (i = min; i < max; i++)
3043 {
3044 if (eptr >= md->end_subject)
3045 {
3046 SCHECK_PARTIAL();
3047 break;
3048 }
3049 if (fc != *eptr) break;
3050 eptr++;
3051 }
3052 if (possessive) continue;
3053
3054 while (eptr >= pp)
3055 {
3056 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3057 eptr--;
3058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3059 }
3060 MRRETURN(MATCH_NOMATCH);
3061 }
3062 }
3063 /* Control never gets here */
3064
3065 /* Match a negated single one-byte character. The character we are
3066 checking can be multibyte. */
3067
3068 case OP_NOT:
3069 if (eptr >= md->end_subject)
3070 {
3071 SCHECK_PARTIAL();
3072 MRRETURN(MATCH_NOMATCH);
3073 }
3074 ecode++;
3075 GETCHARINCTEST(c, eptr);
3076 if ((ims & PCRE_CASELESS) != 0)
3077 {
3078 #ifdef SUPPORT_UTF8
3079 if (c < 256)
3080 #endif
3081 c = md->lcc[c];
3082 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3083 }
3084 else
3085 {
3086 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3087 }
3088 break;
3089
3090 /* Match a negated single one-byte character repeatedly. This is almost a
3091 repeat of the code for a repeated single character, but I haven't found a
3092 nice way of commoning these up that doesn't require a test of the
3093 positive/negative option for each character match. Maybe that wouldn't add
3094 very much to the time taken, but character matching *is* what this is all
3095 about... */
3096
3097 case OP_NOTEXACT:
3098 min = max = GET2(ecode, 1);
3099 ecode += 3;
3100 goto REPEATNOTCHAR;
3101
3102 case OP_NOTUPTO:
3103 case OP_NOTMINUPTO:
3104 min = 0;
3105 max = GET2(ecode, 1);
3106 minimize = *ecode == OP_NOTMINUPTO;
3107 ecode += 3;
3108 goto REPEATNOTCHAR;
3109
3110 case OP_NOTPOSSTAR:
3111 possessive = TRUE;
3112 min = 0;
3113 max = INT_MAX;
3114 ecode++;
3115 goto REPEATNOTCHAR;
3116
3117 case OP_NOTPOSPLUS:
3118 possessive = TRUE;
3119 min = 1;
3120 max = INT_MAX;
3121 ecode++;
3122 goto REPEATNOTCHAR;
3123
3124 case OP_NOTPOSQUERY:
3125 possessive = TRUE;
3126 min = 0;
3127 max = 1;
3128 ecode++;
3129 goto REPEATNOTCHAR;
3130
3131 case OP_NOTPOSUPTO:
3132 possessive = TRUE;
3133 min = 0;
3134 max = GET2(ecode, 1);
3135 ecode += 3;
3136 goto REPEATNOTCHAR;
3137
3138 case OP_NOTSTAR:
3139 case OP_NOTMINSTAR:
3140 case OP_NOTPLUS:
3141 case OP_NOTMINPLUS:
3142 case OP_NOTQUERY:
3143 case OP_NOTMINQUERY:
3144 c = *ecode++ - OP_NOTSTAR;
3145 minimize = (c & 1) != 0;
3146 min = rep_min[c]; /* Pick up values from tables; */
3147 max = rep_max[c]; /* zero for max => infinity */
3148 if (max == 0) max = INT_MAX;
3149
3150 /* Common code for all repeated single-byte matches. */
3151
3152 REPEATNOTCHAR:
3153 fc = *ecode++;
3154
3155 /* The code is duplicated for the caseless and caseful cases, for speed,
3156 since matching characters is likely to be quite common. First, ensure the
3157 minimum number of matches are present. If min = max, continue at the same
3158 level without recursing. Otherwise, if minimizing, keep trying the rest of
3159 the expression and advancing one matching character if failing, up to the
3160 maximum. Alternatively, if maximizing, find the maximum number of
3161 characters and work backwards. */
3162
3163 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3164 max, eptr));
3165
3166 if ((ims & PCRE_CASELESS) != 0)
3167 {
3168 fc = md->lcc[fc];
3169
3170 #ifdef SUPPORT_UTF8
3171 /* UTF-8 mode */
3172 if (utf8)
3173 {
3174 register unsigned int d;
3175 for (i = 1; i <= min; i++)
3176 {
3177 if (eptr >= md->end_subject)
3178 {
3179 SCHECK_PARTIAL();
3180 MRRETURN(MATCH_NOMATCH);
3181 }
3182 GETCHARINC(d, eptr);
3183 if (d < 256) d = md->lcc[d];
3184 if (fc == d) MRRETURN(MATCH_NOMATCH);
3185 }
3186 }
3187 else
3188 #endif
3189
3190 /* Not UTF-8 mode */
3191 {
3192 for (i = 1; i <= min; i++)
3193 {
3194 if (eptr >= md->end_subject)
3195 {
3196 SCHECK_PARTIAL();
3197 MRRETURN(MATCH_NOMATCH);
3198 }
3199 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3200 }
3201 }
3202
3203 if (min == max) continue;
3204
3205 if (minimize)
3206 {
3207 #ifdef SUPPORT_UTF8
3208 /* UTF-8 mode */
3209 if (utf8)
3210 {
3211 register unsigned int d;
3212 for (fi = min;; fi++)
3213 {
3214 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3216 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3217 if (eptr >= md->end_subject)
3218 {
3219 SCHECK_PARTIAL();
3220 MRRETURN(MATCH_NOMATCH);
3221 }
3222 GETCHARINC(d, eptr);
3223 if (d < 256) d = md->lcc[d];
3224 if (fc == d) MRRETURN(MATCH_NOMATCH);
3225 }
3226 }
3227 else
3228 #endif
3229 /* Not UTF-8 mode */
3230 {
3231 for (fi = min;; fi++)
3232 {
3233 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3234 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3235 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3236 if (eptr >= md->end_subject)
3237 {
3238 SCHECK_PARTIAL();
3239 MRRETURN(MATCH_NOMATCH);
3240 }
3241 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3242 }
3243 }
3244 /* Control never gets here */
3245 }
3246
3247 /* Maximize case */
3248
3249 else
3250 {
3251 pp = eptr;
3252
3253 #ifdef SUPPORT_UTF8
3254 /* UTF-8 mode */
3255 if (utf8)
3256 {
3257 register unsigned int d;
3258 for (i = min; i < max; i++)
3259 {
3260 int len = 1;
3261 if (eptr >= md->end_subject)
3262 {
3263 SCHECK_PARTIAL();
3264 break;
3265 }
3266 GETCHARLEN(d, eptr, len);
3267 if (d < 256) d = md->lcc[d];
3268 if (fc == d) break;
3269 eptr += len;
3270 }
3271 if (possessive) continue;
3272 for(;;)
3273 {
3274 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3275 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3276 if (eptr-- == pp) break; /* Stop if tried at original pos */
3277 BACKCHAR(eptr);
3278 }
3279 }
3280 else
3281 #endif
3282 /* Not UTF-8 mode */
3283 {
3284 for (i = min; i < max; i++)
3285 {
3286 if (eptr >= md->end_subject)
3287 {
3288 SCHECK_PARTIAL();
3289 break;
3290 }
3291 if (fc == md->lcc[*eptr]) break;
3292 eptr++;
3293 }
3294 if (possessive) continue;
3295 while (eptr >= pp)
3296 {
3297 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3299 eptr--;
3300 }
3301 }
3302
3303 MRRETURN(MATCH_NOMATCH);
3304 }
3305 /* Control never gets here */
3306 }
3307
3308 /* Caseful comparisons */
3309
3310 else
3311 {
3312 #ifdef SUPPORT_UTF8
3313 /* UTF-8 mode */
3314 if (utf8)
3315 {
3316 register unsigned int d;
3317 for (i = 1; i <= min; i++)
3318 {
3319 if (eptr >= md->end_subject)
3320 {
3321 SCHECK_PARTIAL();
3322 MRRETURN(MATCH_NOMATCH);
3323 }
3324 GETCHARINC(d, eptr);
3325 if (fc == d) MRRETURN(MATCH_NOMATCH);
3326 }
3327 }
3328 else
3329 #endif
3330 /* Not UTF-8 mode */
3331 {
3332 for (i = 1; i <= min; i++)
3333 {
3334 if (eptr >= md->end_subject)
3335 {
3336 SCHECK_PARTIAL();
3337 MRRETURN(MATCH_NOMATCH);
3338 }
3339 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3340 }
3341 }
3342
3343 if (min == max) continue;
3344
3345 if (minimize)
3346 {
3347 #ifdef SUPPORT_UTF8
3348 /* UTF-8 mode */
3349 if (utf8)
3350 {
3351 register unsigned int d;
3352 for (fi = min;; fi++)
3353 {
3354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3357 if (eptr >= md->end_subject)
3358 {
3359 SCHECK_PARTIAL();
3360 MRRETURN(MATCH_NOMATCH);
3361 }
3362 GETCHARINC(d, eptr);
3363 if (fc == d) MRRETURN(MATCH_NOMATCH);
3364 }
3365 }
3366 else
3367 #endif
3368 /* Not UTF-8 mode */
3369 {
3370 for (fi = min;; fi++)
3371 {
3372 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3374 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3375 if (eptr >= md->end_subject)
3376 {
3377 SCHECK_PARTIAL();
3378 MRRETURN(MATCH_NOMATCH);
3379 }
3380 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3381 }
3382 }
3383 /* Control never gets here */
3384 }
3385
3386 /* Maximize case */
3387
3388 else
3389 {
3390 pp = eptr;
3391
3392 #ifdef SUPPORT_UTF8
3393 /* UTF-8 mode */
3394 if (utf8)
3395 {
3396 register unsigned int d;
3397 for (i = min; i < max; i++)
3398 {
3399 int len = 1;
3400 if (eptr >= md->end_subject)
3401 {
3402 SCHECK_PARTIAL();
3403 break;
3404 }
3405 GETCHARLEN(d, eptr, len);
3406 if (fc == d) break;
3407 eptr += len;
3408 }
3409 if (possessive) continue;
3410 for(;;)
3411 {
3412 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3413 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3414 if (eptr-- == pp) break; /* Stop if tried at original pos */
3415 BACKCHAR(eptr);
3416 }
3417 }
3418 else
3419 #endif
3420 /* Not UTF-8 mode */
3421 {
3422 for (i = min; i < max; i++)
3423 {
3424 if (eptr >= md->end_subject)
3425 {
3426 SCHECK_PARTIAL();
3427 break;
3428 }
3429 if (fc == *eptr) break;
3430 eptr++;
3431 }
3432 if (possessive) continue;
3433 while (eptr >= pp)
3434 {
3435 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3436 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3437 eptr--;
3438 }
3439 }
3440
3441 MRRETURN(MATCH_NOMATCH);
3442 }
3443 }
3444 /* Control never gets here */
3445
3446 /* Match a single character type repeatedly; several different opcodes
3447 share code. This is very similar to the code for single characters, but we
3448 repeat it in the interests of efficiency. */
3449
3450 case OP_TYPEEXACT:
3451 min = max = GET2(ecode, 1);
3452 minimize = TRUE;
3453 ecode += 3;
3454 goto REPEATTYPE;
3455
3456 case OP_TYPEUPTO:
3457 case OP_TYPEMINUPTO:
3458 min = 0;
3459 max = GET2(ecode, 1);
3460 minimize = *ecode == OP_TYPEMINUPTO;
3461 ecode += 3;
3462 goto REPEATTYPE;
3463
3464 case OP_TYPEPOSSTAR:
3465 possessive = TRUE;
3466 min = 0;
3467 max = INT_MAX;
3468 ecode++;
3469 goto REPEATTYPE;
3470
3471 case OP_TYPEPOSPLUS:
3472 possessive = TRUE;
3473 min = 1;
3474 max = INT_MAX;
3475 ecode++;
3476 goto REPEATTYPE;
3477
3478 case OP_TYPEPOSQUERY:
3479 possessive = TRUE;
3480 min = 0;
3481 max = 1;
3482 ecode++;
3483 goto REPEATTYPE;
3484
3485 case OP_TYPEPOSUPTO:
3486 possessive = TRUE;
3487 min = 0;
3488 max = GET2(ecode, 1);
3489 ecode += 3;
3490 goto REPEATTYPE;
3491
3492 case OP_TYPESTAR:
3493 case OP_TYPEMINSTAR:
3494 case OP_TYPEPLUS:
3495 case OP_TYPEMINPLUS:
3496 case OP_TYPEQUERY:
3497 case OP_TYPEMINQUERY:
3498 c = *ecode++ - OP_TYPESTAR;
3499 minimize = (c & 1) != 0;
3500 min = rep_min[c]; /* Pick up values from tables; */
3501 max = rep_max[c]; /* zero for max => infinity */
3502 if (max == 0) max = INT_MAX;
3503
3504 /* Common code for all repeated single character type matches. Note that
3505 in UTF-8 mode, '.' matches a character of any length, but for the other
3506 character types, the valid characters are all one-byte long. */
3507
3508 REPEATTYPE:
3509 ctype = *ecode++; /* Code for the character type */
3510
3511 #ifdef SUPPORT_UCP
3512 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3513 {
3514 prop_fail_result = ctype == OP_NOTPROP;
3515 prop_type = *ecode++;
3516 prop_value = *ecode++;
3517 }
3518 else prop_type = -1;
3519 #endif
3520
3521 /* First, ensure the minimum number of matches are present. Use inline
3522 code for maximizing the speed, and do the type test once at the start
3523 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3524 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3525 and single-bytes. */
3526
3527 if (min > 0)
3528 {
3529 #ifdef SUPPORT_UCP
3530 if (prop_type >= 0)
3531 {
3532 switch(prop_type)
3533 {
3534 case PT_ANY:
3535 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3536 for (i = 1; i <= min; i++)
3537 {
3538 if (eptr >= md->end_subject)
3539 {
3540 SCHECK_PARTIAL();
3541 MRRETURN(MATCH_NOMATCH);
3542 }
3543 GETCHARINCTEST(c, eptr);
3544 }
3545 break;
3546
3547 case PT_LAMP:
3548 for (i = 1; i <= min; i++)
3549 {
3550 if (eptr >= md->end_subject)
3551 {
3552 SCHECK_PARTIAL();
3553 MRRETURN(MATCH_NOMATCH);
3554 }
3555 GETCHARINCTEST(c, eptr);
3556 prop_chartype = UCD_CHARTYPE(c);
3557 if ((prop_chartype == ucp_Lu ||
3558 prop_chartype == ucp_Ll ||
3559 prop_chartype == ucp_Lt) == prop_fail_result)
3560 MRRETURN(MATCH_NOMATCH);
3561 }
3562 break;
3563
3564 case PT_GC:
3565 for (i = 1; i <= min; i++)
3566 {
3567 if (eptr >= md->end_subject)
3568 {
3569 SCHECK_PARTIAL();
3570 MRRETURN(MATCH_NOMATCH);
3571 }
3572 GETCHARINCTEST(c, eptr);
3573 prop_category = UCD_CATEGORY(c);
3574 if ((prop_category == prop_value) == prop_fail_result)
3575 MRRETURN(MATCH_NOMATCH);
3576 }
3577 break;
3578
3579 case PT_PC:
3580 for (i = 1; i <= min; i++)
3581 {
3582 if (eptr >= md->end_subject)
3583 {
3584 SCHECK_PARTIAL();
3585 MRRETURN(MATCH_NOMATCH);
3586 }
3587 GETCHARINCTEST(c, eptr);
3588 prop_chartype = UCD_CHARTYPE(c);
3589 if ((prop_chartype == prop_value) == prop_fail_result)
3590 MRRETURN(MATCH_NOMATCH);
3591 }
3592 break;
3593
3594 case PT_SC:
3595 for (i = 1; i <= min; i++)
3596 {
3597 if (eptr >= md->end_subject)
3598 {
3599 SCHECK_PARTIAL();
3600 MRRETURN(MATCH_NOMATCH);
3601 }
3602 GETCHARINCTEST(c, eptr);
3603 prop_script = UCD_SCRIPT(c);
3604 if ((prop_script == prop_value) == prop_fail_result)
3605 MRRETURN(MATCH_NOMATCH);
3606 }
3607 break;
3608
3609 case PT_ALNUM:
3610 for (i = 1; i <= min; i++)
3611 {
3612 if (eptr >= md->end_subject)
3613 {
3614 SCHECK_PARTIAL();
3615 MRRETURN(MATCH_NOMATCH);
3616 }
3617 GETCHARINCTEST(c, eptr);
3618 prop_category = UCD_CATEGORY(c);
3619 if ((prop_category == ucp_L || prop_category == ucp_N)
3620 == prop_fail_result)
3621 MRRETURN(MATCH_NOMATCH);
3622 }
3623 break;
3624
3625 case PT_SPACE: /* Perl space */
3626 for (i = 1; i <= min; i++)
3627 {
3628 if (eptr >= md->end_subject)
3629 {
3630 SCHECK_PARTIAL();
3631 MRRETURN(MATCH_NOMATCH);
3632 }
3633 GETCHARINCTEST(c, eptr);
3634 prop_category = UCD_CATEGORY(c);
3635 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3636 c == CHAR_FF || c == CHAR_CR)
3637 == prop_fail_result)
3638 MRRETURN(MATCH_NOMATCH);
3639 }
3640 break;
3641
3642 case PT_PXSPACE: /* POSIX space */
3643 for (i = 1; i <= min; i++)
3644 {
3645 if (eptr >= md->end_subject)
3646 {
3647 SCHECK_PARTIAL();
3648 MRRETURN(MATCH_NOMATCH);
3649 }
3650 GETCHARINCTEST(c, eptr);
3651 prop_category = UCD_CATEGORY(c);
3652 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3653 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3654 == prop_fail_result)
3655 MRRETURN(MATCH_NOMATCH);
3656 }
3657 break;
3658
3659 case PT_WORD:
3660 for (i = 1; i <= min; i++)
3661 {
3662 if (eptr >= md->end_subject)
3663 {
3664 SCHECK_PARTIAL();
3665 MRRETURN(MATCH_NOMATCH);
3666 }
3667 GETCHARINCTEST(c, eptr);
3668 prop_category = UCD_CATEGORY(c);
3669 if ((prop_category == ucp_L || prop_category == ucp_N ||
3670 c == CHAR_UNDERSCORE)
3671 == prop_fail_result)
3672 MRRETURN(MATCH_NOMATCH);
3673 }
3674 break;
3675
3676 /* This should not occur */
3677
3678 default:
3679 RRETURN(PCRE_ERROR_INTERNAL);
3680 }
3681 }
3682
3683 /* Match extended Unicode sequences. We will get here only if the
3684 support is in the binary; otherwise a compile-time error occurs. */
3685
3686 else if (ctype == OP_EXTUNI)
3687 {
3688 for (i = 1; i <= min; i++)
3689 {
3690 if (eptr >= md->end_subject)
3691 {
3692 SCHECK_PARTIAL();
3693 MRRETURN(MATCH_NOMATCH);
3694 }
3695 GETCHARINCTEST(c, eptr);
3696 prop_category = UCD_CATEGORY(c);
3697 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3698 while (eptr < md->end_subject)
3699 {
3700 int len = 1;
3701 if (!utf8) c = *eptr;
3702 else { GETCHARLEN(c, eptr, len); }
3703 prop_category = UCD_CATEGORY(c);
3704 if (prop_category != ucp_M) break;
3705 eptr += len;
3706 }
3707 }
3708 }
3709
3710 else
3711 #endif /* SUPPORT_UCP */
3712
3713 /* Handle all other cases when the coding is UTF-8 */
3714
3715 #ifdef SUPPORT_UTF8
3716 if (utf8) switch(ctype)
3717 {
3718 case OP_ANY:
3719 for (i = 1; i <= min; i++)
3720 {
3721 if (eptr >= md->end_subject)
3722 {
3723 SCHECK_PARTIAL();
3724 MRRETURN(MATCH_NOMATCH);
3725 }
3726 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3727 eptr++;
3728 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3729 }
3730 break;
3731
3732 case OP_ALLANY:
3733 for (i = 1; i <= min; i++)
3734 {
3735 if (eptr >= md->end_subject)
3736 {
3737 SCHECK_PARTIAL();
3738 MRRETURN(MATCH_NOMATCH);
3739 }
3740 eptr++;
3741 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3742 }
3743 break;
3744
3745 case OP_ANYBYTE:
3746 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3747 eptr += min;
3748 break;
3749
3750 case OP_ANYNL:
3751 for (i = 1; i <= min; i++)
3752 {
3753 if (eptr >= md->end_subject)
3754 {
3755 SCHECK_PARTIAL();
3756 MRRETURN(MATCH_NOMATCH);
3757 }
3758 GETCHARINC(c, eptr);
3759 switch(c)
3760 {
3761 default: MRRETURN(MATCH_NOMATCH);
3762 case 0x000d:
3763 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3764 break;
3765
3766 case 0x000a:
3767 break;
3768
3769 case 0x000b:
3770 case 0x000c:
3771 case 0x0085:
3772 case 0x2028:
3773 case 0x2029:
3774 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3775 break;
3776 }
3777 }
3778 break;
3779
3780 case OP_NOT_HSPACE:
3781 for (i = 1; i <= min; i++)
3782 {
3783 if (eptr >= md->end_subject)
3784 {
3785 SCHECK_PARTIAL();
3786 MRRETURN(MATCH_NOMATCH);
3787 }
3788 GETCHARINC(c, eptr);
3789 switch(c)
3790 {
3791 default: break;
3792 case 0x09: /* HT */
3793 case 0x20: /* SPACE */
3794 case 0xa0: /* NBSP */
3795 case 0x1680: /* OGHAM SPACE MARK */
3796 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3797 case 0x2000: /* EN QUAD */
3798 case 0x2001: /* EM QUAD */
3799 case 0x2002: /* EN SPACE */
3800 case 0x2003: /* EM SPACE */
3801 case 0x2004: /* THREE-PER-EM SPACE */
3802 case 0x2005: /* FOUR-PER-EM SPACE */
3803 case 0x2006: /* SIX-PER-EM SPACE */
3804 case 0x2007: /* FIGURE SPACE */
3805 case 0x2008: /* PUNCTUATION SPACE */
3806 case 0x2009: /* THIN SPACE */
3807 case 0x200A: /* HAIR SPACE */
3808 case 0x202f: /* NARROW NO-BREAK SPACE */
3809 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3810 case 0x3000: /* IDEOGRAPHIC SPACE */
3811 MRRETURN(MATCH_NOMATCH);
3812 }
3813 }
3814 break;
3815
3816 case OP_HSPACE:
3817 for (i = 1; i <= min; i++)
3818 {
3819 if (eptr >= md->end_subject)
3820 {
3821 SCHECK_PARTIAL();
3822 MRRETURN(MATCH_NOMATCH);
3823 }
3824 GETCHARINC(c, eptr);
3825 switch(c)
3826 {
3827 default: MRRETURN(MATCH_NOMATCH);
3828 case 0x09: /* HT */
3829 case 0x20: /* SPACE */
3830 case 0xa0: /* NBSP */
3831 case 0x1680: /* OGHAM SPACE MARK */
3832 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3833 case 0x2000: /* EN QUAD */
3834 case 0x2001: /* EM QUAD */
3835 case 0x2002: /* EN SPACE */
3836 case 0x2003: /* EM SPACE */
3837 case 0x2004: /* THREE-PER-EM SPACE */
3838 case 0x2005: /* FOUR-PER-EM SPACE */
3839 case 0x2006: /* SIX-PER-EM SPACE */
3840 case 0x2007: /* FIGURE SPACE */
3841 case 0x2008: /* PUNCTUATION SPACE */
3842 case 0x2009: /* THIN SPACE */
3843 case 0x200A: /* HAIR SPACE */
3844 case 0x202f: /* NARROW NO-BREAK SPACE */
3845 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3846 case 0x3000: /* IDEOGRAPHIC SPACE */
3847 break;
3848 }
3849 }
3850 break;
3851
3852 case OP_NOT_VSPACE:
3853 for (i = 1; i <= min; i++)
3854 {
3855 if (eptr >= md->end_subject)
3856 {
3857 SCHECK_PARTIAL();
3858 MRRETURN(MATCH_NOMATCH);
3859 }
3860 GETCHARINC(c, eptr);
3861 switch(c)
3862 {
3863 default: break;
3864 case 0x0a: /* LF */
3865 case 0x0b: /* VT */
3866 case 0x0c: /* FF */
3867 case 0x0d: /* CR */
3868 case 0x85: /* NEL */
3869 case 0x2028: /* LINE SEPARATOR */
3870 case 0x2029: /* PARAGRAPH SEPARATOR */
3871 MRRETURN(MATCH_NOMATCH);
3872 }
3873 }
3874 break;
3875
3876 case OP_VSPACE:
3877 for (i = 1; i <= min; i++)
3878 {
3879 if (eptr >= md->end_subject)
3880 {
3881 SCHECK_PARTIAL();
3882 MRRETURN(MATCH_NOMATCH);
3883 }
3884 GETCHARINC(c, eptr);
3885 switch(c)
3886 {
3887 default: MRRETURN(MATCH_NOMATCH);
3888 case 0x0a: /* LF */
3889 case 0x0b: /* VT */
3890 case 0x0c: /* FF */
3891 case 0x0d: /* CR */
3892 case 0x85: /* NEL */
3893 case 0x2028: /* LINE SEPARATOR */
3894 case 0x2029: /* PARAGRAPH SEPARATOR */
3895 break;
3896 }
3897 }
3898 break;
3899
3900 case OP_NOT_DIGIT:
3901 for (i = 1; i <= min; i++)
3902 {
3903 if (eptr >= md->end_subject)
3904 {
3905 SCHECK_PARTIAL();
3906 MRRETURN(MATCH_NOMATCH);
3907 }
3908 GETCHARINC(c, eptr);
3909 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3910 MRRETURN(MATCH_NOMATCH);
3911 }
3912 break;
3913
3914 case OP_DIGIT:
3915 for (i = 1; i <= min; i++)
3916 {
3917 if (eptr >= md->end_subject)
3918 {
3919 SCHECK_PARTIAL();
3920 MRRETURN(MATCH_NOMATCH);
3921 }
3922 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3923 MRRETURN(MATCH_NOMATCH);
3924 /* No need to skip more bytes - we know it's a 1-byte character */
3925 }
3926 break;
3927
3928 case OP_NOT_WHITESPACE:
3929 for (i = 1; i <= min; i++)
3930 {
3931 if (eptr >= md->end_subject)
3932 {
3933 SCHECK_PARTIAL();
3934 MRRETURN(MATCH_NOMATCH);
3935 }
3936 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3937 MRRETURN(MATCH_NOMATCH);
3938 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3939 }
3940 break;
3941
3942 case OP_WHITESPACE:
3943 for (i = 1; i <= min; i++)
3944 {
3945 if (eptr >= md->end_subject)
3946 {
3947 SCHECK_PARTIAL();
3948 MRRETURN(MATCH_NOMATCH);
3949 }
3950 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3951 MRRETURN(MATCH_NOMATCH);
3952 /* No need to skip more bytes - we know it's a 1-byte character */
3953 }
3954 break;
3955
3956 case OP_NOT_WORDCHAR:
3957 for (i = 1; i <= min; i++)
3958 {
3959 if (eptr >= md->end_subject)
3960 {
3961 SCHECK_PARTIAL();
3962 MRRETURN(MATCH_NOMATCH);
3963 }
3964 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3965 MRRETURN(MATCH_NOMATCH);
3966 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3967 }
3968 break;
3969
3970 case OP_WORDCHAR:
3971 for (i = 1; i <= min; i++)
3972 {
3973 if (eptr >= md->end_subject)
3974 {
3975 SCHECK_PARTIAL();
3976 MRRETURN(MATCH_NOMATCH);
3977 }
3978 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3979 MRRETURN(MATCH_NOMATCH);
3980 /* No need to skip more bytes - we know it's a 1-byte character */
3981 }
3982 break;
3983
3984 default:
3985 RRETURN(PCRE_ERROR_INTERNAL);
3986 } /* End switch(ctype) */
3987
3988 else
3989 #endif /* SUPPORT_UTF8 */
3990
3991 /* Code for the non-UTF-8 case for minimum matching of operators other
3992 than OP_PROP and OP_NOTPROP. */
3993
3994 switch(ctype)
3995 {
3996 case OP_ANY:
3997 for (i = 1; i <= min; i++)
3998 {
3999 if (eptr >= md->end_subject)
4000 {
4001 SCHECK_PARTIAL();
4002 MRRETURN(MATCH_NOMATCH);
4003 }
4004 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4005 eptr++;
4006 }
4007 break;
4008
4009 case OP_ALLANY:
4010 if (eptr > md->end_subject - min)
4011 {
4012 SCHECK_PARTIAL();
4013 MRRETURN(MATCH_NOMATCH);
4014 }
4015 eptr += min;
4016 break;
4017
4018 case OP_ANYBYTE:
4019 if (eptr > md->end_subject - min)
4020 {
4021 SCHECK_PARTIAL();
4022 MRRETURN(MATCH_NOMATCH);
4023 }
4024 eptr += min;
4025 break;
4026
4027 case OP_ANYNL:
4028 for (i = 1; i <= min; i++)
4029 {
4030 if (eptr >= md->end_subject)
4031 {
4032 SCHECK_PARTIAL();
4033 MRRETURN(MATCH_NOMATCH);
4034 }
4035 switch(*eptr++)
4036 {
4037 default: MRRETURN(MATCH_NOMATCH);
4038 case 0x000d:
4039 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4040 break;
4041 case 0x000a:
4042 break;
4043
4044 case 0x000b:
4045 case 0x000c:
4046 case 0x0085:
4047 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4048 break;
4049 }
4050 }
4051 break;
4052
4053 case OP_NOT_HSPACE:
4054 for (i = 1; i <= min; i++)
4055 {
4056 if (eptr >= md->end_subject)
4057 {
4058 SCHECK_PARTIAL();
4059 MRRETURN(MATCH_NOMATCH);
4060 }
4061 switch(*eptr++)
4062 {
4063 default: break;
4064 case 0x09: /* HT */
4065 case 0x20: /* SPACE */
4066 case 0xa0: /* NBSP */
4067 MRRETURN(MATCH_NOMATCH);
4068 }
4069 }
4070 break;
4071
4072 case OP_HSPACE:
4073 for (i = 1; i <= min; i++)
4074 {
4075 if (eptr >= md->end_subject)
4076 {
4077 SCHECK_PARTIAL();
4078 MRRETURN(MATCH_NOMATCH);
4079 }
4080 switch(*eptr++)
4081 {
4082 default: MRRETURN(MATCH_NOMATCH);
4083 case 0x09: /* HT */
4084 case 0x20: /* SPACE */
4085 case 0xa0: /* NBSP */
4086 break;
4087 }
4088 }
4089 break;
4090
4091 case OP_NOT_VSPACE:
4092 for (i = 1; i <= min; i++)
4093 {
4094 if (eptr >= md->end_subject)
4095 {
4096 SCHECK_PARTIAL();
4097 MRRETURN(MATCH_NOMATCH);
4098 }
4099 switch(*eptr++)
4100 {
4101 default: break;
4102 case 0x0a: /* LF */
4103 case 0x0b: /* VT */
4104 case 0x0c: /* FF */
4105 case 0x0d: /* CR */
4106 case 0x85: /* NEL */
4107 MRRETURN(MATCH_NOMATCH);
4108 }
4109 }
4110 break;
4111
4112 case OP_VSPACE:
4113 for (i = 1; i <= min; i++)
4114 {
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 MRRETURN(MATCH_NOMATCH);
4119 }
4120 switch(*eptr++)
4121 {
4122 default: MRRETURN(MATCH_NOMATCH);
4123 case 0x0a: /* LF */
4124 case 0x0b: /* VT */
4125 case 0x0c: /* FF */
4126 case 0x0d: /* CR */
4127 case 0x85: /* NEL */
4128 break;
4129 }
4130 }
4131 break;
4132
4133 case OP_NOT_DIGIT:
4134 for (i = 1; i <= min; i++)
4135 {
4136 if (eptr >= md->end_subject)
4137 {
4138 SCHECK_PARTIAL();
4139 MRRETURN(MATCH_NOMATCH);
4140 }
4141 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4142 }
4143 break;
4144
4145 case OP_DIGIT:
4146 for (i = 1; i <= min; i++)
4147 {
4148 if (eptr >= md->end_subject)
4149 {
4150 SCHECK_PARTIAL();
4151 MRRETURN(MATCH_NOMATCH);
4152 }
4153 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4154 }
4155 break;
4156
4157 case OP_NOT_WHITESPACE:
4158 for (i = 1; i <= min; i++)
4159 {
4160 if (eptr >= md->end_subject)
4161 {
4162 SCHECK_PARTIAL();
4163 MRRETURN(MATCH_NOMATCH);
4164 }
4165 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4166 }
4167 break;
4168
4169 case OP_WHITESPACE:
4170 for (i = 1; i <= min; i++)
4171 {
4172 if (eptr >= md->end_subject)
4173 {
4174 SCHECK_PARTIAL();
4175 MRRETURN(MATCH_NOMATCH);
4176 }
4177 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4178 }
4179 break;
4180
4181 case OP_NOT_WORDCHAR:
4182 for (i = 1; i <= min; i++)
4183 {
4184 if (eptr >= md->end_subject)
4185 {
4186 SCHECK_PARTIAL();
4187 MRRETURN(MATCH_NOMATCH);
4188 }
4189 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4190 MRRETURN(MATCH_NOMATCH);
4191 }
4192 break;
4193
4194 case OP_WORDCHAR:
4195 for (i = 1; i <= min; i++)
4196 {
4197 if (eptr >= md->end_subject)
4198 {
4199 SCHECK_PARTIAL();
4200 MRRETURN(MATCH_NOMATCH);
4201 }
4202 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4203 MRRETURN(MATCH_NOMATCH);
4204 }
4205 break;
4206
4207 default:
4208 RRETURN(PCRE_ERROR_INTERNAL);
4209 }
4210 }
4211
4212 /* If min = max, continue at the same level without recursing */
4213
4214 if (min == max) continue;
4215
4216 /* If minimizing, we have to test the rest of the pattern before each
4217 subsequent match. Again, separate the UTF-8 case for speed, and also
4218 separate the UCP cases. */
4219
4220 if (minimize)
4221 {
4222 #ifdef SUPPORT_UCP
4223 if (prop_type >= 0)
4224 {
4225 switch(prop_type)
4226 {
4227 case PT_ANY:
4228 for (fi = min;; fi++)
4229 {
4230 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4231 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4232 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4233 if (eptr >= md->end_subject)
4234 {
4235 SCHECK_PARTIAL();
4236 MRRETURN(MATCH_NOMATCH);
4237 }
4238 GETCHARINCTEST(c, eptr);
4239 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4240 }
4241 /* Control never gets here */
4242
4243 case PT_LAMP:
4244 for (fi = min;; fi++)
4245 {
4246 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4247 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4248 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4249 if (eptr >= md->end_subject)
4250 {
4251 SCHECK_PARTIAL();
4252 MRRETURN(MATCH_NOMATCH);
4253 }
4254 GETCHARINCTEST(c, eptr);
4255 prop_chartype = UCD_CHARTYPE(c);
4256 if ((prop_chartype == ucp_Lu ||
4257 prop_chartype == ucp_Ll ||
4258 prop_chartype == ucp_Lt) == prop_fail_result)
4259 MRRETURN(MATCH_NOMATCH);
4260 }
4261 /* Control never gets here */
4262
4263 case PT_GC:
4264 for (fi = min;; fi++)
4265 {
4266 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4267 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4268 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4269 if (eptr >= md->end_subject)
4270 {
4271 SCHECK_PARTIAL();
4272 MRRETURN(MATCH_NOMATCH);
4273 }
4274 GETCHARINCTEST(c, eptr);
4275 prop_category = UCD_CATEGORY(c);
4276 if ((prop_category == prop_value) == prop_fail_result)
4277 MRRETURN(MATCH_NOMATCH);
4278 }
4279 /* Control never gets here */
4280
4281 case PT_PC:
4282 for (fi = min;; fi++)
4283 {
4284 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4285 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4286 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4287 if (eptr >= md->end_subject)
4288 {
4289 SCHECK_PARTIAL();
4290 MRRETURN(MATCH_NOMATCH);
4291 }
4292 GETCHARINCTEST(c, eptr);
4293 prop_chartype = UCD_CHARTYPE(c);
4294 if ((prop_chartype == prop_value) == prop_fail_result)
4295 MRRETURN(MATCH_NOMATCH);
4296 }
4297 /* Control never gets here */
4298
4299 case PT_SC:
4300 for (fi = min;; fi++)
4301 {
4302 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4303 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4304 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4305 if (eptr >= md->end_subject)
4306 {
4307 SCHECK_PARTIAL();
4308 MRRETURN(MATCH_NOMATCH);
4309 }
4310 GETCHARINCTEST(c, eptr);
4311 prop_script = UCD_SCRIPT(c);
4312 if ((prop_script == prop_value) == prop_fail_result)
4313 MRRETURN(MATCH_NOMATCH);
4314 }
4315 /* Control never gets here */
4316
4317 case PT_ALNUM:
4318 for (fi = min;; fi++)
4319 {
4320 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4321 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4322 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 MRRETURN(MATCH_NOMATCH);
4327 }
4328 GETCHARINCTEST(c, eptr);
4329 prop_category = UCD_CATEGORY(c);
4330 if ((prop_category == ucp_L || prop_category == ucp_N)
4331 == prop_fail_result)
4332 MRRETURN(MATCH_NOMATCH);
4333 }
4334 /* Control never gets here */
4335
4336 case PT_SPACE: /* Perl space */
4337 for (fi = min;; fi++)
4338 {
4339 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4340 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4341 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4342 if (eptr >= md->end_subject)
4343 {
4344 SCHECK_PARTIAL();
4345 MRRETURN(MATCH_NOMATCH);
4346 }
4347 GETCHARINCTEST(c, eptr);
4348 prop_category = UCD_CATEGORY(c);
4349 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4350 c == CHAR_FF || c == CHAR_CR)
4351 == prop_fail_result)
4352 MRRETURN(MATCH_NOMATCH);
4353 }
4354 /* Control never gets here */
4355
4356 case PT_PXSPACE: /* POSIX space */
4357 for (fi = min;; fi++)
4358 {
4359 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4360 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4361 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4362 if (eptr >= md->end_subject)
4363 {
4364 SCHECK_PARTIAL();
4365 MRRETURN(MATCH_NOMATCH);
4366 }
4367 GETCHARINCTEST(c, eptr);
4368 prop_category = UCD_CATEGORY(c);
4369 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4370 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4371 == prop_fail_result)
4372 MRRETURN(MATCH_NOMATCH);
4373 }
4374 /* Control never gets here */
4375
4376 case PT_WORD:
4377 for (fi = min;; fi++)
4378 {
4379 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4380 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4381 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4382 if (eptr >= md->end_subject)
4383 {
4384 SCHECK_PARTIAL();
4385 MRRETURN(MATCH_NOMATCH);
4386 }
4387 GETCHARINCTEST(c, eptr);
4388 prop_category = UCD_CATEGORY(c);
4389 if ((prop_category == ucp_L ||
4390 prop_category == ucp_N ||
4391 c == CHAR_UNDERSCORE)
4392 == prop_fail_result)
4393 MRRETURN(MATCH_NOMATCH);
4394 }
4395 /* Control never gets here */
4396
4397 /* This should never occur */
4398
4399 default:
4400 RRETURN(PCRE_ERROR_INTERNAL);
4401 }
4402 }
4403
4404 /* Match extended Unicode sequences. We will get here only if the
4405 support is in the binary; otherwise a compile-time error occurs. */
4406
4407 else if (ctype == OP_EXTUNI)
4408 {
4409 for (fi = min;; fi++)
4410 {
4411 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4412 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4413 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4414 if (eptr >= md->end_subject)
4415 {
4416 SCHECK_PARTIAL();
4417 MRRETURN(MATCH_NOMATCH);
4418 }
4419 GETCHARINCTEST(c, eptr);
4420 prop_category = UCD_CATEGORY(c);
4421 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4422 while (eptr < md->end_subject)
4423 {
4424 int len = 1;
4425 if (!utf8) c = *eptr;
4426 else { GETCHARLEN(c, eptr, len); }
4427 prop_category = UCD_CATEGORY(c);
4428 if (prop_category != ucp_M) break;
4429 eptr += len;
4430 }
4431 }
4432 }
4433
4434 else
4435 #endif /* SUPPORT_UCP */
4436
4437 #ifdef SUPPORT_UTF8
4438 /* UTF-8 mode */
4439 if (utf8)
4440 {
4441 for (fi = min;; fi++)
4442 {
4443 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4445 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4446 if (eptr >= md->end_subject)
4447 {
4448 SCHECK_PARTIAL();
4449 MRRETURN(MATCH_NOMATCH);
4450 }
4451 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4452 MRRETURN(MATCH_NOMATCH);
4453 GETCHARINC(c, eptr);
4454 switch(ctype)
4455 {
4456 case OP_ANY: /* This is the non-NL case */
4457 case OP_ALLANY:
4458 case OP_ANYBYTE:
4459 break;
4460
4461 case OP_ANYNL:
4462 switch(c)
4463 {
4464 default: MRRETURN(MATCH_NOMATCH);
4465 case 0x000d:
4466 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4467 break;
4468 case 0x000a:
4469 break;
4470
4471 case 0x000b:
4472 case 0x000c:
4473 case 0x0085:
4474 case 0x2028:
4475 case 0x2029:
4476 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4477 break;
4478 }
4479 break;
4480
4481 case OP_NOT_HSPACE:
4482 switch(c)
4483 {
4484 default: break;
4485 case 0x09: /* HT */
4486 case 0x20: /* SPACE */
4487 case 0xa0: /* NBSP */
4488 case 0x1680: /* OGHAM SPACE MARK */
4489 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4490 case 0x2000: /* EN QUAD */
4491 case 0x2001: /* EM QUAD */
4492 case 0x2002: /* EN SPACE */
4493 case 0x2003: /* EM SPACE */
4494 case 0x2004: /* THREE-PER-EM SPACE */
4495 case 0x2005: /* FOUR-PER-EM SPACE */
4496 case 0x2006: /* SIX-PER-EM SPACE */
4497 case 0x2007: /* FIGURE SPACE */
4498 case 0x2008: /* PUNCTUATION SPACE */
4499 case 0x2009: /* THIN SPACE */
4500 case 0x200A: /* HAIR SPACE */
4501 case 0x202f: /* NARROW NO-BREAK SPACE */
4502 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4503 case 0x3000: /* IDEOGRAPHIC SPACE */
4504 MRRETURN(MATCH_NOMATCH);
4505 }
4506 break;
4507
4508 case OP_HSPACE:
4509 switch(c)
4510 {
4511 default: MRRETURN(MATCH_NOMATCH);
4512 case 0x09: /* HT */
4513 case 0x20: /* SPACE */
4514 case 0xa0: /* NBSP */
4515 case 0x1680: /* OGHAM SPACE MARK */
4516 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4517 case 0x2000: /* EN QUAD */
4518 case 0x2001: /* EM QUAD */
4519 case 0x2002: /* EN SPACE */
4520 case 0x2003: /* EM SPACE */
4521 case 0x2004: /* THREE-PER-EM SPACE */
4522 case 0x2005: /* FOUR-PER-EM SPACE */
4523 case 0x2006: /* SIX-PER-EM SPACE */
4524 case 0x2007: /* FIGURE SPACE */
4525 case 0x2008: /* PUNCTUATION SPACE */
4526 case 0x2009: /* THIN SPACE */
4527 case 0x200A: /* HAIR SPACE */
4528 case 0x202f: /* NARROW NO-BREAK SPACE */
4529 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4530 case 0x3000: /* IDEOGRAPHIC SPACE */
4531 break;
4532 }
4533 break;
4534
4535 case OP_NOT_VSPACE:
4536 switch(c)
4537 {
4538 default: break;
4539 case 0x0a: /* LF */
4540 case 0x0b: /* VT */
4541 case 0x0c: /* FF */
4542 case 0x0d: /* CR */
4543 case 0x85: /* NEL */
4544 case 0x2028: /* LINE SEPARATOR */
4545 case 0x2029: /* PARAGRAPH SEPARATOR */
4546 MRRETURN(MATCH_NOMATCH);
4547 }
4548 break;
4549
4550 case OP_VSPACE:
4551 switch(c)
4552 {
4553 default: MRRETURN(MATCH_NOMATCH);
4554 case 0x0a: /* LF */
4555 case 0x0b: /* VT */
4556 case 0x0c: /* FF */
4557 case 0x0d: /* CR */
4558 case 0x85: /* NEL */
4559 case 0x2028: /* LINE SEPARATOR */
4560 case 0x2029: /* PARAGRAPH SEPARATOR */
4561 break;
4562 }
4563 break;
4564
4565 case OP_NOT_DIGIT:
4566 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4567 MRRETURN(MATCH_NOMATCH);
4568 break;
4569
4570 case OP_DIGIT:
4571 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4572 MRRETURN(MATCH_NOMATCH);
4573 break;
4574
4575 case OP_NOT_WHITESPACE:
4576 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4577 MRRETURN(MATCH_NOMATCH);
4578 break;
4579
4580 case OP_WHITESPACE:
4581 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4582 MRRETURN(MATCH_NOMATCH);
4583 break;
4584
4585 case OP_NOT_WORDCHAR:
4586 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4587 MRRETURN(MATCH_NOMATCH);
4588 break;
4589
4590 case OP_WORDCHAR:
4591 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4592 MRRETURN(MATCH_NOMATCH);
4593 break;
4594
4595 default:
4596 RRETURN(PCRE_ERROR_INTERNAL);
4597 }
4598 }
4599 }
4600 else
4601 #endif
4602 /* Not UTF-8 mode */
4603 {
4604 for (fi = min;; fi++)
4605 {
4606 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4608 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4609 if (eptr >= md->end_subject)
4610 {
4611 SCHECK_PARTIAL();
4612 MRRETURN(MATCH_NOMATCH);
4613 }
4614 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4615 MRRETURN(MATCH_NOMATCH);
4616 c = *eptr++;
4617 switch(ctype)
4618 {
4619 case OP_ANY: /* This is the non-NL case */
4620 case OP_ALLANY:
4621 case OP_ANYBYTE:
4622 break;
4623
4624 case OP_ANYNL:
4625 switch(c)
4626 {
4627 default: MRRETURN(MATCH_NOMATCH);
4628 case 0x000d:
4629 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4630 break;
4631
4632 case 0x000a:
4633 break;
4634
4635 case 0x000b:
4636 case 0x000c:
4637 case 0x0085:
4638 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4639 break;
4640 }
4641 break;
4642
4643 case OP_NOT_HSPACE:
4644 switch(c)
4645 {
4646 default: break;
4647 case 0x09: /* HT */
4648 case 0x20: /* SPACE */
4649 case 0xa0: /* NBSP */
4650 MRRETURN(MATCH_NOMATCH);
4651 }
4652 break;
4653
4654 case OP_HSPACE:
4655 switch(c)
4656 {
4657 default: MRRETURN(MATCH_NOMATCH);
4658 case 0x09: /* HT */
4659 case 0x20: /* SPACE */
4660 case 0xa0: /* NBSP */
4661 break;
4662 }
4663 break;
4664
4665 case OP_NOT_VSPACE:
4666 switch(c)
4667 {
4668 default: break;
4669 case 0x0a: /* LF */
4670 case 0x0b: /* VT */
4671 case 0x0c: /* FF */
4672 case 0x0d: /* CR */
4673 case 0x85: /* NEL */
4674 MRRETURN(MATCH_NOMATCH);
4675 }
4676 break;
4677
4678 case OP_VSPACE:
4679 switch(c)
4680 {
4681 default: MRRETURN(MATCH_NOMATCH);
4682 case 0x0a: /* LF */
4683 case 0x0b: /* VT */
4684 case 0x0c: /* FF */
4685 case 0x0d: /* CR */
4686 case 0x85: /* NEL */
4687 break;
4688 }
4689 break;
4690
4691 case OP_NOT_DIGIT:
4692 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4693 break;
4694
4695 case OP_DIGIT:
4696 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4697 break;
4698
4699 case OP_NOT_WHITESPACE:
4700 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4701 break;
4702
4703 case OP_WHITESPACE:
4704 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4705 break;
4706
4707 case OP_NOT_WORDCHAR:
4708 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4709 break;
4710
4711 case OP_WORDCHAR:
4712 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4713 break;
4714
4715 default:
4716 RRETURN(PCRE_ERROR_INTERNAL);
4717 }
4718 }
4719 }
4720 /* Control never gets here */
4721 }
4722
4723 /* If maximizing, it is worth using inline code for speed, doing the type
4724 test once at the start (i.e. keep it out of the loop). Again, keep the
4725 UTF-8 and UCP stuff separate. */
4726
4727 else
4728 {
4729 pp = eptr; /* Remember where we started */
4730
4731 #ifdef SUPPORT_UCP
4732 if (prop_type >= 0)
4733 {
4734 switch(prop_type)
4735 {
4736 case PT_ANY:
4737 for (i = min; i < max; i++)
4738 {
4739 int len = 1;
4740 if (eptr >= md->end_subject)
4741 {
4742 SCHECK_PARTIAL();
4743 break;
4744 }
4745 GETCHARLENTEST(c, eptr, len);
4746 if (prop_fail_result) break;
4747 eptr+= len;
4748 }
4749 break;
4750
4751 case PT_LAMP:
4752 for (i = min; i < max; i++)
4753 {
4754 int len = 1;
4755 if (eptr >= md->end_subject)
4756 {
4757 SCHECK_PARTIAL();
4758 break;
4759 }
4760 GETCHARLENTEST(c, eptr, len);
4761 prop_chartype = UCD_CHARTYPE(c);
4762 if ((prop_chartype == ucp_Lu ||
4763 prop_chartype == ucp_Ll ||
4764 prop_chartype == ucp_Lt) == prop_fail_result)
4765 break;
4766 eptr+= len;
4767 }
4768 break;
4769
4770 case PT_GC:
4771 for (i = min; i < max; i++)
4772 {
4773 int len = 1;
4774 if (eptr >= md->end_subject)
4775 {
4776 SCHECK_PARTIAL();
4777 break;
4778 }
4779 GETCHARLENTEST(c, eptr, len);
4780 prop_category = UCD_CATEGORY(c);
4781 if ((prop_category == prop_value) == prop_fail_result)
4782 break;
4783 eptr+= len;
4784 }
4785 break;
4786
4787 case PT_PC:
4788 for (i = min; i < max; i++)
4789 {
4790 int len = 1;
4791 if (eptr >= md->end_subject)
4792 {
4793 SCHECK_PARTIAL();
4794 break;
4795 }
4796 GETCHARLENTEST(c, eptr, len);
4797 prop_chartype = UCD_CHARTYPE(c);
4798 if ((prop_chartype == prop_value) == prop_fail_result)
4799 break;
4800 eptr+= len;
4801 }
4802 break;
4803
4804 case PT_SC:
4805 for (i = min; i < max; i++)
4806 {
4807 int len = 1;
4808 if (eptr >= md->end_subject)
4809 {
4810 SCHECK_PARTIAL();
4811 break;
4812 }
4813 GETCHARLENTEST(c, eptr, len);
4814 prop_script = UCD_SCRIPT(c);
4815 if ((prop_script == prop_value) == prop_fail_result)
4816 break;
4817 eptr+= len;
4818 }
4819 break;
4820
4821 case PT_ALNUM:
4822 for (i = min; i < max; i++)
4823 {
4824 int len = 1;
4825 if (eptr >= md->end_subject)
4826 {
4827 SCHECK_PARTIAL();
4828 break;
4829 }
4830 GETCHARLENTEST(c, eptr, len);
4831 prop_category = UCD_CATEGORY(c);
4832 if ((prop_category == ucp_L || prop_category == ucp_N)
4833 == prop_fail_result)
4834 break;
4835 eptr+= len;
4836 }
4837 break;
4838
4839 case PT_SPACE: /* Perl space */
4840 for (i = min; i < max; i++)
4841 {
4842 int len = 1;
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 break;
4847 }
4848 GETCHARLENTEST(c, eptr, len);
4849 prop_category = UCD_CATEGORY(c);
4850 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4851 c == CHAR_FF || c == CHAR_CR)
4852 == prop_fail_result)
4853 break;
4854 eptr+= len;
4855 }
4856 break;
4857
4858 case PT_PXSPACE: /* POSIX space */
4859 for (i = min; i < max; i++)
4860 {
4861 int len = 1;
4862 if (eptr >= md->end_subject)
4863 {
4864 SCHECK_PARTIAL();
4865 break;
4866 }
4867 GETCHARLENTEST(c, eptr, len);
4868 prop_category = UCD_CATEGORY(c);
4869 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4870 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4871 == prop_fail_result)
4872 break;
4873 eptr+= len;
4874 }
4875 break;
4876
4877 case PT_WORD:
4878 for (i = min; i < max; i++)
4879 {
4880 int len = 1;
4881 if (eptr >= md->end_subject)
4882 {
4883 SCHECK_PARTIAL();
4884 break;
4885 }
4886 GETCHARLENTEST(c, eptr, len);
4887 prop_category = UCD_CATEGORY(c);
4888 if ((prop_category == ucp_L || prop_category == ucp_N ||
4889 c == CHAR_UNDERSCORE) == prop_fail_result)
4890 break;
4891 eptr+= len;
4892 }
4893 break;
4894
4895 default:
4896 RRETURN(PCRE_ERROR_INTERNAL);
4897 }
4898
4899 /* eptr is now past the end of the maximum run */
4900
4901 if (possessive) continue;
4902 for(;;)
4903 {
4904 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4906 if (eptr-- == pp) break; /* Stop if tried at original pos */
4907 if (utf8) BACKCHAR(eptr);
4908 }
4909 }
4910
4911 /* Match extended Unicode sequences. We will get here only if the
4912 support is in the binary; otherwise a compile-time error occurs. */
4913
4914 else if (ctype == OP_EXTUNI)
4915 {
4916 for (i = min; i < max; i++)
4917 {
4918 if (eptr >= md->end_subject)
4919 {
4920 SCHECK_PARTIAL();
4921 break;
4922 }
4923 GETCHARINCTEST(c, eptr);
4924 prop_category = UCD_CATEGORY(c);
4925 if (prop_category == ucp_M) break;
4926 while (eptr < md->end_subject)
4927 {
4928 int len = 1;
4929 if (!utf8) c = *eptr; else
4930 {
4931 GETCHARLEN(c, eptr, len);
4932 }
4933 prop_category = UCD_CATEGORY(c);
4934 if (prop_category != ucp_M) break;
4935 eptr += len;
4936 }
4937 }
4938
4939 /* eptr is now past the end of the maximum run */
4940
4941 if (possessive) continue;
4942
4943 for(;;)
4944 {
4945 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4947 if (eptr-- == pp) break; /* Stop if tried at original pos */
4948 for (;;) /* Move back over one extended */
4949 {
4950 int len = 1;
4951 if (!utf8) c = *eptr; else
4952 {
4953 BACKCHAR(eptr);
4954 GETCHARLEN(c, eptr, len);
4955 }
4956 prop_category = UCD_CATEGORY(c);
4957 if (prop_category != ucp_M) break;
4958 eptr--;
4959 }
4960 }
4961 }
4962
4963 else
4964 #endif /* SUPPORT_UCP */
4965
4966 #ifdef SUPPORT_UTF8
4967 /* UTF-8 mode */
4968
4969 if (utf8)
4970 {
4971 switch(ctype)
4972 {
4973 case OP_ANY:
4974 if (max < INT_MAX)
4975 {
4976 for (i = min; i < max; i++)
4977 {
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 break;
4982 }
4983 if (IS_NEWLINE(eptr)) break;
4984 eptr++;
4985 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4986 }
4987 }
4988
4989 /* Handle unlimited UTF-8 repeat */
4990
4991 else
4992 {
4993 for (i = min; i < max; i++)
4994 {
4995 if (eptr >= md->end_subject)
4996 {
4997 SCHECK_PARTIAL();
4998 break;
4999 }
5000 if (IS_NEWLINE(eptr)) break;
5001 eptr++;
5002 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5003 }
5004 }
5005 break;
5006
5007 case OP_ALLANY:
5008 if (max < INT_MAX)
5009 {
5010 for (i = min; i < max; i++)
5011 {
5012 if (eptr >= md->end_subject)
5013 {
5014 SCHECK_PARTIAL();
5015 break;
5016 }
5017 eptr++;
5018 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5019 }
5020 }
5021 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5022 break;
5023
5024 /* The byte case is the same as non-UTF8 */
5025
5026 case OP_ANYBYTE:
5027 c = max - min;
5028 if (c > (unsigned int)(md->end_subject - eptr))
5029 {
5030 eptr = md->end_subject;
5031 SCHECK_PARTIAL();
5032 }
5033 else eptr += c;
5034 break;
5035
5036 case OP_ANYNL:
5037 for (i = min; i < max; i++)
5038 {
5039 int len = 1;
5040 if (eptr >= md->end_subject)
5041 {
5042 SCHECK_PARTIAL();
5043 break;
5044 }
5045 GETCHARLEN(c, eptr, len);
5046 if (c == 0x000d)
5047 {
5048 if (++eptr >= md->end_subject) break;
5049 if (*eptr == 0x000a) eptr++;
5050 }
5051 else
5052 {
5053 if (c != 0x000a &&
5054 (md->bsr_anycrlf ||
5055 (c != 0x000b && c != 0x000c &&
5056 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5057 break;
5058 eptr += len;
5059 }
5060 }
5061 break;
5062
5063 case OP_NOT_HSPACE:
5064 case OP_HSPACE:
5065 for (i = min; i < max; i++)
5066 {
5067 BOOL gotspace;
5068 int len = 1;
5069 if (eptr >= md->end_subject)
5070 {
5071 SCHECK_PARTIAL();
5072 break;
5073 }
5074 GETCHARLEN(c, eptr, len);
5075 switch(c)
5076 {
5077 default: gotspace = FALSE; break;
5078 case 0x09: /* HT */
5079 case 0x20: /* SPACE */
5080 case 0xa0: /* NBSP */
5081 case 0x1680: /* OGHAM SPACE MARK */
5082 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5083 case 0x2000: /* EN QUAD */
5084 case 0x2001: /* EM QUAD */
5085 case 0x2002: /* EN SPACE */
5086 case 0x2003: /* EM SPACE */
5087 case 0x2004: /* THREE-PER-EM SPACE */
5088 case 0x2005: /* FOUR-PER-EM SPACE */
5089 case 0x2006: /* SIX-PER-EM SPACE */
5090 case 0x2007: /* FIGURE SPACE */
5091 case 0x2008: /* PUNCTUATION SPACE */
5092 case 0x2009: /* THIN SPACE */
5093 case 0x200A: /* HAIR SPACE */
5094 case 0x202f: /* NARROW NO-BREAK SPACE */
5095 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5096 case 0x3000: /* IDEOGRAPHIC SPACE */
5097 gotspace = TRUE;
5098 break;
5099 }
5100 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5101 eptr += len;
5102 }
5103 break;
5104
5105 case OP_NOT_VSPACE:
5106 case OP_VSPACE:
5107 for (i = min; i < max; i++)
5108 {
5109 BOOL gotspace;
5110 int len = 1;
5111 if (eptr >= md->end_subject)
5112 {
5113 SCHECK_PARTIAL();
5114 break;
5115 }
5116 GETCHARLEN(c, eptr, len);
5117 switch(c)
5118 {
5119 default: gotspace = FALSE; break;
5120 case 0x0a: /* LF */
5121 case 0x0b: /* VT */
5122 case 0x0c: /* FF */
5123 case 0x0d: /* CR */
5124 case 0x85: /* NEL */
5125 case 0x2028: /* LINE SEPARATOR */
5126 case 0x2029: /* PARAGRAPH SEPARATOR */
5127 gotspace = TRUE;
5128 break;
5129 }
5130 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5131 eptr += len;
5132 }
5133 break;
5134
5135 case OP_NOT_DIGIT:
5136 for (i = min; i < max; i++)
5137 {
5138 int len = 1;
5139 if (eptr >= md->end_subject)
5140 {
5141 SCHECK_PARTIAL();
5142 break;
5143 }
5144 GETCHARLEN(c, eptr, len);
5145 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5146 eptr+= len;
5147 }
5148 break;
5149
5150 case OP_DIGIT:
5151 for (i = min; i < max; i++)
5152 {
5153 int len = 1;
5154 if (eptr >= md->end_subject)
5155 {
5156 SCHECK_PARTIAL();
5157 break;
5158 }
5159 GETCHARLEN(c, eptr, len);
5160 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5161 eptr+= len;
5162 }
5163 break;
5164
5165 case OP_NOT_WHITESPACE:
5166 for (i = min; i < max; i++)
5167 {
5168 int len = 1;
5169 if (eptr >= md->end_subject)
5170 {
5171 SCHECK_PARTIAL();
5172 break;
5173 }
5174 GETCHARLEN(c, eptr, len);
5175 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5176 eptr+= len;
5177 }
5178 break;
5179
5180 case OP_WHITESPACE:
5181 for (i = min; i < max; i++)
5182 {
5183 int len = 1;
5184 if (eptr >= md->end_subject)
5185 {
5186 SCHECK_PARTIAL();
5187 break;
5188 }
5189 GETCHARLEN(c, eptr, len);
5190 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5191 eptr+= len;
5192 }
5193 break;
5194
5195 case OP_NOT_WORDCHAR:
5196 for (i = min; i < max; i++)
5197 {
5198 int len = 1;
5199 if (eptr >= md->end_subject)
5200 {
5201 SCHECK_PARTIAL();
5202 break;
5203 }
5204 GETCHARLEN(c, eptr, len);
5205 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5206 eptr+= len;
5207 }
5208 break;
5209
5210 case OP_WORDCHAR:
5211 for (i = min; i < max; i++)
5212 {
5213 int len = 1;
5214 if (eptr >= md->end_subject)
5215 {
5216 SCHECK_PARTIAL();
5217 break;
5218 }
5219 GETCHARLEN(c, eptr, len);
5220 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5221 eptr+= len;
5222 }
5223 break;
5224
5225 default:
5226 RRETURN(PCRE_ERROR_INTERNAL);
5227 }
5228
5229 /* eptr is now past the end of the maximum run */
5230
5231 if (possessive) continue;
5232 for(;;)
5233 {
5234 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5235 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5236 if (eptr-- == pp) break; /* Stop if tried at original pos */
5237 BACKCHAR(eptr);
5238 }
5239 }
5240 else
5241 #endif /* SUPPORT_UTF8 */
5242
5243 /* Not UTF-8 mode */
5244 {
5245 switch(ctype)
5246 {
5247 case OP_ANY:
5248 for (i = min; i < max; i++)
5249 {
5250 if (eptr >= md->end_subject)
5251 {
5252 SCHECK_PARTIAL();
5253 break;
5254 }
5255 if (IS_NEWLINE(eptr)) break;
5256 eptr++;
5257 }
5258 break;
5259
5260 case OP_ALLANY:
5261 case OP_ANYBYTE:
5262 c = max - min;
5263 if (c > (unsigned int)(md->end_subject - eptr))
5264 {
5265 eptr = md->end_subject;
5266 SCHECK_PARTIAL();
5267 }
5268 else eptr += c;
5269 break;
5270
5271 case OP_ANYNL:
5272 for (i = min; i < max; i++)
5273 {
5274 if (eptr >= md->end_subject)
5275 {
5276 SCHECK_PARTIAL();
5277 break;
5278 }
5279 c = *eptr;
5280 if (c == 0x000d)
5281 {
5282 if (++eptr >= md->end_subject) break;
5283 if (*eptr == 0x000a) eptr++;
5284 }
5285 else
5286 {
5287 if (c != 0x000a &&
5288 (md->bsr_anycrlf ||
5289 (c != 0x000b && c != 0x000c && c != 0x0085)))
5290 break;
5291 eptr++;
5292 }
5293 }
5294 break;
5295
5296 case OP_NOT_HSPACE:
5297 for (i = min; i < max; i++)
5298 {
5299 if (eptr >= md->end_subject)
5300 {
5301 SCHECK_PARTIAL();
5302 break;
5303 }
5304 c = *eptr;
5305 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5306 eptr++;
5307 }
5308 break;
5309
5310 case OP_HSPACE:
5311 for (i = min; i < max; i++)
5312 {
5313 if (eptr >= md->end_subject)
5314 {
5315 SCHECK_PARTIAL();
5316 break;
5317 }
5318 c = *eptr;
5319 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5320 eptr++;
5321 }
5322 break;
5323
5324 case OP_NOT_VSPACE:
5325 for (i = min; i < max; i++)
5326 {
5327 if (eptr >= md->end_subject)
5328 {
5329 SCHECK_PARTIAL();
5330 break;
5331 }
5332 c = *eptr;
5333 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5334 break;
5335 eptr++;
5336 }
5337 break;
5338
5339 case OP_VSPACE:
5340 for (i = min; i < max; i++)
5341 {
5342 if (eptr >= md->end_subject)
5343 {
5344 SCHECK_PARTIAL();
5345 break;
5346 }
5347 c = *eptr;
5348 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5349 break;
5350 eptr++;
5351 }
5352 break;
5353
5354 case OP_NOT_DIGIT:
5355 for (i = min; i < max; i++)
5356 {
5357 if (eptr >= md->end_subject)
5358 {
5359 SCHECK_PARTIAL();
5360 break;
5361 }
5362 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5363 eptr++;
5364 }
5365 break;
5366
5367 case OP_DIGIT:
5368 for (i = min; i < max; i++)
5369 {
5370 if (eptr >= md->end_subject)
5371 {
5372 SCHECK_PARTIAL();
5373 break;
5374 }
5375 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5376 eptr++;
5377 }
5378 break;
5379
5380 case OP_NOT_WHITESPACE:
5381 for (i = min; i < max; i++)
5382 {
5383 if (eptr >= md->end_subject)
5384 {
5385 SCHECK_PARTIAL();
5386 break;
5387 }
5388 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5389 eptr++;
5390 }
5391 break;
5392
5393 case OP_WHITESPACE:
5394 for (i = min; i < max; i++)
5395 {
5396 if (eptr >= md->end_subject)
5397 {
5398 SCHECK_PARTIAL();
5399 break;
5400 }
5401 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5402 eptr++;
5403 }
5404 break;
5405
5406 case OP_NOT_WORDCHAR:
5407 for (i = min; i < max; i++)
5408 {
5409 if (eptr >= md->end_subject)
5410 {
5411 SCHECK_PARTIAL();
5412 break;
5413 }
5414 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5415 eptr++;
5416 }
5417 break;
5418
5419 case OP_WORDCHAR:
5420 for (i = min; i < max; i++)
5421 {
5422 if (eptr >= md->end_subject)
5423 {
5424 SCHECK_PARTIAL();
5425 break;
5426 }
5427 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5428 eptr++;
5429 }
5430 break;
5431
5432 default:
5433 RRETURN(PCRE_ERROR_INTERNAL);
5434 }
5435
5436 /* eptr is now past the end of the maximum run */
5437
5438 if (possessive) continue;
5439 while (eptr >= pp)
5440 {
5441 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5442 eptr--;
5443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5444 }
5445 }
5446
5447 /* Get here if we can't make it match with any permitted repetitions */
5448
5449 MRRETURN(MATCH_NOMATCH);
5450 }
5451 /* Control never gets here */
5452
5453 /* There's been some horrible disaster. Arrival here can only mean there is
5454 something seriously wrong in the code above or the OP_xxx definitions. */
5455
5456 default:
5457 DPRINTF(("Unknown opcode %d\n", *ecode));
5458 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5459 }
5460
5461 /* Do not stick any code in here without much thought; it is assumed
5462 that "continue" in the code above comes out to here to repeat the main
5463 loop. */
5464
5465 } /* End of main loop */
5466 /* Control never reaches here */
5467
5468
5469 /* When compiling to use the heap rather than the stack for recursive calls to
5470 match(), the RRETURN() macro jumps here. The number that is saved in
5471 frame->Xwhere indicates which label we actually want to return to. */
5472
5473 #ifdef NO_RECURSE
5474 #define LBL(val) case val: goto L_RM##val;
5475 HEAP_RETURN:
5476 switch (frame->Xwhere)
5477 {
5478 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5479 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5480 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5481 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5482 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5483 #ifdef SUPPORT_UTF8
5484 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5485 LBL(32) LBL(34) LBL(42) LBL(46)
5486 #ifdef SUPPORT_UCP
5487 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5488 LBL(59) LBL(60) LBL(61) LBL(62)
5489 #endif /* SUPPORT_UCP */
5490 #endif /* SUPPORT_UTF8 */
5491 default:
5492 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5493 return PCRE_ERROR_INTERNAL;
5494 }
5495 #undef LBL
5496 #endif /* NO_RECURSE */
5497 }
5498
5499
5500 /***************************************************************************
5501 ****************************************************************************
5502 RECURSION IN THE match() FUNCTION
5503
5504 Undefine all the macros that were defined above to handle this. */
5505
5506 #ifdef NO_RECURSE
5507 #undef eptr
5508 #undef ecode
5509 #undef mstart
5510 #undef offset_top
5511 #undef ims
5512 #undef eptrb
5513 #undef flags
5514
5515 #undef callpat
5516 #undef charptr
5517 #undef data
5518 #undef next
5519 #undef pp
5520 #undef prev
5521 #undef saved_eptr
5522
5523 #undef new_recursive
5524
5525 #undef cur_is_word
5526 #undef condition
5527 #undef prev_is_word
5528
5529 #undef original_ims
5530
5531 #undef ctype
5532 #undef length
5533 #undef max
5534 #undef min
5535 #undef number
5536 #undef offset
5537 #undef op
5538 #undef save_capture_last
5539 #undef save_offset1
5540 #undef save_offset2
5541 #undef save_offset3
5542 #undef stacksave
5543
5544 #undef newptrb
5545
5546 #endif
5547
5548 /* These two are defined as macros in both cases */
5549
5550 #undef fc
5551 #undef fi
5552
5553 /***************************************************************************
5554 ***************************************************************************/
5555
5556
5557
5558 /*************************************************
5559 * Execute a Regular Expression *
5560 *************************************************/
5561
5562 /* This function applies a compiled re to a subject string and picks out
5563 portions of the string if it matches. Two elements in the vector are set for
5564 each substring: the offsets to the start and end of the substring.
5565
5566 Arguments:
5567 argument_re points to the compiled expression
5568 extra_data points to extra data or is NULL
5569 subject points to the subject string
5570 length length of subject string (may contain binary zeros)
5571 start_offset where to start in the subject string
5572 options option bits
5573 offsets points to a vector of ints to be filled in with offsets
5574 offsetcount the number of elements in the vector
5575
5576 Returns: > 0 => success; value is the number of elements filled in
5577 = 0 => success, but offsets is not big enough
5578 -1 => failed to match
5579 < -1 => some kind of unexpected problem
5580 */
5581
5582 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5583 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5584 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5585 int offsetcount)
5586 {
5587 int rc, resetcount, ocount;
5588 int first_byte = -1;
5589 int req_byte = -1;
5590 int req_byte2 = -1;
5591 int newline;
5592 unsigned long int ims;
5593 BOOL using_temporary_offsets = FALSE;
5594 BOOL anchored;
5595 BOOL startline;
5596 BOOL firstline;
5597 BOOL first_byte_caseless = FALSE;
5598 BOOL req_byte_caseless = FALSE;
5599 BOOL utf8;
5600 match_data match_block;
5601 match_data *md = &match_block;
5602 const uschar *tables;
5603 const uschar *start_bits = NULL;
5604 USPTR start_match = (USPTR)subject + start_offset;
5605 USPTR end_subject;
5606 USPTR start_partial = NULL;
5607 USPTR req_byte_ptr = start_match - 1;
5608
5609 pcre_study_data internal_study;
5610 const pcre_study_data *study;
5611
5612 real_pcre internal_re;
5613 const real_pcre *external_re = (const real_pcre *)argument_re;
5614 const real_pcre *re = external_re;
5615
5616 /* Plausibility checks */
5617
5618 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5619 if (re == NULL || subject == NULL ||
5620 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5621 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5622
5623 /* This information is for finding all the numbers associated with a given
5624 name, for condition testing. */
5625
5626 md->name_table = (uschar *)re + re->name_table_offset;
5627 md->name_count = re->name_count;
5628 md->name_entry_size = re->name_entry_size;
5629
5630 /* Fish out the optional data from the extra_data structure, first setting
5631 the default values. */
5632
5633 study = NULL;
5634 md->match_limit = MATCH_LIMIT;
5635 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5636 md->callout_data = NULL;
5637
5638 /* The table pointer is always in native byte order. */
5639
5640 tables = external_re->tables;
5641
5642 if (extra_data != NULL)
5643 {
5644 register unsigned int flags = extra_data->flags;
5645 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5646 study = (const pcre_study_data *)extra_data->study_data;
5647 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5648 md->match_limit = extra_data->match_limit;
5649 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5650 md->match_limit_recursion = extra_data->match_limit_recursion;
5651 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5652 md->callout_data = extra_data->callout_data;
5653 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5654 }
5655
5656 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5657 is a feature that makes it possible to save compiled regex and re-use them
5658 in other programs later. */
5659
5660 if (tables == NULL) tables = _pcre_default_tables;
5661
5662 /* Check that the first field in the block is the magic number. If it is not,
5663 test for a regex that was compiled on a host of opposite endianness. If this is
5664 the case, flipped values are put in internal_re and internal_study if there was
5665 study data too. */
5666
5667 if (re->magic_number != MAGIC_NUMBER)
5668 {
5669 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5670 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5671 if (study != NULL) study = &internal_study;
5672 }
5673
5674 /* Set up other data */
5675
5676 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5677 startline = (re->flags & PCRE_STARTLINE) != 0;
5678 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5679
5680 /* The code starts after the real_pcre block and the capture name table. */
5681
5682 md->start_code = (const uschar *)external_re + re->name_table_offset +
5683 re->name_count * re->name_entry_size;
5684
5685 md->start_subject = (USPTR)subject;
5686 md->start_offset = start_offset;
5687 md->end_subject = md->start_subject + length;
5688 end_subject = md->end_subject;
5689
5690 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5691 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5692 md->use_ucp = (re->options & PCRE_UCP) != 0;
5693 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5694
5695 md->notbol = (options & PCRE_NOTBOL) != 0;
5696 md->noteol = (options & PCRE_NOTEOL) != 0;
5697 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5698 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5699 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5700 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5701 md->hitend = FALSE;
5702 md->mark = NULL; /* In case never set */
5703
5704 md->recursive = NULL; /* No recursion at top level */
5705
5706 md->lcc = tables + lcc_offset;
5707 md->ctypes = tables + ctypes_offset;
5708
5709 /* Handle different \R options. */
5710
5711 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5712 {
5713 case 0:
5714 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5715 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5716 else
5717 #ifdef BSR_ANYCRLF
5718 md->bsr_anycrlf = TRUE;
5719 #else
5720 md->bsr_anycrlf = FALSE;
5721 #endif
5722 break;
5723
5724 case PCRE_BSR_ANYCRLF:
5725 md->bsr_anycrlf = TRUE;
5726 break;
5727
5728 case PCRE_BSR_UNICODE:
5729 md->bsr_anycrlf = FALSE;
5730 break;
5731
5732 default: return PCRE_ERROR_BADNEWLINE;
5733 }
5734
5735 /* Handle different types of newline. The three bits give eight cases. If
5736 nothing is set at run time, whatever was used at compile time applies. */
5737
5738 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5739 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5740 {
5741 case 0: newline = NEWLINE; break; /* Compile-time default */
5742 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5743 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5744 case PCRE_NEWLINE_CR+
5745 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5746 case PCRE_NEWLINE_ANY: newline = -1; break;
5747 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5748 default: return PCRE_ERROR_BADNEWLINE;
5749 }
5750
5751 if (newline == -2)
5752 {
5753 md->nltype = NLTYPE_ANYCRLF;
5754 }
5755 else if (newline < 0)
5756 {
5757 md->nltype = NLTYPE_ANY;
5758 }
5759 else
5760 {
5761 md->nltype = NLTYPE_FIXED;
5762 if (newline > 255)
5763 {
5764 md->nllen = 2;
5765 md->nl[0] = (newline >> 8) & 255;
5766 md->nl[1] = newline & 255;
5767 }
5768 else
5769 {
5770 md->nllen = 1;
5771 md->nl[0] = newline;
5772 }
5773 }
5774
5775 /* Partial matching was originally supported only for a restricted set of
5776 regexes; from release 8.00 there are no restrictions, but the bits are still
5777 defined (though never set). So there's no harm in leaving this code. */
5778
5779 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5780 return PCRE_ERROR_BADPARTIAL;
5781
5782 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5783 back the character offset. */
5784
5785 #ifdef SUPPORT_UTF8
5786 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5787 {
5788 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5789 return PCRE_ERROR_BADUTF8;
5790 if (start_offset > 0 && start_offset < length)
5791 {
5792 int tb = ((USPTR)subject)[start_offset];
5793 if (tb > 127)
5794 {
5795 tb &= 0xc0;
5796 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5797 }
5798 }
5799 }
5800 #endif
5801
5802 /* The ims options can vary during the matching as a result of the presence
5803 of (?ims) items in the pattern. They are kept in a local variable so that
5804 restoring at the exit of a group is easy. */
5805
5806 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5807
5808 /* If the expression has got more back references than the offsets supplied can
5809 hold, we get a temporary chunk of working store to use during the matching.
5810 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5811 of 3. */
5812
5813 ocount = offsetcount - (offsetcount % 3);
5814
5815 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5816 {
5817 ocount = re->top_backref * 3 + 3;
5818 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5819 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5820 using_temporary_offsets = TRUE;
5821 DPRINTF(("Got memory to hold back references\n"));
5822 }
5823 else md->offset_vector = offsets;
5824
5825 md->offset_end = ocount;
5826 md->offset_max = (2*ocount)/3;
5827 md->offset_overflow = FALSE;
5828 md->capture_last = -1;
5829
5830 /* Compute the minimum number of offsets that we need to reset each time. Doing
5831 this makes a huge difference to execution time when there aren't many brackets
5832 in the pattern. */
5833
5834 resetcount = 2 + re->top_bracket * 2;
5835 if (resetcount > offsetcount) resetcount = ocount;
5836
5837 /* Reset the working variable associated with each extraction. These should
5838 never be used unless previously set, but they get saved and restored, and so we
5839 initialize them to avoid reading uninitialized locations. */
5840
5841 if (md->offset_vector != NULL)
5842 {
5843 register int *iptr = md->offset_vector + ocount;
5844 register int *iend = iptr - resetcount/2 + 1;
5845 while (--iptr >= iend) *iptr = -1;
5846 }
5847
5848 /* Set up the first character to match, if available. The first_byte value is
5849 never set for an anchored regular expression, but the anchoring may be forced
5850 at run time, so we have to test for anchoring. The first char may be unset for
5851 an unanchored pattern, of course. If there's no first char and the pattern was
5852 studied, there may be a bitmap of possible first characters. */
5853
5854 if (!anchored)
5855 {
5856 if ((re->flags & PCRE_FIRSTSET) != 0)
5857 {
5858 first_byte = re->first_byte & 255;
5859 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5860 first_byte = md->lcc[first_byte];
5861 }
5862 else
5863 if (!startline && study != NULL &&
5864 (study->flags & PCRE_STUDY_MAPPED) != 0)
5865 start_bits = study->start_bits;
5866 }
5867
5868 /* For anchored or unanchored matches, there may be a "last known required
5869 character" set. */
5870
5871 if ((re->flags & PCRE_REQCHSET) != 0)
5872 {
5873 req_byte = re->req_byte & 255;
5874 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5875 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5876 }
5877
5878
5879 /* ==========================================================================*/
5880
5881 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5882 the loop runs just once. */
5883
5884 for(;;)
5885 {
5886 USPTR save_end_subject = end_subject;
5887 USPTR new_start_match;
5888
5889 /* Reset the maximum number of extractions we might see. */
5890
5891 if (md->offset_vector != NULL)
5892 {
5893 register int *iptr = md->offset_vector;
5894 register int *iend = iptr + resetcount;
5895 while (iptr < iend) *iptr++ = -1;
5896 }
5897
5898 /* If firstline is TRUE, the start of the match is constrained to the first
5899 line of a multiline string. That is, the match must be before or at the first
5900 newline. Implement this by temporarily adjusting end_subject so that we stop
5901 scanning at a newline. If the match fails at the newline, later code breaks
5902 this loop. */
5903
5904 if (firstline)
5905 {
5906 USPTR t = start_match;
5907 #ifdef SUPPORT_UTF8
5908 if (utf8)
5909 {
5910 while (t < md->end_subject && !IS_NEWLINE(t))
5911 {
5912 t++;
5913 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5914 }
5915 }
5916 else
5917 #endif
5918 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5919 end_subject = t;
5920 }
5921
5922 /* There are some optimizations that avoid running the match if a known
5923 starting point is not found, or if a known later character is not present.
5924 However, there is an option that disables these, for testing and for ensuring
5925 that all callouts do actually occur. */
5926
5927 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5928 {
5929 /* Advance to a unique first byte if there is one. */
5930
5931 if (first_byte >= 0)
5932 {
5933 if (first_byte_caseless)
5934 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5935 start_match++;
5936 else
5937 while (start_match < end_subject && *start_match != first_byte)
5938 start_match++;
5939 }
5940
5941 /* Or to just after a linebreak for a multiline match */
5942
5943 else if (startline)
5944 {
5945 if (start_match > md->start_subject + start_offset)
5946 {
5947 #ifdef SUPPORT_UTF8
5948 if (utf8)
5949 {
5950 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5951 {
5952 start_match++;
5953 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5954 start_match++;
5955 }
5956 }
5957 else
5958 #endif
5959 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5960 start_match++;
5961
5962 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5963 and we are now at a LF, advance the match position by one more character.
5964 */
5965
5966 if (start_match[-1] == CHAR_CR &&
5967 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5968 start_match < end_subject &&
5969 *start_match == CHAR_NL)
5970 start_match++;
5971 }
5972 }
5973
5974 /* Or to a non-unique first byte after study */
5975
5976 else if (start_bits != NULL)
5977 {
5978 while (start_match < end_subject)
5979 {
5980 register unsigned int c = *start_match;
5981 if ((start_bits[c/8] & (1 << (c&7))) == 0)
5982 {
5983 start_match++;
5984 #ifdef SUPPORT_UTF8
5985 if (utf8)
5986 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5987 start_match++;
5988 #endif
5989 }
5990 else break;
5991 }
5992 }
5993 } /* Starting optimizations */
5994
5995 /* Restore fudged end_subject */
5996
5997 end_subject = save_end_subject;
5998
5999 /* The following two optimizations are disabled for partial matching or if
6000 disabling is explicitly requested. */
6001
6002 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6003 {
6004 /* If the pattern was studied, a minimum subject length may be set. This is
6005 a lower bound; no actual string of that length may actually match the
6006 pattern. Although the value is, strictly, in characters, we treat it as
6007 bytes to avoid spending too much time in this optimization. */
6008
6009 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6010 (pcre_uint32)(end_subject - start_match) < study->minlength)
6011 {
6012 rc = MATCH_NOMATCH;
6013 break;
6014 }
6015
6016 /* If req_byte is set, we know that that character must appear in the
6017 subject for the match to succeed. If the first character is set, req_byte
6018 must be later in the subject; otherwise the test starts at the match point.
6019 This optimization can save a huge amount of backtracking in patterns with
6020 nested unlimited repeats that aren't going to match. Writing separate code
6021 for cased/caseless versions makes it go faster, as does using an
6022 autoincrement and backing off on a match.
6023
6024 HOWEVER: when the subject string is very, very long, searching to its end
6025 can take a long time, and give bad performance on quite ordinary patterns.
6026 This showed up when somebody was matching something like /^\d+C/ on a
6027 32-megabyte string... so we don't do this when the string is sufficiently
6028 long. */
6029
6030 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6031 {
6032 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6033
6034 /* We don't need to repeat the search if we haven't yet reached the
6035 place we found it at last time. */
6036
6037 if (p > req_byte_ptr)
6038 {
6039 if (req_byte_caseless)
6040 {
6041 while (p < end_subject)
6042 {
6043 register int pp = *p++;
6044 if (pp == req_byte || pp == req_byte2) { p--; break; }
6045 }
6046 }
6047 else
6048 {
6049 while (p < end_subject)
6050 {
6051 if (*p++ == req_byte) { p--; break; }
6052 }
6053 }
6054
6055 /* If we can't find the required character, break the matching loop,
6056 forcing a match failure. */
6057
6058 if (p >= end_subject)
6059 {
6060 rc = MATCH_NOMATCH;
6061 break;
6062 }
6063
6064 /* If we have found the required character, save the point where we
6065 found it, so that we don't search again next time round the loop if
6066 the start hasn't passed this character yet. */
6067
6068 req_byte_ptr = p;
6069 }
6070 }
6071 }
6072
6073 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6074 printf(">>>> Match against: ");
6075 pchars(start_match, end_subject - start_match, TRUE, md);
6076 printf("\n");
6077 #endif
6078
6079 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6080 first starting point for which a partial match was found. */
6081
6082 md->start_match_ptr = start_match;
6083 md->start_used_ptr = start_match;
6084 md->match_call_count = 0;
6085 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6086 0, 0);
6087 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6088
6089 switch(rc)
6090 {
6091 /* SKIP passes back the next starting point explicitly, but if it is the
6092 same as the match we have just done, treat it as NOMATCH. */
6093
6094 case MATCH_SKIP:
6095 if (md->start_match_ptr != start_match)
6096 {
6097 new_start_match = md->start_match_ptr;
6098 break;
6099 }
6100 /* Fall through */
6101
6102 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6103 the SKIP's arg was not found. We also treat this as NOMATCH. */
6104
6105 case MATCH_SKIP_ARG:
6106 /* Fall through */
6107
6108 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6109 exactly like PRUNE. */
6110
6111 case MATCH_NOMATCH:
6112 case MATCH_PRUNE:
6113 case MATCH_THEN:
6114 new_start_match = start_match + 1;
6115 #ifdef SUPPORT_UTF8
6116 if (utf8)
6117 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6118 new_start_match++;
6119 #endif
6120 break;
6121
6122 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6123
6124 case MATCH_COMMIT:
6125 rc = MATCH_NOMATCH;
6126 goto ENDLOOP;
6127
6128 /* Any other return is either a match, or some kind of error. */
6129
6130 default:
6131 goto ENDLOOP;
6132 }
6133
6134 /* Control reaches here for the various types of "no match at this point"
6135 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6136
6137 rc = MATCH_NOMATCH;
6138
6139 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6140 newline in the subject (though it may continue over the newline). Therefore,
6141 if we have just failed to match, starting at a newline, do not continue. */
6142
6143 if (firstline && IS_NEWLINE(start_match)) break;
6144
6145 /* Advance to new matching position */
6146
6147 start_match = new_start_match;
6148
6149 /* Break the loop if the pattern is anchored or if we have passed the end of
6150 the subject. */
6151
6152 if (anchored || start_match > end_subject) break;
6153
6154 /* If we have just passed a CR and we are now at a LF, and the pattern does
6155 not contain any explicit matches for \r or \n, and the newline option is CRLF
6156 or ANY or ANYCRLF, advance the match position by one more character. */
6157
6158 if (start_match[-1] == CHAR_CR &&
6159 start_match < end_subject &&
6160 *start_match == CHAR_NL &&
6161 (re->flags & PCRE_HASCRORLF) == 0 &&
6162 (md->nltype == NLTYPE_ANY ||
6163 md->nltype == NLTYPE_ANYCRLF ||
6164 md->nllen == 2))
6165 start_match++;
6166
6167 md->mark = NULL; /* Reset for start of next match attempt */
6168 } /* End of for(;;) "bumpalong" loop */
6169
6170 /* ==========================================================================*/
6171
6172 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6173 conditions is true:
6174
6175 (1) The pattern is anchored or the match was failed by (*COMMIT);
6176
6177 (2) We are past the end of the subject;
6178
6179 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6180 this option requests that a match occur at or before the first newline in
6181 the subject.
6182
6183 When we have a match and the offset vector is big enough to deal with any
6184 backreferences, captured substring offsets will already be set up. In the case
6185 where we had to get some local store to hold offsets for backreference
6186 processing, copy those that we can. In this case there need not be overflow if
6187 certain parts of the pattern were not used, even though there are more
6188 capturing parentheses than vector slots. */
6189
6190 ENDLOOP:
6191
6192 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6193 {
6194 if (using_temporary_offsets)
6195 {
6196 if (offsetcount >= 4)
6197 {
6198 memcpy(offsets + 2, md->offset_vector + 2,
6199 (offsetcount - 2) * sizeof(int));
6200 DPRINTF(("Copied offsets from temporary memory\n"));
6201 }
6202 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6203 DPRINTF(("Freeing temporary memory\n"));
6204 (pcre_free)(md->offset_vector);
6205 }
6206
6207 /* Set the return code to the number of captured strings, or 0 if there are
6208 too many to fit into the vector. */
6209
6210 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6211
6212 /* If there is space, set up the whole thing as substring 0. The value of
6213 md->start_match_ptr might be modified if \K was encountered on the success
6214 matching path. */
6215
6216 if (offsetcount < 2) rc = 0; else
6217 {
6218 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6219 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6220 }
6221
6222 DPRINTF((">>>> returning %d\n", rc));
6223 goto RETURN_MARK;
6224 }
6225
6226 /* Control gets here if there has been an error, or if the overall match
6227 attempt has failed at all permitted starting positions. */
6228
6229 if (using_temporary_offsets)
6230 {
6231 DPRINTF(("Freeing temporary memory\n"));
6232 (pcre_free)(md->offset_vector);
6233 }
6234
6235 /* For anything other than nomatch or partial match, just return the code. */
6236
6237 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6238 {
6239 DPRINTF((">>>> error: returning %d\n", rc));
6240 return rc;
6241 }
6242
6243 /* Handle partial matches - disable any mark data */
6244
6245 if (start_partial != NULL)
6246 {
6247 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6248 md->mark = NULL;
6249 if (offsetcount > 1)
6250 {
6251 offsets[0] = (int)(start_partial - (USPTR)subject);
6252 offsets[1] = (int)(end_subject - (USPTR)subject);
6253 }
6254 rc = PCRE_ERROR_PARTIAL;
6255 }
6256
6257 /* This is the classic nomatch case */
6258
6259 else
6260 {
6261 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6262 rc = PCRE_ERROR_NOMATCH;
6263 }
6264
6265 /* Return the MARK data if it has been requested. */
6266
6267 RETURN_MARK:
6268
6269 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6270 *(extra_data->mark) = (unsigned char *)(md->mark);
6271 return rc;
6272 }
6273
6274 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5