/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1135 - (show annotations)
Thu Oct 18 18:35:37 2012 UTC (7 years ago) by chpe
File MIME type: text/plain
File size: 212066 byte(s)
pcre32: Fix signed-unsigned compare
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95 #ifdef PCRE_DEBUG
96 /*************************************************
97 * Debugging function to print chars *
98 *************************************************/
99
100 /* Print a sequence of chars in printable format, stopping at the end of the
101 subject if the requested.
102
103 Arguments:
104 p points to characters
105 length number to print
106 is_subject TRUE if printing from within md->start_subject
107 md pointer to matching data block, if is_subject is TRUE
108
109 Returns: nothing
110 */
111
112 static void
113 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
114 {
115 pcre_uint32 c;
116 BOOL utf = md->utf;
117 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
118 while (length-- > 0)
119 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
120 }
121 #endif
122
123
124
125 /*************************************************
126 * Match a back-reference *
127 *************************************************/
128
129 /* Normally, if a back reference hasn't been set, the length that is passed is
130 negative, so the match always fails. However, in JavaScript compatibility mode,
131 the length passed is zero. Note that in caseless UTF-8 mode, the number of
132 subject bytes matched may be different to the number of reference bytes.
133
134 Arguments:
135 offset index into the offset vector
136 eptr pointer into the subject
137 length length of reference to be matched (number of bytes)
138 md points to match data block
139 caseless TRUE if caseless
140
141 Returns: >= 0 the number of subject bytes matched
142 -1 no match
143 -2 partial match; always given if at end subject
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152 BOOL utf = md->utf;
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if reference not set (and not JavaScript compatible - in that
168 case the length is passed as zero). */
169
170 if (length < 0) return -1;
171
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
175
176 if (caseless)
177 {
178 #ifdef SUPPORT_UTF
179 #ifdef SUPPORT_UCP
180 if (utf)
181 {
182 /* Match characters up to the end of the reference. NOTE: the number of
183 data units matched may differ, because in UTF-8 there are some characters
184 whose upper and lower case versions code have different numbers of bytes.
185 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
186 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
187 sequence of two of the latter. It is important, therefore, to check the
188 length along the reference, not along the subject (earlier code did this
189 wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 pcre_uint32 c, d;
195 const ucd_record *ur;
196 if (eptr >= md->end_subject) return -2; /* Partial match */
197 GETCHARINC(c, eptr);
198 GETCHARINC(d, p);
199 ur = GET_UCD(d);
200 if (c != d && c != d + ur->other_case)
201 {
202 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
203 for (;;)
204 {
205 if (c < *pp) return -1;
206 if (c == *pp++) break;
207 }
208 }
209 }
210 }
211 else
212 #endif
213 #endif
214
215 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
216 is no UCP support. */
217 {
218 while (length-- > 0)
219 {
220 pcre_uchar cc, cp;
221 if (eptr >= md->end_subject) return -2; /* Partial match */
222 cc = RAWUCHARTEST(eptr);
223 cp = RAWUCHARTEST(p);
224 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
225 p++;
226 eptr++;
227 }
228 }
229 }
230
231 /* In the caseful case, we can just compare the bytes, whether or not we
232 are in UTF-8 mode. */
233
234 else
235 {
236 while (length-- > 0)
237 {
238 if (eptr >= md->end_subject) return -2; /* Partial match */
239 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
240 }
241 }
242
243 return (int)(eptr - eptr_start);
244 }
245
246
247
248 /***************************************************************************
249 ****************************************************************************
250 RECURSION IN THE match() FUNCTION
251
252 The match() function is highly recursive, though not every recursive call
253 increases the recursive depth. Nevertheless, some regular expressions can cause
254 it to recurse to a great depth. I was writing for Unix, so I just let it call
255 itself recursively. This uses the stack for saving everything that has to be
256 saved for a recursive call. On Unix, the stack can be large, and this works
257 fine.
258
259 It turns out that on some non-Unix-like systems there are problems with
260 programs that use a lot of stack. (This despite the fact that every last chip
261 has oodles of memory these days, and techniques for extending the stack have
262 been known for decades.) So....
263
264 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
265 calls by keeping local variables that need to be preserved in blocks of memory
266 obtained from malloc() instead instead of on the stack. Macros are used to
267 achieve this so that the actual code doesn't look very different to what it
268 always used to.
269
270 The original heap-recursive code used longjmp(). However, it seems that this
271 can be very slow on some operating systems. Following a suggestion from Stan
272 Switzer, the use of longjmp() has been abolished, at the cost of having to
273 provide a unique number for each call to RMATCH. There is no way of generating
274 a sequence of numbers at compile time in C. I have given them names, to make
275 them stand out more clearly.
276
277 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
278 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
279 tests. Furthermore, not using longjmp() means that local dynamic variables
280 don't have indeterminate values; this has meant that the frame size can be
281 reduced because the result can be "passed back" by straight setting of the
282 variable instead of being passed in the frame.
283 ****************************************************************************
284 ***************************************************************************/
285
286 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
287 below must be updated in sync. */
288
289 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
290 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
291 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
292 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
293 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
294 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
295 RM61, RM62, RM63, RM64, RM65, RM66 };
296
297 /* These versions of the macros use the stack, as normal. There are debugging
298 versions and production versions. Note that the "rw" argument of RMATCH isn't
299 actually used in this definition. */
300
301 #ifndef NO_RECURSE
302 #define REGISTER register
303
304 #ifdef PCRE_DEBUG
305 #define RMATCH(ra,rb,rc,rd,re,rw) \
306 { \
307 printf("match() called in line %d\n", __LINE__); \
308 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
309 printf("to line %d\n", __LINE__); \
310 }
311 #define RRETURN(ra) \
312 { \
313 printf("match() returned %d from line %d\n", ra, __LINE__); \
314 return ra; \
315 }
316 #else
317 #define RMATCH(ra,rb,rc,rd,re,rw) \
318 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
319 #define RRETURN(ra) return ra
320 #endif
321
322 #else
323
324
325 /* These versions of the macros manage a private stack on the heap. Note that
326 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
327 argument of match(), which never changes. */
328
329 #define REGISTER
330
331 #define RMATCH(ra,rb,rc,rd,re,rw)\
332 {\
333 heapframe *newframe = frame->Xnextframe;\
334 if (newframe == NULL)\
335 {\
336 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
337 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
338 newframe->Xnextframe = NULL;\
339 frame->Xnextframe = newframe;\
340 }\
341 frame->Xwhere = rw;\
342 newframe->Xeptr = ra;\
343 newframe->Xecode = rb;\
344 newframe->Xmstart = mstart;\
345 newframe->Xoffset_top = rc;\
346 newframe->Xeptrb = re;\
347 newframe->Xrdepth = frame->Xrdepth + 1;\
348 newframe->Xprevframe = frame;\
349 frame = newframe;\
350 DPRINTF(("restarting from line %d\n", __LINE__));\
351 goto HEAP_RECURSE;\
352 L_##rw:\
353 DPRINTF(("jumped back to line %d\n", __LINE__));\
354 }
355
356 #define RRETURN(ra)\
357 {\
358 heapframe *oldframe = frame;\
359 frame = oldframe->Xprevframe;\
360 if (frame != NULL)\
361 {\
362 rrc = ra;\
363 goto HEAP_RETURN;\
364 }\
365 return ra;\
366 }
367
368
369 /* Structure for remembering the local variables in a private frame */
370
371 typedef struct heapframe {
372 struct heapframe *Xprevframe;
373 struct heapframe *Xnextframe;
374
375 /* Function arguments that may change */
376
377 PCRE_PUCHAR Xeptr;
378 const pcre_uchar *Xecode;
379 PCRE_PUCHAR Xmstart;
380 int Xoffset_top;
381 eptrblock *Xeptrb;
382 unsigned int Xrdepth;
383
384 /* Function local variables */
385
386 PCRE_PUCHAR Xcallpat;
387 #ifdef SUPPORT_UTF
388 PCRE_PUCHAR Xcharptr;
389 #endif
390 PCRE_PUCHAR Xdata;
391 PCRE_PUCHAR Xnext;
392 PCRE_PUCHAR Xpp;
393 PCRE_PUCHAR Xprev;
394 PCRE_PUCHAR Xsaved_eptr;
395
396 recursion_info Xnew_recursive;
397
398 BOOL Xcur_is_word;
399 BOOL Xcondition;
400 BOOL Xprev_is_word;
401
402 #ifdef SUPPORT_UCP
403 int Xprop_type;
404 unsigned int Xprop_value;
405 int Xprop_fail_result;
406 int Xoclength;
407 pcre_uchar Xocchars[6];
408 #endif
409
410 int Xcodelink;
411 int Xctype;
412 unsigned int Xfc;
413 int Xfi;
414 int Xlength;
415 int Xmax;
416 int Xmin;
417 int Xnumber;
418 int Xoffset;
419 int Xop;
420 int Xsave_capture_last;
421 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
422 int Xstacksave[REC_STACK_SAVE_MAX];
423
424 eptrblock Xnewptrb;
425
426 /* Where to jump back to */
427
428 int Xwhere;
429
430 } heapframe;
431
432 #endif
433
434
435 /***************************************************************************
436 ***************************************************************************/
437
438
439
440 /*************************************************
441 * Match from current position *
442 *************************************************/
443
444 /* This function is called recursively in many circumstances. Whenever it
445 returns a negative (error) response, the outer incarnation must also return the
446 same response. */
447
448 /* These macros pack up tests that are used for partial matching, and which
449 appear several times in the code. We set the "hit end" flag if the pointer is
450 at the end of the subject and also past the start of the subject (i.e.
451 something has been matched). For hard partial matching, we then return
452 immediately. The second one is used when we already know we are past the end of
453 the subject. */
454
455 #define CHECK_PARTIAL()\
456 if (md->partial != 0 && eptr >= md->end_subject && \
457 eptr > md->start_used_ptr) \
458 { \
459 md->hitend = TRUE; \
460 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
461 }
462
463 #define SCHECK_PARTIAL()\
464 if (md->partial != 0 && eptr > md->start_used_ptr) \
465 { \
466 md->hitend = TRUE; \
467 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
468 }
469
470
471 /* Performance note: It might be tempting to extract commonly used fields from
472 the md structure (e.g. utf, end_subject) into individual variables to improve
473 performance. Tests using gcc on a SPARC disproved this; in the first case, it
474 made performance worse.
475
476 Arguments:
477 eptr pointer to current character in subject
478 ecode pointer to current position in compiled code
479 mstart pointer to the current match start position (can be modified
480 by encountering \K)
481 offset_top current top pointer
482 md pointer to "static" info for the match
483 eptrb pointer to chain of blocks containing eptr at start of
484 brackets - for testing for empty matches
485 rdepth the recursion depth
486
487 Returns: MATCH_MATCH if matched ) these values are >= 0
488 MATCH_NOMATCH if failed to match )
489 a negative MATCH_xxx value for PRUNE, SKIP, etc
490 a negative PCRE_ERROR_xxx value if aborted by an error condition
491 (e.g. stopped by repeated call or recursion limit)
492 */
493
494 static int
495 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
496 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
497 unsigned int rdepth)
498 {
499 /* These variables do not need to be preserved over recursion in this function,
500 so they can be ordinary variables in all cases. Mark some of them with
501 "register" because they are used a lot in loops. */
502
503 register int rrc; /* Returns from recursive calls */
504 register int i; /* Used for loops not involving calls to RMATCH() */
505 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
506 register BOOL utf; /* Local copy of UTF flag for speed */
507
508 BOOL minimize, possessive; /* Quantifier options */
509 BOOL caseless;
510 int condcode;
511
512 /* When recursion is not being used, all "local" variables that have to be
513 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
514 frame on the stack here; subsequent instantiations are obtained from the heap
515 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
516 the top-level on the stack rather than malloc-ing them all gives a performance
517 boost in many cases where there is not much "recursion". */
518
519 #ifdef NO_RECURSE
520 heapframe *frame = (heapframe *)md->match_frames_base;
521
522 /* Copy in the original argument variables */
523
524 frame->Xeptr = eptr;
525 frame->Xecode = ecode;
526 frame->Xmstart = mstart;
527 frame->Xoffset_top = offset_top;
528 frame->Xeptrb = eptrb;
529 frame->Xrdepth = rdepth;
530
531 /* This is where control jumps back to to effect "recursion" */
532
533 HEAP_RECURSE:
534
535 /* Macros make the argument variables come from the current frame */
536
537 #define eptr frame->Xeptr
538 #define ecode frame->Xecode
539 #define mstart frame->Xmstart
540 #define offset_top frame->Xoffset_top
541 #define eptrb frame->Xeptrb
542 #define rdepth frame->Xrdepth
543
544 /* Ditto for the local variables */
545
546 #ifdef SUPPORT_UTF
547 #define charptr frame->Xcharptr
548 #endif
549 #define callpat frame->Xcallpat
550 #define codelink frame->Xcodelink
551 #define data frame->Xdata
552 #define next frame->Xnext
553 #define pp frame->Xpp
554 #define prev frame->Xprev
555 #define saved_eptr frame->Xsaved_eptr
556
557 #define new_recursive frame->Xnew_recursive
558
559 #define cur_is_word frame->Xcur_is_word
560 #define condition frame->Xcondition
561 #define prev_is_word frame->Xprev_is_word
562
563 #ifdef SUPPORT_UCP
564 #define prop_type frame->Xprop_type
565 #define prop_value frame->Xprop_value
566 #define prop_fail_result frame->Xprop_fail_result
567 #define oclength frame->Xoclength
568 #define occhars frame->Xocchars
569 #endif
570
571 #define ctype frame->Xctype
572 #define fc frame->Xfc
573 #define fi frame->Xfi
574 #define length frame->Xlength
575 #define max frame->Xmax
576 #define min frame->Xmin
577 #define number frame->Xnumber
578 #define offset frame->Xoffset
579 #define op frame->Xop
580 #define save_capture_last frame->Xsave_capture_last
581 #define save_offset1 frame->Xsave_offset1
582 #define save_offset2 frame->Xsave_offset2
583 #define save_offset3 frame->Xsave_offset3
584 #define stacksave frame->Xstacksave
585
586 #define newptrb frame->Xnewptrb
587
588 /* When recursion is being used, local variables are allocated on the stack and
589 get preserved during recursion in the normal way. In this environment, fi and
590 i, and fc and c, can be the same variables. */
591
592 #else /* NO_RECURSE not defined */
593 #define fi i
594 #define fc c
595
596 /* Many of the following variables are used only in small blocks of the code.
597 My normal style of coding would have declared them within each of those blocks.
598 However, in order to accommodate the version of this code that uses an external
599 "stack" implemented on the heap, it is easier to declare them all here, so the
600 declarations can be cut out in a block. The only declarations within blocks
601 below are for variables that do not have to be preserved over a recursive call
602 to RMATCH(). */
603
604 #ifdef SUPPORT_UTF
605 const pcre_uchar *charptr;
606 #endif
607 const pcre_uchar *callpat;
608 const pcre_uchar *data;
609 const pcre_uchar *next;
610 PCRE_PUCHAR pp;
611 const pcre_uchar *prev;
612 PCRE_PUCHAR saved_eptr;
613
614 recursion_info new_recursive;
615
616 BOOL cur_is_word;
617 BOOL condition;
618 BOOL prev_is_word;
619
620 #ifdef SUPPORT_UCP
621 int prop_type;
622 unsigned int prop_value;
623 int prop_fail_result;
624 int oclength;
625 pcre_uchar occhars[6];
626 #endif
627
628 int codelink;
629 int ctype;
630 int length;
631 int max;
632 int min;
633 int number;
634 int offset;
635 pcre_uchar op;
636 int save_capture_last;
637 int save_offset1, save_offset2, save_offset3;
638 int stacksave[REC_STACK_SAVE_MAX];
639
640 eptrblock newptrb;
641
642 /* There is a special fudge for calling match() in a way that causes it to
643 measure the size of its basic stack frame when the stack is being used for
644 recursion. The second argument (ecode) being NULL triggers this behaviour. It
645 cannot normally ever be NULL. The return is the negated value of the frame
646 size. */
647
648 if (ecode == NULL)
649 {
650 if (rdepth == 0)
651 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
652 else
653 {
654 int len = (char *)&rdepth - (char *)eptr;
655 return (len > 0)? -len : len;
656 }
657 }
658 #endif /* NO_RECURSE */
659
660 /* To save space on the stack and in the heap frame, I have doubled up on some
661 of the local variables that are used only in localised parts of the code, but
662 still need to be preserved over recursive calls of match(). These macros define
663 the alternative names that are used. */
664
665 #define allow_zero cur_is_word
666 #define cbegroup condition
667 #define code_offset codelink
668 #define condassert condition
669 #define matched_once prev_is_word
670 #define foc number
671 #define save_mark data
672
673 /* These statements are here to stop the compiler complaining about unitialized
674 variables. */
675
676 #ifdef SUPPORT_UCP
677 prop_value = 0;
678 prop_fail_result = 0;
679 #endif
680
681
682 /* This label is used for tail recursion, which is used in a few cases even
683 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
684 used. Thanks to Ian Taylor for noticing this possibility and sending the
685 original patch. */
686
687 TAIL_RECURSE:
688
689 /* OK, now we can get on with the real code of the function. Recursive calls
690 are specified by the macro RMATCH and RRETURN is used to return. When
691 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
692 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
693 defined). However, RMATCH isn't like a function call because it's quite a
694 complicated macro. It has to be used in one particular way. This shouldn't,
695 however, impact performance when true recursion is being used. */
696
697 #ifdef SUPPORT_UTF
698 utf = md->utf; /* Local copy of the flag */
699 #else
700 utf = FALSE;
701 #endif
702
703 /* First check that we haven't called match() too many times, or that we
704 haven't exceeded the recursive call limit. */
705
706 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
707 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
708
709 /* At the start of a group with an unlimited repeat that may match an empty
710 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
711 done this way to save having to use another function argument, which would take
712 up space on the stack. See also MATCH_CONDASSERT below.
713
714 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
715 such remembered pointers, to be checked when we hit the closing ket, in order
716 to break infinite loops that match no characters. When match() is called in
717 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
718 NOT be used with tail recursion, because the memory block that is used is on
719 the stack, so a new one may be required for each match(). */
720
721 if (md->match_function_type == MATCH_CBEGROUP)
722 {
723 newptrb.epb_saved_eptr = eptr;
724 newptrb.epb_prev = eptrb;
725 eptrb = &newptrb;
726 md->match_function_type = 0;
727 }
728
729 /* Now start processing the opcodes. */
730
731 for (;;)
732 {
733 minimize = possessive = FALSE;
734 op = *ecode;
735
736 switch(op)
737 {
738 case OP_MARK:
739 md->nomatch_mark = ecode + 2;
740 md->mark = NULL; /* In case previously set by assertion */
741 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
742 eptrb, RM55);
743 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
744 md->mark == NULL) md->mark = ecode + 2;
745
746 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
747 argument, and we must check whether that argument matches this MARK's
748 argument. It is passed back in md->start_match_ptr (an overloading of that
749 variable). If it does match, we reset that variable to the current subject
750 position and return MATCH_SKIP. Otherwise, pass back the return code
751 unaltered. */
752
753 else if (rrc == MATCH_SKIP_ARG &&
754 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
755 {
756 md->start_match_ptr = eptr;
757 RRETURN(MATCH_SKIP);
758 }
759 RRETURN(rrc);
760
761 case OP_FAIL:
762 RRETURN(MATCH_NOMATCH);
763
764 /* COMMIT overrides PRUNE, SKIP, and THEN */
765
766 case OP_COMMIT:
767 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
768 eptrb, RM52);
769 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
770 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
771 rrc != MATCH_THEN)
772 RRETURN(rrc);
773 RRETURN(MATCH_COMMIT);
774
775 /* PRUNE overrides THEN */
776
777 case OP_PRUNE:
778 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
779 eptrb, RM51);
780 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
781 RRETURN(MATCH_PRUNE);
782
783 case OP_PRUNE_ARG:
784 md->nomatch_mark = ecode + 2;
785 md->mark = NULL; /* In case previously set by assertion */
786 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
787 eptrb, RM56);
788 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
789 md->mark == NULL) md->mark = ecode + 2;
790 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
791 RRETURN(MATCH_PRUNE);
792
793 /* SKIP overrides PRUNE and THEN */
794
795 case OP_SKIP:
796 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
797 eptrb, RM53);
798 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
799 RRETURN(rrc);
800 md->start_match_ptr = eptr; /* Pass back current position */
801 RRETURN(MATCH_SKIP);
802
803 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
804 nomatch_mark. There is a flag that disables this opcode when re-matching a
805 pattern that ended with a SKIP for which there was not a matching MARK. */
806
807 case OP_SKIP_ARG:
808 if (md->ignore_skip_arg)
809 {
810 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
811 break;
812 }
813 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
814 eptrb, RM57);
815 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
816 RRETURN(rrc);
817
818 /* Pass back the current skip name by overloading md->start_match_ptr and
819 returning the special MATCH_SKIP_ARG return code. This will either be
820 caught by a matching MARK, or get to the top, where it causes a rematch
821 with the md->ignore_skip_arg flag set. */
822
823 md->start_match_ptr = ecode + 2;
824 RRETURN(MATCH_SKIP_ARG);
825
826 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
827 the branch in which it occurs can be determined. Overload the start of
828 match pointer to do this. */
829
830 case OP_THEN:
831 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
832 eptrb, RM54);
833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
834 md->start_match_ptr = ecode;
835 RRETURN(MATCH_THEN);
836
837 case OP_THEN_ARG:
838 md->nomatch_mark = ecode + 2;
839 md->mark = NULL; /* In case previously set by assertion */
840 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
841 md, eptrb, RM58);
842 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
843 md->mark == NULL) md->mark = ecode + 2;
844 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
845 md->start_match_ptr = ecode;
846 RRETURN(MATCH_THEN);
847
848 /* Handle an atomic group that does not contain any capturing parentheses.
849 This can be handled like an assertion. Prior to 8.13, all atomic groups
850 were handled this way. In 8.13, the code was changed as below for ONCE, so
851 that backups pass through the group and thereby reset captured values.
852 However, this uses a lot more stack, so in 8.20, atomic groups that do not
853 contain any captures generate OP_ONCE_NC, which can be handled in the old,
854 less stack intensive way.
855
856 Check the alternative branches in turn - the matching won't pass the KET
857 for this kind of subpattern. If any one branch matches, we carry on as at
858 the end of a normal bracket, leaving the subject pointer, but resetting
859 the start-of-match value in case it was changed by \K. */
860
861 case OP_ONCE_NC:
862 prev = ecode;
863 saved_eptr = eptr;
864 save_mark = md->mark;
865 do
866 {
867 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
868 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
869 {
870 mstart = md->start_match_ptr;
871 break;
872 }
873 if (rrc == MATCH_THEN)
874 {
875 next = ecode + GET(ecode,1);
876 if (md->start_match_ptr < next &&
877 (*ecode == OP_ALT || *next == OP_ALT))
878 rrc = MATCH_NOMATCH;
879 }
880
881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
882 ecode += GET(ecode,1);
883 md->mark = save_mark;
884 }
885 while (*ecode == OP_ALT);
886
887 /* If hit the end of the group (which could be repeated), fail */
888
889 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
890
891 /* Continue as from after the group, updating the offsets high water
892 mark, since extracts may have been taken. */
893
894 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
895
896 offset_top = md->end_offset_top;
897 eptr = md->end_match_ptr;
898
899 /* For a non-repeating ket, just continue at this level. This also
900 happens for a repeating ket if no characters were matched in the group.
901 This is the forcible breaking of infinite loops as implemented in Perl
902 5.005. */
903
904 if (*ecode == OP_KET || eptr == saved_eptr)
905 {
906 ecode += 1+LINK_SIZE;
907 break;
908 }
909
910 /* The repeating kets try the rest of the pattern or restart from the
911 preceding bracket, in the appropriate order. The second "call" of match()
912 uses tail recursion, to avoid using another stack frame. */
913
914 if (*ecode == OP_KETRMIN)
915 {
916 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
917 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
918 ecode = prev;
919 goto TAIL_RECURSE;
920 }
921 else /* OP_KETRMAX */
922 {
923 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
925 ecode += 1 + LINK_SIZE;
926 goto TAIL_RECURSE;
927 }
928 /* Control never gets here */
929
930 /* Handle a capturing bracket, other than those that are possessive with an
931 unlimited repeat. If there is space in the offset vector, save the current
932 subject position in the working slot at the top of the vector. We mustn't
933 change the current values of the data slot, because they may be set from a
934 previous iteration of this group, and be referred to by a reference inside
935 the group. A failure to match might occur after the group has succeeded,
936 if something later on doesn't match. For this reason, we need to restore
937 the working value and also the values of the final offsets, in case they
938 were set by a previous iteration of the same bracket.
939
940 If there isn't enough space in the offset vector, treat this as if it were
941 a non-capturing bracket. Don't worry about setting the flag for the error
942 case here; that is handled in the code for KET. */
943
944 case OP_CBRA:
945 case OP_SCBRA:
946 number = GET2(ecode, 1+LINK_SIZE);
947 offset = number << 1;
948
949 #ifdef PCRE_DEBUG
950 printf("start bracket %d\n", number);
951 printf("subject=");
952 pchars(eptr, 16, TRUE, md);
953 printf("\n");
954 #endif
955
956 if (offset < md->offset_max)
957 {
958 save_offset1 = md->offset_vector[offset];
959 save_offset2 = md->offset_vector[offset+1];
960 save_offset3 = md->offset_vector[md->offset_end - number];
961 save_capture_last = md->capture_last;
962 save_mark = md->mark;
963
964 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
965 md->offset_vector[md->offset_end - number] =
966 (int)(eptr - md->start_subject);
967
968 for (;;)
969 {
970 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
971 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
972 eptrb, RM1);
973 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
974
975 /* If we backed up to a THEN, check whether it is within the current
976 branch by comparing the address of the THEN that is passed back with
977 the end of the branch. If it is within the current branch, and the
978 branch is one of two or more alternatives (it either starts or ends
979 with OP_ALT), we have reached the limit of THEN's action, so convert
980 the return code to NOMATCH, which will cause normal backtracking to
981 happen from now on. Otherwise, THEN is passed back to an outer
982 alternative. This implements Perl's treatment of parenthesized groups,
983 where a group not containing | does not affect the current alternative,
984 that is, (X) is NOT the same as (X|(*F)). */
985
986 if (rrc == MATCH_THEN)
987 {
988 next = ecode + GET(ecode,1);
989 if (md->start_match_ptr < next &&
990 (*ecode == OP_ALT || *next == OP_ALT))
991 rrc = MATCH_NOMATCH;
992 }
993
994 /* Anything other than NOMATCH is passed back. */
995
996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
997 md->capture_last = save_capture_last;
998 ecode += GET(ecode, 1);
999 md->mark = save_mark;
1000 if (*ecode != OP_ALT) break;
1001 }
1002
1003 DPRINTF(("bracket %d failed\n", number));
1004 md->offset_vector[offset] = save_offset1;
1005 md->offset_vector[offset+1] = save_offset2;
1006 md->offset_vector[md->offset_end - number] = save_offset3;
1007
1008 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1009
1010 RRETURN(rrc);
1011 }
1012
1013 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1014 as a non-capturing bracket. */
1015
1016 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1017 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1018
1019 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1020
1021 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1022 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1023
1024 /* Non-capturing or atomic group, except for possessive with unlimited
1025 repeat and ONCE group with no captures. Loop for all the alternatives.
1026
1027 When we get to the final alternative within the brackets, we used to return
1028 the result of a recursive call to match() whatever happened so it was
1029 possible to reduce stack usage by turning this into a tail recursion,
1030 except in the case of a possibly empty group. However, now that there is
1031 the possiblity of (*THEN) occurring in the final alternative, this
1032 optimization is no longer always possible.
1033
1034 We can optimize if we know there are no (*THEN)s in the pattern; at present
1035 this is the best that can be done.
1036
1037 MATCH_ONCE is returned when the end of an atomic group is successfully
1038 reached, but subsequent matching fails. It passes back up the tree (causing
1039 captured values to be reset) until the original atomic group level is
1040 reached. This is tested by comparing md->once_target with the start of the
1041 group. At this point, the return is converted into MATCH_NOMATCH so that
1042 previous backup points can be taken. */
1043
1044 case OP_ONCE:
1045 case OP_BRA:
1046 case OP_SBRA:
1047 DPRINTF(("start non-capturing bracket\n"));
1048
1049 for (;;)
1050 {
1051 if (op >= OP_SBRA || op == OP_ONCE)
1052 md->match_function_type = MATCH_CBEGROUP;
1053
1054 /* If this is not a possibly empty group, and there are no (*THEN)s in
1055 the pattern, and this is the final alternative, optimize as described
1056 above. */
1057
1058 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1059 {
1060 ecode += PRIV(OP_lengths)[*ecode];
1061 goto TAIL_RECURSE;
1062 }
1063
1064 /* In all other cases, we have to make another call to match(). */
1065
1066 save_mark = md->mark;
1067 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1068 RM2);
1069
1070 /* See comment in the code for capturing groups above about handling
1071 THEN. */
1072
1073 if (rrc == MATCH_THEN)
1074 {
1075 next = ecode + GET(ecode,1);
1076 if (md->start_match_ptr < next &&
1077 (*ecode == OP_ALT || *next == OP_ALT))
1078 rrc = MATCH_NOMATCH;
1079 }
1080
1081 if (rrc != MATCH_NOMATCH)
1082 {
1083 if (rrc == MATCH_ONCE)
1084 {
1085 const pcre_uchar *scode = ecode;
1086 if (*scode != OP_ONCE) /* If not at start, find it */
1087 {
1088 while (*scode == OP_ALT) scode += GET(scode, 1);
1089 scode -= GET(scode, 1);
1090 }
1091 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1092 }
1093 RRETURN(rrc);
1094 }
1095 ecode += GET(ecode, 1);
1096 md->mark = save_mark;
1097 if (*ecode != OP_ALT) break;
1098 }
1099
1100 RRETURN(MATCH_NOMATCH);
1101
1102 /* Handle possessive capturing brackets with an unlimited repeat. We come
1103 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1104 handled similarly to the normal case above. However, the matching is
1105 different. The end of these brackets will always be OP_KETRPOS, which
1106 returns MATCH_KETRPOS without going further in the pattern. By this means
1107 we can handle the group by iteration rather than recursion, thereby
1108 reducing the amount of stack needed. */
1109
1110 case OP_CBRAPOS:
1111 case OP_SCBRAPOS:
1112 allow_zero = FALSE;
1113
1114 POSSESSIVE_CAPTURE:
1115 number = GET2(ecode, 1+LINK_SIZE);
1116 offset = number << 1;
1117
1118 #ifdef PCRE_DEBUG
1119 printf("start possessive bracket %d\n", number);
1120 printf("subject=");
1121 pchars(eptr, 16, TRUE, md);
1122 printf("\n");
1123 #endif
1124
1125 if (offset < md->offset_max)
1126 {
1127 matched_once = FALSE;
1128 code_offset = (int)(ecode - md->start_code);
1129
1130 save_offset1 = md->offset_vector[offset];
1131 save_offset2 = md->offset_vector[offset+1];
1132 save_offset3 = md->offset_vector[md->offset_end - number];
1133 save_capture_last = md->capture_last;
1134
1135 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1136
1137 /* Each time round the loop, save the current subject position for use
1138 when the group matches. For MATCH_MATCH, the group has matched, so we
1139 restart it with a new subject starting position, remembering that we had
1140 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1141 usual. If we haven't matched any alternatives in any iteration, check to
1142 see if a previous iteration matched. If so, the group has matched;
1143 continue from afterwards. Otherwise it has failed; restore the previous
1144 capture values before returning NOMATCH. */
1145
1146 for (;;)
1147 {
1148 md->offset_vector[md->offset_end - number] =
1149 (int)(eptr - md->start_subject);
1150 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1151 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1152 eptrb, RM63);
1153 if (rrc == MATCH_KETRPOS)
1154 {
1155 offset_top = md->end_offset_top;
1156 eptr = md->end_match_ptr;
1157 ecode = md->start_code + code_offset;
1158 save_capture_last = md->capture_last;
1159 matched_once = TRUE;
1160 continue;
1161 }
1162
1163 /* See comment in the code for capturing groups above about handling
1164 THEN. */
1165
1166 if (rrc == MATCH_THEN)
1167 {
1168 next = ecode + GET(ecode,1);
1169 if (md->start_match_ptr < next &&
1170 (*ecode == OP_ALT || *next == OP_ALT))
1171 rrc = MATCH_NOMATCH;
1172 }
1173
1174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1175 md->capture_last = save_capture_last;
1176 ecode += GET(ecode, 1);
1177 if (*ecode != OP_ALT) break;
1178 }
1179
1180 if (!matched_once)
1181 {
1182 md->offset_vector[offset] = save_offset1;
1183 md->offset_vector[offset+1] = save_offset2;
1184 md->offset_vector[md->offset_end - number] = save_offset3;
1185 }
1186
1187 if (allow_zero || matched_once)
1188 {
1189 ecode += 1 + LINK_SIZE;
1190 break;
1191 }
1192
1193 RRETURN(MATCH_NOMATCH);
1194 }
1195
1196 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1197 as a non-capturing bracket. */
1198
1199 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1200 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1201
1202 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1203
1204 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1205 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1206
1207 /* Non-capturing possessive bracket with unlimited repeat. We come here
1208 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1209 without the capturing complication. It is written out separately for speed
1210 and cleanliness. */
1211
1212 case OP_BRAPOS:
1213 case OP_SBRAPOS:
1214 allow_zero = FALSE;
1215
1216 POSSESSIVE_NON_CAPTURE:
1217 matched_once = FALSE;
1218 code_offset = (int)(ecode - md->start_code);
1219
1220 for (;;)
1221 {
1222 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1223 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1224 eptrb, RM48);
1225 if (rrc == MATCH_KETRPOS)
1226 {
1227 offset_top = md->end_offset_top;
1228 eptr = md->end_match_ptr;
1229 ecode = md->start_code + code_offset;
1230 matched_once = TRUE;
1231 continue;
1232 }
1233
1234 /* See comment in the code for capturing groups above about handling
1235 THEN. */
1236
1237 if (rrc == MATCH_THEN)
1238 {
1239 next = ecode + GET(ecode,1);
1240 if (md->start_match_ptr < next &&
1241 (*ecode == OP_ALT || *next == OP_ALT))
1242 rrc = MATCH_NOMATCH;
1243 }
1244
1245 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1246 ecode += GET(ecode, 1);
1247 if (*ecode != OP_ALT) break;
1248 }
1249
1250 if (matched_once || allow_zero)
1251 {
1252 ecode += 1 + LINK_SIZE;
1253 break;
1254 }
1255 RRETURN(MATCH_NOMATCH);
1256
1257 /* Control never reaches here. */
1258
1259 /* Conditional group: compilation checked that there are no more than
1260 two branches. If the condition is false, skipping the first branch takes us
1261 past the end if there is only one branch, but that's OK because that is
1262 exactly what going to the ket would do. */
1263
1264 case OP_COND:
1265 case OP_SCOND:
1266 codelink = GET(ecode, 1);
1267
1268 /* Because of the way auto-callout works during compile, a callout item is
1269 inserted between OP_COND and an assertion condition. */
1270
1271 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1272 {
1273 if (PUBL(callout) != NULL)
1274 {
1275 PUBL(callout_block) cb;
1276 cb.version = 2; /* Version 1 of the callout block */
1277 cb.callout_number = ecode[LINK_SIZE+2];
1278 cb.offset_vector = md->offset_vector;
1279 #if defined COMPILE_PCRE8
1280 cb.subject = (PCRE_SPTR)md->start_subject;
1281 #elif defined COMPILE_PCRE16
1282 cb.subject = (PCRE_SPTR16)md->start_subject;
1283 #elif defined COMPILE_PCRE32
1284 cb.subject = (PCRE_SPTR32)md->start_subject;
1285 #endif
1286 cb.subject_length = (int)(md->end_subject - md->start_subject);
1287 cb.start_match = (int)(mstart - md->start_subject);
1288 cb.current_position = (int)(eptr - md->start_subject);
1289 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1290 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1291 cb.capture_top = offset_top/2;
1292 cb.capture_last = md->capture_last;
1293 cb.callout_data = md->callout_data;
1294 cb.mark = md->nomatch_mark;
1295 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1296 if (rrc < 0) RRETURN(rrc);
1297 }
1298 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1299 }
1300
1301 condcode = ecode[LINK_SIZE+1];
1302
1303 /* Now see what the actual condition is */
1304
1305 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1306 {
1307 if (md->recursive == NULL) /* Not recursing => FALSE */
1308 {
1309 condition = FALSE;
1310 ecode += GET(ecode, 1);
1311 }
1312 else
1313 {
1314 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1315 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1316
1317 /* If the test is for recursion into a specific subpattern, and it is
1318 false, but the test was set up by name, scan the table to see if the
1319 name refers to any other numbers, and test them. The condition is true
1320 if any one is set. */
1321
1322 if (!condition && condcode == OP_NRREF)
1323 {
1324 pcre_uchar *slotA = md->name_table;
1325 for (i = 0; i < md->name_count; i++)
1326 {
1327 if (GET2(slotA, 0) == recno) break;
1328 slotA += md->name_entry_size;
1329 }
1330
1331 /* Found a name for the number - there can be only one; duplicate
1332 names for different numbers are allowed, but not vice versa. First
1333 scan down for duplicates. */
1334
1335 if (i < md->name_count)
1336 {
1337 pcre_uchar *slotB = slotA;
1338 while (slotB > md->name_table)
1339 {
1340 slotB -= md->name_entry_size;
1341 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1342 {
1343 condition = GET2(slotB, 0) == md->recursive->group_num;
1344 if (condition) break;
1345 }
1346 else break;
1347 }
1348
1349 /* Scan up for duplicates */
1350
1351 if (!condition)
1352 {
1353 slotB = slotA;
1354 for (i++; i < md->name_count; i++)
1355 {
1356 slotB += md->name_entry_size;
1357 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1358 {
1359 condition = GET2(slotB, 0) == md->recursive->group_num;
1360 if (condition) break;
1361 }
1362 else break;
1363 }
1364 }
1365 }
1366 }
1367
1368 /* Chose branch according to the condition */
1369
1370 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1371 }
1372 }
1373
1374 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1375 {
1376 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1377 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1378
1379 /* If the numbered capture is unset, but the reference was by name,
1380 scan the table to see if the name refers to any other numbers, and test
1381 them. The condition is true if any one is set. This is tediously similar
1382 to the code above, but not close enough to try to amalgamate. */
1383
1384 if (!condition && condcode == OP_NCREF)
1385 {
1386 int refno = offset >> 1;
1387 pcre_uchar *slotA = md->name_table;
1388
1389 for (i = 0; i < md->name_count; i++)
1390 {
1391 if (GET2(slotA, 0) == refno) break;
1392 slotA += md->name_entry_size;
1393 }
1394
1395 /* Found a name for the number - there can be only one; duplicate names
1396 for different numbers are allowed, but not vice versa. First scan down
1397 for duplicates. */
1398
1399 if (i < md->name_count)
1400 {
1401 pcre_uchar *slotB = slotA;
1402 while (slotB > md->name_table)
1403 {
1404 slotB -= md->name_entry_size;
1405 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1406 {
1407 offset = GET2(slotB, 0) << 1;
1408 condition = offset < offset_top &&
1409 md->offset_vector[offset] >= 0;
1410 if (condition) break;
1411 }
1412 else break;
1413 }
1414
1415 /* Scan up for duplicates */
1416
1417 if (!condition)
1418 {
1419 slotB = slotA;
1420 for (i++; i < md->name_count; i++)
1421 {
1422 slotB += md->name_entry_size;
1423 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1424 {
1425 offset = GET2(slotB, 0) << 1;
1426 condition = offset < offset_top &&
1427 md->offset_vector[offset] >= 0;
1428 if (condition) break;
1429 }
1430 else break;
1431 }
1432 }
1433 }
1434 }
1435
1436 /* Chose branch according to the condition */
1437
1438 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1439 }
1440
1441 else if (condcode == OP_DEF) /* DEFINE - always false */
1442 {
1443 condition = FALSE;
1444 ecode += GET(ecode, 1);
1445 }
1446
1447 /* The condition is an assertion. Call match() to evaluate it - setting
1448 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1449 an assertion. */
1450
1451 else
1452 {
1453 md->match_function_type = MATCH_CONDASSERT;
1454 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1455 if (rrc == MATCH_MATCH)
1456 {
1457 if (md->end_offset_top > offset_top)
1458 offset_top = md->end_offset_top; /* Captures may have happened */
1459 condition = TRUE;
1460 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1461 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1462 }
1463
1464 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1465 assertion; it is therefore treated as NOMATCH. */
1466
1467 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1468 {
1469 RRETURN(rrc); /* Need braces because of following else */
1470 }
1471 else
1472 {
1473 condition = FALSE;
1474 ecode += codelink;
1475 }
1476 }
1477
1478 /* We are now at the branch that is to be obeyed. As there is only one, can
1479 use tail recursion to avoid using another stack frame, except when there is
1480 unlimited repeat of a possibly empty group. In the latter case, a recursive
1481 call to match() is always required, unless the second alternative doesn't
1482 exist, in which case we can just plough on. Note that, for compatibility
1483 with Perl, the | in a conditional group is NOT treated as creating two
1484 alternatives. If a THEN is encountered in the branch, it propagates out to
1485 the enclosing alternative (unless nested in a deeper set of alternatives,
1486 of course). */
1487
1488 if (condition || *ecode == OP_ALT)
1489 {
1490 if (op != OP_SCOND)
1491 {
1492 ecode += 1 + LINK_SIZE;
1493 goto TAIL_RECURSE;
1494 }
1495
1496 md->match_function_type = MATCH_CBEGROUP;
1497 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1498 RRETURN(rrc);
1499 }
1500
1501 /* Condition false & no alternative; continue after the group. */
1502
1503 else
1504 {
1505 ecode += 1 + LINK_SIZE;
1506 }
1507 break;
1508
1509
1510 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1511 to close any currently open capturing brackets. */
1512
1513 case OP_CLOSE:
1514 number = GET2(ecode, 1);
1515 offset = number << 1;
1516
1517 #ifdef PCRE_DEBUG
1518 printf("end bracket %d at *ACCEPT", number);
1519 printf("\n");
1520 #endif
1521
1522 md->capture_last = number;
1523 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1524 {
1525 md->offset_vector[offset] =
1526 md->offset_vector[md->offset_end - number];
1527 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1528 if (offset_top <= offset) offset_top = offset + 2;
1529 }
1530 ecode += 1 + IMM2_SIZE;
1531 break;
1532
1533
1534 /* End of the pattern, either real or forced. */
1535
1536 case OP_END:
1537 case OP_ACCEPT:
1538 case OP_ASSERT_ACCEPT:
1539
1540 /* If we have matched an empty string, fail if not in an assertion and not
1541 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1542 is set and we have matched at the start of the subject. In both cases,
1543 backtracking will then try other alternatives, if any. */
1544
1545 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1546 md->recursive == NULL &&
1547 (md->notempty ||
1548 (md->notempty_atstart &&
1549 mstart == md->start_subject + md->start_offset)))
1550 RRETURN(MATCH_NOMATCH);
1551
1552 /* Otherwise, we have a match. */
1553
1554 md->end_match_ptr = eptr; /* Record where we ended */
1555 md->end_offset_top = offset_top; /* and how many extracts were taken */
1556 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1557
1558 /* For some reason, the macros don't work properly if an expression is
1559 given as the argument to RRETURN when the heap is in use. */
1560
1561 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1562 RRETURN(rrc);
1563
1564 /* Assertion brackets. Check the alternative branches in turn - the
1565 matching won't pass the KET for an assertion. If any one branch matches,
1566 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1567 start of each branch to move the current point backwards, so the code at
1568 this level is identical to the lookahead case. When the assertion is part
1569 of a condition, we want to return immediately afterwards. The caller of
1570 this incarnation of the match() function will have set MATCH_CONDASSERT in
1571 md->match_function type, and one of these opcodes will be the first opcode
1572 that is processed. We use a local variable that is preserved over calls to
1573 match() to remember this case. */
1574
1575 case OP_ASSERT:
1576 case OP_ASSERTBACK:
1577 save_mark = md->mark;
1578 if (md->match_function_type == MATCH_CONDASSERT)
1579 {
1580 condassert = TRUE;
1581 md->match_function_type = 0;
1582 }
1583 else condassert = FALSE;
1584
1585 do
1586 {
1587 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1588 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1589 {
1590 mstart = md->start_match_ptr; /* In case \K reset it */
1591 break;
1592 }
1593 md->mark = save_mark;
1594
1595 /* A COMMIT failure must fail the entire assertion, without trying any
1596 subsequent branches. */
1597
1598 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1599
1600 /* PCRE does not allow THEN to escape beyond an assertion; it
1601 is treated as NOMATCH. */
1602
1603 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1604 ecode += GET(ecode, 1);
1605 }
1606 while (*ecode == OP_ALT);
1607
1608 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1609
1610 /* If checking an assertion for a condition, return MATCH_MATCH. */
1611
1612 if (condassert) RRETURN(MATCH_MATCH);
1613
1614 /* Continue from after the assertion, updating the offsets high water
1615 mark, since extracts may have been taken during the assertion. */
1616
1617 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1618 ecode += 1 + LINK_SIZE;
1619 offset_top = md->end_offset_top;
1620 continue;
1621
1622 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1623 PRUNE, or COMMIT means we must assume failure without checking subsequent
1624 branches. */
1625
1626 case OP_ASSERT_NOT:
1627 case OP_ASSERTBACK_NOT:
1628 save_mark = md->mark;
1629 if (md->match_function_type == MATCH_CONDASSERT)
1630 {
1631 condassert = TRUE;
1632 md->match_function_type = 0;
1633 }
1634 else condassert = FALSE;
1635
1636 do
1637 {
1638 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1639 md->mark = save_mark;
1640 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1641 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1642 {
1643 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1644 break;
1645 }
1646
1647 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1648 as NOMATCH. */
1649
1650 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1651 ecode += GET(ecode,1);
1652 }
1653 while (*ecode == OP_ALT);
1654
1655 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1656
1657 ecode += 1 + LINK_SIZE;
1658 continue;
1659
1660 /* Move the subject pointer back. This occurs only at the start of
1661 each branch of a lookbehind assertion. If we are too close to the start to
1662 move back, this match function fails. When working with UTF-8 we move
1663 back a number of characters, not bytes. */
1664
1665 case OP_REVERSE:
1666 #ifdef SUPPORT_UTF
1667 if (utf)
1668 {
1669 i = GET(ecode, 1);
1670 while (i-- > 0)
1671 {
1672 eptr--;
1673 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1674 BACKCHAR(eptr);
1675 }
1676 }
1677 else
1678 #endif
1679
1680 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1681
1682 {
1683 eptr -= GET(ecode, 1);
1684 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1685 }
1686
1687 /* Save the earliest consulted character, then skip to next op code */
1688
1689 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1690 ecode += 1 + LINK_SIZE;
1691 break;
1692
1693 /* The callout item calls an external function, if one is provided, passing
1694 details of the match so far. This is mainly for debugging, though the
1695 function is able to force a failure. */
1696
1697 case OP_CALLOUT:
1698 if (PUBL(callout) != NULL)
1699 {
1700 PUBL(callout_block) cb;
1701 cb.version = 2; /* Version 1 of the callout block */
1702 cb.callout_number = ecode[1];
1703 cb.offset_vector = md->offset_vector;
1704 #if defined COMPILE_PCRE8
1705 cb.subject = (PCRE_SPTR)md->start_subject;
1706 #elif defined COMPILE_PCRE16
1707 cb.subject = (PCRE_SPTR16)md->start_subject;
1708 #elif defined COMPILE_PCRE32
1709 cb.subject = (PCRE_SPTR32)md->start_subject;
1710 #endif
1711 cb.subject_length = (int)(md->end_subject - md->start_subject);
1712 cb.start_match = (int)(mstart - md->start_subject);
1713 cb.current_position = (int)(eptr - md->start_subject);
1714 cb.pattern_position = GET(ecode, 2);
1715 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1716 cb.capture_top = offset_top/2;
1717 cb.capture_last = md->capture_last;
1718 cb.callout_data = md->callout_data;
1719 cb.mark = md->nomatch_mark;
1720 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1721 if (rrc < 0) RRETURN(rrc);
1722 }
1723 ecode += 2 + 2*LINK_SIZE;
1724 break;
1725
1726 /* Recursion either matches the current regex, or some subexpression. The
1727 offset data is the offset to the starting bracket from the start of the
1728 whole pattern. (This is so that it works from duplicated subpatterns.)
1729
1730 The state of the capturing groups is preserved over recursion, and
1731 re-instated afterwards. We don't know how many are started and not yet
1732 finished (offset_top records the completed total) so we just have to save
1733 all the potential data. There may be up to 65535 such values, which is too
1734 large to put on the stack, but using malloc for small numbers seems
1735 expensive. As a compromise, the stack is used when there are no more than
1736 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1737
1738 There are also other values that have to be saved. We use a chained
1739 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1740 for the original version of this logic. It has, however, been hacked around
1741 a lot, so he is not to blame for the current way it works. */
1742
1743 case OP_RECURSE:
1744 {
1745 recursion_info *ri;
1746 int recno;
1747
1748 callpat = md->start_code + GET(ecode, 1);
1749 recno = (callpat == md->start_code)? 0 :
1750 GET2(callpat, 1 + LINK_SIZE);
1751
1752 /* Check for repeating a recursion without advancing the subject pointer.
1753 This should catch convoluted mutual recursions. (Some simple cases are
1754 caught at compile time.) */
1755
1756 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1757 if (recno == ri->group_num && eptr == ri->subject_position)
1758 RRETURN(PCRE_ERROR_RECURSELOOP);
1759
1760 /* Add to "recursing stack" */
1761
1762 new_recursive.group_num = recno;
1763 new_recursive.subject_position = eptr;
1764 new_recursive.prevrec = md->recursive;
1765 md->recursive = &new_recursive;
1766
1767 /* Where to continue from afterwards */
1768
1769 ecode += 1 + LINK_SIZE;
1770
1771 /* Now save the offset data */
1772
1773 new_recursive.saved_max = md->offset_end;
1774 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1775 new_recursive.offset_save = stacksave;
1776 else
1777 {
1778 new_recursive.offset_save =
1779 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1780 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1781 }
1782 memcpy(new_recursive.offset_save, md->offset_vector,
1783 new_recursive.saved_max * sizeof(int));
1784
1785 /* OK, now we can do the recursion. After processing each alternative,
1786 restore the offset data. If there were nested recursions, md->recursive
1787 might be changed, so reset it before looping. */
1788
1789 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1790 cbegroup = (*callpat >= OP_SBRA);
1791 do
1792 {
1793 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1794 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1795 md, eptrb, RM6);
1796 memcpy(md->offset_vector, new_recursive.offset_save,
1797 new_recursive.saved_max * sizeof(int));
1798 md->recursive = new_recursive.prevrec;
1799 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1800 {
1801 DPRINTF(("Recursion matched\n"));
1802 if (new_recursive.offset_save != stacksave)
1803 (PUBL(free))(new_recursive.offset_save);
1804
1805 /* Set where we got to in the subject, and reset the start in case
1806 it was changed by \K. This *is* propagated back out of a recursion,
1807 for Perl compatibility. */
1808
1809 eptr = md->end_match_ptr;
1810 mstart = md->start_match_ptr;
1811 goto RECURSION_MATCHED; /* Exit loop; end processing */
1812 }
1813
1814 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1815 is treated as NOMATCH. */
1816
1817 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1818 rrc != MATCH_COMMIT)
1819 {
1820 DPRINTF(("Recursion gave error %d\n", rrc));
1821 if (new_recursive.offset_save != stacksave)
1822 (PUBL(free))(new_recursive.offset_save);
1823 RRETURN(rrc);
1824 }
1825
1826 md->recursive = &new_recursive;
1827 callpat += GET(callpat, 1);
1828 }
1829 while (*callpat == OP_ALT);
1830
1831 DPRINTF(("Recursion didn't match\n"));
1832 md->recursive = new_recursive.prevrec;
1833 if (new_recursive.offset_save != stacksave)
1834 (PUBL(free))(new_recursive.offset_save);
1835 RRETURN(MATCH_NOMATCH);
1836 }
1837
1838 RECURSION_MATCHED:
1839 break;
1840
1841 /* An alternation is the end of a branch; scan along to find the end of the
1842 bracketed group and go to there. */
1843
1844 case OP_ALT:
1845 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1846 break;
1847
1848 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1849 indicating that it may occur zero times. It may repeat infinitely, or not
1850 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1851 with fixed upper repeat limits are compiled as a number of copies, with the
1852 optional ones preceded by BRAZERO or BRAMINZERO. */
1853
1854 case OP_BRAZERO:
1855 next = ecode + 1;
1856 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1858 do next += GET(next, 1); while (*next == OP_ALT);
1859 ecode = next + 1 + LINK_SIZE;
1860 break;
1861
1862 case OP_BRAMINZERO:
1863 next = ecode + 1;
1864 do next += GET(next, 1); while (*next == OP_ALT);
1865 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1867 ecode++;
1868 break;
1869
1870 case OP_SKIPZERO:
1871 next = ecode+1;
1872 do next += GET(next,1); while (*next == OP_ALT);
1873 ecode = next + 1 + LINK_SIZE;
1874 break;
1875
1876 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1877 here; just jump to the group, with allow_zero set TRUE. */
1878
1879 case OP_BRAPOSZERO:
1880 op = *(++ecode);
1881 allow_zero = TRUE;
1882 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1883 goto POSSESSIVE_NON_CAPTURE;
1884
1885 /* End of a group, repeated or non-repeating. */
1886
1887 case OP_KET:
1888 case OP_KETRMIN:
1889 case OP_KETRMAX:
1890 case OP_KETRPOS:
1891 prev = ecode - GET(ecode, 1);
1892
1893 /* If this was a group that remembered the subject start, in order to break
1894 infinite repeats of empty string matches, retrieve the subject start from
1895 the chain. Otherwise, set it NULL. */
1896
1897 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1898 {
1899 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1900 eptrb = eptrb->epb_prev; /* Backup to previous group */
1901 }
1902 else saved_eptr = NULL;
1903
1904 /* If we are at the end of an assertion group or a non-capturing atomic
1905 group, stop matching and return MATCH_MATCH, but record the current high
1906 water mark for use by positive assertions. We also need to record the match
1907 start in case it was changed by \K. */
1908
1909 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1910 *prev == OP_ONCE_NC)
1911 {
1912 md->end_match_ptr = eptr; /* For ONCE_NC */
1913 md->end_offset_top = offset_top;
1914 md->start_match_ptr = mstart;
1915 RRETURN(MATCH_MATCH); /* Sets md->mark */
1916 }
1917
1918 /* For capturing groups we have to check the group number back at the start
1919 and if necessary complete handling an extraction by setting the offsets and
1920 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1921 into group 0, so it won't be picked up here. Instead, we catch it when the
1922 OP_END is reached. Other recursion is handled here. We just have to record
1923 the current subject position and start match pointer and give a MATCH
1924 return. */
1925
1926 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1927 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1928 {
1929 number = GET2(prev, 1+LINK_SIZE);
1930 offset = number << 1;
1931
1932 #ifdef PCRE_DEBUG
1933 printf("end bracket %d", number);
1934 printf("\n");
1935 #endif
1936
1937 /* Handle a recursively called group. */
1938
1939 if (md->recursive != NULL && md->recursive->group_num == number)
1940 {
1941 md->end_match_ptr = eptr;
1942 md->start_match_ptr = mstart;
1943 RRETURN(MATCH_MATCH);
1944 }
1945
1946 /* Deal with capturing */
1947
1948 md->capture_last = number;
1949 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1950 {
1951 /* If offset is greater than offset_top, it means that we are
1952 "skipping" a capturing group, and that group's offsets must be marked
1953 unset. In earlier versions of PCRE, all the offsets were unset at the
1954 start of matching, but this doesn't work because atomic groups and
1955 assertions can cause a value to be set that should later be unset.
1956 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1957 part of the atomic group, but this is not on the final matching path,
1958 so must be unset when 2 is set. (If there is no group 2, there is no
1959 problem, because offset_top will then be 2, indicating no capture.) */
1960
1961 if (offset > offset_top)
1962 {
1963 register int *iptr = md->offset_vector + offset_top;
1964 register int *iend = md->offset_vector + offset;
1965 while (iptr < iend) *iptr++ = -1;
1966 }
1967
1968 /* Now make the extraction */
1969
1970 md->offset_vector[offset] =
1971 md->offset_vector[md->offset_end - number];
1972 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1973 if (offset_top <= offset) offset_top = offset + 2;
1974 }
1975 }
1976
1977 /* For an ordinary non-repeating ket, just continue at this level. This
1978 also happens for a repeating ket if no characters were matched in the
1979 group. This is the forcible breaking of infinite loops as implemented in
1980 Perl 5.005. For a non-repeating atomic group that includes captures,
1981 establish a backup point by processing the rest of the pattern at a lower
1982 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1983 original OP_ONCE level, thereby bypassing intermediate backup points, but
1984 resetting any captures that happened along the way. */
1985
1986 if (*ecode == OP_KET || eptr == saved_eptr)
1987 {
1988 if (*prev == OP_ONCE)
1989 {
1990 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1992 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1993 RRETURN(MATCH_ONCE);
1994 }
1995 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1996 break;
1997 }
1998
1999 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2000 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2001 at a time from the outer level, thus saving stack. */
2002
2003 if (*ecode == OP_KETRPOS)
2004 {
2005 md->end_match_ptr = eptr;
2006 md->end_offset_top = offset_top;
2007 RRETURN(MATCH_KETRPOS);
2008 }
2009
2010 /* The normal repeating kets try the rest of the pattern or restart from
2011 the preceding bracket, in the appropriate order. In the second case, we can
2012 use tail recursion to avoid using another stack frame, unless we have an
2013 an atomic group or an unlimited repeat of a group that can match an empty
2014 string. */
2015
2016 if (*ecode == OP_KETRMIN)
2017 {
2018 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2019 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020 if (*prev == OP_ONCE)
2021 {
2022 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2024 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2025 RRETURN(MATCH_ONCE);
2026 }
2027 if (*prev >= OP_SBRA) /* Could match an empty string */
2028 {
2029 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2030 RRETURN(rrc);
2031 }
2032 ecode = prev;
2033 goto TAIL_RECURSE;
2034 }
2035 else /* OP_KETRMAX */
2036 {
2037 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2038 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2039 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2040 if (*prev == OP_ONCE)
2041 {
2042 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2044 md->once_target = prev;
2045 RRETURN(MATCH_ONCE);
2046 }
2047 ecode += 1 + LINK_SIZE;
2048 goto TAIL_RECURSE;
2049 }
2050 /* Control never gets here */
2051
2052 /* Not multiline mode: start of subject assertion, unless notbol. */
2053
2054 case OP_CIRC:
2055 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2056
2057 /* Start of subject assertion */
2058
2059 case OP_SOD:
2060 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2061 ecode++;
2062 break;
2063
2064 /* Multiline mode: start of subject unless notbol, or after any newline. */
2065
2066 case OP_CIRCM:
2067 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2068 if (eptr != md->start_subject &&
2069 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2070 RRETURN(MATCH_NOMATCH);
2071 ecode++;
2072 break;
2073
2074 /* Start of match assertion */
2075
2076 case OP_SOM:
2077 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2078 ecode++;
2079 break;
2080
2081 /* Reset the start of match point */
2082
2083 case OP_SET_SOM:
2084 mstart = eptr;
2085 ecode++;
2086 break;
2087
2088 /* Multiline mode: assert before any newline, or before end of subject
2089 unless noteol is set. */
2090
2091 case OP_DOLLM:
2092 if (eptr < md->end_subject)
2093 {
2094 if (!IS_NEWLINE(eptr))
2095 {
2096 if (md->partial != 0 &&
2097 eptr + 1 >= md->end_subject &&
2098 NLBLOCK->nltype == NLTYPE_FIXED &&
2099 NLBLOCK->nllen == 2 &&
2100 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2101 {
2102 md->hitend = TRUE;
2103 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2104 }
2105 RRETURN(MATCH_NOMATCH);
2106 }
2107 }
2108 else
2109 {
2110 if (md->noteol) RRETURN(MATCH_NOMATCH);
2111 SCHECK_PARTIAL();
2112 }
2113 ecode++;
2114 break;
2115
2116 /* Not multiline mode: assert before a terminating newline or before end of
2117 subject unless noteol is set. */
2118
2119 case OP_DOLL:
2120 if (md->noteol) RRETURN(MATCH_NOMATCH);
2121 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2122
2123 /* ... else fall through for endonly */
2124
2125 /* End of subject assertion (\z) */
2126
2127 case OP_EOD:
2128 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2129 SCHECK_PARTIAL();
2130 ecode++;
2131 break;
2132
2133 /* End of subject or ending \n assertion (\Z) */
2134
2135 case OP_EODN:
2136 ASSERT_NL_OR_EOS:
2137 if (eptr < md->end_subject &&
2138 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2139 {
2140 if (md->partial != 0 &&
2141 eptr + 1 >= md->end_subject &&
2142 NLBLOCK->nltype == NLTYPE_FIXED &&
2143 NLBLOCK->nllen == 2 &&
2144 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2145 {
2146 md->hitend = TRUE;
2147 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2148 }
2149 RRETURN(MATCH_NOMATCH);
2150 }
2151
2152 /* Either at end of string or \n before end. */
2153
2154 SCHECK_PARTIAL();
2155 ecode++;
2156 break;
2157
2158 /* Word boundary assertions */
2159
2160 case OP_NOT_WORD_BOUNDARY:
2161 case OP_WORD_BOUNDARY:
2162 {
2163
2164 /* Find out if the previous and current characters are "word" characters.
2165 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2166 be "non-word" characters. Remember the earliest consulted character for
2167 partial matching. */
2168
2169 #ifdef SUPPORT_UTF
2170 if (utf)
2171 {
2172 /* Get status of previous character */
2173
2174 if (eptr == md->start_subject) prev_is_word = FALSE; else
2175 {
2176 PCRE_PUCHAR lastptr = eptr - 1;
2177 BACKCHAR(lastptr);
2178 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2179 GETCHAR(c, lastptr);
2180 #ifdef SUPPORT_UCP
2181 if (md->use_ucp)
2182 {
2183 if (c == '_') prev_is_word = TRUE; else
2184 {
2185 int cat = UCD_CATEGORY(c);
2186 prev_is_word = (cat == ucp_L || cat == ucp_N);
2187 }
2188 }
2189 else
2190 #endif
2191 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2192 }
2193
2194 /* Get status of next character */
2195
2196 if (eptr >= md->end_subject)
2197 {
2198 SCHECK_PARTIAL();
2199 cur_is_word = FALSE;
2200 }
2201 else
2202 {
2203 GETCHAR(c, eptr);
2204 #ifdef SUPPORT_UCP
2205 if (md->use_ucp)
2206 {
2207 if (c == '_') cur_is_word = TRUE; else
2208 {
2209 int cat = UCD_CATEGORY(c);
2210 cur_is_word = (cat == ucp_L || cat == ucp_N);
2211 }
2212 }
2213 else
2214 #endif
2215 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2216 }
2217 }
2218 else
2219 #endif
2220
2221 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2222 consistency with the behaviour of \w we do use it in this case. */
2223
2224 {
2225 /* Get status of previous character */
2226
2227 if (eptr == md->start_subject) prev_is_word = FALSE; else
2228 {
2229 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2230 #ifdef SUPPORT_UCP
2231 if (md->use_ucp)
2232 {
2233 c = eptr[-1];
2234 if (c == '_') prev_is_word = TRUE; else
2235 {
2236 int cat = UCD_CATEGORY(c);
2237 prev_is_word = (cat == ucp_L || cat == ucp_N);
2238 }
2239 }
2240 else
2241 #endif
2242 prev_is_word = MAX_255(eptr[-1])
2243 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2244 }
2245
2246 /* Get status of next character */
2247
2248 if (eptr >= md->end_subject)
2249 {
2250 SCHECK_PARTIAL();
2251 cur_is_word = FALSE;
2252 }
2253 else
2254 #ifdef SUPPORT_UCP
2255 if (md->use_ucp)
2256 {
2257 c = *eptr;
2258 if (c == '_') cur_is_word = TRUE; else
2259 {
2260 int cat = UCD_CATEGORY(c);
2261 cur_is_word = (cat == ucp_L || cat == ucp_N);
2262 }
2263 }
2264 else
2265 #endif
2266 cur_is_word = MAX_255(*eptr)
2267 && ((md->ctypes[*eptr] & ctype_word) != 0);
2268 }
2269
2270 /* Now see if the situation is what we want */
2271
2272 if ((*ecode++ == OP_WORD_BOUNDARY)?
2273 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2274 RRETURN(MATCH_NOMATCH);
2275 }
2276 break;
2277
2278 /* Match any single character type except newline; have to take care with
2279 CRLF newlines and partial matching. */
2280
2281 case OP_ANY:
2282 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2283 if (md->partial != 0 &&
2284 eptr + 1 >= md->end_subject &&
2285 NLBLOCK->nltype == NLTYPE_FIXED &&
2286 NLBLOCK->nllen == 2 &&
2287 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2288 {
2289 md->hitend = TRUE;
2290 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2291 }
2292
2293 /* Fall through */
2294
2295 /* Match any single character whatsoever. */
2296
2297 case OP_ALLANY:
2298 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2299 { /* not be updated before SCHECK_PARTIAL. */
2300 SCHECK_PARTIAL();
2301 RRETURN(MATCH_NOMATCH);
2302 }
2303 eptr++;
2304 #ifdef SUPPORT_UTF
2305 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2306 #endif
2307 ecode++;
2308 break;
2309
2310 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2311 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2312
2313 case OP_ANYBYTE:
2314 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2315 { /* not be updated before SCHECK_PARTIAL. */
2316 SCHECK_PARTIAL();
2317 RRETURN(MATCH_NOMATCH);
2318 }
2319 eptr++;
2320 ecode++;
2321 break;
2322
2323 case OP_NOT_DIGIT:
2324 if (eptr >= md->end_subject)
2325 {
2326 SCHECK_PARTIAL();
2327 RRETURN(MATCH_NOMATCH);
2328 }
2329 GETCHARINCTEST(c, eptr);
2330 if (
2331 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2332 c < 256 &&
2333 #endif
2334 (md->ctypes[c] & ctype_digit) != 0
2335 )
2336 RRETURN(MATCH_NOMATCH);
2337 ecode++;
2338 break;
2339
2340 case OP_DIGIT:
2341 if (eptr >= md->end_subject)
2342 {
2343 SCHECK_PARTIAL();
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 GETCHARINCTEST(c, eptr);
2347 if (
2348 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2349 c > 255 ||
2350 #endif
2351 (md->ctypes[c] & ctype_digit) == 0
2352 )
2353 RRETURN(MATCH_NOMATCH);
2354 ecode++;
2355 break;
2356
2357 case OP_NOT_WHITESPACE:
2358 if (eptr >= md->end_subject)
2359 {
2360 SCHECK_PARTIAL();
2361 RRETURN(MATCH_NOMATCH);
2362 }
2363 GETCHARINCTEST(c, eptr);
2364 if (
2365 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2366 c < 256 &&
2367 #endif
2368 (md->ctypes[c] & ctype_space) != 0
2369 )
2370 RRETURN(MATCH_NOMATCH);
2371 ecode++;
2372 break;
2373
2374 case OP_WHITESPACE:
2375 if (eptr >= md->end_subject)
2376 {
2377 SCHECK_PARTIAL();
2378 RRETURN(MATCH_NOMATCH);
2379 }
2380 GETCHARINCTEST(c, eptr);
2381 if (
2382 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2383 c > 255 ||
2384 #endif
2385 (md->ctypes[c] & ctype_space) == 0
2386 )
2387 RRETURN(MATCH_NOMATCH);
2388 ecode++;
2389 break;
2390
2391 case OP_NOT_WORDCHAR:
2392 if (eptr >= md->end_subject)
2393 {
2394 SCHECK_PARTIAL();
2395 RRETURN(MATCH_NOMATCH);
2396 }
2397 GETCHARINCTEST(c, eptr);
2398 if (
2399 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2400 c < 256 &&
2401 #endif
2402 (md->ctypes[c] & ctype_word) != 0
2403 )
2404 RRETURN(MATCH_NOMATCH);
2405 ecode++;
2406 break;
2407
2408 case OP_WORDCHAR:
2409 if (eptr >= md->end_subject)
2410 {
2411 SCHECK_PARTIAL();
2412 RRETURN(MATCH_NOMATCH);
2413 }
2414 GETCHARINCTEST(c, eptr);
2415 if (
2416 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2417 c > 255 ||
2418 #endif
2419 (md->ctypes[c] & ctype_word) == 0
2420 )
2421 RRETURN(MATCH_NOMATCH);
2422 ecode++;
2423 break;
2424
2425 case OP_ANYNL:
2426 if (eptr >= md->end_subject)
2427 {
2428 SCHECK_PARTIAL();
2429 RRETURN(MATCH_NOMATCH);
2430 }
2431 GETCHARINCTEST(c, eptr);
2432 switch(c)
2433 {
2434 default: RRETURN(MATCH_NOMATCH);
2435
2436 case CHAR_CR:
2437 if (eptr >= md->end_subject)
2438 {
2439 SCHECK_PARTIAL();
2440 }
2441 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2442 break;
2443
2444 case CHAR_LF:
2445 break;
2446
2447 case CHAR_VT:
2448 case CHAR_FF:
2449 case CHAR_NEL:
2450 #ifndef EBCDIC
2451 case 0x2028:
2452 case 0x2029:
2453 #endif /* Not EBCDIC */
2454 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2455 break;
2456 }
2457 ecode++;
2458 break;
2459
2460 case OP_NOT_HSPACE:
2461 if (eptr >= md->end_subject)
2462 {
2463 SCHECK_PARTIAL();
2464 RRETURN(MATCH_NOMATCH);
2465 }
2466 GETCHARINCTEST(c, eptr);
2467 switch(c)
2468 {
2469 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2470 default: break;
2471 }
2472 ecode++;
2473 break;
2474
2475 case OP_HSPACE:
2476 if (eptr >= md->end_subject)
2477 {
2478 SCHECK_PARTIAL();
2479 RRETURN(MATCH_NOMATCH);
2480 }
2481 GETCHARINCTEST(c, eptr);
2482 switch(c)
2483 {
2484 HSPACE_CASES: break; /* Byte and multibyte cases */
2485 default: RRETURN(MATCH_NOMATCH);
2486 }
2487 ecode++;
2488 break;
2489
2490 case OP_NOT_VSPACE:
2491 if (eptr >= md->end_subject)
2492 {
2493 SCHECK_PARTIAL();
2494 RRETURN(MATCH_NOMATCH);
2495 }
2496 GETCHARINCTEST(c, eptr);
2497 switch(c)
2498 {
2499 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2500 default: break;
2501 }
2502 ecode++;
2503 break;
2504
2505 case OP_VSPACE:
2506 if (eptr >= md->end_subject)
2507 {
2508 SCHECK_PARTIAL();
2509 RRETURN(MATCH_NOMATCH);
2510 }
2511 GETCHARINCTEST(c, eptr);
2512 switch(c)
2513 {
2514 VSPACE_CASES: break;
2515 default: RRETURN(MATCH_NOMATCH);
2516 }
2517 ecode++;
2518 break;
2519
2520 #ifdef SUPPORT_UCP
2521 /* Check the next character by Unicode property. We will get here only
2522 if the support is in the binary; otherwise a compile-time error occurs. */
2523
2524 case OP_PROP:
2525 case OP_NOTPROP:
2526 if (eptr >= md->end_subject)
2527 {
2528 SCHECK_PARTIAL();
2529 RRETURN(MATCH_NOMATCH);
2530 }
2531 GETCHARINCTEST(c, eptr);
2532 {
2533 const pcre_uint32 *cp;
2534 const ucd_record *prop = GET_UCD(c);
2535
2536 switch(ecode[1])
2537 {
2538 case PT_ANY:
2539 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2540 break;
2541
2542 case PT_LAMP:
2543 if ((prop->chartype == ucp_Lu ||
2544 prop->chartype == ucp_Ll ||
2545 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2546 RRETURN(MATCH_NOMATCH);
2547 break;
2548
2549 case PT_GC:
2550 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2551 RRETURN(MATCH_NOMATCH);
2552 break;
2553
2554 case PT_PC:
2555 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2556 RRETURN(MATCH_NOMATCH);
2557 break;
2558
2559 case PT_SC:
2560 if ((ecode[2] != prop->script) == (op == OP_PROP))
2561 RRETURN(MATCH_NOMATCH);
2562 break;
2563
2564 /* These are specials */
2565
2566 case PT_ALNUM:
2567 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2568 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2569 RRETURN(MATCH_NOMATCH);
2570 break;
2571
2572 case PT_SPACE: /* Perl space */
2573 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2574 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2575 == (op == OP_NOTPROP))
2576 RRETURN(MATCH_NOMATCH);
2577 break;
2578
2579 case PT_PXSPACE: /* POSIX space */
2580 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2581 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2582 c == CHAR_FF || c == CHAR_CR)
2583 == (op == OP_NOTPROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2586
2587 case PT_WORD:
2588 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2589 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2590 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2591 RRETURN(MATCH_NOMATCH);
2592 break;
2593
2594 case PT_CLIST:
2595 cp = PRIV(ucd_caseless_sets) + prop->caseset;
2596 for (;;)
2597 {
2598 if (c < *cp)
2599 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2600 if (c == *cp++)
2601 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2602 }
2603 break;
2604
2605 /* This should never occur */
2606
2607 default:
2608 RRETURN(PCRE_ERROR_INTERNAL);
2609 }
2610
2611 ecode += 3;
2612 }
2613 break;
2614
2615 /* Match an extended Unicode sequence. We will get here only if the support
2616 is in the binary; otherwise a compile-time error occurs. */
2617
2618 case OP_EXTUNI:
2619 if (eptr >= md->end_subject)
2620 {
2621 SCHECK_PARTIAL();
2622 RRETURN(MATCH_NOMATCH);
2623 }
2624 else
2625 {
2626 int lgb, rgb;
2627 GETCHARINCTEST(c, eptr);
2628 lgb = UCD_GRAPHBREAK(c);
2629 while (eptr < md->end_subject)
2630 {
2631 int len = 1;
2632 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2633 rgb = UCD_GRAPHBREAK(c);
2634 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2635 lgb = rgb;
2636 eptr += len;
2637 }
2638 }
2639 CHECK_PARTIAL();
2640 ecode++;
2641 break;
2642 #endif /* SUPPORT_UCP */
2643
2644
2645 /* Match a back reference, possibly repeatedly. Look past the end of the
2646 item to see if there is repeat information following. The code is similar
2647 to that for character classes, but repeated for efficiency. Then obey
2648 similar code to character type repeats - written out again for speed.
2649 However, if the referenced string is the empty string, always treat
2650 it as matched, any number of times (otherwise there could be infinite
2651 loops). */
2652
2653 case OP_REF:
2654 case OP_REFI:
2655 caseless = op == OP_REFI;
2656 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2657 ecode += 1 + IMM2_SIZE;
2658
2659 /* If the reference is unset, there are two possibilities:
2660
2661 (a) In the default, Perl-compatible state, set the length negative;
2662 this ensures that every attempt at a match fails. We can't just fail
2663 here, because of the possibility of quantifiers with zero minima.
2664
2665 (b) If the JavaScript compatibility flag is set, set the length to zero
2666 so that the back reference matches an empty string.
2667
2668 Otherwise, set the length to the length of what was matched by the
2669 referenced subpattern. */
2670
2671 if (offset >= offset_top || md->offset_vector[offset] < 0)
2672 length = (md->jscript_compat)? 0 : -1;
2673 else
2674 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2675
2676 /* Set up for repetition, or handle the non-repeated case */
2677
2678 switch (*ecode)
2679 {
2680 case OP_CRSTAR:
2681 case OP_CRMINSTAR:
2682 case OP_CRPLUS:
2683 case OP_CRMINPLUS:
2684 case OP_CRQUERY:
2685 case OP_CRMINQUERY:
2686 c = *ecode++ - OP_CRSTAR;
2687 minimize = (c & 1) != 0;
2688 min = rep_min[c]; /* Pick up values from tables; */
2689 max = rep_max[c]; /* zero for max => infinity */
2690 if (max == 0) max = INT_MAX;
2691 break;
2692
2693 case OP_CRRANGE:
2694 case OP_CRMINRANGE:
2695 minimize = (*ecode == OP_CRMINRANGE);
2696 min = GET2(ecode, 1);
2697 max = GET2(ecode, 1 + IMM2_SIZE);
2698 if (max == 0) max = INT_MAX;
2699 ecode += 1 + 2 * IMM2_SIZE;
2700 break;
2701
2702 default: /* No repeat follows */
2703 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2704 {
2705 if (length == -2) eptr = md->end_subject; /* Partial match */
2706 CHECK_PARTIAL();
2707 RRETURN(MATCH_NOMATCH);
2708 }
2709 eptr += length;
2710 continue; /* With the main loop */
2711 }
2712
2713 /* Handle repeated back references. If the length of the reference is
2714 zero, just continue with the main loop. If the length is negative, it
2715 means the reference is unset in non-Java-compatible mode. If the minimum is
2716 zero, we can continue at the same level without recursion. For any other
2717 minimum, carrying on will result in NOMATCH. */
2718
2719 if (length == 0) continue;
2720 if (length < 0 && min == 0) continue;
2721
2722 /* First, ensure the minimum number of matches are present. We get back
2723 the length of the reference string explicitly rather than passing the
2724 address of eptr, so that eptr can be a register variable. */
2725
2726 for (i = 1; i <= min; i++)
2727 {
2728 int slength;
2729 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2730 {
2731 if (slength == -2) eptr = md->end_subject; /* Partial match */
2732 CHECK_PARTIAL();
2733 RRETURN(MATCH_NOMATCH);
2734 }
2735 eptr += slength;
2736 }
2737
2738 /* If min = max, continue at the same level without recursion.
2739 They are not both allowed to be zero. */
2740
2741 if (min == max) continue;
2742
2743 /* If minimizing, keep trying and advancing the pointer */
2744
2745 if (minimize)
2746 {
2747 for (fi = min;; fi++)
2748 {
2749 int slength;
2750 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2751 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2752 if (fi >= max) RRETURN(MATCH_NOMATCH);
2753 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2754 {
2755 if (slength == -2) eptr = md->end_subject; /* Partial match */
2756 CHECK_PARTIAL();
2757 RRETURN(MATCH_NOMATCH);
2758 }
2759 eptr += slength;
2760 }
2761 /* Control never gets here */
2762 }
2763
2764 /* If maximizing, find the longest string and work backwards */
2765
2766 else
2767 {
2768 pp = eptr;
2769 for (i = min; i < max; i++)
2770 {
2771 int slength;
2772 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2773 {
2774 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2775 the soft partial matching case. */
2776
2777 if (slength == -2 && md->partial != 0 &&
2778 md->end_subject > md->start_used_ptr)
2779 {
2780 md->hitend = TRUE;
2781 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2782 }
2783 break;
2784 }
2785 eptr += slength;
2786 }
2787
2788 while (eptr >= pp)
2789 {
2790 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2792 eptr -= length;
2793 }
2794 RRETURN(MATCH_NOMATCH);
2795 }
2796 /* Control never gets here */
2797
2798 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2799 used when all the characters in the class have values in the range 0-255,
2800 and either the matching is caseful, or the characters are in the range
2801 0-127 when UTF-8 processing is enabled. The only difference between
2802 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2803 encountered.
2804
2805 First, look past the end of the item to see if there is repeat information
2806 following. Then obey similar code to character type repeats - written out
2807 again for speed. */
2808
2809 case OP_NCLASS:
2810 case OP_CLASS:
2811 {
2812 /* The data variable is saved across frames, so the byte map needs to
2813 be stored there. */
2814 #define BYTE_MAP ((pcre_uint8 *)data)
2815 data = ecode + 1; /* Save for matching */
2816 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2817
2818 switch (*ecode)
2819 {
2820 case OP_CRSTAR:
2821 case OP_CRMINSTAR:
2822 case OP_CRPLUS:
2823 case OP_CRMINPLUS:
2824 case OP_CRQUERY:
2825 case OP_CRMINQUERY:
2826 c = *ecode++ - OP_CRSTAR;
2827 minimize = (c & 1) != 0;
2828 min = rep_min[c]; /* Pick up values from tables; */
2829 max = rep_max[c]; /* zero for max => infinity */
2830 if (max == 0) max = INT_MAX;
2831 break;
2832
2833 case OP_CRRANGE:
2834 case OP_CRMINRANGE:
2835 minimize = (*ecode == OP_CRMINRANGE);
2836 min = GET2(ecode, 1);
2837 max = GET2(ecode, 1 + IMM2_SIZE);
2838 if (max == 0) max = INT_MAX;
2839 ecode += 1 + 2 * IMM2_SIZE;
2840 break;
2841
2842 default: /* No repeat follows */
2843 min = max = 1;
2844 break;
2845 }
2846
2847 /* First, ensure the minimum number of matches are present. */
2848
2849 #ifdef SUPPORT_UTF
2850 if (utf)
2851 {
2852 for (i = 1; i <= min; i++)
2853 {
2854 if (eptr >= md->end_subject)
2855 {
2856 SCHECK_PARTIAL();
2857 RRETURN(MATCH_NOMATCH);
2858 }
2859 GETCHARINC(c, eptr);
2860 if (c > 255)
2861 {
2862 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2863 }
2864 else
2865 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2866 }
2867 }
2868 else
2869 #endif
2870 /* Not UTF mode */
2871 {
2872 for (i = 1; i <= min; i++)
2873 {
2874 if (eptr >= md->end_subject)
2875 {
2876 SCHECK_PARTIAL();
2877 RRETURN(MATCH_NOMATCH);
2878 }
2879 c = *eptr++;
2880 #ifndef COMPILE_PCRE8
2881 if (c > 255)
2882 {
2883 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2884 }
2885 else
2886 #endif
2887 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2888 }
2889 }
2890
2891 /* If max == min we can continue with the main loop without the
2892 need to recurse. */
2893
2894 if (min == max) continue;
2895
2896 /* If minimizing, keep testing the rest of the expression and advancing
2897 the pointer while it matches the class. */
2898
2899 if (minimize)
2900 {
2901 #ifdef SUPPORT_UTF
2902 if (utf)
2903 {
2904 for (fi = min;; fi++)
2905 {
2906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2908 if (fi >= max) RRETURN(MATCH_NOMATCH);
2909 if (eptr >= md->end_subject)
2910 {
2911 SCHECK_PARTIAL();
2912 RRETURN(MATCH_NOMATCH);
2913 }
2914 GETCHARINC(c, eptr);
2915 if (c > 255)
2916 {
2917 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2918 }
2919 else
2920 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2921 }
2922 }
2923 else
2924 #endif
2925 /* Not UTF mode */
2926 {
2927 for (fi = min;; fi++)
2928 {
2929 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2930 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2931 if (fi >= max) RRETURN(MATCH_NOMATCH);
2932 if (eptr >= md->end_subject)
2933 {
2934 SCHECK_PARTIAL();
2935 RRETURN(MATCH_NOMATCH);
2936 }
2937 c = *eptr++;
2938 #ifndef COMPILE_PCRE8
2939 if (c > 255)
2940 {
2941 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2942 }
2943 else
2944 #endif
2945 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2946 }
2947 }
2948 /* Control never gets here */
2949 }
2950
2951 /* If maximizing, find the longest possible run, then work backwards. */
2952
2953 else
2954 {
2955 pp = eptr;
2956
2957 #ifdef SUPPORT_UTF
2958 if (utf)
2959 {
2960 for (i = min; i < max; i++)
2961 {
2962 int len = 1;
2963 if (eptr >= md->end_subject)
2964 {
2965 SCHECK_PARTIAL();
2966 break;
2967 }
2968 GETCHARLEN(c, eptr, len);
2969 if (c > 255)
2970 {
2971 if (op == OP_CLASS) break;
2972 }
2973 else
2974 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2975 eptr += len;
2976 }
2977 for (;;)
2978 {
2979 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2981 if (eptr-- == pp) break; /* Stop if tried at original pos */
2982 BACKCHAR(eptr);
2983 }
2984 }
2985 else
2986 #endif
2987 /* Not UTF mode */
2988 {
2989 for (i = min; i < max; i++)
2990 {
2991 if (eptr >= md->end_subject)
2992 {
2993 SCHECK_PARTIAL();
2994 break;
2995 }
2996 c = *eptr;
2997 #ifndef COMPILE_PCRE8
2998 if (c > 255)
2999 {
3000 if (op == OP_CLASS) break;
3001 }
3002 else
3003 #endif
3004 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3005 eptr++;
3006 }
3007 while (eptr >= pp)
3008 {
3009 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3011 eptr--;
3012 }
3013 }
3014
3015 RRETURN(MATCH_NOMATCH);
3016 }
3017 #undef BYTE_MAP
3018 }
3019 /* Control never gets here */
3020
3021
3022 /* Match an extended character class. This opcode is encountered only
3023 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3024 mode, because Unicode properties are supported in non-UTF-8 mode. */
3025
3026 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3027 case OP_XCLASS:
3028 {
3029 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3030 ecode += GET(ecode, 1); /* Advance past the item */
3031
3032 switch (*ecode)
3033 {
3034 case OP_CRSTAR:
3035 case OP_CRMINSTAR:
3036 case OP_CRPLUS:
3037 case OP_CRMINPLUS:
3038 case OP_CRQUERY:
3039 case OP_CRMINQUERY:
3040 c = *ecode++ - OP_CRSTAR;
3041 minimize = (c & 1) != 0;
3042 min = rep_min[c]; /* Pick up values from tables; */
3043 max = rep_max[c]; /* zero for max => infinity */
3044 if (max == 0) max = INT_MAX;
3045 break;
3046
3047 case OP_CRRANGE:
3048 case OP_CRMINRANGE:
3049 minimize = (*ecode == OP_CRMINRANGE);
3050 min = GET2(ecode, 1);
3051 max = GET2(ecode, 1 + IMM2_SIZE);
3052 if (max == 0) max = INT_MAX;
3053 ecode += 1 + 2 * IMM2_SIZE;
3054 break;
3055
3056 default: /* No repeat follows */
3057 min = max = 1;
3058 break;
3059 }
3060
3061 /* First, ensure the minimum number of matches are present. */
3062
3063 for (i = 1; i <= min; i++)
3064 {
3065 if (eptr >= md->end_subject)
3066 {
3067 SCHECK_PARTIAL();
3068 RRETURN(MATCH_NOMATCH);
3069 }
3070 GETCHARINCTEST(c, eptr);
3071 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3072 }
3073
3074 /* If max == min we can continue with the main loop without the
3075 need to recurse. */
3076
3077 if (min == max) continue;
3078
3079 /* If minimizing, keep testing the rest of the expression and advancing
3080 the pointer while it matches the class. */
3081
3082 if (minimize)
3083 {
3084 for (fi = min;; fi++)
3085 {
3086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3088 if (fi >= max) RRETURN(MATCH_NOMATCH);
3089 if (eptr >= md->end_subject)
3090 {
3091 SCHECK_PARTIAL();
3092 RRETURN(MATCH_NOMATCH);
3093 }
3094 GETCHARINCTEST(c, eptr);
3095 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3096 }
3097 /* Control never gets here */
3098 }
3099
3100 /* If maximizing, find the longest possible run, then work backwards. */
3101
3102 else
3103 {
3104 pp = eptr;
3105 for (i = min; i < max; i++)
3106 {
3107 int len = 1;
3108 if (eptr >= md->end_subject)
3109 {
3110 SCHECK_PARTIAL();
3111 break;
3112 }
3113 #ifdef SUPPORT_UTF
3114 GETCHARLENTEST(c, eptr, len);
3115 #else
3116 c = *eptr;
3117 #endif
3118 if (!PRIV(xclass)(c, data, utf)) break;
3119 eptr += len;
3120 }
3121 for(;;)
3122 {
3123 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3124 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3125 if (eptr-- == pp) break; /* Stop if tried at original pos */
3126 #ifdef SUPPORT_UTF
3127 if (utf) BACKCHAR(eptr);
3128 #endif
3129 }
3130 RRETURN(MATCH_NOMATCH);
3131 }
3132
3133 /* Control never gets here */
3134 }
3135 #endif /* End of XCLASS */
3136
3137 /* Match a single character, casefully */
3138
3139 case OP_CHAR:
3140 #ifdef SUPPORT_UTF
3141 if (utf)
3142 {
3143 length = 1;
3144 ecode++;
3145 GETCHARLEN(fc, ecode, length);
3146 if (length > md->end_subject - eptr)
3147 {
3148 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3149 RRETURN(MATCH_NOMATCH);
3150 }
3151 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3152 }
3153 else
3154 #endif
3155 /* Not UTF mode */
3156 {
3157 if (md->end_subject - eptr < 1)
3158 {
3159 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3160 RRETURN(MATCH_NOMATCH);
3161 }
3162 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3163 ecode += 2;
3164 }
3165 break;
3166
3167 /* Match a single character, caselessly. If we are at the end of the
3168 subject, give up immediately. */
3169
3170 case OP_CHARI:
3171 if (eptr >= md->end_subject)
3172 {
3173 SCHECK_PARTIAL();
3174 RRETURN(MATCH_NOMATCH);
3175 }
3176
3177 #ifdef SUPPORT_UTF
3178 if (utf)
3179 {
3180 length = 1;
3181 ecode++;
3182 GETCHARLEN(fc, ecode, length);
3183
3184 /* If the pattern character's value is < 128, we have only one byte, and
3185 we know that its other case must also be one byte long, so we can use the
3186 fast lookup table. We know that there is at least one byte left in the
3187 subject. */
3188
3189 if (fc < 128)
3190 {
3191 pcre_uchar cc = RAWUCHAR(eptr);
3192 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3193 ecode++;
3194 eptr++;
3195 }
3196
3197 /* Otherwise we must pick up the subject character. Note that we cannot
3198 use the value of "length" to check for sufficient bytes left, because the
3199 other case of the character may have more or fewer bytes. */
3200
3201 else
3202 {
3203 pcre_uint32 dc;
3204 GETCHARINC(dc, eptr);
3205 ecode += length;
3206
3207 /* If we have Unicode property support, we can use it to test the other
3208 case of the character, if there is one. */
3209
3210 if (fc != dc)
3211 {
3212 #ifdef SUPPORT_UCP
3213 if (dc != UCD_OTHERCASE(fc))
3214 #endif
3215 RRETURN(MATCH_NOMATCH);
3216 }
3217 }
3218 }
3219 else
3220 #endif /* SUPPORT_UTF */
3221
3222 /* Not UTF mode */
3223 {
3224 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3225 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3226 eptr++;
3227 ecode += 2;
3228 }
3229 break;
3230
3231 /* Match a single character repeatedly. */
3232
3233 case OP_EXACT:
3234 case OP_EXACTI:
3235 min = max = GET2(ecode, 1);
3236 ecode += 1 + IMM2_SIZE;
3237 goto REPEATCHAR;
3238
3239 case OP_POSUPTO:
3240 case OP_POSUPTOI:
3241 possessive = TRUE;
3242 /* Fall through */
3243
3244 case OP_UPTO:
3245 case OP_UPTOI:
3246 case OP_MINUPTO:
3247 case OP_MINUPTOI:
3248 min = 0;
3249 max = GET2(ecode, 1);
3250 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3251 ecode += 1 + IMM2_SIZE;
3252 goto REPEATCHAR;
3253
3254 case OP_POSSTAR:
3255 case OP_POSSTARI:
3256 possessive = TRUE;
3257 min = 0;
3258 max = INT_MAX;
3259 ecode++;
3260 goto REPEATCHAR;
3261
3262 case OP_POSPLUS:
3263 case OP_POSPLUSI:
3264 possessive = TRUE;
3265 min = 1;
3266 max = INT_MAX;
3267 ecode++;
3268 goto REPEATCHAR;
3269
3270 case OP_POSQUERY:
3271 case OP_POSQUERYI:
3272 possessive = TRUE;
3273 min = 0;
3274 max = 1;
3275 ecode++;
3276 goto REPEATCHAR;
3277
3278 case OP_STAR:
3279 case OP_STARI:
3280 case OP_MINSTAR:
3281 case OP_MINSTARI:
3282 case OP_PLUS:
3283 case OP_PLUSI:
3284 case OP_MINPLUS:
3285 case OP_MINPLUSI:
3286 case OP_QUERY:
3287 case OP_QUERYI:
3288 case OP_MINQUERY:
3289 case OP_MINQUERYI:
3290 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3291 minimize = (c & 1) != 0;
3292 min = rep_min[c]; /* Pick up values from tables; */
3293 max = rep_max[c]; /* zero for max => infinity */
3294 if (max == 0) max = INT_MAX;
3295
3296 /* Common code for all repeated single-character matches. */
3297
3298 REPEATCHAR:
3299 #ifdef SUPPORT_UTF
3300 if (utf)
3301 {
3302 length = 1;
3303 charptr = ecode;
3304 GETCHARLEN(fc, ecode, length);
3305 ecode += length;
3306
3307 /* Handle multibyte character matching specially here. There is
3308 support for caseless matching if UCP support is present. */
3309
3310 if (length > 1)
3311 {
3312 #ifdef SUPPORT_UCP
3313 pcre_uint32 othercase;
3314 if (op >= OP_STARI && /* Caseless */
3315 (othercase = UCD_OTHERCASE(fc)) != fc)
3316 oclength = PRIV(ord2utf)(othercase, occhars);
3317 else oclength = 0;
3318 #endif /* SUPPORT_UCP */
3319
3320 for (i = 1; i <= min; i++)
3321 {
3322 if (eptr <= md->end_subject - length &&
3323 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3324 #ifdef SUPPORT_UCP
3325 else if (oclength > 0 &&
3326 eptr <= md->end_subject - oclength &&
3327 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3328 #endif /* SUPPORT_UCP */
3329 else
3330 {
3331 CHECK_PARTIAL();
3332 RRETURN(MATCH_NOMATCH);
3333 }
3334 }
3335
3336 if (min == max) continue;
3337
3338 if (minimize)
3339 {
3340 for (fi = min;; fi++)
3341 {
3342 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3343 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3344 if (fi >= max) RRETURN(MATCH_NOMATCH);
3345 if (eptr <= md->end_subject - length &&
3346 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3347 #ifdef SUPPORT_UCP
3348 else if (oclength > 0 &&
3349 eptr <= md->end_subject - oclength &&
3350 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3351 #endif /* SUPPORT_UCP */
3352 else
3353 {
3354 CHECK_PARTIAL();
3355 RRETURN(MATCH_NOMATCH);
3356 }
3357 }
3358 /* Control never gets here */
3359 }
3360
3361 else /* Maximize */
3362 {
3363 pp = eptr;
3364 for (i = min; i < max; i++)
3365 {
3366 if (eptr <= md->end_subject - length &&
3367 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3368 #ifdef SUPPORT_UCP
3369 else if (oclength > 0 &&
3370 eptr <= md->end_subject - oclength &&
3371 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3372 #endif /* SUPPORT_UCP */
3373 else
3374 {
3375 CHECK_PARTIAL();
3376 break;
3377 }
3378 }
3379
3380 if (possessive) continue;
3381
3382 for(;;)
3383 {
3384 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3385 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3386 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3387 #ifdef SUPPORT_UCP
3388 eptr--;
3389 BACKCHAR(eptr);
3390 #else /* without SUPPORT_UCP */
3391 eptr -= length;
3392 #endif /* SUPPORT_UCP */
3393 }
3394 }
3395 /* Control never gets here */
3396 }
3397
3398 /* If the length of a UTF-8 character is 1, we fall through here, and
3399 obey the code as for non-UTF-8 characters below, though in this case the
3400 value of fc will always be < 128. */
3401 }
3402 else
3403 #endif /* SUPPORT_UTF */
3404 /* When not in UTF-8 mode, load a single-byte character. */
3405 fc = *ecode++;
3406
3407 /* The value of fc at this point is always one character, though we may
3408 or may not be in UTF mode. The code is duplicated for the caseless and
3409 caseful cases, for speed, since matching characters is likely to be quite
3410 common. First, ensure the minimum number of matches are present. If min =
3411 max, continue at the same level without recursing. Otherwise, if
3412 minimizing, keep trying the rest of the expression and advancing one
3413 matching character if failing, up to the maximum. Alternatively, if
3414 maximizing, find the maximum number of characters and work backwards. */
3415
3416 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3417 max, (char *)eptr));
3418
3419 if (op >= OP_STARI) /* Caseless */
3420 {
3421 #ifdef COMPILE_PCRE8
3422 /* fc must be < 128 if UTF is enabled. */
3423 foc = md->fcc[fc];
3424 #else
3425 #ifdef SUPPORT_UTF
3426 #ifdef SUPPORT_UCP
3427 if (utf && fc > 127)
3428 foc = UCD_OTHERCASE(fc);
3429 #else
3430 if (utf && fc > 127)
3431 foc = fc;
3432 #endif /* SUPPORT_UCP */
3433 else
3434 #endif /* SUPPORT_UTF */
3435 foc = TABLE_GET(fc, md->fcc, fc);
3436 #endif /* COMPILE_PCRE8 */
3437
3438 for (i = 1; i <= min; i++)
3439 {
3440 pcre_uchar cc;
3441
3442 if (eptr >= md->end_subject)
3443 {
3444 SCHECK_PARTIAL();
3445 RRETURN(MATCH_NOMATCH);
3446 }
3447 cc = RAWUCHARTEST(eptr);
3448 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3449 eptr++;
3450 }
3451 if (min == max) continue;
3452 if (minimize)
3453 {
3454 for (fi = min;; fi++)
3455 {
3456 pcre_uchar cc;
3457
3458 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3460 if (fi >= max) RRETURN(MATCH_NOMATCH);
3461 if (eptr >= md->end_subject)
3462 {
3463 SCHECK_PARTIAL();
3464 RRETURN(MATCH_NOMATCH);
3465 }
3466 cc = RAWUCHARTEST(eptr);
3467 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3468 eptr++;
3469 }
3470 /* Control never gets here */
3471 }
3472 else /* Maximize */
3473 {
3474 pp = eptr;
3475 for (i = min; i < max; i++)
3476 {
3477 pcre_uchar cc;
3478
3479 if (eptr >= md->end_subject)
3480 {
3481 SCHECK_PARTIAL();
3482 break;
3483 }
3484 cc = RAWUCHARTEST(eptr);
3485 if (fc != cc && foc != cc) break;
3486 eptr++;
3487 }
3488
3489 if (possessive) continue;
3490
3491 while (eptr >= pp)
3492 {
3493 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3494 eptr--;
3495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3496 }
3497 RRETURN(MATCH_NOMATCH);
3498 }
3499 /* Control never gets here */
3500 }
3501
3502 /* Caseful comparisons (includes all multi-byte characters) */
3503
3504 else
3505 {
3506 for (i = 1; i <= min; i++)
3507 {
3508 if (eptr >= md->end_subject)
3509 {
3510 SCHECK_PARTIAL();
3511 RRETURN(MATCH_NOMATCH);
3512 }
3513 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3514 }
3515
3516 if (min == max) continue;
3517
3518 if (minimize)
3519 {
3520 for (fi = min;; fi++)
3521 {
3522 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3523 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3524 if (fi >= max) RRETURN(MATCH_NOMATCH);
3525 if (eptr >= md->end_subject)
3526 {
3527 SCHECK_PARTIAL();
3528 RRETURN(MATCH_NOMATCH);
3529 }
3530 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3531 }
3532 /* Control never gets here */
3533 }
3534 else /* Maximize */
3535 {
3536 pp = eptr;
3537 for (i = min; i < max; i++)
3538 {
3539 if (eptr >= md->end_subject)
3540 {
3541 SCHECK_PARTIAL();
3542 break;
3543 }
3544 if (fc != RAWUCHARTEST(eptr)) break;
3545 eptr++;
3546 }
3547 if (possessive) continue;
3548
3549 while (eptr >= pp)
3550 {
3551 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3552 eptr--;
3553 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3554 }
3555 RRETURN(MATCH_NOMATCH);
3556 }
3557 }
3558 /* Control never gets here */
3559
3560 /* Match a negated single one-byte character. The character we are
3561 checking can be multibyte. */
3562
3563 case OP_NOT:
3564 case OP_NOTI:
3565 if (eptr >= md->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 #ifdef SUPPORT_UTF
3571 if (utf)
3572 {
3573 register pcre_uint32 ch, och;
3574
3575 ecode++;
3576 GETCHARINC(ch, ecode);
3577 GETCHARINC(c, eptr);
3578
3579 if (op == OP_NOT)
3580 {
3581 if (ch == c) RRETURN(MATCH_NOMATCH);
3582 }
3583 else
3584 {
3585 #ifdef SUPPORT_UCP
3586 if (ch > 127)
3587 och = UCD_OTHERCASE(ch);
3588 #else
3589 if (ch > 127)
3590 och = ch;
3591 #endif /* SUPPORT_UCP */
3592 else
3593 och = TABLE_GET(ch, md->fcc, ch);
3594 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3595 }
3596 }
3597 else
3598 #endif
3599 {
3600 register pcre_uint32 ch = ecode[1];
3601 c = *eptr++;
3602 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3603 RRETURN(MATCH_NOMATCH);
3604 ecode += 2;
3605 }
3606 break;
3607
3608 /* Match a negated single one-byte character repeatedly. This is almost a
3609 repeat of the code for a repeated single character, but I haven't found a
3610 nice way of commoning these up that doesn't require a test of the
3611 positive/negative option for each character match. Maybe that wouldn't add
3612 very much to the time taken, but character matching *is* what this is all
3613 about... */
3614
3615 case OP_NOTEXACT:
3616 case OP_NOTEXACTI:
3617 min = max = GET2(ecode, 1);
3618 ecode += 1 + IMM2_SIZE;
3619 goto REPEATNOTCHAR;
3620
3621 case OP_NOTUPTO:
3622 case OP_NOTUPTOI:
3623 case OP_NOTMINUPTO:
3624 case OP_NOTMINUPTOI:
3625 min = 0;
3626 max = GET2(ecode, 1);
3627 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3628 ecode += 1 + IMM2_SIZE;
3629 goto REPEATNOTCHAR;
3630
3631 case OP_NOTPOSSTAR:
3632 case OP_NOTPOSSTARI:
3633 possessive = TRUE;
3634 min = 0;
3635 max = INT_MAX;
3636 ecode++;
3637 goto REPEATNOTCHAR;
3638
3639 case OP_NOTPOSPLUS:
3640 case OP_NOTPOSPLUSI:
3641 possessive = TRUE;
3642 min = 1;
3643 max = INT_MAX;
3644 ecode++;
3645 goto REPEATNOTCHAR;
3646
3647 case OP_NOTPOSQUERY:
3648 case OP_NOTPOSQUERYI:
3649 possessive = TRUE;
3650 min = 0;
3651 max = 1;
3652 ecode++;
3653 goto REPEATNOTCHAR;
3654
3655 case OP_NOTPOSUPTO:
3656 case OP_NOTPOSUPTOI:
3657 possessive = TRUE;
3658 min = 0;
3659 max = GET2(ecode, 1);
3660 ecode += 1 + IMM2_SIZE;
3661 goto REPEATNOTCHAR;
3662
3663 case OP_NOTSTAR:
3664 case OP_NOTSTARI:
3665 case OP_NOTMINSTAR:
3666 case OP_NOTMINSTARI:
3667 case OP_NOTPLUS:
3668 case OP_NOTPLUSI:
3669 case OP_NOTMINPLUS:
3670 case OP_NOTMINPLUSI:
3671 case OP_NOTQUERY:
3672 case OP_NOTQUERYI:
3673 case OP_NOTMINQUERY:
3674 case OP_NOTMINQUERYI:
3675 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3676 minimize = (c & 1) != 0;
3677 min = rep_min[c]; /* Pick up values from tables; */
3678 max = rep_max[c]; /* zero for max => infinity */
3679 if (max == 0) max = INT_MAX;
3680
3681 /* Common code for all repeated single-byte matches. */
3682
3683 REPEATNOTCHAR:
3684 GETCHARINCTEST(fc, ecode);
3685
3686 /* The code is duplicated for the caseless and caseful cases, for speed,
3687 since matching characters is likely to be quite common. First, ensure the
3688 minimum number of matches are present. If min = max, continue at the same
3689 level without recursing. Otherwise, if minimizing, keep trying the rest of
3690 the expression and advancing one matching character if failing, up to the
3691 maximum. Alternatively, if maximizing, find the maximum number of
3692 characters and work backwards. */
3693
3694 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3695 max, (char *)eptr));
3696
3697 if (op >= OP_NOTSTARI) /* Caseless */
3698 {
3699 #ifdef SUPPORT_UTF
3700 #ifdef SUPPORT_UCP
3701 if (utf && fc > 127)
3702 foc = UCD_OTHERCASE(fc);
3703 #else
3704 if (utf && fc > 127)
3705 foc = fc;
3706 #endif /* SUPPORT_UCP */
3707 else
3708 #endif /* SUPPORT_UTF */
3709 foc = TABLE_GET(fc, md->fcc, fc);
3710
3711 #ifdef SUPPORT_UTF
3712 if (utf)
3713 {
3714 register pcre_uint32 d;
3715 for (i = 1; i <= min; i++)
3716 {
3717 if (eptr >= md->end_subject)
3718 {
3719 SCHECK_PARTIAL();
3720 RRETURN(MATCH_NOMATCH);
3721 }
3722 GETCHARINC(d, eptr);
3723 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3724 }
3725 }
3726 else
3727 #endif
3728 /* Not UTF mode */
3729 {
3730 for (i = 1; i <= min; i++)
3731 {
3732 if (eptr >= md->end_subject)
3733 {
3734 SCHECK_PARTIAL();
3735 RRETURN(MATCH_NOMATCH);
3736 }
3737 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3738 eptr++;
3739 }
3740 }
3741
3742 if (min == max) continue;
3743
3744 if (minimize)
3745 {
3746 #ifdef SUPPORT_UTF
3747 if (utf)
3748 {
3749 register pcre_uint32 d;
3750 for (fi = min;; fi++)
3751 {
3752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3754 if (fi >= max) RRETURN(MATCH_NOMATCH);
3755 if (eptr >= md->end_subject)
3756 {
3757 SCHECK_PARTIAL();
3758 RRETURN(MATCH_NOMATCH);
3759 }
3760 GETCHARINC(d, eptr);
3761 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3762 }
3763 }
3764 else
3765 #endif
3766 /* Not UTF mode */
3767 {
3768 for (fi = min;; fi++)
3769 {
3770 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3771 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3772 if (fi >= max) RRETURN(MATCH_NOMATCH);
3773 if (eptr >= md->end_subject)
3774 {
3775 SCHECK_PARTIAL();
3776 RRETURN(MATCH_NOMATCH);
3777 }
3778 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3779 eptr++;
3780 }
3781 }
3782 /* Control never gets here */
3783 }
3784
3785 /* Maximize case */
3786
3787 else
3788 {
3789 pp = eptr;
3790
3791 #ifdef SUPPORT_UTF
3792 if (utf)
3793 {
3794 register pcre_uint32 d;
3795 for (i = min; i < max; i++)
3796 {
3797 int len = 1;
3798 if (eptr >= md->end_subject)
3799 {
3800 SCHECK_PARTIAL();
3801 break;
3802 }
3803 GETCHARLEN(d, eptr, len);
3804 if (fc == d || (unsigned int)foc == d) break;
3805 eptr += len;
3806 }
3807 if (possessive) continue;
3808 for(;;)
3809 {
3810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3812 if (eptr-- == pp) break; /* Stop if tried at original pos */
3813 BACKCHAR(eptr);
3814 }
3815 }
3816 else
3817 #endif
3818 /* Not UTF mode */
3819 {
3820 for (i = min; i < max; i++)
3821 {
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 break;
3826 }
3827 if (fc == *eptr || foc == *eptr) break;
3828 eptr++;
3829 }
3830 if (possessive) continue;
3831 while (eptr >= pp)
3832 {
3833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3835 eptr--;
3836 }
3837 }
3838
3839 RRETURN(MATCH_NOMATCH);
3840 }
3841 /* Control never gets here */
3842 }
3843
3844 /* Caseful comparisons */
3845
3846 else
3847 {
3848 #ifdef SUPPORT_UTF
3849 if (utf)
3850 {
3851 register pcre_uint32 d;
3852 for (i = 1; i <= min; i++)
3853 {
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 RRETURN(MATCH_NOMATCH);
3858 }
3859 GETCHARINC(d, eptr);
3860 if (fc == d) RRETURN(MATCH_NOMATCH);
3861 }
3862 }
3863 else
3864 #endif
3865 /* Not UTF mode */
3866 {
3867 for (i = 1; i <= min; i++)
3868 {
3869 if (eptr >= md->end_subject)
3870 {
3871 SCHECK_PARTIAL();
3872 RRETURN(MATCH_NOMATCH);
3873 }
3874 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3875 }
3876 }
3877
3878 if (min == max) continue;
3879
3880 if (minimize)
3881 {
3882 #ifdef SUPPORT_UTF
3883 if (utf)
3884 {
3885 register pcre_uint32 d;
3886 for (fi = min;; fi++)
3887 {
3888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3890 if (fi >= max) RRETURN(MATCH_NOMATCH);
3891 if (eptr >= md->end_subject)
3892 {
3893 SCHECK_PARTIAL();
3894 RRETURN(MATCH_NOMATCH);
3895 }
3896 GETCHARINC(d, eptr);
3897 if (fc == d) RRETURN(MATCH_NOMATCH);
3898 }
3899 }
3900 else
3901 #endif
3902 /* Not UTF mode */
3903 {
3904 for (fi = min;; fi++)
3905 {
3906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3908 if (fi >= max) RRETURN(MATCH_NOMATCH);
3909 if (eptr >= md->end_subject)
3910 {
3911 SCHECK_PARTIAL();
3912 RRETURN(MATCH_NOMATCH);
3913 }
3914 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3915 }
3916 }
3917 /* Control never gets here */
3918 }
3919
3920 /* Maximize case */
3921
3922 else
3923 {
3924 pp = eptr;
3925
3926 #ifdef SUPPORT_UTF
3927 if (utf)
3928 {
3929 register pcre_uint32 d;
3930 for (i = min; i < max; i++)
3931 {
3932 int len = 1;
3933 if (eptr >= md->end_subject)
3934 {
3935 SCHECK_PARTIAL();
3936 break;
3937 }
3938 GETCHARLEN(d, eptr, len);
3939 if (fc == d) break;
3940 eptr += len;
3941 }
3942 if (possessive) continue;
3943 for(;;)
3944 {
3945 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3947 if (eptr-- == pp) break; /* Stop if tried at original pos */
3948 BACKCHAR(eptr);
3949 }
3950 }
3951 else
3952 #endif
3953 /* Not UTF mode */
3954 {
3955 for (i = min; i < max; i++)
3956 {
3957 if (eptr >= md->end_subject)
3958 {
3959 SCHECK_PARTIAL();
3960 break;
3961 }
3962 if (fc == *eptr) break;
3963 eptr++;
3964 }
3965 if (possessive) continue;
3966 while (eptr >= pp)
3967 {
3968 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3970 eptr--;
3971 }
3972 }
3973
3974 RRETURN(MATCH_NOMATCH);
3975 }
3976 }
3977 /* Control never gets here */
3978
3979 /* Match a single character type repeatedly; several different opcodes
3980 share code. This is very similar to the code for single characters, but we
3981 repeat it in the interests of efficiency. */
3982
3983 case OP_TYPEEXACT:
3984 min = max = GET2(ecode, 1);
3985 minimize = TRUE;
3986 ecode += 1 + IMM2_SIZE;
3987 goto REPEATTYPE;
3988
3989 case OP_TYPEUPTO:
3990 case OP_TYPEMINUPTO:
3991 min = 0;
3992 max = GET2(ecode, 1);
3993 minimize = *ecode == OP_TYPEMINUPTO;
3994 ecode += 1 + IMM2_SIZE;
3995 goto REPEATTYPE;
3996
3997 case OP_TYPEPOSSTAR:
3998 possessive = TRUE;
3999 min = 0;
4000 max = INT_MAX;
4001 ecode++;
4002 goto REPEATTYPE;
4003
4004 case OP_TYPEPOSPLUS:
4005 possessive = TRUE;
4006 min = 1;
4007 max = INT_MAX;
4008 ecode++;
4009 goto REPEATTYPE;
4010
4011 case OP_TYPEPOSQUERY:
4012 possessive = TRUE;
4013 min = 0;
4014 max = 1;
4015 ecode++;
4016 goto REPEATTYPE;
4017
4018 case OP_TYPEPOSUPTO:
4019 possessive = TRUE;
4020 min = 0;
4021 max = GET2(ecode, 1);
4022 ecode += 1 + IMM2_SIZE;
4023 goto REPEATTYPE;
4024
4025 case OP_TYPESTAR:
4026 case OP_TYPEMINSTAR:
4027 case OP_TYPEPLUS:
4028 case OP_TYPEMINPLUS:
4029 case OP_TYPEQUERY:
4030 case OP_TYPEMINQUERY:
4031 c = *ecode++ - OP_TYPESTAR;
4032 minimize = (c & 1) != 0;
4033 min = rep_min[c]; /* Pick up values from tables; */
4034 max = rep_max[c]; /* zero for max => infinity */
4035 if (max == 0) max = INT_MAX;
4036
4037 /* Common code for all repeated single character type matches. Note that
4038 in UTF-8 mode, '.' matches a character of any length, but for the other
4039 character types, the valid characters are all one-byte long. */
4040
4041 REPEATTYPE:
4042 ctype = *ecode++; /* Code for the character type */
4043
4044 #ifdef SUPPORT_UCP
4045 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4046 {
4047 prop_fail_result = ctype == OP_NOTPROP;
4048 prop_type = *ecode++;
4049 prop_value = *ecode++;
4050 }
4051 else prop_type = -1;
4052 #endif
4053
4054 /* First, ensure the minimum number of matches are present. Use inline
4055 code for maximizing the speed, and do the type test once at the start
4056 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4057 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4058 and single-bytes. */
4059
4060 if (min > 0)
4061 {
4062 #ifdef SUPPORT_UCP
4063 if (prop_type >= 0)
4064 {
4065 switch(prop_type)
4066 {
4067 case PT_ANY:
4068 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4069 for (i = 1; i <= min; i++)
4070 {
4071 if (eptr >= md->end_subject)
4072 {
4073 SCHECK_PARTIAL();
4074 RRETURN(MATCH_NOMATCH);
4075 }
4076 GETCHARINCTEST(c, eptr);
4077 }
4078 break;
4079
4080 case PT_LAMP:
4081 for (i = 1; i <= min; i++)
4082 {
4083 int chartype;
4084 if (eptr >= md->end_subject)
4085 {
4086 SCHECK_PARTIAL();
4087 RRETURN(MATCH_NOMATCH);
4088 }
4089 GETCHARINCTEST(c, eptr);
4090 chartype = UCD_CHARTYPE(c);
4091 if ((chartype == ucp_Lu ||
4092 chartype == ucp_Ll ||
4093 chartype == ucp_Lt) == prop_fail_result)
4094 RRETURN(MATCH_NOMATCH);
4095 }
4096 break;
4097
4098 case PT_GC:
4099 for (i = 1; i <= min; i++)
4100 {
4101 if (eptr >= md->end_subject)
4102 {
4103 SCHECK_PARTIAL();
4104 RRETURN(MATCH_NOMATCH);
4105 }
4106 GETCHARINCTEST(c, eptr);
4107 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4108 RRETURN(MATCH_NOMATCH);
4109 }
4110 break;
4111
4112 case PT_PC:
4113 for (i = 1; i <= min; i++)
4114 {
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 GETCHARINCTEST(c, eptr);
4121 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4122 RRETURN(MATCH_NOMATCH);
4123 }
4124 break;
4125
4126 case PT_SC:
4127 for (i = 1; i <= min; i++)
4128 {
4129 if (eptr >= md->end_subject)
4130 {
4131 SCHECK_PARTIAL();
4132 RRETURN(MATCH_NOMATCH);
4133 }
4134 GETCHARINCTEST(c, eptr);
4135 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4136 RRETURN(MATCH_NOMATCH);
4137 }
4138 break;
4139
4140 case PT_ALNUM:
4141 for (i = 1; i <= min; i++)
4142 {
4143 int category;
4144 if (eptr >= md->end_subject)
4145 {
4146 SCHECK_PARTIAL();
4147 RRETURN(MATCH_NOMATCH);
4148 }
4149 GETCHARINCTEST(c, eptr);
4150 category = UCD_CATEGORY(c);
4151 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4152 RRETURN(MATCH_NOMATCH);
4153 }
4154 break;
4155
4156 case PT_SPACE: /* Perl space */
4157 for (i = 1; i <= min; i++)
4158 {
4159 if (eptr >= md->end_subject)
4160 {
4161 SCHECK_PARTIAL();
4162 RRETURN(MATCH_NOMATCH);
4163 }
4164 GETCHARINCTEST(c, eptr);
4165 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4166 c == CHAR_FF || c == CHAR_CR)
4167 == prop_fail_result)
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case PT_PXSPACE: /* POSIX space */
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 GETCHARINCTEST(c, eptr);
4181 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4182 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4183 == prop_fail_result)
4184 RRETURN(MATCH_NOMATCH);
4185 }
4186 break;
4187
4188 case PT_WORD:
4189 for (i = 1; i <= min; i++)
4190 {
4191 int category;
4192 if (eptr >= md->end_subject)
4193 {
4194 SCHECK_PARTIAL();
4195 RRETURN(MATCH_NOMATCH);
4196 }
4197 GETCHARINCTEST(c, eptr);
4198 category = UCD_CATEGORY(c);
4199 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4200 == prop_fail_result)
4201 RRETURN(MATCH_NOMATCH);
4202 }
4203 break;
4204
4205 case PT_CLIST:
4206 for (i = 1; i <= min; i++)
4207 {
4208 const pcre_uint32 *cp;
4209 if (eptr >= md->end_subject)
4210 {
4211 SCHECK_PARTIAL();
4212 RRETURN(MATCH_NOMATCH);
4213 }
4214 GETCHARINCTEST(c, eptr);
4215 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4216 for (;;)
4217 {
4218 if (c < *cp)
4219 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4220 if (c == *cp++)
4221 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4222 }
4223 }
4224 break;
4225
4226 /* This should not occur */
4227
4228 default:
4229 RRETURN(PCRE_ERROR_INTERNAL);
4230 }
4231 }
4232
4233 /* Match extended Unicode sequences. We will get here only if the
4234 support is in the binary; otherwise a compile-time error occurs. */
4235
4236 else if (ctype == OP_EXTUNI)
4237 {
4238 for (i = 1; i <= min; i++)
4239 {
4240 if (eptr >= md->end_subject)
4241 {
4242 SCHECK_PARTIAL();
4243 RRETURN(MATCH_NOMATCH);
4244 }
4245 else
4246 {
4247 int lgb, rgb;
4248 GETCHARINCTEST(c, eptr);
4249 lgb = UCD_GRAPHBREAK(c);
4250 while (eptr < md->end_subject)
4251 {
4252 int len = 1;
4253 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4254 rgb = UCD_GRAPHBREAK(c);
4255 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4256 lgb = rgb;
4257 eptr += len;
4258 }
4259 }
4260 CHECK_PARTIAL();
4261 }
4262 }
4263
4264 else
4265 #endif /* SUPPORT_UCP */
4266
4267 /* Handle all other cases when the coding is UTF-8 */
4268
4269 #ifdef SUPPORT_UTF
4270 if (utf) switch(ctype)
4271 {
4272 case OP_ANY:
4273 for (i = 1; i <= min; i++)
4274 {
4275 if (eptr >= md->end_subject)
4276 {
4277 SCHECK_PARTIAL();
4278 RRETURN(MATCH_NOMATCH);
4279 }
4280 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4281 if (md->partial != 0 &&
4282 eptr + 1 >= md->end_subject &&
4283 NLBLOCK->nltype == NLTYPE_FIXED &&
4284 NLBLOCK->nllen == 2 &&
4285 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4286 {
4287 md->hitend = TRUE;
4288 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4289 }
4290 eptr++;
4291 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4292 }
4293 break;
4294
4295 case OP_ALLANY:
4296 for (i = 1; i <= min; i++)
4297 {
4298 if (eptr >= md->end_subject)
4299 {
4300 SCHECK_PARTIAL();
4301 RRETURN(MATCH_NOMATCH);
4302 }
4303 eptr++;
4304 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4305 }
4306 break;
4307
4308 case OP_ANYBYTE:
4309 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4310 eptr += min;
4311 break;
4312
4313 case OP_ANYNL:
4314 for (i = 1; i <= min; i++)
4315 {
4316 if (eptr >= md->end_subject)
4317 {
4318 SCHECK_PARTIAL();
4319 RRETURN(MATCH_NOMATCH);
4320 }
4321 GETCHARINC(c, eptr);
4322 switch(c)
4323 {
4324 default: RRETURN(MATCH_NOMATCH);
4325
4326 case CHAR_CR:
4327 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4328 break;
4329
4330 case CHAR_LF:
4331 break;
4332
4333 case CHAR_VT:
4334 case CHAR_FF:
4335 case CHAR_NEL:
4336 #ifndef EBCDIC
4337 case 0x2028:
4338 case 0x2029:
4339 #endif /* Not EBCDIC */
4340 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4341 break;
4342 }
4343 }
4344 break;
4345
4346 case OP_NOT_HSPACE:
4347 for (i = 1; i <= min; i++)
4348 {
4349 if (eptr >= md->end_subject)
4350 {
4351 SCHECK_PARTIAL();
4352 RRETURN(MATCH_NOMATCH);
4353 }
4354 GETCHARINC(c, eptr);
4355 switch(c)
4356 {
4357 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4358 default: break;
4359 }
4360 }
4361 break;
4362
4363 case OP_HSPACE:
4364 for (i = 1; i <= min; i++)
4365 {
4366 if (eptr >= md->end_subject)
4367 {
4368 SCHECK_PARTIAL();
4369 RRETURN(MATCH_NOMATCH);
4370 }
4371 GETCHARINC(c, eptr);
4372 switch(c)
4373 {
4374 HSPACE_CASES: break; /* Byte and multibyte cases */
4375 default: RRETURN(MATCH_NOMATCH);
4376 }
4377 }
4378 break;
4379
4380 case OP_NOT_VSPACE:
4381 for (i = 1; i <= min; i++)
4382 {
4383 if (eptr >= md->end_subject)
4384 {
4385 SCHECK_PARTIAL();
4386 RRETURN(MATCH_NOMATCH);
4387 }
4388 GETCHARINC(c, eptr);
4389 switch(c)
4390 {
4391 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4392 default: break;
4393 }
4394 }
4395 break;
4396
4397 case OP_VSPACE:
4398 for (i = 1; i <= min; i++)
4399 {
4400 if (eptr >= md->end_subject)
4401 {
4402 SCHECK_PARTIAL();
4403 RRETURN(MATCH_NOMATCH);
4404 }
4405 GETCHARINC(c, eptr);
4406 switch(c)
4407 {
4408 VSPACE_CASES: break;
4409 default: RRETURN(MATCH_NOMATCH);
4410 }
4411 }
4412 break;
4413
4414 case OP_NOT_DIGIT:
4415 for (i = 1; i <= min; i++)
4416 {
4417 if (eptr >= md->end_subject)
4418 {
4419 SCHECK_PARTIAL();
4420 RRETURN(MATCH_NOMATCH);
4421 }
4422 GETCHARINC(c, eptr);
4423 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4424 RRETURN(MATCH_NOMATCH);
4425 }
4426 break;
4427
4428 case OP_DIGIT:
4429 for (i = 1; i <= min; i++)
4430 {
4431 pcre_uchar cc;
4432
4433 if (eptr >= md->end_subject)
4434 {
4435 SCHECK_PARTIAL();
4436 RRETURN(MATCH_NOMATCH);
4437 }
4438 cc = RAWUCHAR(eptr);
4439 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4440 RRETURN(MATCH_NOMATCH);
4441 eptr++;
4442 /* No need to skip more bytes - we know it's a 1-byte character */
4443 }
4444 break;
4445
4446 case OP_NOT_WHITESPACE:
4447 for (i = 1; i <= min; i++)
4448 {
4449 pcre_uchar cc;
4450
4451 if (eptr >= md->end_subject)
4452 {
4453 SCHECK_PARTIAL();
4454 RRETURN(MATCH_NOMATCH);
4455 }
4456 cc = RAWUCHAR(eptr);
4457 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4458 RRETURN(MATCH_NOMATCH);
4459 eptr++;
4460 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4461 }
4462 break;
4463
4464 case OP_WHITESPACE:
4465 for (i = 1; i <= min; i++)
4466 {
4467 pcre_uchar cc;
4468
4469 if (eptr >= md->end_subject)
4470 {
4471 SCHECK_PARTIAL();
4472 RRETURN(MATCH_NOMATCH);
4473 }
4474 cc = RAWUCHAR(eptr);
4475 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4476 RRETURN(MATCH_NOMATCH);
4477 eptr++;
4478 /* No need to skip more bytes - we know it's a 1-byte character */
4479 }
4480 break;
4481
4482 case OP_NOT_WORDCHAR:
4483 for (i = 1; i <= min; i++)
4484 {
4485 pcre_uchar cc;
4486
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 RRETURN(MATCH_NOMATCH);
4491 }
4492 cc = RAWUCHAR(eptr);
4493 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4494 RRETURN(MATCH_NOMATCH);
4495 eptr++;
4496 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4497 }
4498 break;
4499
4500 case OP_WORDCHAR:
4501 for (i = 1; i <= min; i++)
4502 {
4503 pcre_uchar cc;
4504
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 RRETURN(MATCH_NOMATCH);
4509 }
4510 cc = RAWUCHAR(eptr);
4511 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4512 RRETURN(MATCH_NOMATCH);
4513 eptr++;
4514 /* No need to skip more bytes - we know it's a 1-byte character */
4515 }
4516 break;
4517
4518 default:
4519 RRETURN(PCRE_ERROR_INTERNAL);
4520 } /* End switch(ctype) */
4521
4522 else
4523 #endif /* SUPPORT_UTF */
4524
4525 /* Code for the non-UTF-8 case for minimum matching of operators other
4526 than OP_PROP and OP_NOTPROP. */
4527
4528 switch(ctype)
4529 {
4530 case OP_ANY:
4531 for (i = 1; i <= min; i++)
4532 {
4533 if (eptr >= md->end_subject)
4534 {
4535 SCHECK_PARTIAL();
4536 RRETURN(MATCH_NOMATCH);
4537 }
4538 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4539 if (md->partial != 0 &&
4540 eptr + 1 >= md->end_subject &&
4541 NLBLOCK->nltype == NLTYPE_FIXED &&
4542 NLBLOCK->nllen == 2 &&
4543 *eptr == NLBLOCK->nl[0])
4544 {
4545 md->hitend = TRUE;
4546 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4547 }
4548 eptr++;
4549 }
4550 break;
4551
4552 case OP_ALLANY:
4553 if (eptr > md->end_subject - min)
4554 {
4555 SCHECK_PARTIAL();
4556 RRETURN(MATCH_NOMATCH);
4557 }
4558 eptr += min;
4559 break;
4560
4561 case OP_ANYBYTE:
4562 if (eptr > md->end_subject - min)
4563 {
4564 SCHECK_PARTIAL();
4565 RRETURN(MATCH_NOMATCH);
4566 }
4567 eptr += min;
4568 break;
4569
4570 case OP_ANYNL:
4571 for (i = 1; i <= min; i++)
4572 {
4573 if (eptr >= md->end_subject)
4574 {
4575 SCHECK_PARTIAL();
4576 RRETURN(MATCH_NOMATCH);
4577 }
4578 switch(*eptr++)
4579 {
4580 default: RRETURN(MATCH_NOMATCH);
4581
4582 case CHAR_CR:
4583 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4584 break;
4585
4586 case CHAR_LF:
4587 break;
4588
4589 case CHAR_VT:
4590 case CHAR_FF:
4591 case CHAR_NEL:
4592 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4593 case 0x2028:
4594 case 0x2029:
4595 #endif
4596 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4597 break;
4598 }
4599 }
4600 break;
4601
4602 case OP_NOT_HSPACE:
4603 for (i = 1; i <= min; i++)
4604 {
4605 if (eptr >= md->end_subject)
4606 {
4607 SCHECK_PARTIAL();
4608 RRETURN(MATCH_NOMATCH);
4609 }
4610 switch(*eptr++)
4611 {
4612 default: break;
4613 HSPACE_BYTE_CASES:
4614 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4615 HSPACE_MULTIBYTE_CASES:
4616 #endif
4617 RRETURN(MATCH_NOMATCH);
4618 }
4619 }
4620 break;
4621
4622 case OP_HSPACE:
4623 for (i = 1; i <= min; i++)
4624 {
4625 if (eptr >= md->end_subject)
4626 {
4627 SCHECK_PARTIAL();
4628 RRETURN(MATCH_NOMATCH);
4629 }
4630 switch(*eptr++)
4631 {
4632 default: RRETURN(MATCH_NOMATCH);
4633 HSPACE_BYTE_CASES:
4634 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4635 HSPACE_MULTIBYTE_CASES:
4636 #endif
4637 break;
4638 }
4639 }
4640 break;
4641
4642 case OP_NOT_VSPACE:
4643 for (i = 1; i <= min; i++)
4644 {
4645 if (eptr >= md->end_subject)
4646 {
4647 SCHECK_PARTIAL();
4648 RRETURN(MATCH_NOMATCH);
4649 }
4650 switch(*eptr++)
4651 {
4652 VSPACE_BYTE_CASES:
4653 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4654 VSPACE_MULTIBYTE_CASES:
4655 #endif
4656 RRETURN(MATCH_NOMATCH);
4657 default: break;
4658 }
4659 }
4660 break;
4661
4662 case OP_VSPACE:
4663 for (i = 1; i <= min; i++)
4664 {
4665 if (eptr >= md->end_subject)
4666 {
4667 SCHECK_PARTIAL();
4668 RRETURN(MATCH_NOMATCH);
4669 }
4670 switch(*eptr++)
4671 {
4672 default: RRETURN(MATCH_NOMATCH);
4673 VSPACE_BYTE_CASES:
4674 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4675 VSPACE_MULTIBYTE_CASES:
4676 #endif
4677 break;
4678 }
4679 }
4680 break;
4681
4682 case OP_NOT_DIGIT:
4683 for (i = 1; i <= min; i++)
4684 {
4685 if (eptr >= md->end_subject)
4686 {
4687 SCHECK_PARTIAL();
4688 RRETURN(MATCH_NOMATCH);
4689 }
4690 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4691 RRETURN(MATCH_NOMATCH);
4692 eptr++;
4693 }
4694 break;
4695
4696 case OP_DIGIT:
4697 for (i = 1; i <= min; i++)
4698 {
4699 if (eptr >= md->end_subject)
4700 {
4701 SCHECK_PARTIAL();
4702 RRETURN(MATCH_NOMATCH);
4703 }
4704 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4705 RRETURN(MATCH_NOMATCH);
4706 eptr++;
4707 }
4708 break;
4709
4710 case OP_NOT_WHITESPACE:
4711 for (i = 1; i <= min; i++)
4712 {
4713 if (eptr >= md->end_subject)
4714 {
4715 SCHECK_PARTIAL();
4716 RRETURN(MATCH_NOMATCH);
4717 }
4718 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4719 RRETURN(MATCH_NOMATCH);
4720 eptr++;
4721 }
4722 break;
4723
4724 case OP_WHITESPACE:
4725 for (i = 1; i <= min; i++)
4726 {
4727 if (eptr >= md->end_subject)
4728 {
4729 SCHECK_PARTIAL();
4730 RRETURN(MATCH_NOMATCH);
4731 }
4732 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4733 RRETURN(MATCH_NOMATCH);
4734 eptr++;
4735 }
4736 break;
4737
4738 case OP_NOT_WORDCHAR:
4739 for (i = 1; i <= min; i++)
4740 {
4741 if (eptr >= md->end_subject)
4742 {
4743 SCHECK_PARTIAL();
4744 RRETURN(MATCH_NOMATCH);
4745 }
4746 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4747 RRETURN(MATCH_NOMATCH);
4748 eptr++;
4749 }
4750 break;
4751
4752 case OP_WORDCHAR:
4753 for (i = 1; i <= min; i++)
4754 {
4755 if (eptr >= md->end_subject)
4756 {
4757 SCHECK_PARTIAL();
4758 RRETURN(MATCH_NOMATCH);
4759 }
4760 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4761 RRETURN(MATCH_NOMATCH);
4762 eptr++;
4763 }
4764 break;
4765
4766 default:
4767 RRETURN(PCRE_ERROR_INTERNAL);
4768 }
4769 }
4770
4771 /* If min = max, continue at the same level without recursing */
4772
4773 if (min == max) continue;
4774
4775 /* If minimizing, we have to test the rest of the pattern before each
4776 subsequent match. Again, separate the UTF-8 case for speed, and also
4777 separate the UCP cases. */
4778
4779 if (minimize)
4780 {
4781 #ifdef SUPPORT_UCP
4782 if (prop_type >= 0)
4783 {
4784 switch(prop_type)
4785 {
4786 case PT_ANY:
4787 for (fi = min;; fi++)
4788 {
4789 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4791 if (fi >= max) RRETURN(MATCH_NOMATCH);
4792 if (eptr >= md->end_subject)
4793 {
4794 SCHECK_PARTIAL();
4795 RRETURN(MATCH_NOMATCH);
4796 }
4797 GETCHARINCTEST(c, eptr);
4798 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4799 }
4800 /* Control never gets here */
4801
4802 case PT_LAMP:
4803 for (fi = min;; fi++)
4804 {
4805 int chartype;
4806 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4808 if (fi >= max) RRETURN(MATCH_NOMATCH);
4809 if (eptr >= md->end_subject)
4810 {
4811 SCHECK_PARTIAL();
4812 RRETURN(MATCH_NOMATCH);
4813 }
4814 GETCHARINCTEST(c, eptr);
4815 chartype = UCD_CHARTYPE(c);
4816 if ((chartype == ucp_Lu ||
4817 chartype == ucp_Ll ||
4818 chartype == ucp_Lt) == prop_fail_result)
4819 RRETURN(MATCH_NOMATCH);
4820 }
4821 /* Control never gets here */
4822
4823 case PT_GC:
4824 for (fi = min;; fi++)
4825 {
4826 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4828 if (fi >= max) RRETURN(MATCH_NOMATCH);
4829 if (eptr >= md->end_subject)
4830 {
4831 SCHECK_PARTIAL();
4832 RRETURN(MATCH_NOMATCH);
4833 }
4834 GETCHARINCTEST(c, eptr);
4835 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4836 RRETURN(MATCH_NOMATCH);
4837 }
4838 /* Control never gets here */
4839
4840 case PT_PC:
4841 for (fi = min;; fi++)
4842 {
4843 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4844 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4845 if (fi >= max) RRETURN(MATCH_NOMATCH);
4846 if (eptr >= md->end_subject)
4847 {
4848 SCHECK_PARTIAL();
4849 RRETURN(MATCH_NOMATCH);
4850 }
4851 GETCHARINCTEST(c, eptr);
4852 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4853 RRETURN(MATCH_NOMATCH);
4854 }
4855 /* Control never gets here */
4856
4857 case PT_SC:
4858 for (fi = min;; fi++)
4859 {
4860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4862 if (fi >= max) RRETURN(MATCH_NOMATCH);
4863 if (eptr >= md->end_subject)
4864 {
4865 SCHECK_PARTIAL();
4866 RRETURN(MATCH_NOMATCH);
4867 }
4868 GETCHARINCTEST(c, eptr);
4869 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4870 RRETURN(MATCH_NOMATCH);
4871 }
4872 /* Control never gets here */
4873
4874 case PT_ALNUM:
4875 for (fi = min;; fi++)
4876 {
4877 int category;
4878 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4880 if (fi >= max) RRETURN(MATCH_NOMATCH);
4881 if (eptr >= md->end_subject)
4882 {
4883 SCHECK_PARTIAL();
4884 RRETURN(MATCH_NOMATCH);
4885 }
4886 GETCHARINCTEST(c, eptr);
4887 category = UCD_CATEGORY(c);
4888 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4889 RRETURN(MATCH_NOMATCH);
4890 }
4891 /* Control never gets here */
4892
4893 case PT_SPACE: /* Perl space */
4894 for (fi = min;; fi++)
4895 {
4896 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4898 if (fi >= max) RRETURN(MATCH_NOMATCH);
4899 if (eptr >= md->end_subject)
4900 {
4901 SCHECK_PARTIAL();
4902 RRETURN(MATCH_NOMATCH);
4903 }
4904 GETCHARINCTEST(c, eptr);
4905 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4906 c == CHAR_FF || c == CHAR_CR)
4907 == prop_fail_result)
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 /* Control never gets here */
4911
4912 case PT_PXSPACE: /* POSIX space */
4913 for (fi = min;; fi++)
4914 {
4915 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4917 if (fi >= max) RRETURN(MATCH_NOMATCH);
4918 if (eptr >= md->end_subject)
4919 {
4920 SCHECK_PARTIAL();
4921 RRETURN(MATCH_NOMATCH);
4922 }
4923 GETCHARINCTEST(c, eptr);
4924 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4925 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4926 == prop_fail_result)
4927 RRETURN(MATCH_NOMATCH);
4928 }
4929 /* Control never gets here */
4930
4931 case PT_WORD:
4932 for (fi = min;; fi++)
4933 {
4934 int category;
4935 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4937 if (fi >= max) RRETURN(MATCH_NOMATCH);
4938 if (eptr >= md->end_subject)
4939 {
4940 SCHECK_PARTIAL();
4941 RRETURN(MATCH_NOMATCH);
4942 }
4943 GETCHARINCTEST(c, eptr);
4944 category = UCD_CATEGORY(c);
4945 if ((category == ucp_L ||
4946 category == ucp_N ||
4947 c == CHAR_UNDERSCORE)
4948 == prop_fail_result)
4949 RRETURN(MATCH_NOMATCH);
4950 }
4951 /* Control never gets here */
4952
4953 case PT_CLIST:
4954 for (fi = min;; fi++)
4955 {
4956 const pcre_uint32 *cp;
4957 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4958 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4959 if (fi >= max) RRETURN(MATCH_NOMATCH);
4960 if (eptr >= md->end_subject)
4961 {
4962 SCHECK_PARTIAL();
4963 RRETURN(MATCH_NOMATCH);
4964 }
4965 GETCHARINCTEST(c, eptr);
4966 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4967 for (;;)
4968 {
4969 if (c < *cp)
4970 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4971 if (c == *cp++)
4972 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4973 }
4974 }
4975 /* Control never gets here */
4976
4977 /* This should never occur */
4978 default:
4979 RRETURN(PCRE_ERROR_INTERNAL);
4980 }
4981 }
4982
4983 /* Match extended Unicode sequences. We will get here only if the
4984 support is in the binary; otherwise a compile-time error occurs. */
4985
4986 else if (ctype == OP_EXTUNI)
4987 {
4988 for (fi = min;; fi++)
4989 {
4990 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4992 if (fi >= max) RRETURN(MATCH_NOMATCH);
4993 if (eptr >= md->end_subject)
4994 {
4995 SCHECK_PARTIAL();
4996 RRETURN(MATCH_NOMATCH);
4997 }
4998 else
4999 {
5000 int lgb, rgb;
5001 GETCHARINCTEST(c, eptr);
5002 lgb = UCD_GRAPHBREAK(c);
5003 while (eptr < md->end_subject)
5004 {
5005 int len = 1;
5006 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5007 rgb = UCD_GRAPHBREAK(c);
5008 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5009 lgb = rgb;
5010 eptr += len;
5011 }
5012 }
5013 CHECK_PARTIAL();
5014 }
5015 }
5016 else
5017 #endif /* SUPPORT_UCP */
5018
5019 #ifdef SUPPORT_UTF
5020 if (utf)
5021 {
5022 for (fi = min;; fi++)
5023 {
5024 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5026 if (fi >= max) RRETURN(MATCH_NOMATCH);
5027 if (eptr >= md->end_subject)
5028 {
5029 SCHECK_PARTIAL();
5030 RRETURN(MATCH_NOMATCH);
5031 }
5032 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5033 RRETURN(MATCH_NOMATCH);
5034 GETCHARINC(c, eptr);
5035 switch(ctype)
5036 {
5037 case OP_ANY: /* This is the non-NL case */
5038 if (md->partial != 0 && /* Take care with CRLF partial */
5039 eptr >= md->end_subject &&
5040 NLBLOCK->nltype == NLTYPE_FIXED &&
5041 NLBLOCK->nllen == 2 &&
5042 c == NLBLOCK->nl[0])
5043 {
5044 md->hitend = TRUE;
5045 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5046 }
5047 break;
5048
5049 case OP_ALLANY:
5050 case OP_ANYBYTE:
5051 break;
5052
5053 case OP_ANYNL:
5054 switch(c)
5055 {
5056 default: RRETURN(MATCH_NOMATCH);
5057 case CHAR_CR:
5058 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5059 break;
5060
5061 case CHAR_LF:
5062 break;
5063
5064 case CHAR_VT:
5065 case CHAR_FF:
5066 case CHAR_NEL:
5067 #ifndef EBCDIC
5068 case 0x2028:
5069 case 0x2029:
5070 #endif /* Not EBCDIC */
5071 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5072 break;
5073 }
5074 break;
5075
5076 case OP_NOT_HSPACE:
5077 switch(c)
5078 {
5079 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5080 default: break;
5081 }
5082 break;
5083
5084 case OP_HSPACE:
5085 switch(c)
5086 {
5087 HSPACE_CASES: break;
5088 default: RRETURN(MATCH_NOMATCH);
5089 }
5090 break;
5091
5092 case OP_NOT_VSPACE:
5093 switch(c)
5094 {
5095 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5096 default: break;
5097 }
5098 break;
5099
5100 case OP_VSPACE:
5101 switch(c)
5102 {
5103 VSPACE_CASES: break;
5104 default: RRETURN(MATCH_NOMATCH);
5105 }
5106 break;
5107
5108 case OP_NOT_DIGIT:
5109 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5110 RRETURN(MATCH_NOMATCH);
5111 break;
5112
5113 case OP_DIGIT:
5114 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5115 RRETURN(MATCH_NOMATCH);
5116 break;
5117
5118 case OP_NOT_WHITESPACE:
5119 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5120 RRETURN(MATCH_NOMATCH);
5121 break;
5122
5123 case OP_WHITESPACE:
5124 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5125 RRETURN(MATCH_NOMATCH);
5126 break;
5127
5128 case OP_NOT_WORDCHAR:
5129 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5130 RRETURN(MATCH_NOMATCH);
5131 break;
5132
5133 case OP_WORDCHAR:
5134 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5135 RRETURN(MATCH_NOMATCH);
5136 break;
5137
5138 default:
5139 RRETURN(PCRE_ERROR_INTERNAL);
5140 }
5141 }
5142 }
5143 else
5144 #endif
5145 /* Not UTF mode */
5146 {
5147 for (fi = min;; fi++)
5148 {
5149 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5150 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5151 if (fi >= max) RRETURN(MATCH_NOMATCH);
5152 if (eptr >= md->end_subject)
5153 {
5154 SCHECK_PARTIAL();
5155 RRETURN(MATCH_NOMATCH);
5156 }
5157 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5158 RRETURN(MATCH_NOMATCH);
5159 c = *eptr++;
5160 switch(ctype)
5161 {
5162 case OP_ANY: /* This is the non-NL case */
5163 if (md->partial != 0 && /* Take care with CRLF partial */
5164 eptr >= md->end_subject &&
5165 NLBLOCK->nltype == NLTYPE_FIXED &&
5166 NLBLOCK->nllen == 2 &&
5167 c == NLBLOCK->nl[0])
5168 {
5169 md->hitend = TRUE;
5170 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5171 }
5172 break;
5173
5174 case OP_ALLANY:
5175 case OP_ANYBYTE:
5176 break;
5177
5178 case OP_ANYNL:
5179 switch(c)
5180 {
5181 default: RRETURN(MATCH_NOMATCH);
5182 case CHAR_CR:
5183 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5184 break;
5185
5186 case CHAR_LF:
5187 break;
5188
5189 case CHAR_VT:
5190 case CHAR_FF:
5191 case CHAR_NEL:
5192 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5193 case 0x2028:
5194 case 0x2029:
5195 #endif
5196 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5197 break;
5198 }
5199 break;
5200
5201 case OP_NOT_HSPACE:
5202 switch(c)
5203 {
5204 default: break;
5205 HSPACE_BYTE_CASES:
5206 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5207 HSPACE_MULTIBYTE_CASES:
5208 #endif
5209 RRETURN(MATCH_NOMATCH);
5210 }
5211 break;
5212
5213 case OP_HSPACE:
5214 switch(c)
5215 {
5216 default: RRETURN(MATCH_NOMATCH);
5217 HSPACE_BYTE_CASES:
5218 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5219 HSPACE_MULTIBYTE_CASES:
5220 #endif
5221 break;
5222 }
5223 break;
5224
5225 case OP_NOT_VSPACE:
5226 switch(c)
5227 {
5228 default: break;
5229 VSPACE_BYTE_CASES:
5230 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5231 VSPACE_MULTIBYTE_CASES:
5232 #endif
5233 RRETURN(MATCH_NOMATCH);
5234 }
5235 break;
5236
5237 case OP_VSPACE:
5238 switch(c)
5239 {
5240 default: RRETURN(MATCH_NOMATCH);
5241 VSPACE_BYTE_CASES:
5242 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5243 VSPACE_MULTIBYTE_CASES:
5244 #endif
5245 break;
5246 }
5247 break;
5248
5249 case OP_NOT_DIGIT:
5250 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5251 break;
5252
5253 case OP_DIGIT:
5254 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5255 break;
5256
5257 case OP_NOT_WHITESPACE:
5258 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5259 break;
5260
5261 case OP_WHITESPACE:
5262 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5263 break;
5264
5265 case OP_NOT_WORDCHAR:
5266 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5267 break;
5268
5269 case OP_WORDCHAR:
5270 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5271 break;
5272
5273 default:
5274 RRETURN(PCRE_ERROR_INTERNAL);
5275 }
5276 }
5277 }
5278 /* Control never gets here */
5279 }
5280
5281 /* If maximizing, it is worth using inline code for speed, doing the type
5282 test once at the start (i.e. keep it out of the loop). Again, keep the
5283 UTF-8 and UCP stuff separate. */
5284
5285 else
5286 {
5287 pp = eptr; /* Remember where we started */
5288
5289 #ifdef SUPPORT_UCP
5290 if (prop_type >= 0)
5291 {
5292 switch(prop_type)
5293 {
5294 case PT_ANY:
5295 for (i = min; i < max; i++)
5296 {
5297 int len = 1;
5298 if (eptr >= md->end_subject)
5299 {
5300 SCHECK_PARTIAL();
5301 break;
5302 }
5303 GETCHARLENTEST(c, eptr, len);
5304 if (prop_fail_result) break;
5305 eptr+= len;
5306 }
5307 break;
5308
5309 case PT_LAMP:
5310 for (i = min; i < max; i++)
5311 {
5312 int chartype;
5313 int len = 1;
5314 if (eptr >= md->end_subject)
5315 {
5316 SCHECK_PARTIAL();
5317 break;
5318 }
5319 GETCHARLENTEST(c, eptr, len);
5320 chartype = UCD_CHARTYPE(c);
5321 if ((chartype == ucp_Lu ||
5322 chartype == ucp_Ll ||
5323 chartype == ucp_Lt) == prop_fail_result)
5324 break;
5325 eptr+= len;
5326 }
5327 break;
5328
5329 case PT_GC:
5330 for (i = min; i < max; i++)
5331 {
5332 int len = 1;
5333 if (eptr >= md->end_subject)
5334 {
5335 SCHECK_PARTIAL();
5336 break;
5337 }
5338 GETCHARLENTEST(c, eptr, len);
5339 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5340 eptr+= len;
5341 }
5342 break;
5343
5344 case PT_PC:
5345 for (i = min; i < max; i++)
5346 {
5347 int len = 1;
5348 if (eptr >= md->end_subject)
5349 {
5350 SCHECK_PARTIAL();
5351 break;
5352 }
5353 GETCHARLENTEST(c, eptr, len);
5354 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5355 eptr+= len;
5356 }
5357 break;
5358
5359 case PT_SC:
5360 for (i = min; i < max; i++)
5361 {
5362 int len = 1;
5363 if (eptr >= md->end_subject)
5364 {
5365 SCHECK_PARTIAL();
5366 break;
5367 }
5368 GETCHARLENTEST(c, eptr, len);
5369 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5370 eptr+= len;
5371 }
5372 break;
5373
5374 case PT_ALNUM:
5375 for (i = min; i < max; i++)
5376 {
5377 int category;
5378 int len = 1;
5379 if (eptr >= md->end_subject)
5380 {
5381 SCHECK_PARTIAL();
5382 break;
5383 }
5384 GETCHARLENTEST(c, eptr, len);
5385 category = UCD_CATEGORY(c);
5386 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5387 break;
5388 eptr+= len;
5389 }
5390 break;
5391
5392 case PT_SPACE: /* Perl space */
5393 for (i = min; i < max; i++)
5394 {
5395 int len = 1;
5396 if (eptr >= md->end_subject)
5397 {
5398 SCHECK_PARTIAL();
5399 break;
5400 }
5401 GETCHARLENTEST(c, eptr, len);
5402 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5403 c == CHAR_FF || c == CHAR_CR)
5404 == prop_fail_result)
5405 break;
5406 eptr+= len;
5407 }
5408 break;
5409
5410 case PT_PXSPACE: /* POSIX space */
5411 for (i = min; i < max; i++)
5412 {
5413 int len = 1;
5414 if (eptr >= md->end_subject)
5415 {
5416 SCHECK_PARTIAL();
5417 break;
5418 }
5419 GETCHARLENTEST(c, eptr, len);
5420 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5421 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5422 == prop_fail_result)
5423 break;
5424 eptr+= len;
5425 }
5426 break;
5427
5428 case PT_WORD:
5429 for (i = min; i < max; i++)
5430 {
5431 int category;
5432 int len = 1;
5433 if (eptr >= md->end_subject)
5434 {
5435 SCHECK_PARTIAL();
5436 break;
5437 }
5438 GETCHARLENTEST(c, eptr, len);
5439 category = UCD_CATEGORY(c);
5440 if ((category == ucp_L || category == ucp_N ||
5441 c == CHAR_UNDERSCORE) == prop_fail_result)
5442 break;
5443 eptr+= len;
5444 }
5445 break;
5446
5447 case PT_CLIST:
5448 for (i = min; i < max; i++)
5449 {
5450 const pcre_uint32 *cp;
5451 int len = 1;
5452 if (eptr >= md->end_subject)
5453 {
5454 SCHECK_PARTIAL();
5455 break;
5456 }
5457 GETCHARLENTEST(c, eptr, len);
5458 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
5459 for (;;)
5460 {
5461 if (c < *cp)
5462 { if (prop_fail_result) break; else goto GOT_MAX; }
5463 if (c == *cp++)
5464 { if (prop_fail_result) goto GOT_MAX; else break; }
5465 }
5466 eptr += len;
5467 }
5468 GOT_MAX:
5469 break;
5470
5471 default:
5472 RRETURN(PCRE_ERROR_INTERNAL);
5473 }
5474
5475 /* eptr is now past the end of the maximum run */
5476
5477 if (possessive) continue;
5478 for(;;)
5479 {
5480 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5481 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5482 if (eptr-- == pp) break; /* Stop if tried at original pos */
5483 if (utf) BACKCHAR(eptr);
5484 }
5485 }
5486
5487 /* Match extended Unicode sequences. We will get here only if the
5488 support is in the binary; otherwise a compile-time error occurs. */
5489
5490 else if (ctype == OP_EXTUNI)
5491 {
5492 for (i = min; i < max; i++)
5493 {
5494 if (eptr >= md->end_subject)
5495 {
5496 SCHECK_PARTIAL();
5497 break;
5498 }
5499 else
5500 {
5501 int lgb, rgb;
5502 GETCHARINCTEST(c, eptr);
5503 lgb = UCD_GRAPHBREAK(c);
5504 while (eptr < md->end_subject)
5505 {
5506 int len = 1;
5507 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5508 rgb = UCD_GRAPHBREAK(c);
5509 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5510 lgb = rgb;
5511 eptr += len;
5512 }
5513 }
5514 CHECK_PARTIAL();
5515 }
5516
5517 /* eptr is now past the end of the maximum run */
5518
5519 if (possessive) continue;
5520
5521 for(;;)
5522 {
5523 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5524 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5525 if (eptr-- == pp) break; /* Stop if tried at original pos */
5526 for (;;) /* Move back over one extended */
5527 {
5528 if (!utf) c = *eptr; else
5529 {
5530 BACKCHAR(eptr);
5531 GETCHAR(c, eptr);
5532 }
5533 if (UCD_CATEGORY(c) != ucp_M) break;
5534 eptr--;
5535 }
5536 }
5537 }
5538
5539 else
5540 #endif /* SUPPORT_UCP */
5541
5542 #ifdef SUPPORT_UTF
5543 if (utf)
5544 {
5545 switch(ctype)
5546 {
5547 case OP_ANY:
5548 if (max < INT_MAX)
5549 {
5550 for (i = min; i < max; i++)
5551 {
5552 if (eptr >= md->end_subject)
5553 {
5554 SCHECK_PARTIAL();
5555 break;
5556 }
5557 if (IS_NEWLINE(eptr)) break;
5558 if (md->partial != 0 && /* Take care with CRLF partial */
5559 eptr + 1 >= md->end_subject &&
5560 NLBLOCK->nltype == NLTYPE_FIXED &&
5561 NLBLOCK->nllen == 2 &&
5562 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5563 {
5564 md->hitend = TRUE;
5565 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5566 }
5567 eptr++;
5568 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5569 }
5570 }
5571
5572 /* Handle unlimited UTF-8 repeat */
5573
5574 else
5575 {
5576 for (i = min; i < max; i++)
5577 {
5578 if (eptr >= md->end_subject)
5579 {
5580 SCHECK_PARTIAL();
5581 break;
5582 }
5583 if (IS_NEWLINE(eptr)) break;
5584 if (md->partial != 0 && /* Take care with CRLF partial */
5585 eptr + 1 >= md->end_subject &&
5586 NLBLOCK->nltype == NLTYPE_FIXED &&
5587 NLBLOCK->nllen == 2 &&
5588 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5589 {
5590 md->hitend = TRUE;
5591 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5592 }
5593 eptr++;
5594 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5595 }
5596 }
5597 break;
5598
5599 case OP_ALLANY:
5600 if (max < INT_MAX)
5601 {
5602 for (i = min; i < max; i++)
5603 {
5604 if (eptr >= md->end_subject)
5605 {
5606 SCHECK_PARTIAL();
5607 break;
5608 }
5609 eptr++;
5610 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5611 }
5612 }
5613 else
5614 {
5615 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5616 SCHECK_PARTIAL();
5617 }
5618 break;
5619
5620 /* The byte case is the same as non-UTF8 */
5621
5622 case OP_ANYBYTE:
5623 c = max - min;
5624 if (c > (unsigned int)(md->end_subject - eptr))
5625 {
5626 eptr = md->end_subject;
5627 SCHECK_PARTIAL();
5628 }
5629 else eptr += c;
5630 break;
5631
5632 case OP_ANYNL:
5633 for (i = min; i < max; i++)
5634 {
5635 int len = 1;
5636 if (eptr >= md->end_subject)
5637 {
5638 SCHECK_PARTIAL();
5639 break;
5640 }
5641 GETCHARLEN(c, eptr, len);
5642 if (c == CHAR_CR)
5643 {
5644 if (++eptr >= md->end_subject) break;
5645 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5646 }
5647 else
5648 {
5649 if (c != CHAR_LF &&
5650 (md->bsr_anycrlf ||
5651 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5652 #ifndef EBCDIC
5653 && c != 0x2028 && c != 0x2029
5654 #endif /* Not EBCDIC */
5655 )))
5656 break;
5657 eptr += len;
5658 }
5659 }
5660 break;
5661
5662 case OP_NOT_HSPACE:
5663 case OP_HSPACE:
5664 for (i = min; i < max; i++)
5665 {
5666 BOOL gotspace;
5667 int len = 1;
5668 if (eptr >= md->end_subject)
5669 {
5670 SCHECK_PARTIAL();
5671 break;
5672 }
5673 GETCHARLEN(c, eptr, len);
5674 switch(c)
5675 {
5676 HSPACE_CASES: gotspace = TRUE; break;
5677 default: gotspace = FALSE; break;
5678 }
5679 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5680 eptr += len;
5681 }
5682 break;
5683
5684 case OP_NOT_VSPACE:
5685 case OP_VSPACE:
5686 for (i = min; i < max; i++)
5687 {
5688 BOOL gotspace;
5689 int len = 1;
5690 if (eptr >= md->end_subject)
5691 {
5692 SCHECK_PARTIAL();
5693 break;
5694 }
5695 GETCHARLEN(c, eptr, len);
5696 switch(c)
5697 {
5698 VSPACE_CASES: gotspace = TRUE; break;
5699 default: gotspace = FALSE; break;
5700 }
5701 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5702 eptr += len;
5703 }
5704 break;
5705
5706 case OP_NOT_DIGIT:
5707 for (i = min; i < max; i++)
5708 {
5709 int len = 1;
5710 if (eptr >= md->end_subject)
5711 {
5712 SCHECK_PARTIAL();
5713 break;
5714 }
5715 GETCHARLEN(c, eptr, len);
5716 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5717 eptr+= len;
5718 }
5719 break;
5720
5721 case OP_DIGIT:
5722 for (i = min; i < max; i++)
5723 {
5724 int len = 1;
5725 if (eptr >= md->end_subject)
5726 {
5727 SCHECK_PARTIAL();
5728 break;
5729 }
5730 GETCHARLEN(c, eptr, len);
5731 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5732 eptr+= len;
5733 }
5734 break;
5735
5736 case OP_NOT_WHITESPACE:
5737 for (i = min; i < max; i++)
5738 {
5739 int len = 1;
5740 if (eptr >= md->end_subject)
5741 {
5742 SCHECK_PARTIAL();
5743 break;
5744 }
5745 GETCHARLEN(c, eptr, len);
5746 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5747 eptr+= len;
5748 }
5749 break;
5750
5751 case OP_WHITESPACE:
5752 for (i = min; i < max; i++)
5753 {
5754 int len = 1;
5755 if (eptr >= md->end_subject)
5756 {
5757 SCHECK_PARTIAL();
5758 break;
5759 }
5760 GETCHARLEN(c, eptr, len);
5761 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5762 eptr+= len;
5763 }
5764 break;
5765
5766 case OP_NOT_WORDCHAR:
5767 for (i = min; i < max; i++)
5768 {
5769 int len = 1;
5770 if (eptr >= md->end_subject)
5771 {
5772 SCHECK_PARTIAL();
5773 break;
5774 }
5775 GETCHARLEN(c, eptr, len);
5776 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5777 eptr+= len;
5778 }
5779 break;
5780
5781 case OP_WORDCHAR:
5782 for (i = min; i < max; i++)
5783 {
5784 int len = 1;
5785 if (eptr >= md->end_subject)
5786 {
5787 SCHECK_PARTIAL();
5788 break;
5789 }
5790 GETCHARLEN(c, eptr, len);
5791 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5792 eptr+= len;
5793 }
5794 break;
5795
5796 default:
5797 RRETURN(PCRE_ERROR_INTERNAL);
5798 }
5799
5800 /* eptr is now past the end of the maximum run. If possessive, we are
5801 done (no backing up). Otherwise, match at this position; anything other
5802 than no match is immediately returned. For nomatch, back up one
5803 character, unless we are matching \R and the last thing matched was
5804 \r\n, in which case, back up two bytes. */
5805
5806 if (possessive) continue;
5807 for(;;)
5808 {
5809 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5811 if (eptr-- == pp) break; /* Stop if tried at original pos */
5812 BACKCHAR(eptr);
5813 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5814 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5815 }
5816 }
5817 else
5818 #endif /* SUPPORT_UTF */
5819 /* Not UTF mode */
5820 {
5821 switch(ctype)
5822 {
5823 case OP_ANY:
5824 for (i = min; i < max; i++)
5825 {
5826 if (eptr >= md->end_subject)
5827 {
5828 SCHECK_PARTIAL();
5829 break;
5830 }
5831 if (IS_NEWLINE(eptr)) break;
5832 if (md->partial != 0 && /* Take care with CRLF partial */
5833 eptr + 1 >= md->end_subject &&
5834 NLBLOCK->nltype == NLTYPE_FIXED &&
5835 NLBLOCK->nllen == 2 &&
5836 *eptr == NLBLOCK->nl[0])
5837 {
5838 md->hitend = TRUE;
5839 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5840 }
5841 eptr++;
5842 }
5843 break;
5844
5845 case OP_ALLANY:
5846 case OP_ANYBYTE:
5847 c = max - min;
5848 if (c > (unsigned int)(md->end_subject - eptr))
5849 {
5850 eptr = md->end_subject;
5851 SCHECK_PARTIAL();
5852 }
5853 else eptr += c;
5854 break;
5855
5856 case OP_ANYNL:
5857 for (i = min; i < max; i++)
5858 {
5859 if (eptr >= md->end_subject)
5860 {
5861 SCHECK_PARTIAL();
5862 break;
5863 }
5864 c = *eptr;
5865 if (c == CHAR_CR)
5866 {
5867 if (++eptr >= md->end_subject) break;
5868 if (*eptr == CHAR_LF) eptr++;
5869 }
5870 else
5871 {
5872 if (c != CHAR_LF && (md->bsr_anycrlf ||
5873 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5874 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5875 && c != 0x2028 && c != 0x2029
5876 #endif
5877 ))) break;
5878 eptr++;
5879 }
5880 }
5881 break;
5882
5883 case OP_NOT_HSPACE:
5884 for (i = min; i < max; i++)
5885 {
5886 if (eptr >= md->end_subject)
5887 {
5888 SCHECK_PARTIAL();
5889 break;
5890 }
5891 switch(*eptr)
5892 {
5893 default: eptr++; break;
5894 HSPACE_BYTE_CASES:
5895 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5896 HSPACE_MULTIBYTE_CASES:
5897 #endif
5898 goto ENDLOOP00;
5899 }
5900 }
5901 ENDLOOP00:
5902 break;
5903
5904 case OP_HSPACE:
5905 for (i = min; i < max; i++)
5906 {
5907 if (eptr >= md->end_subject)
5908 {
5909 SCHECK_PARTIAL();
5910 break;
5911 }
5912 switch(*eptr)
5913 {
5914 default: goto ENDLOOP01;
5915 HSPACE_BYTE_CASES:
5916 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5917 HSPACE_MULTIBYTE_CASES:
5918 #endif
5919 eptr++; break;
5920 }
5921 }
5922 ENDLOOP01:
5923 break;
5924
5925 case OP_NOT_VSPACE:
5926 for (i = min; i < max; i++)
5927 {
5928 if (eptr >= md->end_subject)
5929 {
5930 SCHECK_PARTIAL();
5931 break;
5932 }
5933 switch(*eptr)
5934 {
5935 default: eptr++; break;
5936 VSPACE_BYTE_CASES:
5937 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5938 VSPACE_MULTIBYTE_CASES:
5939 #endif
5940 goto ENDLOOP02;
5941 }
5942 }
5943 ENDLOOP02:
5944 break;
5945
5946 case OP_VSPACE:
5947 for (i = min; i < max; i++)
5948 {
5949 if (eptr >= md->end_subject)
5950 {
5951 SCHECK_PARTIAL();
5952 break;
5953 }
5954 switch(*eptr)
5955 {
5956 default: goto ENDLOOP03;
5957 VSPACE_BYTE_CASES:
5958 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5959 VSPACE_MULTIBYTE_CASES:
5960 #endif
5961 eptr++; break;
5962 }
5963 }
5964 ENDLOOP03:
5965 break;
5966
5967 case OP_NOT_DIGIT:
5968 for (i = min; i < max; i++)
5969 {
5970 if (eptr >= md->end_subject)
5971 {
5972 SCHECK_PARTIAL();
5973 break;
5974 }
5975 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5976 eptr++;
5977 }
5978 break;
5979
5980 case OP_DIGIT:
5981 for (i = min; i < max; i++)
5982 {
5983 if (eptr >= md->end_subject)
5984 {
5985 SCHECK_PARTIAL();
5986 break;
5987 }
5988 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5989 eptr++;
5990 }
5991 break;
5992
5993 case OP_NOT_WHITESPACE:
5994 for (i = min; i < max; i++)
5995 {
5996 if (eptr >= md->end_subject)
5997 {
5998 SCHECK_PARTIAL();
5999 break;
6000 }
6001 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6002 eptr++;
6003 }
6004 break;
6005
6006 case OP_WHITESPACE:
6007 for (i = min; i < max; i++)
6008 {
6009 if (eptr >= md->end_subject)
6010 {
6011 SCHECK_PARTIAL();
6012 break;
6013 }
6014 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6015 eptr++;
6016 }
6017 break;
6018
6019 case OP_NOT_WORDCHAR:
6020 for (i = min; i < max; i++)
6021 {
6022 if (eptr >= md->end_subject)
6023 {
6024 SCHECK_PARTIAL();
6025 break;
6026 }
6027 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6028 eptr++;
6029 }
6030 break;
6031
6032 case OP_WORDCHAR:
6033 for (i = min; i < max; i++)
6034 {
6035 if (eptr >= md->end_subject)
6036 {
6037 SCHECK_PARTIAL();
6038 break;
6039 }
6040 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6041 eptr++;
6042 }
6043 break;
6044
6045 default:
6046 RRETURN(PCRE_ERROR_INTERNAL);
6047 }
6048
6049 /* eptr is now past the end of the maximum run. If possessive, we are
6050 done (no backing up). Otherwise, match at this position; anything other
6051 than no match is immediately returned. For nomatch, back up one
6052 character (byte), unless we are matching \R and the last thing matched
6053 was \r\n, in which case, back up two bytes. */
6054
6055 if (possessive) continue;
6056 while (eptr >= pp)
6057 {
6058 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6060 eptr--;
6061 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6062 eptr[-1] == CHAR_CR) eptr--;
6063 }
6064 }
6065
6066 /* Get here if we can't make it match with any permitted repetitions */
6067
6068 RRETURN(MATCH_NOMATCH);
6069 }
6070 /* Control never gets here */
6071
6072 /* There's been some horrible disaster. Arrival here can only mean there is
6073 something seriously wrong in the code above or the OP_xxx definitions. */
6074
6075 default:
6076 DPRINTF(("Unknown opcode %d\n", *ecode));
6077 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6078 }
6079
6080 /* Do not stick any code in here without much thought; it is assumed
6081 that "continue" in the code above comes out to here to repeat the main
6082 loop. */
6083
6084 } /* End of main loop */
6085 /* Control never reaches here */
6086
6087
6088 /* When compiling to use the heap rather than the stack for recursive calls to
6089 match(), the RRETURN() macro jumps here. The number that is saved in
6090 frame->Xwhere indicates which label we actually want to return to. */
6091
6092 #ifdef NO_RECURSE
6093 #define LBL(val) case val: goto L_RM##val;
6094 HEAP_RETURN:
6095 switch (frame->Xwhere)
6096 {
6097 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6098 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6099 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6100 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6101 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6102 LBL(65) LBL(66)
6103 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6104 LBL(21)
6105 #endif
6106 #ifdef SUPPORT_UTF
6107 LBL(16) LBL(18) LBL(20)
6108 LBL(22) LBL(23) LBL(28) LBL(30)
6109 LBL(32) LBL(34) LBL(42) LBL(46)
6110 #ifdef SUPPORT_UCP
6111 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6112 LBL(59) LBL(60) LBL(61) LBL(62)
6113 #endif /* SUPPORT_UCP */
6114 #endif /* SUPPORT_UTF */
6115 default:
6116 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6117
6118 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6119
6120 return PCRE_ERROR_INTERNAL;
6121 }
6122 #undef LBL
6123 #endif /* NO_RECURSE */
6124 }
6125
6126
6127 /***************************************************************************
6128 ****************************************************************************
6129 RECURSION IN THE match() FUNCTION
6130
6131 Undefine all the macros that were defined above to handle this. */
6132
6133 #ifdef NO_RECURSE
6134 #undef eptr
6135 #undef ecode
6136 #undef mstart
6137 #undef offset_top
6138 #undef eptrb
6139 #undef flags
6140
6141 #undef callpat
6142 #undef charptr
6143 #undef data
6144 #undef next
6145 #undef pp
6146 #undef prev
6147 #undef saved_eptr
6148
6149 #undef new_recursive
6150
6151 #undef cur_is_word
6152 #undef condition
6153 #undef prev_is_word
6154
6155 #undef ctype
6156 #undef length
6157 #undef max
6158 #undef min
6159 #undef number
6160 #undef offset
6161 #undef op
6162 #undef save_capture_last
6163 #undef save_offset1
6164 #undef save_offset2
6165 #undef save_offset3
6166 #undef stacksave
6167
6168 #undef newptrb
6169
6170 #endif
6171
6172 /* These two are defined as macros in both cases */
6173
6174 #undef fc
6175 #undef fi
6176
6177 /***************************************************************************
6178 ***************************************************************************/
6179
6180
6181 #ifdef NO_RECURSE
6182 /*************************************************
6183 * Release allocated heap frames *
6184 *************************************************/
6185
6186 /* This function releases all the allocated frames. The base frame is on the
6187 machine stack, and so must not be freed.
6188
6189 Argument: the address of the base frame
6190 Returns: nothing
6191 */
6192
6193 static void
6194 release_match_heapframes (heapframe *frame_base)
6195 {
6196 heapframe *nextframe = frame_base->Xnextframe;
6197 while (nextframe != NULL)
6198 {
6199 heapframe *oldframe = nextframe;
6200 nextframe = nextframe->Xnextframe;
6201 (PUBL(stack_free))(oldframe);
6202 }
6203 }
6204 #endif
6205
6206
6207 /*************************************************
6208 * Execute a Regular Expression *
6209 *************************************************/
6210
6211 /* This function applies a compiled re to a subject string and picks out
6212 portions of the string if it matches. Two elements in the vector are set for
6213 each substring: the offsets to the start and end of the substring.
6214
6215 Arguments:
6216 argument_re points to the compiled expression
6217 extra_data points to extra data or is NULL
6218 subject points to the subject string
6219 length length of subject string (may contain binary zeros)
6220 start_offset where to start in the subject string
6221 options option bits
6222 offsets points to a vector of ints to be filled in with offsets
6223 offsetcount the number of elements in the vector
6224
6225 Returns: > 0 => success; value is the number of elements filled in
6226 = 0 => success, but offsets is not big enough
6227 -1 => failed to match
6228 < -1 => some kind of unexpected problem
6229 */
6230
6231 #if defined COMPILE_PCRE8
6232 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6233 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6234 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6235 int offsetcount)
6236 #elif defined COMPILE_PCRE16
6237 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6238 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6239 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6240 int offsetcount)
6241 #elif defined COMPILE_PCRE32
6242 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6243 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6244 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6245 int offsetcount)
6246 #endif
6247 {
6248 int rc, ocount, arg_offset_max;
6249 int newline;
6250 BOOL using_temporary_offsets = FALSE;
6251 BOOL anchored;
6252 BOOL startline;
6253 BOOL firstline;
6254 BOOL utf;
6255 BOOL has_first_char = FALSE;
6256 BOOL has_req_char = FALSE;
6257 pcre_uchar first_char = 0;
6258 pcre_uchar first_char2 = 0;
6259 pcre_uchar req_char = 0;
6260 pcre_uchar req_char2 = 0;
6261 match_data match_block;
6262 match_data *md = &match_block;
6263 const pcre_uint8 *tables;
6264 const pcre_uint8 *start_bits = NULL;
6265 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6266 PCRE_PUCHAR end_subject;
6267 PCRE_PUCHAR start_partial = NULL;
6268 PCRE_PUCHAR req_char_ptr = start_match - 1;
6269
6270 const pcre_study_data *study;
6271 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6272
6273 #ifdef NO_RECURSE
6274 heapframe frame_zero;
6275 frame_zero.Xprevframe = NULL; /* Marks the top level */
6276 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6277 md->match_frames_base = &frame_zero;
6278 #endif
6279
6280 /* Check for the special magic call that measures the size of the stack used
6281 per recursive call of match(). Without the funny casting for sizeof, a Windows
6282 compiler gave this error: "unary minus operator applied to unsigned type,
6283 result still unsigned". Hopefully the cast fixes that. */
6284
6285 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6286 start_offset == -999)
6287 #ifdef NO_RECURSE
6288 return -((int)sizeof(heapframe));
6289 #else
6290 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6291 #endif
6292
6293 /* Plausibility checks */
6294
6295 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6296 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6297 return PCRE_ERROR_NULL;
6298 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6299 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6300
6301 /* Check that the first field in the block is the magic number. If it is not,
6302 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6303 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6304 means that the pattern is likely compiled with different endianness. */
6305
6306 if (re->magic_number != MAGIC_NUMBER)
6307 return re->magic_number == REVERSED_MAGIC_NUMBER?
6308 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6309 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6310
6311 /* These two settings are used in the code for checking a UTF-8 string that
6312 follows immediately afterwards. Other values in the md block are used only
6313 during "normal" pcre_exec() processing, not when the JIT support is in use,
6314 so they are set up later. */
6315
6316 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6317 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6318 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6319 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6320
6321 /* Check a UTF-8 string if required. Pass back the character offset and error
6322 code for an invalid string if a results vector is available. */
6323
6324 #ifdef SUPPORT_UTF
6325 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6326 {
6327 int erroroffset;
6328 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6329 if (errorcode != 0)
6330 {
6331 if (offsetcount >= 2)
6332 {
6333 offsets[0] = erroroffset;
6334 offsets[1] = errorcode;
6335 }
6336 #if defined COMPILE_PCRE8
6337 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6338 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6339 #elif defined COMPILE_PCRE16
6340 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6341 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6342 #elif defined COMPILE_PCRE32
6343 return PCRE_ERROR_BADUTF32;
6344 #endif
6345 }
6346 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6347 /* Check that a start_offset points to the start of a UTF character. */
6348 if (start_offset > 0 && start_offset < length &&
6349 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6350 return PCRE_ERROR_BADUTF8_OFFSET;
6351 #endif
6352 }
6353 #endif
6354
6355 /* If the pattern was successfully studied with JIT support, run the JIT
6356 executable instead of the rest of this function. Most options must be set at
6357 compile time for the JIT code to be usable. Fallback to the normal code path if
6358 an unsupported flag is set. */
6359
6360 #ifdef SUPPORT_JIT
6361 if (extra_data != NULL
6362 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6363 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6364 && extra_data->executable_jit != NULL
6365 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6366 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6367 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6368 {
6369 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6370 start_offset, options, offsets, offsetcount);
6371
6372 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6373 mode is not compiled. In this case we simply fallback to interpreter. */
6374
6375 if (rc != PCRE_ERROR_NULL) return rc;
6376 }
6377 #endif
6378
6379 /* Carry on with non-JIT matching. This information is for finding all the
6380 numbers associated with a given name, for condition testing. */
6381
6382 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6383 md->name_count = re->name_count;
6384 md->name_entry_size = re->name_entry_size;
6385
6386 /* Fish out the optional data from the extra_data structure, first setting
6387 the default values. */
6388
6389 study = NULL;
6390 md->match_limit = MATCH_LIMIT;
6391 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6392 md->callout_data = NULL;
6393
6394 /* The table pointer is always in native byte order. */
6395
6396 tables = re->tables;
6397
6398 if (extra_data != NULL)
6399 {
6400 register unsigned int flags = extra_data->flags;
6401 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6402 study = (const pcre_study_data *)extra_data->study_data;
6403 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6404 md->match_limit = extra_data->match_limit;
6405 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6406 md->match_limit_recursion = extra_data->match_limit_recursion;
6407 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6408 md->callout_data = extra_data->callout_data;
6409 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6410 }
6411
6412 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6413 is a feature that makes it possible to save compiled regex and re-use them
6414 in other programs later. */
6415
6416 if (tables == NULL) tables = PRIV(default_tables);
6417
6418 /* Set up other data */
6419
6420 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6421 startline = (re->flags & PCRE_STARTLINE) != 0;
6422 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6423
6424 /* The code starts after the real_pcre block and the capture name table. */
6425
6426 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6427 re->name_count * re->name_entry_size;
6428
6429 md->start_subject = (PCRE_PUCHAR)subject;
6430 md->start_offset = start_offset;
6431 md->end_subject = md->start_subject + length;
6432 end_subject = md->end_subject;
6433
6434 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6435 md->use_ucp = (re->options & PCRE_UCP) != 0;
6436 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6437 md->ignore_skip_arg = FALSE;
6438
6439 /* Some options are unpacked into BOOL variables in the hope that testing
6440 them will be faster than individual option bits. */
6441
6442 md->notbol = (options & PCRE_NOTBOL) != 0;
6443 md->noteol = (options & PCRE_NOTEOL) != 0;
6444 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6445 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6446
6447 md->hitend = FALSE;
6448 md->mark = md->nomatch_mark = NULL; /* In case never set */
6449
6450 md->recursive = NULL; /* No recursion at top level */
6451 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6452
6453 md->lcc = tables + lcc_offset;
6454 md->fcc = tables + fcc_offset;
6455 md->ctypes = tables + ctypes_offset;
6456
6457 /* Handle different \R options. */
6458
6459 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6460 {
6461 case 0:
6462 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6463 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6464 else
6465 #ifdef BSR_ANYCRLF
6466 md->bsr_anycrlf = TRUE;
6467 #else
6468 md->bsr_anycrlf = FALSE;
6469 #endif
6470 break;
6471
6472 case PCRE_BSR_ANYCRLF:
6473 md->bsr_anycrlf = TRUE;
6474 break;
6475
6476 case PCRE_BSR_UNICODE:
6477 md->bsr_anycrlf = FALSE;
6478 break;
6479
6480 default: return PCRE_ERROR_BADNEWLINE;
6481 }
6482
6483 /* Handle different types of newline. The three bits give eight cases. If
6484 nothing is set at run time, whatever was used at compile time applies. */
6485
6486 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6487 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6488 {
6489 case 0: newline = NEWLINE; break; /* Compile-time default */
6490 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6491 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6492 case PCRE_NEWLINE_CR+
6493 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6494 case PCRE_NEWLINE_ANY: newline = -1; break;
6495 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6496 default: return PCRE_ERROR_BADNEWLINE;
6497 }
6498
6499 if (newline == -2)
6500 {
6501 md->nltype = NLTYPE_ANYCRLF;
6502 }
6503 else if (newline < 0)
6504 {
6505 md->nltype = NLTYPE_ANY;
6506 }
6507 else
6508 {
6509 md->nltype = NLTYPE_FIXED;
6510 if (newline > 255)
6511 {
6512 md->nllen = 2;
6513 md->nl[0] = (newline >> 8) & 255;
6514 md->nl[1] = newline & 255;
6515 }
6516 else
6517 {
6518 md->nllen = 1;
6519 md->nl[0] = newline;
6520 }
6521 }
6522
6523 /* Partial matching was originally supported only for a restricted set of
6524 regexes; from release 8.00 there are no restrictions, but the bits are still
6525 defined (though never set). So there's no harm in leaving this code. */
6526
6527 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6528 return PCRE_ERROR_BADPARTIAL;
6529
6530 /* If the expression has got more back references than the offsets supplied can
6531 hold, we get a temporary chunk of working store to use during the matching.
6532 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6533 of 3. */
6534
6535 ocount = offsetcount - (offsetcount % 3);
6536 arg_offset_max = (2*ocount)/3;
6537
6538 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6539 {
6540 ocount = re->top_backref * 3 + 3;
6541 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6542 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6543 using_temporary_offsets = TRUE;
6544 DPRINTF(("Got memory to hold back references\n"));
6545 }
6546 else md->offset_vector = offsets;
6547
6548 md->offset_end = ocount;
6549 md->offset_max = (2*ocount)/3;
6550 md->offset_overflow = FALSE;
6551 md->capture_last = -1;
6552
6553 /* Reset the working variable associated with each extraction. These should
6554 never be used unless previously set, but they get saved and restored, and so we
6555 initialize them to avoid reading uninitialized locations. Also, unset the
6556 offsets for the matched string. This is really just for tidiness with callouts,
6557 in case they inspect these fields. */
6558
6559 if (md->offset_vector != NULL)
6560 {
6561 register int *iptr = md->offset_vector + ocount;
6562 register int *iend = iptr - re->top_bracket;
6563 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6564 while (--iptr >= iend) *iptr = -1;
6565 md->offset_vector[0] = md->offset_vector[1] = -1;
6566 }
6567
6568 /* Set up the first character to match, if available. The first_char value is
6569 never set for an anchored regular expression, but the anchoring may be forced
6570 at run time, so we have to test for anchoring. The first char may be unset for
6571 an unanchored pattern, of course. If there's no first char and the pattern was
6572 studied, there may be a bitmap of possible first characters. */
6573
6574 if (!anchored)
6575 {
6576 if ((re->flags & PCRE_FIRSTSET) != 0)
6577 {
6578 has_first_char = TRUE;
6579 first_char = first_char2 = (pcre_uchar)(re->first_char);
6580 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6581 {
6582 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6583 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6584 if (utf && first_char > 127)
6585 first_char2 = UCD_OTHERCASE(first_char);
6586 #endif
6587 }
6588 }
6589 else
6590 if (!startline && study != NULL &&
6591 (study->flags & PCRE_STUDY_MAPPED) != 0)
6592 start_bits = study->start_bits;
6593 }
6594
6595 /* For anchored or unanchored matches, there may be a "last known required
6596 character" set. */
6597
6598 if ((re->flags & PCRE_REQCHSET) != 0)
6599 {
6600 has_req_char = TRUE;
6601 req_char = req_char2 = (pcre_uchar)(re->req_char);
6602 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6603 {
6604 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6605 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6606 if (utf && req_char > 127)
6607 req_char2 = UCD_OTHERCASE(req_char);
6608 #endif
6609 }
6610 }
6611
6612
6613 /* ==========================================================================*/
6614
6615 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6616 the loop runs just once. */
6617
6618 for(;;)
6619 {
6620 PCRE_PUCHAR save_end_subject = end_subject;
6621 PCRE_PUCHAR new_start_match;
6622
6623 /* If firstline is TRUE, the start of the match is constrained to the first
6624 line of a multiline string. That is, the match must be before or at the first
6625 newline. Implement this by temporarily adjusting end_subject so that we stop
6626 scanning at a newline. If the match fails at the newline, later code breaks
6627 this loop. */
6628
6629 if (firstline)
6630 {
6631 PCRE_PUCHAR t = start_match;
6632 #ifdef SUPPORT_UTF
6633 if (utf)
6634 {
6635 while (t < md->end_subject && !IS_NEWLINE(t))
6636 {
6637 t++;
6638 ACROSSCHAR(t < end_subject, *t, t++);
6639 }
6640 }
6641 else
6642 #endif
6643 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6644 end_subject = t;
6645 }
6646
6647 /* There are some optimizations that avoid running the match if a known
6648 starting point is not found, or if a known later character is not present.
6649 However, there is an option that disables these, for testing and for ensuring
6650 that all callouts do actually occur. The option can be set in the regex by
6651 (*NO_START_OPT) or passed in match-time options. */
6652
6653 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6654 {
6655 /* Advance to a unique first char if there is one. */
6656
6657 if (has_first_char)
6658 {
6659 pcre_uchar smc;
6660
6661 if (first_char != first_char2)
6662 while (start_match < end_subject &&
6663 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6664 start_match++;
6665 else
6666 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6667 start_match++;
6668 }
6669
6670 /* Or to just after a linebreak for a multiline match */
6671
6672 else if (startline)
6673 {
6674 if (start_match > md->start_subject + start_offset)
6675 {
6676 #ifdef SUPPORT_UTF
6677 if (utf)
6678 {
6679 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6680 {
6681 start_match++;
6682 ACROSSCHAR(start_match < end_subject, *start_match,
6683 start_match++);
6684 }
6685 }
6686 else
6687 #endif
6688 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6689 start_match++;
6690
6691 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6692 and we are now at a LF, advance the match position by one more character.
6693 */
6694
6695 if (start_match[-1] == CHAR_CR &&
6696 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6697 start_match < end_subject &&
6698 RAWUCHARTEST(start_match) == CHAR_NL)
6699 start_match++;
6700 }
6701 }
6702
6703 /* Or to a non-unique first byte after study */
6704
6705 else if (start_bits != NULL)
6706 {
6707 while (start_match < end_subject)
6708 {
6709 register pcre_uint32 c = RAWUCHARTEST(start_match);
6710 #ifndef COMPILE_PCRE8
6711 if (c > 255) c = 255;
6712 #endif
6713 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6714 {
6715 start_match++;
6716 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6717 /* In non 8-bit mode, the iteration will stop for
6718 characters > 255 at the beginning or not stop at all. */
6719 if (utf)
6720 ACROSSCHAR(start_match < end_subject, *start_match,
6721 start_match++);
6722 #endif
6723 }
6724 else break;
6725 }
6726 }
6727 } /* Starting optimizations */
6728
6729 /* Restore fudged end_subject */
6730
6731 end_subject = save_end_subject;
6732
6733 /* The following two optimizations are disabled for partial matching or if
6734 disabling is explicitly requested. */
6735
6736 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6737 {
6738 /* If the pattern was studied, a minimum subject length may be set. This is
6739 a lower bound; no actual string of that length may actually match the
6740 pattern. Although the value is, strictly, in characters, we treat it as
6741 bytes to avoid spending too much time in this optimization. */
6742
6743 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6744 (pcre_uint32)(end_subject - start_match) < study->minlength)
6745 {
6746 rc = MATCH_NOMATCH;
6747 break;
6748 }
6749
6750 /* If req_char is set, we know that that character must appear in the
6751 subject for the match to succeed. If the first character is set, req_char
6752 must be later in the subject; otherwise the test starts at the match point.
6753 This optimization can save a huge amount of backtracking in patterns with
6754 nested unlimited repeats that aren't going to match. Writing separate code
6755 for cased/caseless versions makes it go faster, as does using an
6756 autoincrement and backing off on a match.
6757
6758 HOWEVER: when the subject string is very, very long, searching to its end
6759 can take a long time, and give bad performance on quite ordinary patterns.
6760 This showed up when somebody was matching something like /^\d+C/ on a
6761 32-megabyte string... so we don't do this when the string is sufficiently
6762 long. */
6763
6764 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6765 {
6766 regi